All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 00/24] Support Host Trap Sampling for MI200
@ 2023-11-03 13:11 James Zhu
  2023-11-03 13:11 ` [PATCH 01/24] drm/amdkfd/kfd_ioctl: add pc sampling support James Zhu
                   ` (24 more replies)
  0 siblings, 25 replies; 80+ messages in thread
From: James Zhu @ 2023-11-03 13:11 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.kuehling, joseph.greathouse, jamesz

PC sampling is a form of software profiling, where the threads of an application
are periodically interrupted and the program counter that the thread is currently
attempting to execute is saved out for profiling.

David Yat Sin (5):
  drm/amdkfd/kfd_ioctl: add pc sampling support
  drm/amdkfd: add pc sampling support
  drm/amdkfd: enable pc sampling query
  drm/amdkfd: enable pc sampling create
  drm/amdkfd: add pc sampling capability check

James Zhu (19):
  drm/amdkfd: add pc sampling mutex
  drm/amdkfd: add trace_id return
  drm/amdkfd: check pcs_enrty valid
  drm/amdkfd: enable pc sampling destroy
  drm/amdkfd: add interface to trigger pc sampling trap
  drm/amdkfd: trigger pc sampling trap for gfx v9
  drm/amdkfd/gfx9: enable host trap
  drm/amdgpu: use trapID 4 for host trap
  drm/amdgpu: add sq host trap status check
  drm/amdkfd: trigger pc sampling trap for arcturus
  drm/amdkfd: trigger pc sampling trap for aldebaran
  drm/amdkfd: use bit operation set debug trap
  drm/amdkfd: add setting trap pc sampling flag
  drm/amdkfd: enable pc sampling start
  drm/amdkfd: enable pc sampling stop
  drm/amdkfd: enable pc sampling work to trigger trap
  drm/amdkfd: add queue remapping
  drm/amdkfd: add pc sampling release when process release
  drm/amdkfd: bump kfd ioctl minor version for pc sampling availability

 .../drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c  |   11 +
 .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c   |   14 +-
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c |   72 +
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h |    7 +
 drivers/gpu/drm/amd/amdkfd/Makefile           |    3 +-
 .../gpu/drm/amd/amdkfd/cwsr_trap_handler.h    | 2106 +++++++++--------
 .../drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm |   29 +-
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |   44 +
 drivers/gpu/drm/amd/amdkfd/kfd_device.c       |   17 +
 .../drm/amd/amdkfd/kfd_device_queue_manager.c |   11 +
 .../drm/amd/amdkfd/kfd_device_queue_manager.h |    5 +
 drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c  |  348 +++
 drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h  |   36 +
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |   43 +
 drivers/gpu/drm/amd/amdkfd/kfd_process.c      |   32 +-
 .../amd/include/asic_reg/gc/gc_9_0_offset.h   |    2 +
 .../amd/include/asic_reg/gc/gc_9_0_sh_mask.h  |    5 +
 .../gpu/drm/amd/include/kgd_kfd_interface.h   |    6 +
 include/uapi/linux/kfd_ioctl.h                |   60 +-
 19 files changed, 1792 insertions(+), 1059 deletions(-)
 create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
 create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h

-- 
2.25.1


^ permalink raw reply	[flat|nested] 80+ messages in thread

* [PATCH 01/24] drm/amdkfd/kfd_ioctl: add pc sampling support
  2023-11-03 13:11 [PATCH 00/24] Support Host Trap Sampling for MI200 James Zhu
@ 2023-11-03 13:11 ` James Zhu
  2023-11-22 21:14   ` Felix Kuehling
  2023-11-27 19:11   ` Alex Deucher
  2023-11-03 13:11 ` [PATCH 02/24] drm/amdkfd: " James Zhu
                   ` (23 subsequent siblings)
  24 siblings, 2 replies; 80+ messages in thread
From: James Zhu @ 2023-11-03 13:11 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.kuehling, joseph.greathouse, jamesz

From: David Yat Sin <david.yatsin@amd.com>

Add pc sampling support in kfd_ioctl.

Co-developed-by: James Zhu <James.Zhu@amd.com>
Signed-off-by: James Zhu <James.Zhu@amd.com>
Signed-off-by: David Yat Sin <david.yatsin@amd.com>
---
 include/uapi/linux/kfd_ioctl.h | 57 +++++++++++++++++++++++++++++++++-
 1 file changed, 56 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index f0ed68974c54..5202e29c9560 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -1446,6 +1446,58 @@ struct kfd_ioctl_dbg_trap_args {
 	};
 };
 
+/**
+ * kfd_ioctl_pc_sample_op - PC Sampling ioctl operations
+ *
+ * @KFD_IOCTL_PCS_OP_QUERY_CAPABILITIES: Query device PC Sampling capabilities
+ * @KFD_IOCTL_PCS_OP_CREATE:             Register this process with a per-device PC sampler instance
+ * @KFD_IOCTL_PCS_OP_DESTROY:            Unregister from a previously registered PC sampler instance
+ * @KFD_IOCTL_PCS_OP_START:              Process begins taking samples from a previously registered PC sampler instance
+ * @KFD_IOCTL_PCS_OP_STOP:               Process stops taking samples from a previously registered PC sampler instance
+ */
+enum kfd_ioctl_pc_sample_op {
+	KFD_IOCTL_PCS_OP_QUERY_CAPABILITIES,
+	KFD_IOCTL_PCS_OP_CREATE,
+	KFD_IOCTL_PCS_OP_DESTROY,
+	KFD_IOCTL_PCS_OP_START,
+	KFD_IOCTL_PCS_OP_STOP,
+};
+
+/* Values have to be a power of 2*/
+#define KFD_IOCTL_PCS_FLAG_POWER_OF_2 0x00000001
+
+enum kfd_ioctl_pc_sample_method {
+	KFD_IOCTL_PCS_METHOD_HOSTTRAP = 1,
+	KFD_IOCTL_PCS_METHOD_STOCHASTIC,
+};
+
+enum kfd_ioctl_pc_sample_type {
+	KFD_IOCTL_PCS_TYPE_TIME_US,
+	KFD_IOCTL_PCS_TYPE_CLOCK_CYCLES,
+	KFD_IOCTL_PCS_TYPE_INSTRUCTIONS
+};
+
+struct kfd_pc_sample_info {
+	__u64 value;         /* [IN] if PCS_TYPE_INTERVAL_US: sample interval in us
+	                      * if PCS_TYPE_CLOCK_CYCLES: sample interval in graphics core clk cycles
+	                      * if PCS_TYPE_INSTRUCTIONS: sample interval in instructions issued by
+	                      * graphics compute units
+	                      */
+	__u64 value_min;     /* [OUT] */
+	__u64 value_max;     /* [OUT] */
+	__u64 flags;         /* [OUT] indicate potential restrictions e.g FLAG_POWER_OF_2 */
+	__u32 method;        /* [IN/OUT] kfd_ioctl_pc_sample_method */
+	__u32 type;          /* [IN/OUT] kfd_ioctl_pc_sample_type */
+};
+
+struct kfd_ioctl_pc_sample_args {
+	__u64 sample_info_ptr;   /* array of kfd_pc_sample_info */
+	__u32 num_sample_info;
+	__u32 op;                /* kfd_ioctl_pc_sample_op */
+	__u32 gpu_id;
+	__u32 trace_id;
+};
+
 #define AMDKFD_IOCTL_BASE 'K'
 #define AMDKFD_IO(nr)			_IO(AMDKFD_IOCTL_BASE, nr)
 #define AMDKFD_IOR(nr, type)		_IOR(AMDKFD_IOCTL_BASE, nr, type)
@@ -1566,7 +1618,10 @@ struct kfd_ioctl_dbg_trap_args {
 #define AMDKFD_IOC_DBG_TRAP			\
 		AMDKFD_IOWR(0x26, struct kfd_ioctl_dbg_trap_args)
 
+#define AMDKFD_IOC_PC_SAMPLE		\
+		AMDKFD_IOWR(0x27, struct kfd_ioctl_pc_sample_args)
+
 #define AMDKFD_COMMAND_START		0x01
-#define AMDKFD_COMMAND_END		0x27
+#define AMDKFD_COMMAND_END		0x28
 
 #endif
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 80+ messages in thread

* [PATCH 02/24] drm/amdkfd: add pc sampling support
  2023-11-03 13:11 [PATCH 00/24] Support Host Trap Sampling for MI200 James Zhu
  2023-11-03 13:11 ` [PATCH 01/24] drm/amdkfd/kfd_ioctl: add pc sampling support James Zhu
@ 2023-11-03 13:11 ` James Zhu
  2023-11-03 13:11 ` [PATCH 03/24] drm/amdkfd: enable pc sampling query James Zhu
                   ` (22 subsequent siblings)
  24 siblings, 0 replies; 80+ messages in thread
From: James Zhu @ 2023-11-03 13:11 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.kuehling, joseph.greathouse, jamesz

From: David Yat Sin <david.yatsin@amd.com>

Add pc sampling functions in amdkfd.

Co-developed-by: James Zhu <James.Zhu@amd.com>
Signed-off-by: James Zhu <James.Zhu@amd.com>
Signed-off-by: David Yat Sin <david.yatsin@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/Makefile          |  3 +-
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c     | 36 +++++++++
 drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c | 78 ++++++++++++++++++++
 drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h | 34 +++++++++
 4 files changed, 150 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
 create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h

diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile
index a5ae7bcf44eb..790fd028a681 100644
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -57,7 +57,8 @@ AMDKFD_FILES	:= $(AMDKFD_PATH)/kfd_module.o \
 		$(AMDKFD_PATH)/kfd_int_process_v11.o \
 		$(AMDKFD_PATH)/kfd_smi_events.o \
 		$(AMDKFD_PATH)/kfd_crat.o \
-		$(AMDKFD_PATH)/kfd_debug.o
+		$(AMDKFD_PATH)/kfd_debug.o \
+		$(AMDKFD_PATH)/kfd_pc_sampling.o
 
 ifneq ($(CONFIG_DEBUG_FS),)
 AMDKFD_FILES += $(AMDKFD_PATH)/kfd_debugfs.o
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 06988cf1db51..b00390e451bf 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -41,6 +41,7 @@
 #include "kfd_priv.h"
 #include "kfd_device_queue_manager.h"
 #include "kfd_svm.h"
+#include "kfd_pc_sampling.h"
 #include "amdgpu_amdkfd.h"
 #include "kfd_smi_events.h"
 #include "amdgpu_dma_buf.h"
@@ -1750,6 +1751,38 @@ static int kfd_ioctl_svm(struct file *filep, struct kfd_process *p, void *data)
 }
 #endif
 
+static int kfd_ioctl_pc_sample(struct file *filep,
+				   struct kfd_process *p, void __user *data)
+{
+	struct kfd_ioctl_pc_sample_args *args = data;
+	struct kfd_process_device *pdd;
+	int ret;
+
+	if (sched_policy == KFD_SCHED_POLICY_NO_HWS) {
+		pr_err("PC Sampling does not support sched_policy %i", sched_policy);
+		return -EINVAL;
+	}
+
+	mutex_lock(&p->mutex);
+	pdd = kfd_process_device_data_by_id(p, args->gpu_id);
+
+	if (!pdd) {
+		pr_debug("could not find gpu id 0x%x.", args->gpu_id);
+		ret = -EINVAL;
+	} else {
+		pdd = kfd_bind_process_to_device(pdd->dev, p);
+		if (IS_ERR(pdd)) {
+			pr_debug("failed to bind process %p with gpu id 0x%x", p, args->gpu_id);
+			ret = -ESRCH;
+		} else {
+			ret = kfd_pc_sample(pdd, args);
+		}
+	}
+	mutex_unlock(&p->mutex);
+
+	return ret;
+}
+
 static int criu_checkpoint_process(struct kfd_process *p,
 			     uint8_t __user *user_priv_data,
 			     uint64_t *priv_offset)
@@ -3224,6 +3257,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
 
 	AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_TRAP,
 			kfd_ioctl_set_debug_trap, 0),
+
+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_PC_SAMPLE,
+			kfd_ioctl_pc_sample, 0),
 };
 
 #define AMDKFD_CORE_IOCTL_COUNT	ARRAY_SIZE(amdkfd_ioctls)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
new file mode 100644
index 000000000000..a7e78ff42d07
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+/*
+ * Copyright 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "kfd_priv.h"
+#include "amdgpu_amdkfd.h"
+#include "kfd_pc_sampling.h"
+
+static int kfd_pc_sample_query_cap(struct kfd_process_device *pdd,
+					struct kfd_ioctl_pc_sample_args __user *user_args)
+{
+	return -EINVAL;
+}
+
+static int kfd_pc_sample_start(struct kfd_process_device *pdd)
+{
+	return -EINVAL;
+}
+
+static int kfd_pc_sample_stop(struct kfd_process_device *pdd)
+{
+	return -EINVAL;
+
+}
+
+static int kfd_pc_sample_create(struct kfd_process_device *pdd,
+					struct kfd_ioctl_pc_sample_args __user *user_args)
+{
+	return -EINVAL;
+}
+
+static int kfd_pc_sample_destroy(struct kfd_process_device *pdd, uint32_t trace_id)
+{
+	return -EINVAL;
+
+}
+
+int kfd_pc_sample(struct kfd_process_device *pdd,
+					struct kfd_ioctl_pc_sample_args __user *args)
+{
+	switch (args->op) {
+	case KFD_IOCTL_PCS_OP_QUERY_CAPABILITIES:
+		return kfd_pc_sample_query_cap(pdd, args);
+
+	case KFD_IOCTL_PCS_OP_CREATE:
+		return kfd_pc_sample_create(pdd, args);
+
+	case KFD_IOCTL_PCS_OP_DESTROY:
+		return kfd_pc_sample_destroy(pdd, args->trace_id);
+
+	case KFD_IOCTL_PCS_OP_START:
+		return kfd_pc_sample_start(pdd);
+
+	case KFD_IOCTL_PCS_OP_STOP:
+		return kfd_pc_sample_stop(pdd);
+	}
+
+	return -EINVAL;
+}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h
new file mode 100644
index 000000000000..4eeded4ea5b6
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+/*
+ * Copyright 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef KFD_PC_SAMPLING_H_
+#define KFD_PC_SAMPLING_H_
+
+#include "amdgpu.h"
+#include "kfd_priv.h"
+
+int kfd_pc_sample(struct kfd_process_device *pdd,
+					struct kfd_ioctl_pc_sample_args __user *args);
+
+#endif /* KFD_PC_SAMPLING_H_ */
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 80+ messages in thread

* [PATCH 03/24] drm/amdkfd: enable pc sampling query
  2023-11-03 13:11 [PATCH 00/24] Support Host Trap Sampling for MI200 James Zhu
  2023-11-03 13:11 ` [PATCH 01/24] drm/amdkfd/kfd_ioctl: add pc sampling support James Zhu
  2023-11-03 13:11 ` [PATCH 02/24] drm/amdkfd: " James Zhu
@ 2023-11-03 13:11 ` James Zhu
  2023-11-10 19:04   ` Yat Sin, David
  2023-11-20 15:34   ` [PATCH v2 " James Zhu
  2023-11-03 13:11 ` [PATCH 04/24] drm/amdkfd: add pc sampling mutex James Zhu
                   ` (21 subsequent siblings)
  24 siblings, 2 replies; 80+ messages in thread
From: James Zhu @ 2023-11-03 13:11 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.kuehling, joseph.greathouse, jamesz

From: David Yat Sin <david.yatsin@amd.com>

Enable pc sampling to query system capability.

Co-developed-by: James Zhu <James.Zhu@amd.com>
Signed-off-by: James Zhu <James.Zhu@amd.com>
Signed-off-by: David Yat Sin <david.yatsin@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c | 54 +++++++++++++++++++-
 1 file changed, 53 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
index a7e78ff42d07..49fecbc7013e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
@@ -25,10 +25,62 @@
 #include "amdgpu_amdkfd.h"
 #include "kfd_pc_sampling.h"
 
+struct supported_pc_sample_info {
+	uint32_t ip_version;
+	const struct kfd_pc_sample_info *sample_info;
+};
+
+const struct kfd_pc_sample_info sample_info_hosttrap_9_0_0 = {
+	0, 1, ~0ULL, 0, KFD_IOCTL_PCS_METHOD_HOSTTRAP, KFD_IOCTL_PCS_TYPE_TIME_US };
+
+struct supported_pc_sample_info supported_formats[] = {
+	{ IP_VERSION(9, 4, 1), &sample_info_hosttrap_9_0_0 },
+	{ IP_VERSION(9, 4, 2), &sample_info_hosttrap_9_0_0 },
+};
+
 static int kfd_pc_sample_query_cap(struct kfd_process_device *pdd,
 					struct kfd_ioctl_pc_sample_args __user *user_args)
 {
-	return -EINVAL;
+	uint64_t sample_offset;
+	int num_method = 0;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(supported_formats); i++)
+		if (KFD_GC_VERSION(pdd->dev) == supported_formats[i].ip_version)
+			num_method++;
+
+	if (!num_method) {
+		pr_debug("PC Sampling not supported on GC_HWIP:0x%x.",
+			pdd->dev->adev->ip_versions[GC_HWIP][0]);
+		return -EOPNOTSUPP;
+	}
+
+	if (!user_args->sample_info_ptr) {
+		user_args->num_sample_info = num_method;
+		return 0;
+	}
+
+	if (user_args->num_sample_info < num_method) {
+		user_args->num_sample_info = num_method;
+		pr_debug("Sample info buffer is not large enough, "
+			 "ASIC requires space for %d kfd_pc_sample_info entries.", num_method);
+		return -ENOSPC;
+	}
+
+	sample_offset = user_args->sample_info_ptr;
+	for (i = 0; i < ARRAY_SIZE(supported_formats); i++) {
+		if (KFD_GC_VERSION(pdd->dev) == supported_formats[i].ip_version) {
+			int ret = copy_to_user((void __user *) sample_offset,
+				supported_formats[i].sample_info, sizeof(struct kfd_pc_sample_info));
+			if (ret) {
+				pr_debug("Failed to copy PC sampling info to user.");
+				return -EFAULT;
+			}
+			sample_offset += sizeof(struct kfd_pc_sample_info);
+		}
+	}
+
+	return 0;
 }
 
 static int kfd_pc_sample_start(struct kfd_process_device *pdd)
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 80+ messages in thread

* [PATCH 04/24] drm/amdkfd: add pc sampling mutex
  2023-11-03 13:11 [PATCH 00/24] Support Host Trap Sampling for MI200 James Zhu
                   ` (2 preceding siblings ...)
  2023-11-03 13:11 ` [PATCH 03/24] drm/amdkfd: enable pc sampling query James Zhu
@ 2023-11-03 13:11 ` James Zhu
  2023-11-03 13:11 ` [PATCH 05/24] drm/amdkfd: enable pc sampling create James Zhu
                   ` (20 subsequent siblings)
  24 siblings, 0 replies; 80+ messages in thread
From: James Zhu @ 2023-11-03 13:11 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.kuehling, joseph.greathouse, jamesz

Add pc sampling mutex per node, and do init/destroy in node init.

Signed-off-by: James Zhu <James.Zhu@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_device.c | 12 ++++++++++++
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h   |  7 +++++++
 2 files changed, 19 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 0a9cf9dfc224..0e24e011f66b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -533,6 +533,16 @@ static void kfd_smi_init(struct kfd_node *dev)
 	spin_lock_init(&dev->smi_lock);
 }
 
+static void kfd_pc_sampling_init(struct kfd_node *dev)
+{
+	mutex_init(&dev->pcs_data.mutex);
+}
+
+static void kfd_pc_sampling_exit(struct kfd_node *dev)
+{
+	mutex_destroy(&dev->pcs_data.mutex);
+}
+
 static int kfd_init_node(struct kfd_node *node)
 {
 	int err = -1;
@@ -563,6 +573,7 @@ static int kfd_init_node(struct kfd_node *node)
 	}
 
 	kfd_smi_init(node);
+	kfd_pc_sampling_init(node);
 
 	return 0;
 
@@ -593,6 +604,7 @@ static void kfd_cleanup_nodes(struct kfd_dev *kfd, unsigned int num_nodes)
 		kfd_topology_remove_device(knode);
 		if (knode->gws)
 			amdgpu_amdkfd_free_gws(knode->adev, knode->gws);
+		kfd_pc_sampling_exit(knode);
 		kfree(knode);
 		kfd->nodes[i] = NULL;
 	}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 9cc32f577e38..4a0b66189c67 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -256,6 +256,11 @@ struct kfd_vmid_info {
 
 struct kfd_dev;
 
+/* Per device PC Sampling data */
+struct kfd_dev_pc_sampling {
+	struct mutex mutex;
+};
+
 struct kfd_node {
 	unsigned int node_id;
 	struct amdgpu_device *adev;     /* Duplicated here along with keeping
@@ -309,6 +314,8 @@ struct kfd_node {
 	struct kfd_local_mem_info local_mem_info;
 
 	struct kfd_dev *kfd;
+
+	struct kfd_dev_pc_sampling pcs_data;
 };
 
 struct kfd_dev {
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 80+ messages in thread

* [PATCH 05/24] drm/amdkfd: enable pc sampling create
  2023-11-03 13:11 [PATCH 00/24] Support Host Trap Sampling for MI200 James Zhu
                   ` (3 preceding siblings ...)
  2023-11-03 13:11 ` [PATCH 04/24] drm/amdkfd: add pc sampling mutex James Zhu
@ 2023-11-03 13:11 ` James Zhu
  2023-11-22 21:51   ` Felix Kuehling
  2023-11-03 13:11 ` [PATCH 06/24] drm/amdkfd: add trace_id return James Zhu
                   ` (19 subsequent siblings)
  24 siblings, 1 reply; 80+ messages in thread
From: James Zhu @ 2023-11-03 13:11 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.kuehling, joseph.greathouse, jamesz

From: David Yat Sin <david.yatsin@amd.com>

Enable pc sampling create.

Co-developed-by: James Zhu <James.Zhu@amd.com>
Signed-off-by: James Zhu <James.Zhu@amd.com>
Signed-off-by: David Yat Sin <david.yatsin@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c | 54 +++++++++++++++++++-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h        | 10 ++++
 2 files changed, 63 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
index 49fecbc7013e..f0d910ee730c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
@@ -97,7 +97,59 @@ static int kfd_pc_sample_stop(struct kfd_process_device *pdd)
 static int kfd_pc_sample_create(struct kfd_process_device *pdd,
 					struct kfd_ioctl_pc_sample_args __user *user_args)
 {
-	return -EINVAL;
+	struct kfd_pc_sample_info *supported_format = NULL;
+	struct kfd_pc_sample_info user_info;
+	int ret;
+	int i;
+
+	if (user_args->num_sample_info != 1)
+		return -EINVAL;
+
+	ret = copy_from_user(&user_info, (void __user *) user_args->sample_info_ptr,
+				sizeof(struct kfd_pc_sample_info));
+	if (ret) {
+		pr_debug("Failed to copy PC sampling info from user\n");
+		return -EFAULT;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(supported_formats); i++) {
+		if (KFD_GC_VERSION(pdd->dev) == supported_formats[i].ip_version
+			&& user_info.method == supported_formats[i].sample_info->method
+			&& user_info.type == supported_formats[i].sample_info->type
+			&& user_info.value <= supported_formats[i].sample_info->value_max
+			&& user_info.value >= supported_formats[i].sample_info->value_min) {
+			supported_format =
+				(struct kfd_pc_sample_info *)supported_formats[i].sample_info;
+			break;
+		}
+	}
+
+	if (!supported_format) {
+		pr_debug("Sampling format is not supported!");
+		return -EOPNOTSUPP;
+	}
+
+	mutex_lock(&pdd->dev->pcs_data.mutex);
+	if (pdd->dev->pcs_data.hosttrap_entry.base.use_count &&
+		memcmp(&pdd->dev->pcs_data.hosttrap_entry.base.pc_sample_info,
+				&user_info, sizeof(user_info))) {
+		ret = copy_to_user((void __user *) user_args->sample_info_ptr,
+			&pdd->dev->pcs_data.hosttrap_entry.base.pc_sample_info,
+			sizeof(struct kfd_pc_sample_info));
+		mutex_unlock(&pdd->dev->pcs_data.mutex);
+		return ret ? ret : -EEXIST;
+	}
+
+	/* TODO: add trace_id return */
+
+	if (!pdd->dev->pcs_data.hosttrap_entry.base.use_count)
+		memcpy(&pdd->dev->pcs_data.hosttrap_entry.base.pc_sample_info,
+				&user_info, sizeof(user_info));
+
+	pdd->dev->pcs_data.hosttrap_entry.base.use_count++;
+	mutex_unlock(&pdd->dev->pcs_data.mutex);
+
+	return 0;
 }
 
 static int kfd_pc_sample_destroy(struct kfd_process_device *pdd, uint32_t trace_id)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 4a0b66189c67..81c925fb2952 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -256,9 +256,19 @@ struct kfd_vmid_info {
 
 struct kfd_dev;
 
+struct kfd_dev_pc_sampling_data {
+	uint32_t use_count;         /* Num of PC sampling sessions */
+	struct kfd_pc_sample_info pc_sample_info;
+};
+
+struct kfd_dev_pcs_hosttrap {
+	struct kfd_dev_pc_sampling_data base;
+};
+
 /* Per device PC Sampling data */
 struct kfd_dev_pc_sampling {
 	struct mutex mutex;
+	struct kfd_dev_pcs_hosttrap hosttrap_entry;
 };
 
 struct kfd_node {
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 80+ messages in thread

* [PATCH 06/24] drm/amdkfd: add trace_id return
  2023-11-03 13:11 [PATCH 00/24] Support Host Trap Sampling for MI200 James Zhu
                   ` (4 preceding siblings ...)
  2023-11-03 13:11 ` [PATCH 05/24] drm/amdkfd: enable pc sampling create James Zhu
@ 2023-11-03 13:11 ` James Zhu
  2023-11-22 21:56   ` Felix Kuehling
  2023-11-22 22:21   ` Felix Kuehling
  2023-11-03 13:11 ` [PATCH 07/24] drm/amdkfd: check pcs_enrty valid James Zhu
                   ` (18 subsequent siblings)
  24 siblings, 2 replies; 80+ messages in thread
From: James Zhu @ 2023-11-03 13:11 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.kuehling, joseph.greathouse, jamesz

Add trace_id return for new pc sampling creation per device,
Use IDR to quickly locate pc_sampling_entry for reference.

Signed-off-by: James Zhu <James.Zhu@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_device.c      |  2 ++
 drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c | 20 +++++++++++++++++++-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h        |  6 ++++++
 3 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 0e24e011f66b..bcaeedac8fe0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -536,10 +536,12 @@ static void kfd_smi_init(struct kfd_node *dev)
 static void kfd_pc_sampling_init(struct kfd_node *dev)
 {
 	mutex_init(&dev->pcs_data.mutex);
+	idr_init_base(&dev->pcs_data.hosttrap_entry.base.pc_sampling_idr, 1);
 }
 
 static void kfd_pc_sampling_exit(struct kfd_node *dev)
 {
+	idr_destroy(&dev->pcs_data.hosttrap_entry.base.pc_sampling_idr);
 	mutex_destroy(&dev->pcs_data.mutex);
 }
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
index f0d910ee730c..4c9fc48e1a6a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
@@ -99,6 +99,7 @@ static int kfd_pc_sample_create(struct kfd_process_device *pdd,
 {
 	struct kfd_pc_sample_info *supported_format = NULL;
 	struct kfd_pc_sample_info user_info;
+	struct pc_sampling_entry *pcs_entry;
 	int ret;
 	int i;
 
@@ -140,7 +141,19 @@ static int kfd_pc_sample_create(struct kfd_process_device *pdd,
 		return ret ? ret : -EEXIST;
 	}
 
-	/* TODO: add trace_id return */
+	pcs_entry = kvzalloc(sizeof(*pcs_entry), GFP_KERNEL);
+	if (!pcs_entry) {
+		mutex_unlock(&pdd->dev->pcs_data.mutex);
+		return -ENOMEM;
+	}
+
+	i = idr_alloc_cyclic(&pdd->dev->pcs_data.hosttrap_entry.base.pc_sampling_idr,
+				pcs_entry, 1, 0, GFP_KERNEL);
+	if (i < 0) {
+		mutex_unlock(&pdd->dev->pcs_data.mutex);
+		kvfree(pcs_entry);
+		return i;
+	}
 
 	if (!pdd->dev->pcs_data.hosttrap_entry.base.use_count)
 		memcpy(&pdd->dev->pcs_data.hosttrap_entry.base.pc_sample_info,
@@ -149,6 +162,11 @@ static int kfd_pc_sample_create(struct kfd_process_device *pdd,
 	pdd->dev->pcs_data.hosttrap_entry.base.use_count++;
 	mutex_unlock(&pdd->dev->pcs_data.mutex);
 
+	pcs_entry->pdd = pdd;
+	user_args->trace_id = (uint32_t)i;
+
+	pr_debug("alloc pcs_entry = %p, trace_id = 0x%x on gpu 0x%x", pcs_entry, i, pdd->dev->id);
+
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 81c925fb2952..642558026d16 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -258,6 +258,7 @@ struct kfd_dev;
 
 struct kfd_dev_pc_sampling_data {
 	uint32_t use_count;         /* Num of PC sampling sessions */
+	struct idr pc_sampling_idr;
 	struct kfd_pc_sample_info pc_sample_info;
 };
 
@@ -743,6 +744,11 @@ enum kfd_pdd_bound {
  */
 #define SDMA_ACTIVITY_DIVISOR  100
 
+struct pc_sampling_entry {
+	bool enabled;
+	struct kfd_process_device *pdd;
+};
+
 /* Data that is per-process-per device. */
 struct kfd_process_device {
 	/* The device that owns this data. */
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 80+ messages in thread

* [PATCH 07/24] drm/amdkfd: check pcs_enrty valid
  2023-11-03 13:11 [PATCH 00/24] Support Host Trap Sampling for MI200 James Zhu
                   ` (5 preceding siblings ...)
  2023-11-03 13:11 ` [PATCH 06/24] drm/amdkfd: add trace_id return James Zhu
@ 2023-11-03 13:11 ` James Zhu
  2023-11-10 19:09   ` Yat Sin, David
                     ` (2 more replies)
  2023-11-03 13:11 ` [PATCH 08/24] drm/amdkfd: enable pc sampling destroy James Zhu
                   ` (17 subsequent siblings)
  24 siblings, 3 replies; 80+ messages in thread
From: James Zhu @ 2023-11-03 13:11 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.kuehling, joseph.greathouse, jamesz

Check pcs_enrty valid for pc sampling ioctl.

Signed-off-by: James Zhu <James.Zhu@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c | 30 ++++++++++++++++++--
 1 file changed, 27 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
index 4c9fc48e1a6a..36366c8847de 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
@@ -179,6 +179,21 @@ static int kfd_pc_sample_destroy(struct kfd_process_device *pdd, uint32_t trace_
 int kfd_pc_sample(struct kfd_process_device *pdd,
 					struct kfd_ioctl_pc_sample_args __user *args)
 {
+	struct pc_sampling_entry *pcs_entry;
+
+	if (args->op != KFD_IOCTL_PCS_OP_QUERY_CAPABILITIES &&
+		args->op != KFD_IOCTL_PCS_OP_CREATE) {
+
+		mutex_lock(&pdd->dev->pcs_data.mutex);
+		pcs_entry = idr_find(&pdd->dev->pcs_data.hosttrap_entry.base.pc_sampling_idr,
+				args->trace_id);
+		mutex_unlock(&pdd->dev->pcs_data.mutex);
+
+		if (!pcs_entry ||
+			pcs_entry->pdd != pdd)
+			return -EINVAL;
+	}
+
 	switch (args->op) {
 	case KFD_IOCTL_PCS_OP_QUERY_CAPABILITIES:
 		return kfd_pc_sample_query_cap(pdd, args);
@@ -187,13 +202,22 @@ int kfd_pc_sample(struct kfd_process_device *pdd,
 		return kfd_pc_sample_create(pdd, args);
 
 	case KFD_IOCTL_PCS_OP_DESTROY:
-		return kfd_pc_sample_destroy(pdd, args->trace_id);
+		if (pcs_entry->enabled)
+			return -EBUSY;
+		else
+			return kfd_pc_sample_destroy(pdd, args->trace_id);
 
 	case KFD_IOCTL_PCS_OP_START:
-		return kfd_pc_sample_start(pdd);
+		if (pcs_entry->enabled)
+			return -EALREADY;
+		else
+			return kfd_pc_sample_start(pdd);
 
 	case KFD_IOCTL_PCS_OP_STOP:
-		return kfd_pc_sample_stop(pdd);
+		if (!pcs_entry->enabled)
+			return -EALREADY;
+		else
+			return kfd_pc_sample_stop(pdd);
 	}
 
 	return -EINVAL;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 80+ messages in thread

* [PATCH 08/24] drm/amdkfd: enable pc sampling destroy
  2023-11-03 13:11 [PATCH 00/24] Support Host Trap Sampling for MI200 James Zhu
                   ` (6 preceding siblings ...)
  2023-11-03 13:11 ` [PATCH 07/24] drm/amdkfd: check pcs_enrty valid James Zhu
@ 2023-11-03 13:11 ` James Zhu
  2023-11-03 13:11 ` [PATCH 09/24] drm/amdkfd: add interface to trigger pc sampling trap James Zhu
                   ` (16 subsequent siblings)
  24 siblings, 0 replies; 80+ messages in thread
From: James Zhu @ 2023-11-03 13:11 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.kuehling, joseph.greathouse, jamesz

Enable pc sampling destroy.

Signed-off-by: James Zhu <James.Zhu@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
index 36366c8847de..60b29b245db5 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
@@ -170,10 +170,24 @@ static int kfd_pc_sample_create(struct kfd_process_device *pdd,
 	return 0;
 }
 
-static int kfd_pc_sample_destroy(struct kfd_process_device *pdd, uint32_t trace_id)
+static int kfd_pc_sample_destroy(struct kfd_process_device *pdd, uint32_t trace_id,
+					struct pc_sampling_entry *pcs_entry)
 {
-	return -EINVAL;
+	pr_debug("free pcs_entry = %p, trace_id = 0x%x on gpu 0x%x",
+		pcs_entry, trace_id, pdd->dev->id);
+
+	mutex_lock(&pdd->dev->pcs_data.mutex);
+	pdd->dev->pcs_data.hosttrap_entry.base.use_count--;
+	idr_remove(&pdd->dev->pcs_data.hosttrap_entry.base.pc_sampling_idr, trace_id);
 
+	if (!pdd->dev->pcs_data.hosttrap_entry.base.use_count)
+		memset(&pdd->dev->pcs_data.hosttrap_entry.base.pc_sample_info, 0x0,
+			sizeof(struct kfd_pc_sample_info));
+	mutex_unlock(&pdd->dev->pcs_data.mutex);
+
+	kvfree(pcs_entry);
+
+	return 0;
 }
 
 int kfd_pc_sample(struct kfd_process_device *pdd,
@@ -205,7 +219,7 @@ int kfd_pc_sample(struct kfd_process_device *pdd,
 		if (pcs_entry->enabled)
 			return -EBUSY;
 		else
-			return kfd_pc_sample_destroy(pdd, args->trace_id);
+			return kfd_pc_sample_destroy(pdd, args->trace_id, pcs_entry);
 
 	case KFD_IOCTL_PCS_OP_START:
 		if (pcs_entry->enabled)
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 80+ messages in thread

* [PATCH 09/24] drm/amdkfd: add interface to trigger pc sampling trap
  2023-11-03 13:11 [PATCH 00/24] Support Host Trap Sampling for MI200 James Zhu
                   ` (7 preceding siblings ...)
  2023-11-03 13:11 ` [PATCH 08/24] drm/amdkfd: enable pc sampling destroy James Zhu
@ 2023-11-03 13:11 ` James Zhu
  2023-11-03 13:11 ` [PATCH 10/24] drm/amdkfd: trigger pc sampling trap for gfx v9 James Zhu
                   ` (15 subsequent siblings)
  24 siblings, 0 replies; 80+ messages in thread
From: James Zhu @ 2023-11-03 13:11 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.kuehling, joseph.greathouse, jamesz

Add interface to trigger pc sampling trap.

Signed-off-by: James Zhu <James.Zhu@amd.com>
---
 drivers/gpu/drm/amd/include/kgd_kfd_interface.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
index 6d094cf3587d..05b0255aca37 100644
--- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
+++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
@@ -33,6 +33,7 @@
 #include <linux/dma-fence.h>
 #include "amdgpu_irq.h"
 #include "amdgpu_gfx.h"
+#include <uapi/linux/kfd_ioctl.h>
 
 struct pci_dev;
 struct amdgpu_device;
@@ -318,6 +319,11 @@ struct kfd2kgd_calls {
 	void (*program_trap_handler_settings)(struct amdgpu_device *adev,
 			uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr,
 			uint32_t inst);
+	uint32_t (*trigger_pc_sample_trap)(struct amdgpu_device *adev,
+					uint32_t vmid,
+					uint32_t *target_simd,
+					uint32_t *target_wave_slot,
+					enum kfd_ioctl_pc_sample_method method);
 };
 
 #endif	/* KGD_KFD_INTERFACE_H_INCLUDED */
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 80+ messages in thread

* [PATCH 10/24] drm/amdkfd: trigger pc sampling trap for gfx v9
  2023-11-03 13:11 [PATCH 00/24] Support Host Trap Sampling for MI200 James Zhu
                   ` (8 preceding siblings ...)
  2023-11-03 13:11 ` [PATCH 09/24] drm/amdkfd: add interface to trigger pc sampling trap James Zhu
@ 2023-11-03 13:11 ` James Zhu
  2023-11-10 19:08   ` Yat Sin, David
  2023-11-20 16:05   ` [PATCH v2 " James Zhu
  2023-11-03 13:11 ` [PATCH 11/24] drm/amdkfd/gfx9: enable host trap James Zhu
                   ` (14 subsequent siblings)
  24 siblings, 2 replies; 80+ messages in thread
From: James Zhu @ 2023-11-03 13:11 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.kuehling, joseph.greathouse, jamesz

Implement trigger pc sampling trap for gfx v9.

Signed-off-by: James Zhu <James.Zhu@amd.com>
---
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 35 +++++++++++++++++++
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h |  7 ++++
 2 files changed, 42 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
index 51011e8ee90d..723fef2d45d6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
@@ -1146,6 +1146,41 @@ void kgd_gfx_v9_program_trap_handler_settings(struct amdgpu_device *adev,
 	kgd_gfx_v9_unlock_srbm(adev, inst);
 }
 
+uint32_t kgd_gfx_v9_trigger_pc_sample_trap(struct amdgpu_device *adev,
+					    uint32_t vmid,
+					    uint32_t max_wave_slot,
+					    uint32_t max_simd,
+					    uint32_t *target_simd,
+					    uint32_t *target_wave_slot,
+					    enum kfd_ioctl_pc_sample_method method)
+{
+	if (method == KFD_IOCTL_PCS_METHOD_HOSTTRAP) {
+		uint32_t value = 0;
+
+		value = REG_SET_FIELD(value, SQ_CMD, CMD, SQ_IND_CMD_CMD_TRAP);
+		value = REG_SET_FIELD(value, SQ_CMD, MODE, SQ_IND_CMD_MODE_SINGLE);
+
+		/* select *target_simd */
+		value = REG_SET_FIELD(value, SQ_CMD, SIMD_ID, *target_simd);
+		/* select *target_wave_slot */
+		value = REG_SET_FIELD(value, SQ_CMD, WAVE_ID, (*target_wave_slot)++);
+
+		mutex_lock(&adev->grbm_idx_mutex);
+		amdgpu_gfx_select_se_sh(adev, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0);
+		WREG32_SOC15(GC, 0, mmSQ_CMD, value);
+		mutex_unlock(&adev->grbm_idx_mutex);
+
+		*target_wave_slot %= max_wave_slot;
+		if (!(*target_wave_slot)) {
+			(*target_simd)++;
+			*target_simd %= max_simd;
+		}
+	} else {
+		pr_debug("PC Sampling method %d not supported.", method);
+	}
+	return 0;
+}
+
 const struct kfd2kgd_calls gfx_v9_kfd2kgd = {
 	.program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
 	.set_pasid_vmid_mapping = kgd_gfx_v9_set_pasid_vmid_mapping,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
index ce424615f59b..b47b926891a8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
@@ -101,3 +101,10 @@ void kgd_gfx_v9_build_grace_period_packet_info(struct amdgpu_device *adev,
 					       uint32_t grace_period,
 					       uint32_t *reg_offset,
 					       uint32_t *reg_data);
+uint32_t kgd_gfx_v9_trigger_pc_sample_trap(struct amdgpu_device *adev,
+					    uint32_t vmid,
+					    uint32_t max_wave_slot,
+					    uint32_t max_simd,
+					    uint32_t *target_simd,
+					    uint32_t *target_wave_slot,
+					    enum kfd_ioctl_pc_sample_method method);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 80+ messages in thread

* [PATCH 11/24] drm/amdkfd/gfx9: enable host trap
  2023-11-03 13:11 [PATCH 00/24] Support Host Trap Sampling for MI200 James Zhu
                   ` (9 preceding siblings ...)
  2023-11-03 13:11 ` [PATCH 10/24] drm/amdkfd: trigger pc sampling trap for gfx v9 James Zhu
@ 2023-11-03 13:11 ` James Zhu
  2023-11-03 13:11 ` [PATCH 12/24] drm/amdgpu: use trapID 4 for " James Zhu
                   ` (13 subsequent siblings)
  24 siblings, 0 replies; 80+ messages in thread
From: James Zhu @ 2023-11-03 13:11 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.kuehling, joseph.greathouse, jamesz

Enable host trap.

Signed-off-by: James Zhu <James.Zhu@amd.com>
---
 .../gpu/drm/amd/amdkfd/cwsr_trap_handler.h    | 63 +++++++++++--------
 .../drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm | 24 ++++---
 2 files changed, 52 insertions(+), 35 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
index d7cd5fa313ff..c16595680faa 100644
--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
@@ -274,14 +274,14 @@ static const uint32_t cwsr_trap_gfx8_hex[] = {
 
 
 static const uint32_t cwsr_trap_gfx9_hex[] = {
-	0xbf820001, 0xbf820258,
+	0xbf820001, 0xbf82025e,
 	0xb8f8f802, 0x8978ff78,
 	0x00020006, 0xb8fbf803,
 	0x866eff78, 0x00002000,
 	0xbf840009, 0x866eff6d,
 	0x00ff0000, 0xbf85001e,
 	0x866eff7b, 0x00000400,
-	0xbf850055, 0xbf8e0010,
+	0xbf85005b, 0xbf8e0010,
 	0xb8fbf803, 0xbf82fffa,
 	0x866eff7b, 0x03c00900,
 	0xbf850015, 0x866eff7b,
@@ -294,7 +294,7 @@ static const uint32_t cwsr_trap_gfx9_hex[] = {
 	0xbf850007, 0xb8eef801,
 	0x866eff6e, 0x00000800,
 	0xbf850003, 0x866eff7b,
-	0x00000400, 0xbf85003a,
+	0x00000400, 0xbf850040,
 	0xb8faf807, 0x867aff7a,
 	0x001f8000, 0x8e7a8b7a,
 	0x8977ff77, 0xfc000000,
@@ -303,13 +303,16 @@ static const uint32_t cwsr_trap_gfx9_hex[] = {
 	0xb8fbf813, 0x8efa887a,
 	0xbf0d8f7b, 0xbf840002,
 	0x877bff7b, 0xffff0000,
-	0xc0031bbd, 0x00000010,
-	0xbf8cc07f, 0x8e6e976e,
-	0x8977ff77, 0x00800000,
-	0x87776e77, 0xc0071bbd,
-	0x00000000, 0xbf8cc07f,
+	0xc0031c3d, 0x00000010,
+	0xc0071bbd, 0x00000000,
 	0xc0071ebd, 0x00000008,
-	0xbf8cc07f, 0x86ee6e6e,
+	0xbf8cc07f, 0x8671ff6d,
+	0x01000000, 0xbf840004,
+	0x92f1ff70, 0x00010001,
+	0xbf840016, 0xbf820005,
+	0x86708170, 0x8e709770,
+	0x8977ff77, 0x00800000,
+	0x87777077, 0x86ee6e6e,
 	0xbf840001, 0xbe801d6e,
 	0x866eff6d, 0x01ff0000,
 	0xbf850005, 0x8778ff78,
@@ -1098,14 +1101,14 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
 };
 
 static const uint32_t cwsr_trap_arcturus_hex[] = {
-	0xbf820001, 0xbf8202d4,
+	0xbf820001, 0xbf8202da,
 	0xb8f8f802, 0x8978ff78,
 	0x00020006, 0xb8fbf803,
 	0x866eff78, 0x00002000,
 	0xbf840009, 0x866eff6d,
 	0x00ff0000, 0xbf85001e,
 	0x866eff7b, 0x00000400,
-	0xbf850055, 0xbf8e0010,
+	0xbf85005b, 0xbf8e0010,
 	0xb8fbf803, 0xbf82fffa,
 	0x866eff7b, 0x03c00900,
 	0xbf850015, 0x866eff7b,
@@ -1118,7 +1121,7 @@ static const uint32_t cwsr_trap_arcturus_hex[] = {
 	0xbf850007, 0xb8eef801,
 	0x866eff6e, 0x00000800,
 	0xbf850003, 0x866eff7b,
-	0x00000400, 0xbf85003a,
+	0x00000400, 0xbf850040,
 	0xb8faf807, 0x867aff7a,
 	0x001f8000, 0x8e7a8b7a,
 	0x8977ff77, 0xfc000000,
@@ -1127,13 +1130,16 @@ static const uint32_t cwsr_trap_arcturus_hex[] = {
 	0xb8fbf813, 0x8efa887a,
 	0xbf0d8f7b, 0xbf840002,
 	0x877bff7b, 0xffff0000,
-	0xc0031bbd, 0x00000010,
-	0xbf8cc07f, 0x8e6e976e,
-	0x8977ff77, 0x00800000,
-	0x87776e77, 0xc0071bbd,
-	0x00000000, 0xbf8cc07f,
+	0xc0031c3d, 0x00000010,
+	0xc0071bbd, 0x00000000,
 	0xc0071ebd, 0x00000008,
-	0xbf8cc07f, 0x86ee6e6e,
+	0xbf8cc07f, 0x8671ff6d,
+	0x01000000, 0xbf840004,
+	0x92f1ff70, 0x00010001,
+	0xbf840016, 0xbf820005,
+	0x86708170, 0x8e709770,
+	0x8977ff77, 0x00800000,
+	0x87777077, 0x86ee6e6e,
 	0xbf840001, 0xbe801d6e,
 	0x866eff6d, 0x01ff0000,
 	0xbf850005, 0x8778ff78,
@@ -1578,14 +1584,14 @@ static const uint32_t cwsr_trap_arcturus_hex[] = {
 };
 
 static const uint32_t cwsr_trap_aldebaran_hex[] = {
-	0xbf820001, 0xbf8202df,
+	0xbf820001, 0xbf8202e5,
 	0xb8f8f802, 0x8978ff78,
 	0x00020006, 0xb8fbf803,
 	0x866eff78, 0x00002000,
 	0xbf840009, 0x866eff6d,
 	0x00ff0000, 0xbf85001e,
 	0x866eff7b, 0x00000400,
-	0xbf850055, 0xbf8e0010,
+	0xbf85005b, 0xbf8e0010,
 	0xb8fbf803, 0xbf82fffa,
 	0x866eff7b, 0x03c00900,
 	0xbf850015, 0x866eff7b,
@@ -1598,7 +1604,7 @@ static const uint32_t cwsr_trap_aldebaran_hex[] = {
 	0xbf850007, 0xb8eef801,
 	0x866eff6e, 0x00000800,
 	0xbf850003, 0x866eff7b,
-	0x00000400, 0xbf85003a,
+	0x00000400, 0xbf850040,
 	0xb8faf807, 0x867aff7a,
 	0x001f8000, 0x8e7a8b7a,
 	0x8977ff77, 0xfc000000,
@@ -1607,13 +1613,16 @@ static const uint32_t cwsr_trap_aldebaran_hex[] = {
 	0xb8fbf813, 0x8efa887a,
 	0xbf0d8f7b, 0xbf840002,
 	0x877bff7b, 0xffff0000,
-	0xc0031bbd, 0x00000010,
-	0xbf8cc07f, 0x8e6e976e,
-	0x8977ff77, 0x00800000,
-	0x87776e77, 0xc0071bbd,
-	0x00000000, 0xbf8cc07f,
+	0xc0031c3d, 0x00000010,
+	0xc0071bbd, 0x00000000,
 	0xc0071ebd, 0x00000008,
-	0xbf8cc07f, 0x86ee6e6e,
+	0xbf8cc07f, 0x8671ff6d,
+	0x01000000, 0xbf840004,
+	0x92f1ff70, 0x00010001,
+	0xbf840016, 0xbf820005,
+	0x86708170, 0x8e709770,
+	0x8977ff77, 0x00800000,
+	0x87777077, 0x86ee6e6e,
 	0xbf840001, 0xbe801d6e,
 	0x866eff6d, 0x01ff0000,
 	0xbf850005, 0x8778ff78,
diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm
index e506411ad28a..6880340c25af 100644
--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm
@@ -104,6 +104,10 @@ var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK	= 0x1F8000
 
 var SQ_WAVE_MODE_DEBUG_EN_MASK		=   0x800
 
+var TMA_HOST_TRAP_EN_SHIFT               =   1
+var TMA_HOST_TRAP_EN_SIZE                =   1
+var TMA_HOST_TRAP_EN_BFE                 =   (TMA_HOST_TRAP_EN_SHIFT | (TMA_HOST_TRAP_EN_SIZE << 16))
+
 var TTMP_SAVE_RCNT_FIRST_REPLAY_SHIFT	=   26			// bits [31:26] unused by SPI debug data
 var TTMP_SAVE_RCNT_FIRST_REPLAY_MASK	=   0xFC000000
 var TTMP_DEBUG_TRAP_ENABLED_SHIFT	=   23
@@ -288,17 +292,21 @@ L_FETCH_2ND_TRAP:
     s_or_b32        ttmp15, ttmp15, 0xFFFF0000
 L_NO_SIGN_EXTEND_TMA:
 
-    s_load_dword    ttmp2, [ttmp14, ttmp15], 0x10 glc:1 // debug trap enabled flag
-    s_waitcnt       lgkmcnt(0)
-    s_lshl_b32      ttmp2, ttmp2, TTMP_DEBUG_TRAP_ENABLED_SHIFT
-    s_andn2_b32     s_save_ib_sts, s_save_ib_sts, TTMP_DEBUG_TRAP_ENABLED_MASK
-    s_or_b32        s_save_ib_sts, s_save_ib_sts, ttmp2
-
+    s_load_dword    ttmp4, [ttmp14, ttmp15], 0x10 glc:1 // enable flags from 1st level TMA
     s_load_dwordx2  [ttmp2, ttmp3], [ttmp14, ttmp15], 0x0 glc:1 // second-level TBA
-    s_waitcnt       lgkmcnt(0)
     s_load_dwordx2  [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 glc:1 // second-level TMA
     s_waitcnt       lgkmcnt(0)
-
+    s_and_b32       ttmp5, s_save_pc_hi, S_SAVE_PC_HI_HT_MASK // host trap request
+    s_cbranch_scc0  L_NOT_HT
+    s_bfe_u32       ttmp5, ttmp4, TMA_HOST_TRAP_EN_BFE // extract host_trap_en to ttmp5[0]
+    s_cbranch_scc0  L_EXIT_TRAP // HT requested, but host traps not enabled
+    s_branch        L_GOTO_2ND_TRAP
+L_NOT_HT:
+    s_and_b32       ttmp4, ttmp4, 0x1 // debug_enable bit left over
+    s_lshl_b32      ttmp4, ttmp4, TTMP_DEBUG_TRAP_ENABLED_SHIFT
+    s_andn2_b32     s_save_ib_sts, s_save_ib_sts, TTMP_DEBUG_TRAP_ENABLED_MASK
+    s_or_b32        s_save_ib_sts, s_save_ib_sts, ttmp4
+L_GOTO_2ND_TRAP:
     s_and_b64       [ttmp2, ttmp3], [ttmp2, ttmp3], [ttmp2, ttmp3]
     s_cbranch_scc0  L_NO_NEXT_TRAP // second-level trap handler not been set
     s_setpc_b64     [ttmp2, ttmp3] // jump to second-level trap handler
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 80+ messages in thread

* [PATCH 12/24] drm/amdgpu: use trapID 4 for host trap
  2023-11-03 13:11 [PATCH 00/24] Support Host Trap Sampling for MI200 James Zhu
                   ` (10 preceding siblings ...)
  2023-11-03 13:11 ` [PATCH 11/24] drm/amdkfd/gfx9: enable host trap James Zhu
@ 2023-11-03 13:11 ` James Zhu
  2023-11-20 16:08   ` [PATCH v2 " James Zhu
  2023-11-03 13:11 ` [PATCH 13/24] drm/amdgpu: add sq host trap status check James Zhu
                   ` (12 subsequent siblings)
  24 siblings, 1 reply; 80+ messages in thread
From: James Zhu @ 2023-11-03 13:11 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.kuehling, joseph.greathouse, jamesz

Since TRAPSTS.HOST_TRAP won't work pre-MI300, so use
TTMP1 (bit 24: HT) and (bit 16-23: trapID) to identify
the host trap.

Signed-off-by: James Zhu <James.Zhu@amd.com>
---
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c |    2 +
 .../gpu/drm/amd/amdkfd/cwsr_trap_handler.h    | 2117 +++++++++--------
 .../drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm |    5 +
 3 files changed, 1070 insertions(+), 1054 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
index 723fef2d45d6..740d8a0c9252 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
@@ -1164,6 +1164,8 @@ uint32_t kgd_gfx_v9_trigger_pc_sample_trap(struct amdgpu_device *adev,
 		value = REG_SET_FIELD(value, SQ_CMD, SIMD_ID, *target_simd);
 		/* select *target_wave_slot */
 		value = REG_SET_FIELD(value, SQ_CMD, WAVE_ID, (*target_wave_slot)++);
+		/* set TrapID 4 for HOSTTRAP */
+		value = REG_SET_FIELD(value, SQ_CMD, DATA, 0x4);
 
 		mutex_lock(&adev->grbm_idx_mutex);
 		amdgpu_gfx_select_se_sh(adev, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0);
diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
index c16595680faa..8e55bb0bb0b8 100644
--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
@@ -274,155 +274,263 @@ static const uint32_t cwsr_trap_gfx8_hex[] = {
 
 
 static const uint32_t cwsr_trap_gfx9_hex[] = {
-	0xbf820001, 0xbf82025e,
+	0xbf820001, 0xbf820263,
 	0xb8f8f802, 0x8978ff78,
 	0x00020006, 0xb8fbf803,
 	0x866eff78, 0x00002000,
 	0xbf840009, 0x866eff6d,
-	0x00ff0000, 0xbf85001e,
+	0x00ff0000, 0xbf850023,
 	0x866eff7b, 0x00000400,
-	0xbf85005b, 0xbf8e0010,
+	0xbf850060, 0xbf8e0010,
 	0xb8fbf803, 0xbf82fffa,
 	0x866eff7b, 0x03c00900,
-	0xbf850015, 0x866eff7b,
-	0x000071ff, 0xbf840008,
-	0x866fff7b, 0x00007080,
-	0xbf840001, 0xbeee1a87,
-	0xb8eff801, 0x8e6e8c6e,
-	0x866e6f6e, 0xbf85000a,
-	0x866eff6d, 0x00ff0000,
-	0xbf850007, 0xb8eef801,
-	0x866eff6e, 0x00000800,
-	0xbf850003, 0x866eff7b,
-	0x00000400, 0xbf850040,
-	0xb8faf807, 0x867aff7a,
-	0x001f8000, 0x8e7a8b7a,
-	0x8977ff77, 0xfc000000,
-	0x87777a77, 0xba7ff807,
-	0x00000000, 0xb8faf812,
-	0xb8fbf813, 0x8efa887a,
-	0xbf0d8f7b, 0xbf840002,
-	0x877bff7b, 0xffff0000,
-	0xc0031c3d, 0x00000010,
-	0xc0071bbd, 0x00000000,
-	0xc0071ebd, 0x00000008,
-	0xbf8cc07f, 0x8671ff6d,
-	0x01000000, 0xbf840004,
-	0x92f1ff70, 0x00010001,
-	0xbf840016, 0xbf820005,
-	0x86708170, 0x8e709770,
-	0x8977ff77, 0x00800000,
-	0x87777077, 0x86ee6e6e,
-	0xbf840001, 0xbe801d6e,
-	0x866eff6d, 0x01ff0000,
-	0xbf850005, 0x8778ff78,
-	0x00002000, 0x80ec886c,
-	0x82ed806d, 0xbf820005,
-	0x866eff6d, 0x01000000,
-	0xbf850002, 0x806c846c,
-	0x826d806d, 0x866dff6d,
-	0x0000ffff, 0x8f7a8b77,
+	0xbf85001a, 0x866eff6d,
+	0x01ff0000, 0xbf06ff6e,
+	0x01040000, 0xbf850015,
+	0x866eff7b, 0x000071ff,
+	0xbf840008, 0x866fff7b,
+	0x00007080, 0xbf840001,
+	0xbeee1a87, 0xb8eff801,
+	0x8e6e8c6e, 0x866e6f6e,
+	0xbf85000a, 0x866eff6d,
+	0x00ff0000, 0xbf850007,
+	0xb8eef801, 0x866eff6e,
+	0x00000800, 0xbf850003,
+	0x866eff7b, 0x00000400,
+	0xbf850040, 0xb8faf807,
 	0x867aff7a, 0x001f8000,
-	0xb97af807, 0x86fe7e7e,
-	0x86ea6a6a, 0x8f6e8378,
-	0xb96ee0c2, 0xbf800002,
-	0xb9780002, 0xbe801f6c,
+	0x8e7a8b7a, 0x8977ff77,
+	0xfc000000, 0x87777a77,
+	0xba7ff807, 0x00000000,
+	0xb8faf812, 0xb8fbf813,
+	0x8efa887a, 0xbf0d8f7b,
+	0xbf840002, 0x877bff7b,
+	0xffff0000, 0xc0031c3d,
+	0x00000010, 0xc0071bbd,
+	0x00000000, 0xc0071ebd,
+	0x00000008, 0xbf8cc07f,
+	0x8671ff6d, 0x01000000,
+	0xbf840004, 0x92f1ff70,
+	0x00010001, 0xbf840016,
+	0xbf820005, 0x86708170,
+	0x8e709770, 0x8977ff77,
+	0x00800000, 0x87777077,
+	0x86ee6e6e, 0xbf840001,
+	0xbe801d6e, 0x866eff6d,
+	0x01ff0000, 0xbf850005,
+	0x8778ff78, 0x00002000,
+	0x80ec886c, 0x82ed806d,
+	0xbf820005, 0x866eff6d,
+	0x01000000, 0xbf850002,
+	0x806c846c, 0x826d806d,
 	0x866dff6d, 0x0000ffff,
-	0xbefa0080, 0xb97a0283,
-	0xb8faf807, 0x867aff7a,
-	0x001f8000, 0x8e7a8b7a,
-	0x8977ff77, 0xfc000000,
-	0x87777a77, 0xba7ff807,
-	0x00000000, 0xbeee007e,
-	0xbeef007f, 0xbefe0180,
-	0xbf900004, 0x877a8478,
-	0xb97af802, 0xbf8e0002,
-	0xbf88fffe, 0xb8fa2a05,
-	0x807a817a, 0x8e7a8a7a,
-	0xb8fb1605, 0x807b817b,
-	0x8e7b867b, 0x807a7b7a,
-	0x807a7e7a, 0x827b807f,
-	0x867bff7b, 0x0000ffff,
-	0xc04b1c3d, 0x00000050,
-	0xbf8cc07f, 0xc04b1d3d,
-	0x00000060, 0xbf8cc07f,
-	0xc0431e7d, 0x00000074,
-	0xbf8cc07f, 0xbef4007e,
-	0x8675ff7f, 0x0000ffff,
-	0x8775ff75, 0x00040000,
-	0xbef60080, 0xbef700ff,
-	0x00807fac, 0xbef1007c,
-	0xbef00080, 0xb8f02a05,
-	0x80708170, 0x8e708a70,
-	0xb8fa1605, 0x807a817a,
-	0x8e7a867a, 0x80707a70,
-	0xbef60084, 0xbef600ff,
-	0x01000000, 0xbefe007c,
-	0xbefc0070, 0xc0611c7a,
-	0x0000007c, 0xbf8cc07f,
-	0x80708470, 0xbefc007e,
+	0x8f7a8b77, 0x867aff7a,
+	0x001f8000, 0xb97af807,
+	0x86fe7e7e, 0x86ea6a6a,
+	0x8f6e8378, 0xb96ee0c2,
+	0xbf800002, 0xb9780002,
+	0xbe801f6c, 0x866dff6d,
+	0x0000ffff, 0xbefa0080,
+	0xb97a0283, 0xb8faf807,
+	0x867aff7a, 0x001f8000,
+	0x8e7a8b7a, 0x8977ff77,
+	0xfc000000, 0x87777a77,
+	0xba7ff807, 0x00000000,
+	0xbeee007e, 0xbeef007f,
+	0xbefe0180, 0xbf900004,
+	0x877a8478, 0xb97af802,
+	0xbf8e0002, 0xbf88fffe,
+	0xb8fa2a05, 0x807a817a,
+	0x8e7a8a7a, 0xb8fb1605,
+	0x807b817b, 0x8e7b867b,
+	0x807a7b7a, 0x807a7e7a,
+	0x827b807f, 0x867bff7b,
+	0x0000ffff, 0xc04b1c3d,
+	0x00000050, 0xbf8cc07f,
+	0xc04b1d3d, 0x00000060,
+	0xbf8cc07f, 0xc0431e7d,
+	0x00000074, 0xbf8cc07f,
+	0xbef4007e, 0x8675ff7f,
+	0x0000ffff, 0x8775ff75,
+	0x00040000, 0xbef60080,
+	0xbef700ff, 0x00807fac,
+	0xbef1007c, 0xbef00080,
+	0xb8f02a05, 0x80708170,
+	0x8e708a70, 0xb8fa1605,
+	0x807a817a, 0x8e7a867a,
+	0x80707a70, 0xbef60084,
+	0xbef600ff, 0x01000000,
 	0xbefe007c, 0xbefc0070,
-	0xc0611b3a, 0x0000007c,
+	0xc0611c7a, 0x0000007c,
 	0xbf8cc07f, 0x80708470,
 	0xbefc007e, 0xbefe007c,
-	0xbefc0070, 0xc0611b7a,
+	0xbefc0070, 0xc0611b3a,
 	0x0000007c, 0xbf8cc07f,
 	0x80708470, 0xbefc007e,
 	0xbefe007c, 0xbefc0070,
-	0xc0611bba, 0x0000007c,
+	0xc0611b7a, 0x0000007c,
 	0xbf8cc07f, 0x80708470,
 	0xbefc007e, 0xbefe007c,
-	0xbefc0070, 0xc0611bfa,
+	0xbefc0070, 0xc0611bba,
 	0x0000007c, 0xbf8cc07f,
 	0x80708470, 0xbefc007e,
 	0xbefe007c, 0xbefc0070,
-	0xc0611e3a, 0x0000007c,
-	0xbf8cc07f, 0x80708470,
-	0xbefc007e, 0xb8fbf803,
-	0xbefe007c, 0xbefc0070,
-	0xc0611efa, 0x0000007c,
+	0xc0611bfa, 0x0000007c,
 	0xbf8cc07f, 0x80708470,
 	0xbefc007e, 0xbefe007c,
-	0xbefc0070, 0xc0611a3a,
+	0xbefc0070, 0xc0611e3a,
+	0x0000007c, 0xbf8cc07f,
+	0x80708470, 0xbefc007e,
+	0xb8fbf803, 0xbefe007c,
+	0xbefc0070, 0xc0611efa,
 	0x0000007c, 0xbf8cc07f,
 	0x80708470, 0xbefc007e,
 	0xbefe007c, 0xbefc0070,
-	0xc0611a7a, 0x0000007c,
-	0xbf8cc07f, 0x80708470,
-	0xbefc007e, 0xb8f1f801,
-	0xbefe007c, 0xbefc0070,
-	0xc0611c7a, 0x0000007c,
+	0xc0611a3a, 0x0000007c,
 	0xbf8cc07f, 0x80708470,
-	0xbefc007e, 0x867aff7f,
-	0x04000000, 0xbeef0080,
-	0x876f6f7a, 0xb8f02a05,
+	0xbefc007e, 0xbefe007c,
+	0xbefc0070, 0xc0611a7a,
+	0x0000007c, 0xbf8cc07f,
+	0x80708470, 0xbefc007e,
+	0xb8f1f801, 0xbefe007c,
+	0xbefc0070, 0xc0611c7a,
+	0x0000007c, 0xbf8cc07f,
+	0x80708470, 0xbefc007e,
+	0x867aff7f, 0x04000000,
+	0xbeef0080, 0x876f6f7a,
+	0xb8f02a05, 0x80708170,
+	0x8e708a70, 0xb8fb1605,
+	0x807b817b, 0x8e7b847b,
+	0x8e76827b, 0xbef600ff,
+	0x01000000, 0xbef20174,
+	0x80747074, 0x82758075,
+	0xbefc0080, 0xbf800000,
+	0xbe802b00, 0xbe822b02,
+	0xbe842b04, 0xbe862b06,
+	0xbe882b08, 0xbe8a2b0a,
+	0xbe8c2b0c, 0xbe8e2b0e,
+	0xc06b003a, 0x00000000,
+	0xbf8cc07f, 0xc06b013a,
+	0x00000010, 0xbf8cc07f,
+	0xc06b023a, 0x00000020,
+	0xbf8cc07f, 0xc06b033a,
+	0x00000030, 0xbf8cc07f,
+	0x8074c074, 0x82758075,
+	0x807c907c, 0xbf0a7b7c,
+	0xbf85ffe7, 0xbef40172,
+	0xbef00080, 0xbefe00c1,
+	0xbeff00c1, 0xbee80080,
+	0xbee90080, 0xbef600ff,
+	0x01000000, 0x867aff78,
+	0x00400000, 0xbf850003,
+	0xb8faf803, 0x897a7aff,
+	0x10000000, 0xbf85004d,
+	0xbe840080, 0xd2890000,
+	0x00000900, 0x80048104,
+	0xd2890001, 0x00000900,
+	0x80048104, 0xd2890002,
+	0x00000900, 0x80048104,
+	0xd2890003, 0x00000900,
+	0x80048104, 0xc069003a,
+	0x00000070, 0xbf8cc07f,
+	0x80709070, 0xbf06c004,
+	0xbf84ffee, 0xbe840080,
+	0xd2890000, 0x00000901,
+	0x80048104, 0xd2890001,
+	0x00000901, 0x80048104,
+	0xd2890002, 0x00000901,
+	0x80048104, 0xd2890003,
+	0x00000901, 0x80048104,
+	0xc069003a, 0x00000070,
+	0xbf8cc07f, 0x80709070,
+	0xbf06c004, 0xbf84ffee,
+	0xbe840080, 0xd2890000,
+	0x00000902, 0x80048104,
+	0xd2890001, 0x00000902,
+	0x80048104, 0xd2890002,
+	0x00000902, 0x80048104,
+	0xd2890003, 0x00000902,
+	0x80048104, 0xc069003a,
+	0x00000070, 0xbf8cc07f,
+	0x80709070, 0xbf06c004,
+	0xbf84ffee, 0xbe840080,
+	0xd2890000, 0x00000903,
+	0x80048104, 0xd2890001,
+	0x00000903, 0x80048104,
+	0xd2890002, 0x00000903,
+	0x80048104, 0xd2890003,
+	0x00000903, 0x80048104,
+	0xc069003a, 0x00000070,
+	0xbf8cc07f, 0x80709070,
+	0xbf06c004, 0xbf84ffee,
+	0xbf820008, 0xe0724000,
+	0x701d0000, 0xe0724100,
+	0x701d0100, 0xe0724200,
+	0x701d0200, 0xe0724300,
+	0x701d0300, 0xbefe00c1,
+	0xbeff00c1, 0xb8fb4306,
+	0x867bc17b, 0xbf840063,
+	0xbf8a0000, 0x867aff6f,
+	0x04000000, 0xbf84005f,
+	0x8e7b867b, 0x8e7b827b,
+	0xbef6007b, 0xb8f02a05,
 	0x80708170, 0x8e708a70,
-	0xb8fb1605, 0x807b817b,
-	0x8e7b847b, 0x8e76827b,
+	0xb8fa1605, 0x807a817a,
+	0x8e7a867a, 0x80707a70,
+	0x8070ff70, 0x00000080,
 	0xbef600ff, 0x01000000,
-	0xbef20174, 0x80747074,
-	0x82758075, 0xbefc0080,
-	0xbf800000, 0xbe802b00,
-	0xbe822b02, 0xbe842b04,
-	0xbe862b06, 0xbe882b08,
-	0xbe8a2b0a, 0xbe8c2b0c,
-	0xbe8e2b0e, 0xc06b003a,
-	0x00000000, 0xbf8cc07f,
-	0xc06b013a, 0x00000010,
-	0xbf8cc07f, 0xc06b023a,
-	0x00000020, 0xbf8cc07f,
-	0xc06b033a, 0x00000030,
-	0xbf8cc07f, 0x8074c074,
-	0x82758075, 0x807c907c,
-	0xbf0a7b7c, 0xbf85ffe7,
-	0xbef40172, 0xbef00080,
-	0xbefe00c1, 0xbeff00c1,
-	0xbee80080, 0xbee90080,
+	0xbefc0080, 0xd28c0002,
+	0x000100c1, 0xd28d0003,
+	0x000204c1, 0x867aff78,
+	0x00400000, 0xbf850003,
+	0xb8faf803, 0x897a7aff,
+	0x10000000, 0xbf850030,
+	0x24040682, 0xd86e4000,
+	0x00000002, 0xbf8cc07f,
+	0xbe840080, 0xd2890000,
+	0x00000900, 0x80048104,
+	0xd2890001, 0x00000900,
+	0x80048104, 0xd2890002,
+	0x00000900, 0x80048104,
+	0xd2890003, 0x00000900,
+	0x80048104, 0xc069003a,
+	0x00000070, 0xbf8cc07f,
+	0x80709070, 0xbf06c004,
+	0xbf84ffee, 0xbe840080,
+	0xd2890000, 0x00000901,
+	0x80048104, 0xd2890001,
+	0x00000901, 0x80048104,
+	0xd2890002, 0x00000901,
+	0x80048104, 0xd2890003,
+	0x00000901, 0x80048104,
+	0xc069003a, 0x00000070,
+	0xbf8cc07f, 0x80709070,
+	0xbf06c004, 0xbf84ffee,
+	0x680404ff, 0x00000200,
+	0xd0c9006a, 0x0000f702,
+	0xbf87ffd2, 0xbf820015,
+	0xd1060002, 0x00011103,
+	0x7e0602ff, 0x00000200,
+	0xbefc00ff, 0x00010000,
+	0xbe800077, 0x8677ff77,
+	0xff7fffff, 0x8777ff77,
+	0x00058000, 0xd8ec0000,
+	0x00000002, 0xbf8cc07f,
+	0xe0765000, 0x701d0002,
+	0x68040702, 0xd0c9006a,
+	0x0000f702, 0xbf87fff7,
+	0xbef70000, 0xbef000ff,
+	0x00000400, 0xbefe00c1,
+	0xbeff00c1, 0xb8fb2a05,
+	0x807b817b, 0x8e7b827b,
 	0xbef600ff, 0x01000000,
+	0xbefc0084, 0xbf0a7b7c,
+	0xbf84006d, 0xbf11017c,
+	0x807bff7b, 0x00001000,
 	0x867aff78, 0x00400000,
 	0xbf850003, 0xb8faf803,
 	0x897a7aff, 0x10000000,
-	0xbf85004d, 0xbe840080,
+	0xbf850051, 0xbe840080,
 	0xd2890000, 0x00000900,
 	0x80048104, 0xd2890001,
 	0x00000900, 0x80048104,
@@ -460,224 +568,119 @@ static const uint32_t cwsr_trap_gfx9_hex[] = {
 	0x80048104, 0xc069003a,
 	0x00000070, 0xbf8cc07f,
 	0x80709070, 0xbf06c004,
-	0xbf84ffee, 0xbf820008,
+	0xbf84ffee, 0x807c847c,
+	0xbf0a7b7c, 0xbf85ffb1,
+	0xbf9c0000, 0xbf820012,
+	0x7e000300, 0x7e020301,
+	0x7e040302, 0x7e060303,
 	0xe0724000, 0x701d0000,
 	0xe0724100, 0x701d0100,
 	0xe0724200, 0x701d0200,
 	0xe0724300, 0x701d0300,
+	0x807c847c, 0x8070ff70,
+	0x00000400, 0xbf0a7b7c,
+	0xbf85ffef, 0xbf9c0000,
+	0xbf8200c7, 0xbef4007e,
+	0x8675ff7f, 0x0000ffff,
+	0x8775ff75, 0x00040000,
+	0xbef60080, 0xbef700ff,
+	0x00807fac, 0x866eff7f,
+	0x04000000, 0xbf84001e,
 	0xbefe00c1, 0xbeff00c1,
-	0xb8fb4306, 0x867bc17b,
-	0xbf840063, 0xbf8a0000,
-	0x867aff6f, 0x04000000,
-	0xbf84005f, 0x8e7b867b,
-	0x8e7b827b, 0xbef6007b,
-	0xb8f02a05, 0x80708170,
-	0x8e708a70, 0xb8fa1605,
-	0x807a817a, 0x8e7a867a,
-	0x80707a70, 0x8070ff70,
+	0xb8ef4306, 0x866fc16f,
+	0xbf840019, 0x8e6f866f,
+	0x8e6f826f, 0xbef6006f,
+	0xb8f82a05, 0x80788178,
+	0x8e788a78, 0xb8ee1605,
+	0x806e816e, 0x8e6e866e,
+	0x80786e78, 0x8078ff78,
 	0x00000080, 0xbef600ff,
 	0x01000000, 0xbefc0080,
-	0xd28c0002, 0x000100c1,
-	0xd28d0003, 0x000204c1,
-	0x867aff78, 0x00400000,
-	0xbf850003, 0xb8faf803,
-	0x897a7aff, 0x10000000,
-	0xbf850030, 0x24040682,
-	0xd86e4000, 0x00000002,
-	0xbf8cc07f, 0xbe840080,
-	0xd2890000, 0x00000900,
-	0x80048104, 0xd2890001,
-	0x00000900, 0x80048104,
-	0xd2890002, 0x00000900,
-	0x80048104, 0xd2890003,
-	0x00000900, 0x80048104,
-	0xc069003a, 0x00000070,
-	0xbf8cc07f, 0x80709070,
-	0xbf06c004, 0xbf84ffee,
-	0xbe840080, 0xd2890000,
-	0x00000901, 0x80048104,
-	0xd2890001, 0x00000901,
-	0x80048104, 0xd2890002,
-	0x00000901, 0x80048104,
-	0xd2890003, 0x00000901,
-	0x80048104, 0xc069003a,
-	0x00000070, 0xbf8cc07f,
-	0x80709070, 0xbf06c004,
-	0xbf84ffee, 0x680404ff,
-	0x00000200, 0xd0c9006a,
-	0x0000f702, 0xbf87ffd2,
-	0xbf820015, 0xd1060002,
-	0x00011103, 0x7e0602ff,
-	0x00000200, 0xbefc00ff,
-	0x00010000, 0xbe800077,
-	0x8677ff77, 0xff7fffff,
-	0x8777ff77, 0x00058000,
-	0xd8ec0000, 0x00000002,
-	0xbf8cc07f, 0xe0765000,
-	0x701d0002, 0x68040702,
-	0xd0c9006a, 0x0000f702,
-	0xbf87fff7, 0xbef70000,
-	0xbef000ff, 0x00000400,
+	0xe0510000, 0x781d0000,
+	0xe0510100, 0x781d0000,
+	0x807cff7c, 0x00000200,
+	0x8078ff78, 0x00000200,
+	0xbf0a6f7c, 0xbf85fff6,
 	0xbefe00c1, 0xbeff00c1,
-	0xb8fb2a05, 0x807b817b,
-	0x8e7b827b, 0xbef600ff,
-	0x01000000, 0xbefc0084,
-	0xbf0a7b7c, 0xbf84006d,
-	0xbf11017c, 0x807bff7b,
-	0x00001000, 0x867aff78,
-	0x00400000, 0xbf850003,
-	0xb8faf803, 0x897a7aff,
-	0x10000000, 0xbf850051,
-	0xbe840080, 0xd2890000,
-	0x00000900, 0x80048104,
-	0xd2890001, 0x00000900,
-	0x80048104, 0xd2890002,
-	0x00000900, 0x80048104,
-	0xd2890003, 0x00000900,
-	0x80048104, 0xc069003a,
-	0x00000070, 0xbf8cc07f,
-	0x80709070, 0xbf06c004,
-	0xbf84ffee, 0xbe840080,
-	0xd2890000, 0x00000901,
-	0x80048104, 0xd2890001,
-	0x00000901, 0x80048104,
-	0xd2890002, 0x00000901,
-	0x80048104, 0xd2890003,
-	0x00000901, 0x80048104,
-	0xc069003a, 0x00000070,
-	0xbf8cc07f, 0x80709070,
-	0xbf06c004, 0xbf84ffee,
-	0xbe840080, 0xd2890000,
-	0x00000902, 0x80048104,
-	0xd2890001, 0x00000902,
-	0x80048104, 0xd2890002,
-	0x00000902, 0x80048104,
-	0xd2890003, 0x00000902,
-	0x80048104, 0xc069003a,
-	0x00000070, 0xbf8cc07f,
-	0x80709070, 0xbf06c004,
-	0xbf84ffee, 0xbe840080,
-	0xd2890000, 0x00000903,
-	0x80048104, 0xd2890001,
-	0x00000903, 0x80048104,
-	0xd2890002, 0x00000903,
-	0x80048104, 0xd2890003,
-	0x00000903, 0x80048104,
-	0xc069003a, 0x00000070,
-	0xbf8cc07f, 0x80709070,
-	0xbf06c004, 0xbf84ffee,
-	0x807c847c, 0xbf0a7b7c,
-	0xbf85ffb1, 0xbf9c0000,
-	0xbf820012, 0x7e000300,
-	0x7e020301, 0x7e040302,
-	0x7e060303, 0xe0724000,
-	0x701d0000, 0xe0724100,
-	0x701d0100, 0xe0724200,
-	0x701d0200, 0xe0724300,
-	0x701d0300, 0x807c847c,
-	0x8070ff70, 0x00000400,
-	0xbf0a7b7c, 0xbf85ffef,
-	0xbf9c0000, 0xbf8200c7,
-	0xbef4007e, 0x8675ff7f,
-	0x0000ffff, 0x8775ff75,
-	0x00040000, 0xbef60080,
-	0xbef700ff, 0x00807fac,
-	0x866eff7f, 0x04000000,
-	0xbf84001e, 0xbefe00c1,
-	0xbeff00c1, 0xb8ef4306,
-	0x866fc16f, 0xbf840019,
-	0x8e6f866f, 0x8e6f826f,
-	0xbef6006f, 0xb8f82a05,
+	0xbef600ff, 0x01000000,
+	0xb8ef2a05, 0x806f816f,
+	0x8e6f826f, 0x806fff6f,
+	0x00008000, 0xbef80080,
+	0xbeee0078, 0x8078ff78,
+	0x00000400, 0xbefc0084,
+	0xbf11087c, 0xe0524000,
+	0x781d0000, 0xe0524100,
+	0x781d0100, 0xe0524200,
+	0x781d0200, 0xe0524300,
+	0x781d0300, 0xbf8c0f70,
+	0x7e000300, 0x7e020301,
+	0x7e040302, 0x7e060303,
+	0x807c847c, 0x8078ff78,
+	0x00000400, 0xbf0a6f7c,
+	0xbf85ffee, 0xbf9c0000,
+	0xe0524000, 0x6e1d0000,
+	0xe0524100, 0x6e1d0100,
+	0xe0524200, 0x6e1d0200,
+	0xe0524300, 0x6e1d0300,
+	0xbf8c0f70, 0xb8f82a05,
 	0x80788178, 0x8e788a78,
 	0xb8ee1605, 0x806e816e,
 	0x8e6e866e, 0x80786e78,
-	0x8078ff78, 0x00000080,
-	0xbef600ff, 0x01000000,
-	0xbefc0080, 0xe0510000,
-	0x781d0000, 0xe0510100,
-	0x781d0000, 0x807cff7c,
-	0x00000200, 0x8078ff78,
-	0x00000200, 0xbf0a6f7c,
-	0xbf85fff6, 0xbefe00c1,
-	0xbeff00c1, 0xbef600ff,
-	0x01000000, 0xb8ef2a05,
-	0x806f816f, 0x8e6f826f,
-	0x806fff6f, 0x00008000,
-	0xbef80080, 0xbeee0078,
-	0x8078ff78, 0x00000400,
-	0xbefc0084, 0xbf11087c,
-	0xe0524000, 0x781d0000,
-	0xe0524100, 0x781d0100,
-	0xe0524200, 0x781d0200,
-	0xe0524300, 0x781d0300,
-	0xbf8c0f70, 0x7e000300,
-	0x7e020301, 0x7e040302,
-	0x7e060303, 0x807c847c,
-	0x8078ff78, 0x00000400,
-	0xbf0a6f7c, 0xbf85ffee,
-	0xbf9c0000, 0xe0524000,
-	0x6e1d0000, 0xe0524100,
-	0x6e1d0100, 0xe0524200,
-	0x6e1d0200, 0xe0524300,
-	0x6e1d0300, 0xbf8c0f70,
+	0x80f8c078, 0xb8ef1605,
+	0x806f816f, 0x8e6f846f,
+	0x8e76826f, 0xbef600ff,
+	0x01000000, 0xbefc006f,
+	0xc031003a, 0x00000078,
+	0x80f8c078, 0xbf8cc07f,
+	0x80fc907c, 0xbf800000,
+	0xbe802d00, 0xbe822d02,
+	0xbe842d04, 0xbe862d06,
+	0xbe882d08, 0xbe8a2d0a,
+	0xbe8c2d0c, 0xbe8e2d0e,
+	0xbf06807c, 0xbf84fff0,
 	0xb8f82a05, 0x80788178,
 	0x8e788a78, 0xb8ee1605,
 	0x806e816e, 0x8e6e866e,
-	0x80786e78, 0x80f8c078,
-	0xb8ef1605, 0x806f816f,
-	0x8e6f846f, 0x8e76826f,
+	0x80786e78, 0xbef60084,
 	0xbef600ff, 0x01000000,
-	0xbefc006f, 0xc031003a,
-	0x00000078, 0x80f8c078,
-	0xbf8cc07f, 0x80fc907c,
-	0xbf800000, 0xbe802d00,
-	0xbe822d02, 0xbe842d04,
-	0xbe862d06, 0xbe882d08,
-	0xbe8a2d0a, 0xbe8c2d0c,
-	0xbe8e2d0e, 0xbf06807c,
-	0xbf84fff0, 0xb8f82a05,
-	0x80788178, 0x8e788a78,
-	0xb8ee1605, 0x806e816e,
-	0x8e6e866e, 0x80786e78,
-	0xbef60084, 0xbef600ff,
-	0x01000000, 0xc0211bfa,
+	0xc0211bfa, 0x00000078,
+	0x80788478, 0xc0211b3a,
 	0x00000078, 0x80788478,
-	0xc0211b3a, 0x00000078,
-	0x80788478, 0xc0211b7a,
+	0xc0211b7a, 0x00000078,
+	0x80788478, 0xc0211c3a,
 	0x00000078, 0x80788478,
-	0xc0211c3a, 0x00000078,
-	0x80788478, 0xc0211c7a,
+	0xc0211c7a, 0x00000078,
+	0x80788478, 0xc0211eba,
 	0x00000078, 0x80788478,
-	0xc0211eba, 0x00000078,
-	0x80788478, 0xc0211efa,
+	0xc0211efa, 0x00000078,
+	0x80788478, 0xc0211a3a,
 	0x00000078, 0x80788478,
-	0xc0211a3a, 0x00000078,
-	0x80788478, 0xc0211a7a,
+	0xc0211a7a, 0x00000078,
+	0x80788478, 0xc0211cfa,
 	0x00000078, 0x80788478,
-	0xc0211cfa, 0x00000078,
-	0x80788478, 0xbf8cc07f,
-	0xbefc006f, 0xbefe0070,
-	0xbeff0071, 0x866f7bff,
-	0x000003ff, 0xb96f4803,
-	0x866f7bff, 0xfffff800,
-	0x8f6f8b6f, 0xb96fa2c3,
-	0xb973f801, 0xb8ee2a05,
-	0x806e816e, 0x8e6e8a6e,
-	0xb8ef1605, 0x806f816f,
-	0x8e6f866f, 0x806e6f6e,
-	0x806e746e, 0x826f8075,
-	0x866fff6f, 0x0000ffff,
-	0xc00b1c37, 0x00000050,
-	0xc00b1d37, 0x00000060,
-	0xc0031e77, 0x00000074,
-	0xbf8cc07f, 0x8f6e8b77,
-	0x866eff6e, 0x001f8000,
-	0xb96ef807, 0x866dff6d,
-	0x0000ffff, 0x86fe7e7e,
-	0x86ea6a6a, 0x8f6e837a,
-	0xb96ee0c2, 0xbf800002,
-	0xb97a0002, 0xbf8a0000,
-	0xbe801f6c, 0xbf810000,
+	0xbf8cc07f, 0xbefc006f,
+	0xbefe0070, 0xbeff0071,
+	0x866f7bff, 0x000003ff,
+	0xb96f4803, 0x866f7bff,
+	0xfffff800, 0x8f6f8b6f,
+	0xb96fa2c3, 0xb973f801,
+	0xb8ee2a05, 0x806e816e,
+	0x8e6e8a6e, 0xb8ef1605,
+	0x806f816f, 0x8e6f866f,
+	0x806e6f6e, 0x806e746e,
+	0x826f8075, 0x866fff6f,
+	0x0000ffff, 0xc00b1c37,
+	0x00000050, 0xc00b1d37,
+	0x00000060, 0xc0031e77,
+	0x00000074, 0xbf8cc07f,
+	0x8f6e8b77, 0x866eff6e,
+	0x001f8000, 0xb96ef807,
+	0x866dff6d, 0x0000ffff,
+	0x86fe7e7e, 0x86ea6a6a,
+	0x8f6e837a, 0xb96ee0c2,
+	0xbf800002, 0xb97a0002,
+	0xbf8a0000, 0xbe801f6c,
+	0xbf810000, 0x00000000,
 };
 
 static const uint32_t cwsr_trap_nv1x_hex[] = {
@@ -1101,219 +1104,159 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
 };
 
 static const uint32_t cwsr_trap_arcturus_hex[] = {
-	0xbf820001, 0xbf8202da,
+	0xbf820001, 0xbf8202df,
 	0xb8f8f802, 0x8978ff78,
 	0x00020006, 0xb8fbf803,
 	0x866eff78, 0x00002000,
 	0xbf840009, 0x866eff6d,
-	0x00ff0000, 0xbf85001e,
+	0x00ff0000, 0xbf850023,
 	0x866eff7b, 0x00000400,
-	0xbf85005b, 0xbf8e0010,
+	0xbf850060, 0xbf8e0010,
 	0xb8fbf803, 0xbf82fffa,
 	0x866eff7b, 0x03c00900,
-	0xbf850015, 0x866eff7b,
-	0x000071ff, 0xbf840008,
-	0x866fff7b, 0x00007080,
-	0xbf840001, 0xbeee1a87,
-	0xb8eff801, 0x8e6e8c6e,
-	0x866e6f6e, 0xbf85000a,
-	0x866eff6d, 0x00ff0000,
-	0xbf850007, 0xb8eef801,
-	0x866eff6e, 0x00000800,
-	0xbf850003, 0x866eff7b,
-	0x00000400, 0xbf850040,
-	0xb8faf807, 0x867aff7a,
-	0x001f8000, 0x8e7a8b7a,
-	0x8977ff77, 0xfc000000,
-	0x87777a77, 0xba7ff807,
-	0x00000000, 0xb8faf812,
-	0xb8fbf813, 0x8efa887a,
-	0xbf0d8f7b, 0xbf840002,
-	0x877bff7b, 0xffff0000,
-	0xc0031c3d, 0x00000010,
-	0xc0071bbd, 0x00000000,
-	0xc0071ebd, 0x00000008,
-	0xbf8cc07f, 0x8671ff6d,
-	0x01000000, 0xbf840004,
-	0x92f1ff70, 0x00010001,
-	0xbf840016, 0xbf820005,
-	0x86708170, 0x8e709770,
-	0x8977ff77, 0x00800000,
-	0x87777077, 0x86ee6e6e,
-	0xbf840001, 0xbe801d6e,
-	0x866eff6d, 0x01ff0000,
-	0xbf850005, 0x8778ff78,
-	0x00002000, 0x80ec886c,
-	0x82ed806d, 0xbf820005,
-	0x866eff6d, 0x01000000,
-	0xbf850002, 0x806c846c,
-	0x826d806d, 0x866dff6d,
-	0x0000ffff, 0x8f7a8b77,
+	0xbf85001a, 0x866eff6d,
+	0x01ff0000, 0xbf06ff6e,
+	0x01040000, 0xbf850015,
+	0x866eff7b, 0x000071ff,
+	0xbf840008, 0x866fff7b,
+	0x00007080, 0xbf840001,
+	0xbeee1a87, 0xb8eff801,
+	0x8e6e8c6e, 0x866e6f6e,
+	0xbf85000a, 0x866eff6d,
+	0x00ff0000, 0xbf850007,
+	0xb8eef801, 0x866eff6e,
+	0x00000800, 0xbf850003,
+	0x866eff7b, 0x00000400,
+	0xbf850040, 0xb8faf807,
 	0x867aff7a, 0x001f8000,
-	0xb97af807, 0x86fe7e7e,
-	0x86ea6a6a, 0x8f6e8378,
-	0xb96ee0c2, 0xbf800002,
-	0xb9780002, 0xbe801f6c,
+	0x8e7a8b7a, 0x8977ff77,
+	0xfc000000, 0x87777a77,
+	0xba7ff807, 0x00000000,
+	0xb8faf812, 0xb8fbf813,
+	0x8efa887a, 0xbf0d8f7b,
+	0xbf840002, 0x877bff7b,
+	0xffff0000, 0xc0031c3d,
+	0x00000010, 0xc0071bbd,
+	0x00000000, 0xc0071ebd,
+	0x00000008, 0xbf8cc07f,
+	0x8671ff6d, 0x01000000,
+	0xbf840004, 0x92f1ff70,
+	0x00010001, 0xbf840016,
+	0xbf820005, 0x86708170,
+	0x8e709770, 0x8977ff77,
+	0x00800000, 0x87777077,
+	0x86ee6e6e, 0xbf840001,
+	0xbe801d6e, 0x866eff6d,
+	0x01ff0000, 0xbf850005,
+	0x8778ff78, 0x00002000,
+	0x80ec886c, 0x82ed806d,
+	0xbf820005, 0x866eff6d,
+	0x01000000, 0xbf850002,
+	0x806c846c, 0x826d806d,
 	0x866dff6d, 0x0000ffff,
-	0xbefa0080, 0xb97a0283,
-	0xb8faf807, 0x867aff7a,
-	0x001f8000, 0x8e7a8b7a,
-	0x8977ff77, 0xfc000000,
-	0x87777a77, 0xba7ff807,
-	0x00000000, 0xbeee007e,
-	0xbeef007f, 0xbefe0180,
-	0xbf900004, 0x877a8478,
-	0xb97af802, 0xbf8e0002,
-	0xbf88fffe, 0xb8fa2a05,
-	0x807a817a, 0x8e7a8a7a,
-	0x8e7a817a, 0xb8fb1605,
-	0x807b817b, 0x8e7b867b,
-	0x807a7b7a, 0x807a7e7a,
-	0x827b807f, 0x867bff7b,
-	0x0000ffff, 0xc04b1c3d,
-	0x00000050, 0xbf8cc07f,
-	0xc04b1d3d, 0x00000060,
-	0xbf8cc07f, 0xc0431e7d,
-	0x00000074, 0xbf8cc07f,
-	0xbef4007e, 0x8675ff7f,
-	0x0000ffff, 0x8775ff75,
-	0x00040000, 0xbef60080,
-	0xbef700ff, 0x00807fac,
-	0xbef1007c, 0xbef00080,
-	0xb8f02a05, 0x80708170,
-	0x8e708a70, 0x8e708170,
-	0xb8fa1605, 0x807a817a,
-	0x8e7a867a, 0x80707a70,
-	0xbef60084, 0xbef600ff,
-	0x01000000, 0xbefe007c,
-	0xbefc0070, 0xc0611c7a,
-	0x0000007c, 0xbf8cc07f,
-	0x80708470, 0xbefc007e,
+	0x8f7a8b77, 0x867aff7a,
+	0x001f8000, 0xb97af807,
+	0x86fe7e7e, 0x86ea6a6a,
+	0x8f6e8378, 0xb96ee0c2,
+	0xbf800002, 0xb9780002,
+	0xbe801f6c, 0x866dff6d,
+	0x0000ffff, 0xbefa0080,
+	0xb97a0283, 0xb8faf807,
+	0x867aff7a, 0x001f8000,
+	0x8e7a8b7a, 0x8977ff77,
+	0xfc000000, 0x87777a77,
+	0xba7ff807, 0x00000000,
+	0xbeee007e, 0xbeef007f,
+	0xbefe0180, 0xbf900004,
+	0x877a8478, 0xb97af802,
+	0xbf8e0002, 0xbf88fffe,
+	0xb8fa2a05, 0x807a817a,
+	0x8e7a8a7a, 0x8e7a817a,
+	0xb8fb1605, 0x807b817b,
+	0x8e7b867b, 0x807a7b7a,
+	0x807a7e7a, 0x827b807f,
+	0x867bff7b, 0x0000ffff,
+	0xc04b1c3d, 0x00000050,
+	0xbf8cc07f, 0xc04b1d3d,
+	0x00000060, 0xbf8cc07f,
+	0xc0431e7d, 0x00000074,
+	0xbf8cc07f, 0xbef4007e,
+	0x8675ff7f, 0x0000ffff,
+	0x8775ff75, 0x00040000,
+	0xbef60080, 0xbef700ff,
+	0x00807fac, 0xbef1007c,
+	0xbef00080, 0xb8f02a05,
+	0x80708170, 0x8e708a70,
+	0x8e708170, 0xb8fa1605,
+	0x807a817a, 0x8e7a867a,
+	0x80707a70, 0xbef60084,
+	0xbef600ff, 0x01000000,
 	0xbefe007c, 0xbefc0070,
-	0xc0611b3a, 0x0000007c,
+	0xc0611c7a, 0x0000007c,
 	0xbf8cc07f, 0x80708470,
 	0xbefc007e, 0xbefe007c,
-	0xbefc0070, 0xc0611b7a,
+	0xbefc0070, 0xc0611b3a,
 	0x0000007c, 0xbf8cc07f,
 	0x80708470, 0xbefc007e,
 	0xbefe007c, 0xbefc0070,
-	0xc0611bba, 0x0000007c,
+	0xc0611b7a, 0x0000007c,
 	0xbf8cc07f, 0x80708470,
 	0xbefc007e, 0xbefe007c,
-	0xbefc0070, 0xc0611bfa,
+	0xbefc0070, 0xc0611bba,
 	0x0000007c, 0xbf8cc07f,
 	0x80708470, 0xbefc007e,
 	0xbefe007c, 0xbefc0070,
-	0xc0611e3a, 0x0000007c,
-	0xbf8cc07f, 0x80708470,
-	0xbefc007e, 0xb8fbf803,
-	0xbefe007c, 0xbefc0070,
-	0xc0611efa, 0x0000007c,
+	0xc0611bfa, 0x0000007c,
 	0xbf8cc07f, 0x80708470,
 	0xbefc007e, 0xbefe007c,
-	0xbefc0070, 0xc0611a3a,
+	0xbefc0070, 0xc0611e3a,
+	0x0000007c, 0xbf8cc07f,
+	0x80708470, 0xbefc007e,
+	0xb8fbf803, 0xbefe007c,
+	0xbefc0070, 0xc0611efa,
 	0x0000007c, 0xbf8cc07f,
 	0x80708470, 0xbefc007e,
 	0xbefe007c, 0xbefc0070,
-	0xc0611a7a, 0x0000007c,
-	0xbf8cc07f, 0x80708470,
-	0xbefc007e, 0xb8f1f801,
-	0xbefe007c, 0xbefc0070,
-	0xc0611c7a, 0x0000007c,
+	0xc0611a3a, 0x0000007c,
 	0xbf8cc07f, 0x80708470,
-	0xbefc007e, 0x867aff7f,
-	0x04000000, 0xbeef0080,
-	0x876f6f7a, 0xb8f02a05,
-	0x80708170, 0x8e708a70,
-	0x8e708170, 0xb8fb1605,
-	0x807b817b, 0x8e7b847b,
-	0x8e76827b, 0xbef600ff,
-	0x01000000, 0xbef20174,
-	0x80747074, 0x82758075,
-	0xbefc0080, 0xbf800000,
-	0xbe802b00, 0xbe822b02,
-	0xbe842b04, 0xbe862b06,
-	0xbe882b08, 0xbe8a2b0a,
-	0xbe8c2b0c, 0xbe8e2b0e,
-	0xc06b003a, 0x00000000,
-	0xbf8cc07f, 0xc06b013a,
-	0x00000010, 0xbf8cc07f,
-	0xc06b023a, 0x00000020,
-	0xbf8cc07f, 0xc06b033a,
-	0x00000030, 0xbf8cc07f,
-	0x8074c074, 0x82758075,
-	0x807c907c, 0xbf0a7b7c,
-	0xbf85ffe7, 0xbef40172,
-	0xbef00080, 0xbefe00c1,
-	0xbeff00c1, 0xbee80080,
-	0xbee90080, 0xbef600ff,
-	0x01000000, 0x867aff78,
-	0x00400000, 0xbf850003,
-	0xb8faf803, 0x897a7aff,
-	0x10000000, 0xbf85004d,
-	0xbe840080, 0xd2890000,
-	0x00000900, 0x80048104,
-	0xd2890001, 0x00000900,
-	0x80048104, 0xd2890002,
-	0x00000900, 0x80048104,
-	0xd2890003, 0x00000900,
-	0x80048104, 0xc069003a,
-	0x00000070, 0xbf8cc07f,
-	0x80709070, 0xbf06c004,
-	0xbf84ffee, 0xbe840080,
-	0xd2890000, 0x00000901,
-	0x80048104, 0xd2890001,
-	0x00000901, 0x80048104,
-	0xd2890002, 0x00000901,
-	0x80048104, 0xd2890003,
-	0x00000901, 0x80048104,
-	0xc069003a, 0x00000070,
-	0xbf8cc07f, 0x80709070,
-	0xbf06c004, 0xbf84ffee,
-	0xbe840080, 0xd2890000,
-	0x00000902, 0x80048104,
-	0xd2890001, 0x00000902,
-	0x80048104, 0xd2890002,
-	0x00000902, 0x80048104,
-	0xd2890003, 0x00000902,
-	0x80048104, 0xc069003a,
-	0x00000070, 0xbf8cc07f,
-	0x80709070, 0xbf06c004,
-	0xbf84ffee, 0xbe840080,
-	0xd2890000, 0x00000903,
-	0x80048104, 0xd2890001,
-	0x00000903, 0x80048104,
-	0xd2890002, 0x00000903,
-	0x80048104, 0xd2890003,
-	0x00000903, 0x80048104,
-	0xc069003a, 0x00000070,
-	0xbf8cc07f, 0x80709070,
-	0xbf06c004, 0xbf84ffee,
-	0xbf820008, 0xe0724000,
-	0x701d0000, 0xe0724100,
-	0x701d0100, 0xe0724200,
-	0x701d0200, 0xe0724300,
-	0x701d0300, 0xbefe00c1,
-	0xbeff00c1, 0xb8fb4306,
-	0x867bc17b, 0xbf840064,
-	0xbf8a0000, 0x867aff6f,
-	0x04000000, 0xbf840060,
-	0x8e7b867b, 0x8e7b827b,
-	0xbef6007b, 0xb8f02a05,
-	0x80708170, 0x8e708a70,
-	0x8e708170, 0xb8fa1605,
-	0x807a817a, 0x8e7a867a,
-	0x80707a70, 0x8070ff70,
-	0x00000080, 0xbef600ff,
-	0x01000000, 0xbefc0080,
-	0xd28c0002, 0x000100c1,
-	0xd28d0003, 0x000204c1,
+	0xbefc007e, 0xbefe007c,
+	0xbefc0070, 0xc0611a7a,
+	0x0000007c, 0xbf8cc07f,
+	0x80708470, 0xbefc007e,
+	0xb8f1f801, 0xbefe007c,
+	0xbefc0070, 0xc0611c7a,
+	0x0000007c, 0xbf8cc07f,
+	0x80708470, 0xbefc007e,
+	0x867aff7f, 0x04000000,
+	0xbeef0080, 0x876f6f7a,
+	0xb8f02a05, 0x80708170,
+	0x8e708a70, 0x8e708170,
+	0xb8fb1605, 0x807b817b,
+	0x8e7b847b, 0x8e76827b,
+	0xbef600ff, 0x01000000,
+	0xbef20174, 0x80747074,
+	0x82758075, 0xbefc0080,
+	0xbf800000, 0xbe802b00,
+	0xbe822b02, 0xbe842b04,
+	0xbe862b06, 0xbe882b08,
+	0xbe8a2b0a, 0xbe8c2b0c,
+	0xbe8e2b0e, 0xc06b003a,
+	0x00000000, 0xbf8cc07f,
+	0xc06b013a, 0x00000010,
+	0xbf8cc07f, 0xc06b023a,
+	0x00000020, 0xbf8cc07f,
+	0xc06b033a, 0x00000030,
+	0xbf8cc07f, 0x8074c074,
+	0x82758075, 0x807c907c,
+	0xbf0a7b7c, 0xbf85ffe7,
+	0xbef40172, 0xbef00080,
+	0xbefe00c1, 0xbeff00c1,
+	0xbee80080, 0xbee90080,
+	0xbef600ff, 0x01000000,
 	0x867aff78, 0x00400000,
 	0xbf850003, 0xb8faf803,
 	0x897a7aff, 0x10000000,
-	0xbf850030, 0x24040682,
-	0xd86e4000, 0x00000002,
-	0xbf8cc07f, 0xbe840080,
+	0xbf85004d, 0xbe840080,
 	0xd2890000, 0x00000900,
 	0x80048104, 0xd2890001,
 	0x00000900, 0x80048104,
@@ -1332,31 +1275,50 @@ static const uint32_t cwsr_trap_arcturus_hex[] = {
 	0x80048104, 0xc069003a,
 	0x00000070, 0xbf8cc07f,
 	0x80709070, 0xbf06c004,
-	0xbf84ffee, 0x680404ff,
-	0x00000200, 0xd0c9006a,
-	0x0000f702, 0xbf87ffd2,
-	0xbf820015, 0xd1060002,
-	0x00011103, 0x7e0602ff,
-	0x00000200, 0xbefc00ff,
-	0x00010000, 0xbe800077,
-	0x8677ff77, 0xff7fffff,
-	0x8777ff77, 0x00058000,
-	0xd8ec0000, 0x00000002,
-	0xbf8cc07f, 0xe0765000,
-	0x701d0002, 0x68040702,
-	0xd0c9006a, 0x0000f702,
-	0xbf87fff7, 0xbef70000,
-	0xbef000ff, 0x00000400,
+	0xbf84ffee, 0xbe840080,
+	0xd2890000, 0x00000902,
+	0x80048104, 0xd2890001,
+	0x00000902, 0x80048104,
+	0xd2890002, 0x00000902,
+	0x80048104, 0xd2890003,
+	0x00000902, 0x80048104,
+	0xc069003a, 0x00000070,
+	0xbf8cc07f, 0x80709070,
+	0xbf06c004, 0xbf84ffee,
+	0xbe840080, 0xd2890000,
+	0x00000903, 0x80048104,
+	0xd2890001, 0x00000903,
+	0x80048104, 0xd2890002,
+	0x00000903, 0x80048104,
+	0xd2890003, 0x00000903,
+	0x80048104, 0xc069003a,
+	0x00000070, 0xbf8cc07f,
+	0x80709070, 0xbf06c004,
+	0xbf84ffee, 0xbf820008,
+	0xe0724000, 0x701d0000,
+	0xe0724100, 0x701d0100,
+	0xe0724200, 0x701d0200,
+	0xe0724300, 0x701d0300,
 	0xbefe00c1, 0xbeff00c1,
-	0xb8fb2a05, 0x807b817b,
-	0x8e7b827b, 0xbef600ff,
-	0x01000000, 0xbefc0084,
-	0xbf0a7b7c, 0xbf84006d,
-	0xbf11017c, 0x807bff7b,
-	0x00001000, 0x867aff78,
+	0xb8fb4306, 0x867bc17b,
+	0xbf840064, 0xbf8a0000,
+	0x867aff6f, 0x04000000,
+	0xbf840060, 0x8e7b867b,
+	0x8e7b827b, 0xbef6007b,
+	0xb8f02a05, 0x80708170,
+	0x8e708a70, 0x8e708170,
+	0xb8fa1605, 0x807a817a,
+	0x8e7a867a, 0x80707a70,
+	0x8070ff70, 0x00000080,
+	0xbef600ff, 0x01000000,
+	0xbefc0080, 0xd28c0002,
+	0x000100c1, 0xd28d0003,
+	0x000204c1, 0x867aff78,
 	0x00400000, 0xbf850003,
 	0xb8faf803, 0x897a7aff,
-	0x10000000, 0xbf850051,
+	0x10000000, 0xbf850030,
+	0x24040682, 0xd86e4000,
+	0x00000002, 0xbf8cc07f,
 	0xbe840080, 0xd2890000,
 	0x00000900, 0x80048104,
 	0xd2890001, 0x00000900,
@@ -1376,427 +1338,411 @@ static const uint32_t cwsr_trap_arcturus_hex[] = {
 	0xc069003a, 0x00000070,
 	0xbf8cc07f, 0x80709070,
 	0xbf06c004, 0xbf84ffee,
-	0xbe840080, 0xd2890000,
-	0x00000902, 0x80048104,
-	0xd2890001, 0x00000902,
-	0x80048104, 0xd2890002,
-	0x00000902, 0x80048104,
-	0xd2890003, 0x00000902,
-	0x80048104, 0xc069003a,
+	0x680404ff, 0x00000200,
+	0xd0c9006a, 0x0000f702,
+	0xbf87ffd2, 0xbf820015,
+	0xd1060002, 0x00011103,
+	0x7e0602ff, 0x00000200,
+	0xbefc00ff, 0x00010000,
+	0xbe800077, 0x8677ff77,
+	0xff7fffff, 0x8777ff77,
+	0x00058000, 0xd8ec0000,
+	0x00000002, 0xbf8cc07f,
+	0xe0765000, 0x701d0002,
+	0x68040702, 0xd0c9006a,
+	0x0000f702, 0xbf87fff7,
+	0xbef70000, 0xbef000ff,
+	0x00000400, 0xbefe00c1,
+	0xbeff00c1, 0xb8fb2a05,
+	0x807b817b, 0x8e7b827b,
+	0xbef600ff, 0x01000000,
+	0xbefc0084, 0xbf0a7b7c,
+	0xbf84006d, 0xbf11017c,
+	0x807bff7b, 0x00001000,
+	0x867aff78, 0x00400000,
+	0xbf850003, 0xb8faf803,
+	0x897a7aff, 0x10000000,
+	0xbf850051, 0xbe840080,
+	0xd2890000, 0x00000900,
+	0x80048104, 0xd2890001,
+	0x00000900, 0x80048104,
+	0xd2890002, 0x00000900,
+	0x80048104, 0xd2890003,
+	0x00000900, 0x80048104,
+	0xc069003a, 0x00000070,
+	0xbf8cc07f, 0x80709070,
+	0xbf06c004, 0xbf84ffee,
+	0xbe840080, 0xd2890000,
+	0x00000901, 0x80048104,
+	0xd2890001, 0x00000901,
+	0x80048104, 0xd2890002,
+	0x00000901, 0x80048104,
+	0xd2890003, 0x00000901,
+	0x80048104, 0xc069003a,
 	0x00000070, 0xbf8cc07f,
 	0x80709070, 0xbf06c004,
 	0xbf84ffee, 0xbe840080,
-	0xd2890000, 0x00000903,
+	0xd2890000, 0x00000902,
 	0x80048104, 0xd2890001,
-	0x00000903, 0x80048104,
-	0xd2890002, 0x00000903,
+	0x00000902, 0x80048104,
+	0xd2890002, 0x00000902,
 	0x80048104, 0xd2890003,
-	0x00000903, 0x80048104,
+	0x00000902, 0x80048104,
 	0xc069003a, 0x00000070,
 	0xbf8cc07f, 0x80709070,
 	0xbf06c004, 0xbf84ffee,
-	0x807c847c, 0xbf0a7b7c,
-	0xbf85ffb1, 0xbf9c0000,
-	0xbf820012, 0x7e000300,
-	0x7e020301, 0x7e040302,
-	0x7e060303, 0xe0724000,
-	0x701d0000, 0xe0724100,
-	0x701d0100, 0xe0724200,
-	0x701d0200, 0xe0724300,
-	0x701d0300, 0x807c847c,
-	0x8070ff70, 0x00000400,
-	0xbf0a7b7c, 0xbf85ffef,
-	0xbf9c0000, 0xbefc0080,
-	0xbf11017c, 0x867aff78,
-	0x00400000, 0xbf850003,
-	0xb8faf803, 0x897a7aff,
-	0x10000000, 0xbf850059,
-	0xd3d84000, 0x18000100,
-	0xd3d84001, 0x18000101,
-	0xd3d84002, 0x18000102,
-	0xd3d84003, 0x18000103,
 	0xbe840080, 0xd2890000,
-	0x00000900, 0x80048104,
-	0xd2890001, 0x00000900,
+	0x00000903, 0x80048104,
+	0xd2890001, 0x00000903,
 	0x80048104, 0xd2890002,
-	0x00000900, 0x80048104,
-	0xd2890003, 0x00000900,
+	0x00000903, 0x80048104,
+	0xd2890003, 0x00000903,
 	0x80048104, 0xc069003a,
 	0x00000070, 0xbf8cc07f,
 	0x80709070, 0xbf06c004,
-	0xbf84ffee, 0xbe840080,
-	0xd2890000, 0x00000901,
+	0xbf84ffee, 0x807c847c,
+	0xbf0a7b7c, 0xbf85ffb1,
+	0xbf9c0000, 0xbf820012,
+	0x7e000300, 0x7e020301,
+	0x7e040302, 0x7e060303,
+	0xe0724000, 0x701d0000,
+	0xe0724100, 0x701d0100,
+	0xe0724200, 0x701d0200,
+	0xe0724300, 0x701d0300,
+	0x807c847c, 0x8070ff70,
+	0x00000400, 0xbf0a7b7c,
+	0xbf85ffef, 0xbf9c0000,
+	0xbefc0080, 0xbf11017c,
+	0x867aff78, 0x00400000,
+	0xbf850003, 0xb8faf803,
+	0x897a7aff, 0x10000000,
+	0xbf850059, 0xd3d84000,
+	0x18000100, 0xd3d84001,
+	0x18000101, 0xd3d84002,
+	0x18000102, 0xd3d84003,
+	0x18000103, 0xbe840080,
+	0xd2890000, 0x00000900,
 	0x80048104, 0xd2890001,
-	0x00000901, 0x80048104,
-	0xd2890002, 0x00000901,
+	0x00000900, 0x80048104,
+	0xd2890002, 0x00000900,
 	0x80048104, 0xd2890003,
-	0x00000901, 0x80048104,
+	0x00000900, 0x80048104,
 	0xc069003a, 0x00000070,
 	0xbf8cc07f, 0x80709070,
 	0xbf06c004, 0xbf84ffee,
 	0xbe840080, 0xd2890000,
-	0x00000902, 0x80048104,
-	0xd2890001, 0x00000902,
+	0x00000901, 0x80048104,
+	0xd2890001, 0x00000901,
 	0x80048104, 0xd2890002,
-	0x00000902, 0x80048104,
-	0xd2890003, 0x00000902,
+	0x00000901, 0x80048104,
+	0xd2890003, 0x00000901,
 	0x80048104, 0xc069003a,
 	0x00000070, 0xbf8cc07f,
 	0x80709070, 0xbf06c004,
 	0xbf84ffee, 0xbe840080,
-	0xd2890000, 0x00000903,
+	0xd2890000, 0x00000902,
 	0x80048104, 0xd2890001,
-	0x00000903, 0x80048104,
-	0xd2890002, 0x00000903,
+	0x00000902, 0x80048104,
+	0xd2890002, 0x00000902,
 	0x80048104, 0xd2890003,
-	0x00000903, 0x80048104,
+	0x00000902, 0x80048104,
 	0xc069003a, 0x00000070,
 	0xbf8cc07f, 0x80709070,
 	0xbf06c004, 0xbf84ffee,
-	0x807c847c, 0xbf0a7b7c,
-	0xbf85ffa9, 0xbf9c0000,
-	0xbf820016, 0xd3d84000,
-	0x18000100, 0xd3d84001,
-	0x18000101, 0xd3d84002,
-	0x18000102, 0xd3d84003,
-	0x18000103, 0xe0724000,
-	0x701d0000, 0xe0724100,
-	0x701d0100, 0xe0724200,
-	0x701d0200, 0xe0724300,
-	0x701d0300, 0x807c847c,
-	0x8070ff70, 0x00000400,
-	0xbf0a7b7c, 0xbf85ffeb,
-	0xbf9c0000, 0xbf8200e3,
-	0xbef4007e, 0x8675ff7f,
-	0x0000ffff, 0x8775ff75,
-	0x00040000, 0xbef60080,
-	0xbef700ff, 0x00807fac,
-	0x866eff7f, 0x04000000,
-	0xbf84001f, 0xbefe00c1,
-	0xbeff00c1, 0xb8ef4306,
-	0x866fc16f, 0xbf84001a,
-	0x8e6f866f, 0x8e6f826f,
-	0xbef6006f, 0xb8f82a05,
-	0x80788178, 0x8e788a78,
-	0x8e788178, 0xb8ee1605,
-	0x806e816e, 0x8e6e866e,
-	0x80786e78, 0x8078ff78,
-	0x00000080, 0xbef600ff,
-	0x01000000, 0xbefc0080,
-	0xe0510000, 0x781d0000,
-	0xe0510100, 0x781d0000,
-	0x807cff7c, 0x00000200,
-	0x8078ff78, 0x00000200,
-	0xbf0a6f7c, 0xbf85fff6,
+	0xbe840080, 0xd2890000,
+	0x00000903, 0x80048104,
+	0xd2890001, 0x00000903,
+	0x80048104, 0xd2890002,
+	0x00000903, 0x80048104,
+	0xd2890003, 0x00000903,
+	0x80048104, 0xc069003a,
+	0x00000070, 0xbf8cc07f,
+	0x80709070, 0xbf06c004,
+	0xbf84ffee, 0x807c847c,
+	0xbf0a7b7c, 0xbf85ffa9,
+	0xbf9c0000, 0xbf820016,
+	0xd3d84000, 0x18000100,
+	0xd3d84001, 0x18000101,
+	0xd3d84002, 0x18000102,
+	0xd3d84003, 0x18000103,
+	0xe0724000, 0x701d0000,
+	0xe0724100, 0x701d0100,
+	0xe0724200, 0x701d0200,
+	0xe0724300, 0x701d0300,
+	0x807c847c, 0x8070ff70,
+	0x00000400, 0xbf0a7b7c,
+	0xbf85ffeb, 0xbf9c0000,
+	0xbf8200e3, 0xbef4007e,
+	0x8675ff7f, 0x0000ffff,
+	0x8775ff75, 0x00040000,
+	0xbef60080, 0xbef700ff,
+	0x00807fac, 0x866eff7f,
+	0x04000000, 0xbf84001f,
 	0xbefe00c1, 0xbeff00c1,
+	0xb8ef4306, 0x866fc16f,
+	0xbf84001a, 0x8e6f866f,
+	0x8e6f826f, 0xbef6006f,
+	0xb8f82a05, 0x80788178,
+	0x8e788a78, 0x8e788178,
+	0xb8ee1605, 0x806e816e,
+	0x8e6e866e, 0x80786e78,
+	0x8078ff78, 0x00000080,
 	0xbef600ff, 0x01000000,
-	0xb8ef2a05, 0x806f816f,
-	0x8e6f826f, 0x806fff6f,
-	0x00008000, 0xbef80080,
-	0xbeee0078, 0x8078ff78,
-	0x00000400, 0xbefc0084,
-	0xbf11087c, 0xe0524000,
-	0x781d0000, 0xe0524100,
-	0x781d0100, 0xe0524200,
-	0x781d0200, 0xe0524300,
-	0x781d0300, 0xbf8c0f70,
-	0x7e000300, 0x7e020301,
-	0x7e040302, 0x7e060303,
-	0x807c847c, 0x8078ff78,
-	0x00000400, 0xbf0a6f7c,
-	0xbf85ffee, 0xbefc0080,
-	0xbf11087c, 0xe0524000,
-	0x781d0000, 0xe0524100,
-	0x781d0100, 0xe0524200,
-	0x781d0200, 0xe0524300,
-	0x781d0300, 0xbf8c0f70,
-	0xd3d94000, 0x18000100,
-	0xd3d94001, 0x18000101,
-	0xd3d94002, 0x18000102,
-	0xd3d94003, 0x18000103,
-	0x807c847c, 0x8078ff78,
-	0x00000400, 0xbf0a6f7c,
-	0xbf85ffea, 0xbf9c0000,
-	0xe0524000, 0x6e1d0000,
-	0xe0524100, 0x6e1d0100,
-	0xe0524200, 0x6e1d0200,
-	0xe0524300, 0x6e1d0300,
-	0xbf8c0f70, 0xb8f82a05,
-	0x80788178, 0x8e788a78,
-	0x8e788178, 0xb8ee1605,
-	0x806e816e, 0x8e6e866e,
-	0x80786e78, 0x80f8c078,
-	0xb8ef1605, 0x806f816f,
-	0x8e6f846f, 0x8e76826f,
-	0xbef600ff, 0x01000000,
-	0xbefc006f, 0xc031003a,
-	0x00000078, 0x80f8c078,
-	0xbf8cc07f, 0x80fc907c,
-	0xbf800000, 0xbe802d00,
-	0xbe822d02, 0xbe842d04,
-	0xbe862d06, 0xbe882d08,
-	0xbe8a2d0a, 0xbe8c2d0c,
-	0xbe8e2d0e, 0xbf06807c,
-	0xbf84fff0, 0xb8f82a05,
-	0x80788178, 0x8e788a78,
-	0x8e788178, 0xb8ee1605,
-	0x806e816e, 0x8e6e866e,
-	0x80786e78, 0xbef60084,
-	0xbef600ff, 0x01000000,
-	0xc0211bfa, 0x00000078,
-	0x80788478, 0xc0211b3a,
+	0xbefc0080, 0xe0510000,
+	0x781d0000, 0xe0510100,
+	0x781d0000, 0x807cff7c,
+	0x00000200, 0x8078ff78,
+	0x00000200, 0xbf0a6f7c,
+	0xbf85fff6, 0xbefe00c1,
+	0xbeff00c1, 0xbef600ff,
+	0x01000000, 0xb8ef2a05,
+	0x806f816f, 0x8e6f826f,
+	0x806fff6f, 0x00008000,
+	0xbef80080, 0xbeee0078,
+	0x8078ff78, 0x00000400,
+	0xbefc0084, 0xbf11087c,
+	0xe0524000, 0x781d0000,
+	0xe0524100, 0x781d0100,
+	0xe0524200, 0x781d0200,
+	0xe0524300, 0x781d0300,
+	0xbf8c0f70, 0x7e000300,
+	0x7e020301, 0x7e040302,
+	0x7e060303, 0x807c847c,
+	0x8078ff78, 0x00000400,
+	0xbf0a6f7c, 0xbf85ffee,
+	0xbefc0080, 0xbf11087c,
+	0xe0524000, 0x781d0000,
+	0xe0524100, 0x781d0100,
+	0xe0524200, 0x781d0200,
+	0xe0524300, 0x781d0300,
+	0xbf8c0f70, 0xd3d94000,
+	0x18000100, 0xd3d94001,
+	0x18000101, 0xd3d94002,
+	0x18000102, 0xd3d94003,
+	0x18000103, 0x807c847c,
+	0x8078ff78, 0x00000400,
+	0xbf0a6f7c, 0xbf85ffea,
+	0xbf9c0000, 0xe0524000,
+	0x6e1d0000, 0xe0524100,
+	0x6e1d0100, 0xe0524200,
+	0x6e1d0200, 0xe0524300,
+	0x6e1d0300, 0xbf8c0f70,
+	0xb8f82a05, 0x80788178,
+	0x8e788a78, 0x8e788178,
+	0xb8ee1605, 0x806e816e,
+	0x8e6e866e, 0x80786e78,
+	0x80f8c078, 0xb8ef1605,
+	0x806f816f, 0x8e6f846f,
+	0x8e76826f, 0xbef600ff,
+	0x01000000, 0xbefc006f,
+	0xc031003a, 0x00000078,
+	0x80f8c078, 0xbf8cc07f,
+	0x80fc907c, 0xbf800000,
+	0xbe802d00, 0xbe822d02,
+	0xbe842d04, 0xbe862d06,
+	0xbe882d08, 0xbe8a2d0a,
+	0xbe8c2d0c, 0xbe8e2d0e,
+	0xbf06807c, 0xbf84fff0,
+	0xb8f82a05, 0x80788178,
+	0x8e788a78, 0x8e788178,
+	0xb8ee1605, 0x806e816e,
+	0x8e6e866e, 0x80786e78,
+	0xbef60084, 0xbef600ff,
+	0x01000000, 0xc0211bfa,
 	0x00000078, 0x80788478,
-	0xc0211b7a, 0x00000078,
-	0x80788478, 0xc0211c3a,
+	0xc0211b3a, 0x00000078,
+	0x80788478, 0xc0211b7a,
 	0x00000078, 0x80788478,
-	0xc0211c7a, 0x00000078,
-	0x80788478, 0xc0211eba,
+	0xc0211c3a, 0x00000078,
+	0x80788478, 0xc0211c7a,
 	0x00000078, 0x80788478,
-	0xc0211efa, 0x00000078,
-	0x80788478, 0xc0211a3a,
+	0xc0211eba, 0x00000078,
+	0x80788478, 0xc0211efa,
 	0x00000078, 0x80788478,
-	0xc0211a7a, 0x00000078,
-	0x80788478, 0xc0211cfa,
+	0xc0211a3a, 0x00000078,
+	0x80788478, 0xc0211a7a,
 	0x00000078, 0x80788478,
-	0xbf8cc07f, 0xbefc006f,
-	0xbefe0070, 0xbeff0071,
-	0x866f7bff, 0x000003ff,
-	0xb96f4803, 0x866f7bff,
-	0xfffff800, 0x8f6f8b6f,
-	0xb96fa2c3, 0xb973f801,
-	0xb8ee2a05, 0x806e816e,
-	0x8e6e8a6e, 0x8e6e816e,
-	0xb8ef1605, 0x806f816f,
-	0x8e6f866f, 0x806e6f6e,
-	0x806e746e, 0x826f8075,
-	0x866fff6f, 0x0000ffff,
-	0xc00b1c37, 0x00000050,
-	0xc00b1d37, 0x00000060,
-	0xc0031e77, 0x00000074,
-	0xbf8cc07f, 0x8f6e8b77,
-	0x866eff6e, 0x001f8000,
-	0xb96ef807, 0x866dff6d,
-	0x0000ffff, 0x86fe7e7e,
-	0x86ea6a6a, 0x8f6e837a,
-	0xb96ee0c2, 0xbf800002,
-	0xb97a0002, 0xbf8a0000,
-	0xbe801f6c, 0xbf810000,
+	0xc0211cfa, 0x00000078,
+	0x80788478, 0xbf8cc07f,
+	0xbefc006f, 0xbefe0070,
+	0xbeff0071, 0x866f7bff,
+	0x000003ff, 0xb96f4803,
+	0x866f7bff, 0xfffff800,
+	0x8f6f8b6f, 0xb96fa2c3,
+	0xb973f801, 0xb8ee2a05,
+	0x806e816e, 0x8e6e8a6e,
+	0x8e6e816e, 0xb8ef1605,
+	0x806f816f, 0x8e6f866f,
+	0x806e6f6e, 0x806e746e,
+	0x826f8075, 0x866fff6f,
+	0x0000ffff, 0xc00b1c37,
+	0x00000050, 0xc00b1d37,
+	0x00000060, 0xc0031e77,
+	0x00000074, 0xbf8cc07f,
+	0x8f6e8b77, 0x866eff6e,
+	0x001f8000, 0xb96ef807,
+	0x866dff6d, 0x0000ffff,
+	0x86fe7e7e, 0x86ea6a6a,
+	0x8f6e837a, 0xb96ee0c2,
+	0xbf800002, 0xb97a0002,
+	0xbf8a0000, 0xbe801f6c,
+	0xbf810000, 0x00000000,
 };
 
 static const uint32_t cwsr_trap_aldebaran_hex[] = {
-	0xbf820001, 0xbf8202e5,
+	0xbf820001, 0xbf8202ea,
 	0xb8f8f802, 0x8978ff78,
 	0x00020006, 0xb8fbf803,
 	0x866eff78, 0x00002000,
 	0xbf840009, 0x866eff6d,
-	0x00ff0000, 0xbf85001e,
+	0x00ff0000, 0xbf850023,
 	0x866eff7b, 0x00000400,
-	0xbf85005b, 0xbf8e0010,
+	0xbf850060, 0xbf8e0010,
 	0xb8fbf803, 0xbf82fffa,
 	0x866eff7b, 0x03c00900,
-	0xbf850015, 0x866eff7b,
-	0x000071ff, 0xbf840008,
-	0x866fff7b, 0x00007080,
-	0xbf840001, 0xbeee1a87,
-	0xb8eff801, 0x8e6e8c6e,
-	0x866e6f6e, 0xbf85000a,
-	0x866eff6d, 0x00ff0000,
-	0xbf850007, 0xb8eef801,
-	0x866eff6e, 0x00000800,
-	0xbf850003, 0x866eff7b,
-	0x00000400, 0xbf850040,
-	0xb8faf807, 0x867aff7a,
-	0x001f8000, 0x8e7a8b7a,
-	0x8977ff77, 0xfc000000,
-	0x87777a77, 0xba7ff807,
-	0x00000000, 0xb8faf812,
-	0xb8fbf813, 0x8efa887a,
-	0xbf0d8f7b, 0xbf840002,
-	0x877bff7b, 0xffff0000,
-	0xc0031c3d, 0x00000010,
-	0xc0071bbd, 0x00000000,
-	0xc0071ebd, 0x00000008,
-	0xbf8cc07f, 0x8671ff6d,
-	0x01000000, 0xbf840004,
-	0x92f1ff70, 0x00010001,
-	0xbf840016, 0xbf820005,
-	0x86708170, 0x8e709770,
-	0x8977ff77, 0x00800000,
-	0x87777077, 0x86ee6e6e,
-	0xbf840001, 0xbe801d6e,
-	0x866eff6d, 0x01ff0000,
-	0xbf850005, 0x8778ff78,
-	0x00002000, 0x80ec886c,
-	0x82ed806d, 0xbf820005,
-	0x866eff6d, 0x01000000,
-	0xbf850002, 0x806c846c,
-	0x826d806d, 0x866dff6d,
-	0x0000ffff, 0x8f7a8b77,
+	0xbf85001a, 0x866eff6d,
+	0x01ff0000, 0xbf06ff6e,
+	0x01040000, 0xbf850015,
+	0x866eff7b, 0x000071ff,
+	0xbf840008, 0x866fff7b,
+	0x00007080, 0xbf840001,
+	0xbeee1a87, 0xb8eff801,
+	0x8e6e8c6e, 0x866e6f6e,
+	0xbf85000a, 0x866eff6d,
+	0x00ff0000, 0xbf850007,
+	0xb8eef801, 0x866eff6e,
+	0x00000800, 0xbf850003,
+	0x866eff7b, 0x00000400,
+	0xbf850040, 0xb8faf807,
 	0x867aff7a, 0x001f8000,
-	0xb97af807, 0x86fe7e7e,
-	0x86ea6a6a, 0x8f6e8378,
-	0xb96ee0c2, 0xbf800002,
-	0xb9780002, 0xbe801f6c,
-	0x866dff6d, 0x0000ffff,
-	0xbefa0080, 0xb97a0283,
-	0xb8faf807, 0x867aff7a,
-	0x001f8000, 0x8e7a8b7a,
-	0x8977ff77, 0xfc000000,
-	0x87777a77, 0xba7ff807,
-	0x00000000, 0xbeee007e,
-	0xbeef007f, 0xbefe0180,
-	0xbf900004, 0x877a8478,
-	0xb97af802, 0xbf8e0002,
-	0xbf88fffe, 0xb8fa2985,
-	0x807a817a, 0x8e7a8a7a,
-	0x8e7a817a, 0xb8fb1605,
-	0x807b817b, 0x8e7b867b,
-	0x807a7b7a, 0x807a7e7a,
-	0x827b807f, 0x867bff7b,
-	0x0000ffff, 0xc04b1c3d,
-	0x00000050, 0xbf8cc07f,
-	0xc04b1d3d, 0x00000060,
-	0xbf8cc07f, 0xc0431e7d,
-	0x00000074, 0xbf8cc07f,
-	0xbef4007e, 0x8675ff7f,
-	0x0000ffff, 0x8775ff75,
-	0x00040000, 0xbef60080,
-	0xbef700ff, 0x00807fac,
-	0xbef1007c, 0xbef00080,
-	0xb8f02985, 0x80708170,
-	0x8e708a70, 0x8e708170,
-	0xb8fa1605, 0x807a817a,
-	0x8e7a867a, 0x80707a70,
-	0xbef60084, 0xbef600ff,
-	0x01000000, 0xbefe007c,
-	0xbefc0070, 0xc0611c7a,
-	0x0000007c, 0xbf8cc07f,
-	0x80708470, 0xbefc007e,
+	0x8e7a8b7a, 0x8977ff77,
+	0xfc000000, 0x87777a77,
+	0xba7ff807, 0x00000000,
+	0xb8faf812, 0xb8fbf813,
+	0x8efa887a, 0xbf0d8f7b,
+	0xbf840002, 0x877bff7b,
+	0xffff0000, 0xc0031c3d,
+	0x00000010, 0xc0071bbd,
+	0x00000000, 0xc0071ebd,
+	0x00000008, 0xbf8cc07f,
+	0x8671ff6d, 0x01000000,
+	0xbf840004, 0x92f1ff70,
+	0x00010001, 0xbf840016,
+	0xbf820005, 0x86708170,
+	0x8e709770, 0x8977ff77,
+	0x00800000, 0x87777077,
+	0x86ee6e6e, 0xbf840001,
+	0xbe801d6e, 0x866eff6d,
+	0x01ff0000, 0xbf850005,
+	0x8778ff78, 0x00002000,
+	0x80ec886c, 0x82ed806d,
+	0xbf820005, 0x866eff6d,
+	0x01000000, 0xbf850002,
+	0x806c846c, 0x826d806d,
+	0x866dff6d, 0x0000ffff,
+	0x8f7a8b77, 0x867aff7a,
+	0x001f8000, 0xb97af807,
+	0x86fe7e7e, 0x86ea6a6a,
+	0x8f6e8378, 0xb96ee0c2,
+	0xbf800002, 0xb9780002,
+	0xbe801f6c, 0x866dff6d,
+	0x0000ffff, 0xbefa0080,
+	0xb97a0283, 0xb8faf807,
+	0x867aff7a, 0x001f8000,
+	0x8e7a8b7a, 0x8977ff77,
+	0xfc000000, 0x87777a77,
+	0xba7ff807, 0x00000000,
+	0xbeee007e, 0xbeef007f,
+	0xbefe0180, 0xbf900004,
+	0x877a8478, 0xb97af802,
+	0xbf8e0002, 0xbf88fffe,
+	0xb8fa2985, 0x807a817a,
+	0x8e7a8a7a, 0x8e7a817a,
+	0xb8fb1605, 0x807b817b,
+	0x8e7b867b, 0x807a7b7a,
+	0x807a7e7a, 0x827b807f,
+	0x867bff7b, 0x0000ffff,
+	0xc04b1c3d, 0x00000050,
+	0xbf8cc07f, 0xc04b1d3d,
+	0x00000060, 0xbf8cc07f,
+	0xc0431e7d, 0x00000074,
+	0xbf8cc07f, 0xbef4007e,
+	0x8675ff7f, 0x0000ffff,
+	0x8775ff75, 0x00040000,
+	0xbef60080, 0xbef700ff,
+	0x00807fac, 0xbef1007c,
+	0xbef00080, 0xb8f02985,
+	0x80708170, 0x8e708a70,
+	0x8e708170, 0xb8fa1605,
+	0x807a817a, 0x8e7a867a,
+	0x80707a70, 0xbef60084,
+	0xbef600ff, 0x01000000,
 	0xbefe007c, 0xbefc0070,
-	0xc0611b3a, 0x0000007c,
+	0xc0611c7a, 0x0000007c,
 	0xbf8cc07f, 0x80708470,
 	0xbefc007e, 0xbefe007c,
-	0xbefc0070, 0xc0611b7a,
+	0xbefc0070, 0xc0611b3a,
 	0x0000007c, 0xbf8cc07f,
 	0x80708470, 0xbefc007e,
 	0xbefe007c, 0xbefc0070,
-	0xc0611bba, 0x0000007c,
+	0xc0611b7a, 0x0000007c,
 	0xbf8cc07f, 0x80708470,
 	0xbefc007e, 0xbefe007c,
-	0xbefc0070, 0xc0611bfa,
+	0xbefc0070, 0xc0611bba,
 	0x0000007c, 0xbf8cc07f,
 	0x80708470, 0xbefc007e,
 	0xbefe007c, 0xbefc0070,
-	0xc0611e3a, 0x0000007c,
-	0xbf8cc07f, 0x80708470,
-	0xbefc007e, 0xb8fbf803,
-	0xbefe007c, 0xbefc0070,
-	0xc0611efa, 0x0000007c,
+	0xc0611bfa, 0x0000007c,
 	0xbf8cc07f, 0x80708470,
 	0xbefc007e, 0xbefe007c,
-	0xbefc0070, 0xc0611a3a,
+	0xbefc0070, 0xc0611e3a,
+	0x0000007c, 0xbf8cc07f,
+	0x80708470, 0xbefc007e,
+	0xb8fbf803, 0xbefe007c,
+	0xbefc0070, 0xc0611efa,
 	0x0000007c, 0xbf8cc07f,
 	0x80708470, 0xbefc007e,
 	0xbefe007c, 0xbefc0070,
-	0xc0611a7a, 0x0000007c,
-	0xbf8cc07f, 0x80708470,
-	0xbefc007e, 0xb8f1f801,
-	0xbefe007c, 0xbefc0070,
-	0xc0611c7a, 0x0000007c,
+	0xc0611a3a, 0x0000007c,
 	0xbf8cc07f, 0x80708470,
-	0xbefc007e, 0x867aff7f,
-	0x04000000, 0xbeef0080,
-	0x876f6f7a, 0xb8f02985,
-	0x80708170, 0x8e708a70,
-	0x8e708170, 0xb8fb1605,
-	0x807b817b, 0x8e7b847b,
-	0x8e76827b, 0xbef600ff,
-	0x01000000, 0xbef20174,
-	0x80747074, 0x82758075,
-	0xbefc0080, 0xbf800000,
-	0xbe802b00, 0xbe822b02,
-	0xbe842b04, 0xbe862b06,
-	0xbe882b08, 0xbe8a2b0a,
-	0xbe8c2b0c, 0xbe8e2b0e,
-	0xc06b003a, 0x00000000,
-	0xbf8cc07f, 0xc06b013a,
-	0x00000010, 0xbf8cc07f,
-	0xc06b023a, 0x00000020,
-	0xbf8cc07f, 0xc06b033a,
-	0x00000030, 0xbf8cc07f,
-	0x8074c074, 0x82758075,
-	0x807c907c, 0xbf0a7b7c,
-	0xbf85ffe7, 0xbef40172,
-	0xbef00080, 0xbefe00c1,
-	0xbeff00c1, 0xbee80080,
-	0xbee90080, 0xbef600ff,
-	0x01000000, 0x867aff78,
-	0x00400000, 0xbf850003,
-	0xb8faf803, 0x897a7aff,
-	0x10000000, 0xbf85004d,
-	0xbe840080, 0xd2890000,
-	0x00000900, 0x80048104,
-	0xd2890001, 0x00000900,
-	0x80048104, 0xd2890002,
-	0x00000900, 0x80048104,
-	0xd2890003, 0x00000900,
-	0x80048104, 0xc069003a,
-	0x00000070, 0xbf8cc07f,
-	0x80709070, 0xbf06c004,
-	0xbf84ffee, 0xbe840080,
-	0xd2890000, 0x00000901,
-	0x80048104, 0xd2890001,
-	0x00000901, 0x80048104,
-	0xd2890002, 0x00000901,
-	0x80048104, 0xd2890003,
-	0x00000901, 0x80048104,
-	0xc069003a, 0x00000070,
-	0xbf8cc07f, 0x80709070,
-	0xbf06c004, 0xbf84ffee,
-	0xbe840080, 0xd2890000,
-	0x00000902, 0x80048104,
-	0xd2890001, 0x00000902,
-	0x80048104, 0xd2890002,
-	0x00000902, 0x80048104,
-	0xd2890003, 0x00000902,
-	0x80048104, 0xc069003a,
-	0x00000070, 0xbf8cc07f,
-	0x80709070, 0xbf06c004,
-	0xbf84ffee, 0xbe840080,
-	0xd2890000, 0x00000903,
-	0x80048104, 0xd2890001,
-	0x00000903, 0x80048104,
-	0xd2890002, 0x00000903,
-	0x80048104, 0xd2890003,
-	0x00000903, 0x80048104,
-	0xc069003a, 0x00000070,
-	0xbf8cc07f, 0x80709070,
-	0xbf06c004, 0xbf84ffee,
-	0xbf820008, 0xe0724000,
-	0x701d0000, 0xe0724100,
-	0x701d0100, 0xe0724200,
-	0x701d0200, 0xe0724300,
-	0x701d0300, 0xbefe00c1,
-	0xbeff00c1, 0xb8fb4306,
-	0x867bc17b, 0xbf840064,
-	0xbf8a0000, 0x867aff6f,
-	0x04000000, 0xbf840060,
-	0x8e7b867b, 0x8e7b827b,
-	0xbef6007b, 0xb8f02985,
-	0x80708170, 0x8e708a70,
-	0x8e708170, 0xb8fa1605,
-	0x807a817a, 0x8e7a867a,
-	0x80707a70, 0x8070ff70,
-	0x00000080, 0xbef600ff,
-	0x01000000, 0xbefc0080,
-	0xd28c0002, 0x000100c1,
-	0xd28d0003, 0x000204c1,
+	0xbefc007e, 0xbefe007c,
+	0xbefc0070, 0xc0611a7a,
+	0x0000007c, 0xbf8cc07f,
+	0x80708470, 0xbefc007e,
+	0xb8f1f801, 0xbefe007c,
+	0xbefc0070, 0xc0611c7a,
+	0x0000007c, 0xbf8cc07f,
+	0x80708470, 0xbefc007e,
+	0x867aff7f, 0x04000000,
+	0xbeef0080, 0x876f6f7a,
+	0xb8f02985, 0x80708170,
+	0x8e708a70, 0x8e708170,
+	0xb8fb1605, 0x807b817b,
+	0x8e7b847b, 0x8e76827b,
+	0xbef600ff, 0x01000000,
+	0xbef20174, 0x80747074,
+	0x82758075, 0xbefc0080,
+	0xbf800000, 0xbe802b00,
+	0xbe822b02, 0xbe842b04,
+	0xbe862b06, 0xbe882b08,
+	0xbe8a2b0a, 0xbe8c2b0c,
+	0xbe8e2b0e, 0xc06b003a,
+	0x00000000, 0xbf8cc07f,
+	0xc06b013a, 0x00000010,
+	0xbf8cc07f, 0xc06b023a,
+	0x00000020, 0xbf8cc07f,
+	0xc06b033a, 0x00000030,
+	0xbf8cc07f, 0x8074c074,
+	0x82758075, 0x807c907c,
+	0xbf0a7b7c, 0xbf85ffe7,
+	0xbef40172, 0xbef00080,
+	0xbefe00c1, 0xbeff00c1,
+	0xbee80080, 0xbee90080,
+	0xbef600ff, 0x01000000,
 	0x867aff78, 0x00400000,
 	0xbf850003, 0xb8faf803,
 	0x897a7aff, 0x10000000,
-	0xbf850030, 0x24040682,
-	0xd86e4000, 0x00000002,
-	0xbf8cc07f, 0xbe840080,
+	0xbf85004d, 0xbe840080,
 	0xd2890000, 0x00000900,
 	0x80048104, 0xd2890001,
 	0x00000900, 0x80048104,
@@ -1815,31 +1761,50 @@ static const uint32_t cwsr_trap_aldebaran_hex[] = {
 	0x80048104, 0xc069003a,
 	0x00000070, 0xbf8cc07f,
 	0x80709070, 0xbf06c004,
-	0xbf84ffee, 0x680404ff,
-	0x00000200, 0xd0c9006a,
-	0x0000f702, 0xbf87ffd2,
-	0xbf820015, 0xd1060002,
-	0x00011103, 0x7e0602ff,
-	0x00000200, 0xbefc00ff,
-	0x00010000, 0xbe800077,
-	0x8677ff77, 0xff7fffff,
-	0x8777ff77, 0x00058000,
-	0xd8ec0000, 0x00000002,
-	0xbf8cc07f, 0xe0765000,
-	0x701d0002, 0x68040702,
-	0xd0c9006a, 0x0000f702,
-	0xbf87fff7, 0xbef70000,
-	0xbef000ff, 0x00000400,
+	0xbf84ffee, 0xbe840080,
+	0xd2890000, 0x00000902,
+	0x80048104, 0xd2890001,
+	0x00000902, 0x80048104,
+	0xd2890002, 0x00000902,
+	0x80048104, 0xd2890003,
+	0x00000902, 0x80048104,
+	0xc069003a, 0x00000070,
+	0xbf8cc07f, 0x80709070,
+	0xbf06c004, 0xbf84ffee,
+	0xbe840080, 0xd2890000,
+	0x00000903, 0x80048104,
+	0xd2890001, 0x00000903,
+	0x80048104, 0xd2890002,
+	0x00000903, 0x80048104,
+	0xd2890003, 0x00000903,
+	0x80048104, 0xc069003a,
+	0x00000070, 0xbf8cc07f,
+	0x80709070, 0xbf06c004,
+	0xbf84ffee, 0xbf820008,
+	0xe0724000, 0x701d0000,
+	0xe0724100, 0x701d0100,
+	0xe0724200, 0x701d0200,
+	0xe0724300, 0x701d0300,
 	0xbefe00c1, 0xbeff00c1,
-	0xb8fb2b05, 0x807b817b,
-	0x8e7b827b, 0xbef600ff,
-	0x01000000, 0xbefc0084,
-	0xbf0a7b7c, 0xbf84006d,
-	0xbf11017c, 0x807bff7b,
-	0x00001000, 0x867aff78,
+	0xb8fb4306, 0x867bc17b,
+	0xbf840064, 0xbf8a0000,
+	0x867aff6f, 0x04000000,
+	0xbf840060, 0x8e7b867b,
+	0x8e7b827b, 0xbef6007b,
+	0xb8f02985, 0x80708170,
+	0x8e708a70, 0x8e708170,
+	0xb8fa1605, 0x807a817a,
+	0x8e7a867a, 0x80707a70,
+	0x8070ff70, 0x00000080,
+	0xbef600ff, 0x01000000,
+	0xbefc0080, 0xd28c0002,
+	0x000100c1, 0xd28d0003,
+	0x000204c1, 0x867aff78,
 	0x00400000, 0xbf850003,
 	0xb8faf803, 0x897a7aff,
-	0x10000000, 0xbf850051,
+	0x10000000, 0xbf850030,
+	0x24040682, 0xd86e4000,
+	0x00000002, 0xbf8cc07f,
 	0xbe840080, 0xd2890000,
 	0x00000900, 0x80048104,
 	0xd2890001, 0x00000900,
@@ -1856,54 +1821,34 @@ static const uint32_t cwsr_trap_aldebaran_hex[] = {
 	0xd2890002, 0x00000901,
 	0x80048104, 0xd2890003,
 	0x00000901, 0x80048104,
-	0xc069003a, 0x00000070,
-	0xbf8cc07f, 0x80709070,
-	0xbf06c004, 0xbf84ffee,
-	0xbe840080, 0xd2890000,
-	0x00000902, 0x80048104,
-	0xd2890001, 0x00000902,
-	0x80048104, 0xd2890002,
-	0x00000902, 0x80048104,
-	0xd2890003, 0x00000902,
-	0x80048104, 0xc069003a,
-	0x00000070, 0xbf8cc07f,
-	0x80709070, 0xbf06c004,
-	0xbf84ffee, 0xbe840080,
-	0xd2890000, 0x00000903,
-	0x80048104, 0xd2890001,
-	0x00000903, 0x80048104,
-	0xd2890002, 0x00000903,
-	0x80048104, 0xd2890003,
-	0x00000903, 0x80048104,
-	0xc069003a, 0x00000070,
-	0xbf8cc07f, 0x80709070,
-	0xbf06c004, 0xbf84ffee,
-	0x807c847c, 0xbf0a7b7c,
-	0xbf85ffb1, 0xbf9c0000,
-	0xbf820012, 0x7e000300,
-	0x7e020301, 0x7e040302,
-	0x7e060303, 0xe0724000,
-	0x701d0000, 0xe0724100,
-	0x701d0100, 0xe0724200,
-	0x701d0200, 0xe0724300,
-	0x701d0300, 0x807c847c,
-	0x8070ff70, 0x00000400,
-	0xbf0a7b7c, 0xbf85ffef,
-	0xbf9c0000, 0xb8fb2985,
-	0x807b817b, 0x8e7b837b,
-	0xb8fa2b05, 0x807a817a,
-	0x8e7a827a, 0x80fb7a7b,
-	0x867b7b7b, 0xbf84007a,
+	0xc069003a, 0x00000070,
+	0xbf8cc07f, 0x80709070,
+	0xbf06c004, 0xbf84ffee,
+	0x680404ff, 0x00000200,
+	0xd0c9006a, 0x0000f702,
+	0xbf87ffd2, 0xbf820015,
+	0xd1060002, 0x00011103,
+	0x7e0602ff, 0x00000200,
+	0xbefc00ff, 0x00010000,
+	0xbe800077, 0x8677ff77,
+	0xff7fffff, 0x8777ff77,
+	0x00058000, 0xd8ec0000,
+	0x00000002, 0xbf8cc07f,
+	0xe0765000, 0x701d0002,
+	0x68040702, 0xd0c9006a,
+	0x0000f702, 0xbf87fff7,
+	0xbef70000, 0xbef000ff,
+	0x00000400, 0xbefe00c1,
+	0xbeff00c1, 0xb8fb2b05,
+	0x807b817b, 0x8e7b827b,
+	0xbef600ff, 0x01000000,
+	0xbefc0084, 0xbf0a7b7c,
+	0xbf84006d, 0xbf11017c,
 	0x807bff7b, 0x00001000,
-	0xbefc0080, 0xbf11017c,
 	0x867aff78, 0x00400000,
 	0xbf850003, 0xb8faf803,
 	0x897a7aff, 0x10000000,
-	0xbf850059, 0xd3d84000,
-	0x18000100, 0xd3d84001,
-	0x18000101, 0xd3d84002,
-	0x18000102, 0xd3d84003,
-	0x18000103, 0xbe840080,
+	0xbf850051, 0xbe840080,
 	0xd2890000, 0x00000900,
 	0x80048104, 0xd2890001,
 	0x00000900, 0x80048104,
@@ -1942,139 +1887,203 @@ static const uint32_t cwsr_trap_aldebaran_hex[] = {
 	0x00000070, 0xbf8cc07f,
 	0x80709070, 0xbf06c004,
 	0xbf84ffee, 0x807c847c,
-	0xbf0a7b7c, 0xbf85ffa9,
-	0xbf9c0000, 0xbf820016,
-	0xd3d84000, 0x18000100,
-	0xd3d84001, 0x18000101,
-	0xd3d84002, 0x18000102,
-	0xd3d84003, 0x18000103,
+	0xbf0a7b7c, 0xbf85ffb1,
+	0xbf9c0000, 0xbf820012,
+	0x7e000300, 0x7e020301,
+	0x7e040302, 0x7e060303,
 	0xe0724000, 0x701d0000,
 	0xe0724100, 0x701d0100,
 	0xe0724200, 0x701d0200,
 	0xe0724300, 0x701d0300,
 	0x807c847c, 0x8070ff70,
 	0x00000400, 0xbf0a7b7c,
-	0xbf85ffeb, 0xbf9c0000,
-	0xbf8200ee, 0xbef4007e,
-	0x8675ff7f, 0x0000ffff,
-	0x8775ff75, 0x00040000,
-	0xbef60080, 0xbef700ff,
-	0x00807fac, 0x866eff7f,
-	0x04000000, 0xbf84001f,
+	0xbf85ffef, 0xbf9c0000,
+	0xb8fb2985, 0x807b817b,
+	0x8e7b837b, 0xb8fa2b05,
+	0x807a817a, 0x8e7a827a,
+	0x80fb7a7b, 0x867b7b7b,
+	0xbf84007a, 0x807bff7b,
+	0x00001000, 0xbefc0080,
+	0xbf11017c, 0x867aff78,
+	0x00400000, 0xbf850003,
+	0xb8faf803, 0x897a7aff,
+	0x10000000, 0xbf850059,
+	0xd3d84000, 0x18000100,
+	0xd3d84001, 0x18000101,
+	0xd3d84002, 0x18000102,
+	0xd3d84003, 0x18000103,
+	0xbe840080, 0xd2890000,
+	0x00000900, 0x80048104,
+	0xd2890001, 0x00000900,
+	0x80048104, 0xd2890002,
+	0x00000900, 0x80048104,
+	0xd2890003, 0x00000900,
+	0x80048104, 0xc069003a,
+	0x00000070, 0xbf8cc07f,
+	0x80709070, 0xbf06c004,
+	0xbf84ffee, 0xbe840080,
+	0xd2890000, 0x00000901,
+	0x80048104, 0xd2890001,
+	0x00000901, 0x80048104,
+	0xd2890002, 0x00000901,
+	0x80048104, 0xd2890003,
+	0x00000901, 0x80048104,
+	0xc069003a, 0x00000070,
+	0xbf8cc07f, 0x80709070,
+	0xbf06c004, 0xbf84ffee,
+	0xbe840080, 0xd2890000,
+	0x00000902, 0x80048104,
+	0xd2890001, 0x00000902,
+	0x80048104, 0xd2890002,
+	0x00000902, 0x80048104,
+	0xd2890003, 0x00000902,
+	0x80048104, 0xc069003a,
+	0x00000070, 0xbf8cc07f,
+	0x80709070, 0xbf06c004,
+	0xbf84ffee, 0xbe840080,
+	0xd2890000, 0x00000903,
+	0x80048104, 0xd2890001,
+	0x00000903, 0x80048104,
+	0xd2890002, 0x00000903,
+	0x80048104, 0xd2890003,
+	0x00000903, 0x80048104,
+	0xc069003a, 0x00000070,
+	0xbf8cc07f, 0x80709070,
+	0xbf06c004, 0xbf84ffee,
+	0x807c847c, 0xbf0a7b7c,
+	0xbf85ffa9, 0xbf9c0000,
+	0xbf820016, 0xd3d84000,
+	0x18000100, 0xd3d84001,
+	0x18000101, 0xd3d84002,
+	0x18000102, 0xd3d84003,
+	0x18000103, 0xe0724000,
+	0x701d0000, 0xe0724100,
+	0x701d0100, 0xe0724200,
+	0x701d0200, 0xe0724300,
+	0x701d0300, 0x807c847c,
+	0x8070ff70, 0x00000400,
+	0xbf0a7b7c, 0xbf85ffeb,
+	0xbf9c0000, 0xbf8200ee,
+	0xbef4007e, 0x8675ff7f,
+	0x0000ffff, 0x8775ff75,
+	0x00040000, 0xbef60080,
+	0xbef700ff, 0x00807fac,
+	0x866eff7f, 0x04000000,
+	0xbf84001f, 0xbefe00c1,
+	0xbeff00c1, 0xb8ef4306,
+	0x866fc16f, 0xbf84001a,
+	0x8e6f866f, 0x8e6f826f,
+	0xbef6006f, 0xb8f82985,
+	0x80788178, 0x8e788a78,
+	0x8e788178, 0xb8ee1605,
+	0x806e816e, 0x8e6e866e,
+	0x80786e78, 0x8078ff78,
+	0x00000080, 0xbef600ff,
+	0x01000000, 0xbefc0080,
+	0xe0510000, 0x781d0000,
+	0xe0510100, 0x781d0000,
+	0x807cff7c, 0x00000200,
+	0x8078ff78, 0x00000200,
+	0xbf0a6f7c, 0xbf85fff6,
 	0xbefe00c1, 0xbeff00c1,
-	0xb8ef4306, 0x866fc16f,
-	0xbf84001a, 0x8e6f866f,
-	0x8e6f826f, 0xbef6006f,
-	0xb8f82985, 0x80788178,
-	0x8e788a78, 0x8e788178,
-	0xb8ee1605, 0x806e816e,
-	0x8e6e866e, 0x80786e78,
-	0x8078ff78, 0x00000080,
 	0xbef600ff, 0x01000000,
-	0xbefc0080, 0xe0510000,
-	0x781d0000, 0xe0510100,
-	0x781d0000, 0x807cff7c,
-	0x00000200, 0x8078ff78,
-	0x00000200, 0xbf0a6f7c,
-	0xbf85fff6, 0xbefe00c1,
-	0xbeff00c1, 0xbef600ff,
-	0x01000000, 0xb8ef2b05,
-	0x806f816f, 0x8e6f826f,
-	0x806fff6f, 0x00008000,
-	0xbef80080, 0xbeee0078,
-	0x8078ff78, 0x00000400,
-	0xbefc0084, 0xbf11087c,
-	0xe0524000, 0x781d0000,
-	0xe0524100, 0x781d0100,
-	0xe0524200, 0x781d0200,
-	0xe0524300, 0x781d0300,
-	0xbf8c0f70, 0x7e000300,
-	0x7e020301, 0x7e040302,
-	0x7e060303, 0x807c847c,
-	0x8078ff78, 0x00000400,
-	0xbf0a6f7c, 0xbf85ffee,
-	0xb8ef2985, 0x806f816f,
-	0x8e6f836f, 0xb8f92b05,
-	0x80798179, 0x8e798279,
-	0x80ef796f, 0x866f6f6f,
-	0xbf84001a, 0x806fff6f,
-	0x00008000, 0xbefc0080,
+	0xb8ef2b05, 0x806f816f,
+	0x8e6f826f, 0x806fff6f,
+	0x00008000, 0xbef80080,
+	0xbeee0078, 0x8078ff78,
+	0x00000400, 0xbefc0084,
 	0xbf11087c, 0xe0524000,
 	0x781d0000, 0xe0524100,
 	0x781d0100, 0xe0524200,
 	0x781d0200, 0xe0524300,
 	0x781d0300, 0xbf8c0f70,
-	0xd3d94000, 0x18000100,
-	0xd3d94001, 0x18000101,
-	0xd3d94002, 0x18000102,
-	0xd3d94003, 0x18000103,
+	0x7e000300, 0x7e020301,
+	0x7e040302, 0x7e060303,
 	0x807c847c, 0x8078ff78,
 	0x00000400, 0xbf0a6f7c,
-	0xbf85ffea, 0xbf9c0000,
-	0xe0524000, 0x6e1d0000,
-	0xe0524100, 0x6e1d0100,
-	0xe0524200, 0x6e1d0200,
-	0xe0524300, 0x6e1d0300,
-	0xbf8c0f70, 0xb8f82985,
-	0x80788178, 0x8e788a78,
-	0x8e788178, 0xb8ee1605,
-	0x806e816e, 0x8e6e866e,
-	0x80786e78, 0x80f8c078,
-	0xb8ef1605, 0x806f816f,
-	0x8e6f846f, 0x8e76826f,
-	0xbef600ff, 0x01000000,
-	0xbefc006f, 0xc031003a,
-	0x00000078, 0x80f8c078,
-	0xbf8cc07f, 0x80fc907c,
-	0xbf800000, 0xbe802d00,
-	0xbe822d02, 0xbe842d04,
-	0xbe862d06, 0xbe882d08,
-	0xbe8a2d0a, 0xbe8c2d0c,
-	0xbe8e2d0e, 0xbf06807c,
-	0xbf84fff0, 0xb8f82985,
-	0x80788178, 0x8e788a78,
-	0x8e788178, 0xb8ee1605,
-	0x806e816e, 0x8e6e866e,
-	0x80786e78, 0xbef60084,
-	0xbef600ff, 0x01000000,
-	0xc0211bfa, 0x00000078,
-	0x80788478, 0xc0211b3a,
+	0xbf85ffee, 0xb8ef2985,
+	0x806f816f, 0x8e6f836f,
+	0xb8f92b05, 0x80798179,
+	0x8e798279, 0x80ef796f,
+	0x866f6f6f, 0xbf84001a,
+	0x806fff6f, 0x00008000,
+	0xbefc0080, 0xbf11087c,
+	0xe0524000, 0x781d0000,
+	0xe0524100, 0x781d0100,
+	0xe0524200, 0x781d0200,
+	0xe0524300, 0x781d0300,
+	0xbf8c0f70, 0xd3d94000,
+	0x18000100, 0xd3d94001,
+	0x18000101, 0xd3d94002,
+	0x18000102, 0xd3d94003,
+	0x18000103, 0x807c847c,
+	0x8078ff78, 0x00000400,
+	0xbf0a6f7c, 0xbf85ffea,
+	0xbf9c0000, 0xe0524000,
+	0x6e1d0000, 0xe0524100,
+	0x6e1d0100, 0xe0524200,
+	0x6e1d0200, 0xe0524300,
+	0x6e1d0300, 0xbf8c0f70,
+	0xb8f82985, 0x80788178,
+	0x8e788a78, 0x8e788178,
+	0xb8ee1605, 0x806e816e,
+	0x8e6e866e, 0x80786e78,
+	0x80f8c078, 0xb8ef1605,
+	0x806f816f, 0x8e6f846f,
+	0x8e76826f, 0xbef600ff,
+	0x01000000, 0xbefc006f,
+	0xc031003a, 0x00000078,
+	0x80f8c078, 0xbf8cc07f,
+	0x80fc907c, 0xbf800000,
+	0xbe802d00, 0xbe822d02,
+	0xbe842d04, 0xbe862d06,
+	0xbe882d08, 0xbe8a2d0a,
+	0xbe8c2d0c, 0xbe8e2d0e,
+	0xbf06807c, 0xbf84fff0,
+	0xb8f82985, 0x80788178,
+	0x8e788a78, 0x8e788178,
+	0xb8ee1605, 0x806e816e,
+	0x8e6e866e, 0x80786e78,
+	0xbef60084, 0xbef600ff,
+	0x01000000, 0xc0211bfa,
 	0x00000078, 0x80788478,
-	0xc0211b7a, 0x00000078,
-	0x80788478, 0xc0211c3a,
+	0xc0211b3a, 0x00000078,
+	0x80788478, 0xc0211b7a,
 	0x00000078, 0x80788478,
-	0xc0211c7a, 0x00000078,
-	0x80788478, 0xc0211eba,
+	0xc0211c3a, 0x00000078,
+	0x80788478, 0xc0211c7a,
 	0x00000078, 0x80788478,
-	0xc0211efa, 0x00000078,
-	0x80788478, 0xc0211a3a,
+	0xc0211eba, 0x00000078,
+	0x80788478, 0xc0211efa,
 	0x00000078, 0x80788478,
-	0xc0211a7a, 0x00000078,
-	0x80788478, 0xc0211cfa,
+	0xc0211a3a, 0x00000078,
+	0x80788478, 0xc0211a7a,
 	0x00000078, 0x80788478,
-	0xbf8cc07f, 0xbefc006f,
-	0xbefe0070, 0xbeff0071,
-	0x866f7bff, 0x000003ff,
-	0xb96f4803, 0x866f7bff,
-	0xfffff800, 0x8f6f8b6f,
-	0xb96fa2c3, 0xb973f801,
-	0xb8ee2985, 0x806e816e,
-	0x8e6e8a6e, 0x8e6e816e,
-	0xb8ef1605, 0x806f816f,
-	0x8e6f866f, 0x806e6f6e,
-	0x806e746e, 0x826f8075,
-	0x866fff6f, 0x0000ffff,
-	0xc00b1c37, 0x00000050,
-	0xc00b1d37, 0x00000060,
-	0xc0031e77, 0x00000074,
-	0xbf8cc07f, 0x8f6e8b77,
-	0x866eff6e, 0x001f8000,
-	0xb96ef807, 0x866dff6d,
-	0x0000ffff, 0x86fe7e7e,
-	0x86ea6a6a, 0x8f6e837a,
-	0xb96ee0c2, 0xbf800002,
-	0xb97a0002, 0xbf8a0000,
-	0xbe801f6c, 0xbf810000,
+	0xc0211cfa, 0x00000078,
+	0x80788478, 0xbf8cc07f,
+	0xbefc006f, 0xbefe0070,
+	0xbeff0071, 0x866f7bff,
+	0x000003ff, 0xb96f4803,
+	0x866f7bff, 0xfffff800,
+	0x8f6f8b6f, 0xb96fa2c3,
+	0xb973f801, 0xb8ee2985,
+	0x806e816e, 0x8e6e8a6e,
+	0x8e6e816e, 0xb8ef1605,
+	0x806f816f, 0x8e6f866f,
+	0x806e6f6e, 0x806e746e,
+	0x826f8075, 0x866fff6f,
+	0x0000ffff, 0xc00b1c37,
+	0x00000050, 0xc00b1d37,
+	0x00000060, 0xc0031e77,
+	0x00000074, 0xbf8cc07f,
+	0x8f6e8b77, 0x866eff6e,
+	0x001f8000, 0xb96ef807,
+	0x866dff6d, 0x0000ffff,
+	0x86fe7e7e, 0x86ea6a6a,
+	0x8f6e837a, 0xb96ee0c2,
+	0xbf800002, 0xb97a0002,
+	0xbf8a0000, 0xbe801f6c,
+	0xbf810000, 0x00000000,
 };
 
 static const uint32_t cwsr_trap_gfx10_hex[] = {
diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm
index 6880340c25af..f1d12e42f89a 100644
--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm
@@ -244,6 +244,11 @@ L_NOT_HALTED:
         SQ_WAVE_TRAPSTS_TRAP_AFTER_INST_MASK
     s_cbranch_scc1  L_FETCH_2ND_TRAP
 
+    // Check TTMP1 bits 24 (HT) and 23:16(trapID): HT == 1 & trapID == 4
+    s_and_b32       ttmp2, s_save_pc_hi, (S_SAVE_PC_HI_TRAP_ID_MASK|S_SAVE_PC_HI_HT_MASK)
+    s_cmp_eq_u32    ttmp2, 0x1040000
+    s_cbranch_scc1  L_FETCH_2ND_TRAP
+
     // Check for maskable exceptions in trapsts.excp and trapsts.excp_hi.
     // Maskable exceptions only cause the wave to enter the trap handler if
     // their respective bit in mode.excp_en is set.
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 80+ messages in thread

* [PATCH 13/24] drm/amdgpu: add sq host trap status check
  2023-11-03 13:11 [PATCH 00/24] Support Host Trap Sampling for MI200 James Zhu
                   ` (11 preceding siblings ...)
  2023-11-03 13:11 ` [PATCH 12/24] drm/amdgpu: use trapID 4 for " James Zhu
@ 2023-11-03 13:11 ` James Zhu
  2023-11-10 19:07   ` Yat Sin, David
  2023-11-20 16:16   ` [PATCH v2 " James Zhu
  2023-11-03 13:11 ` [PATCH 14/24] drm/amdkfd: trigger pc sampling trap for arcturus James Zhu
                   ` (11 subsequent siblings)
  24 siblings, 2 replies; 80+ messages in thread
From: James Zhu @ 2023-11-03 13:11 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.kuehling, joseph.greathouse, jamesz

Before fire a new host trap, check the host trap status.

Signed-off-by: James Zhu <James.Zhu@amd.com>
---
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 35 +++++++++++++++++++
 .../amd/include/asic_reg/gc/gc_9_0_offset.h   |  2 ++
 .../amd/include/asic_reg/gc/gc_9_0_sh_mask.h  |  5 +++
 3 files changed, 42 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
index 740d8a0c9252..2c5bbbb7e34e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
@@ -1146,6 +1146,35 @@ void kgd_gfx_v9_program_trap_handler_settings(struct amdgpu_device *adev,
 	kgd_gfx_v9_unlock_srbm(adev, inst);
 }
 
+static uint32_t kgd_aldebaran_get_hosttrap_status(struct amdgpu_device *adev)
+{
+	uint32_t sq_hosttrap_status;
+	int i, j;
+
+	mutex_lock(&adev->grbm_idx_mutex);
+	for (i = 0; i < adev->gfx.config.max_shader_engines; i++) {
+		for (j = 0; j < adev->gfx.config.max_sh_per_se; j++) {
+			amdgpu_gfx_select_se_sh(adev, i, j, 0xffffffff, 0);
+			sq_hosttrap_status = RREG32_SOC15(GC, 0, mmSQ_HOSTTRAP_STATUS);
+
+			if (sq_hosttrap_status & SQ_HOSTTRAP_STATUS__HTPENDING_OVERRIDE_MASK) {
+				WREG32_SOC15(GC, 0, mmSQ_HOSTTRAP_STATUS,
+					SQ_HOSTTRAP_STATUS__HTPENDING_OVERRIDE_MASK);
+				sq_hosttrap_status = 0x0;
+				continue;
+			}
+			if (sq_hosttrap_status)
+				goto out;
+		}
+	}
+
+out:
+	amdgpu_gfx_select_se_sh(adev, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0);
+	mutex_unlock(&adev->grbm_idx_mutex);
+
+	return sq_hosttrap_status;
+}
+
 uint32_t kgd_gfx_v9_trigger_pc_sample_trap(struct amdgpu_device *adev,
 					    uint32_t vmid,
 					    uint32_t max_wave_slot,
@@ -1156,6 +1185,12 @@ uint32_t kgd_gfx_v9_trigger_pc_sample_trap(struct amdgpu_device *adev,
 {
 	if (method == KFD_IOCTL_PCS_METHOD_HOSTTRAP) {
 		uint32_t value = 0;
+		uint32_t sq_hosttrap_status;
+
+		sq_hosttrap_status = kgd_aldebaran_get_hosttrap_status(adev);
+		/* skip when last host trap request is still pending to complete */
+		if (sq_hosttrap_status)
+			return 0;
 
 		value = REG_SET_FIELD(value, SQ_CMD, CMD, SQ_IND_CMD_CMD_TRAP);
 		value = REG_SET_FIELD(value, SQ_CMD, MODE, SQ_IND_CMD_MODE_SINGLE);
diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_offset.h b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_offset.h
index 12d451e5475b..5b17d9066452 100644
--- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_offset.h
+++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_offset.h
@@ -462,6 +462,8 @@
 #define mmSQ_IND_DATA_BASE_IDX                                                                         0
 #define mmSQ_CMD                                                                                       0x037b
 #define mmSQ_CMD_BASE_IDX                                                                              0
+#define mmSQ_HOSTTRAP_STATUS                                                                           0x0376
+#define mmSQ_HOSTTRAP_STATUS_BASE_IDX                                                                  0
 #define mmSQ_TIME_HI                                                                                   0x037c
 #define mmSQ_TIME_HI_BASE_IDX                                                                          0
 #define mmSQ_TIME_LO                                                                                   0x037d
diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_sh_mask.h b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_sh_mask.h
index efc16ddf274a..3dfe4ab31421 100644
--- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_sh_mask.h
+++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_sh_mask.h
@@ -2616,6 +2616,11 @@
 //SQ_CMD_TIMESTAMP
 #define SQ_CMD_TIMESTAMP__TIMESTAMP__SHIFT                                                                    0x0
 #define SQ_CMD_TIMESTAMP__TIMESTAMP_MASK                                                                      0x000000FFL
+//SQ_HOSTTRAP_STATUS
+#define SQ_HOSTTRAP_STATUS__HTPENDINGCOUNT__SHIFT                                                             0x0
+#define SQ_HOSTTRAP_STATUS__HTPENDING_OVERRIDE__SHIFT                                                         0x8
+#define SQ_HOSTTRAP_STATUS__HTPENDINGCOUNT_MASK                                                               0x000000FFL
+#define SQ_HOSTTRAP_STATUS__HTPENDING_OVERRIDE_MASK                                                           0x00000100L
 //SQ_IND_INDEX
 #define SQ_IND_INDEX__WAVE_ID__SHIFT                                                                          0x0
 #define SQ_IND_INDEX__SIMD_ID__SHIFT                                                                          0x4
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 80+ messages in thread

* [PATCH 14/24] drm/amdkfd: trigger pc sampling trap for arcturus
  2023-11-03 13:11 [PATCH 00/24] Support Host Trap Sampling for MI200 James Zhu
                   ` (12 preceding siblings ...)
  2023-11-03 13:11 ` [PATCH 13/24] drm/amdgpu: add sq host trap status check James Zhu
@ 2023-11-03 13:11 ` James Zhu
  2023-11-03 13:11 ` [PATCH 15/24] drm/amdkfd: trigger pc sampling trap for aldebaran James Zhu
                   ` (10 subsequent siblings)
  24 siblings, 0 replies; 80+ messages in thread
From: James Zhu @ 2023-11-03 13:11 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.kuehling, joseph.greathouse, jamesz

Implement trigger pc sampling trap for arcturus.

Signed-off-by: James Zhu <James.Zhu@amd.com>
---
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c    | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
index 625db444df1c..f3d89a469b51 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
@@ -390,6 +390,17 @@ static uint32_t kgd_arcturus_disable_debug_trap(struct amdgpu_device *adev,
 
 	return 0;
 }
+
+static uint32_t kgd_arcturus_trigger_pc_sample_trap(struct amdgpu_device *adev,
+					    uint32_t vmid,
+					    uint32_t *target_simd,
+					    uint32_t *target_wave_slot,
+					    enum kfd_ioctl_pc_sample_method method)
+{
+	return kgd_gfx_v9_trigger_pc_sample_trap(adev, vmid, 10, 4,
+					target_simd, target_wave_slot, method);
+}
+
 const struct kfd2kgd_calls arcturus_kfd2kgd = {
 	.program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
 	.set_pasid_vmid_mapping = kgd_gfx_v9_set_pasid_vmid_mapping,
@@ -418,5 +429,6 @@ const struct kfd2kgd_calls arcturus_kfd2kgd = {
 	.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
 	.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
 	.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
-	.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings
+	.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
+	.trigger_pc_sample_trap = kgd_arcturus_trigger_pc_sample_trap
 };
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 80+ messages in thread

* [PATCH 15/24] drm/amdkfd: trigger pc sampling trap for aldebaran
  2023-11-03 13:11 [PATCH 00/24] Support Host Trap Sampling for MI200 James Zhu
                   ` (13 preceding siblings ...)
  2023-11-03 13:11 ` [PATCH 14/24] drm/amdkfd: trigger pc sampling trap for arcturus James Zhu
@ 2023-11-03 13:11 ` James Zhu
  2023-11-10 19:08   ` Yat Sin, David
  2023-11-03 13:11 ` [PATCH 16/24] drm/amdkfd: use bit operation set debug trap James Zhu
                   ` (9 subsequent siblings)
  24 siblings, 1 reply; 80+ messages in thread
From: James Zhu @ 2023-11-03 13:11 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.kuehling, joseph.greathouse, jamesz

Implement trigger pc sampling trap for aldebaran.

Signed-off-by: James Zhu <James.Zhu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
index aff08321e976..27eda75ceecb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
@@ -163,6 +163,16 @@ static uint32_t kgd_gfx_aldebaran_set_address_watch(
 	return watch_address_cntl;
 }
 
+static uint32_t kgd_aldebaran_trigger_pc_sample_trap(struct amdgpu_device *adev,
+					    uint32_t vmid,
+					    uint32_t *target_simd,
+					    uint32_t *target_wave_slot,
+					    enum kfd_ioctl_pc_sample_method method)
+{
+	return kgd_gfx_v9_trigger_pc_sample_trap(adev, vmid, 8, 4,
+					target_simd, target_wave_slot, method);
+}
+
 const struct kfd2kgd_calls aldebaran_kfd2kgd = {
 	.program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
 	.set_pasid_vmid_mapping = kgd_gfx_v9_set_pasid_vmid_mapping,
@@ -191,4 +201,5 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
 	.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
 	.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
 	.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
+	.trigger_pc_sample_trap = kgd_aldebaran_trigger_pc_sample_trap,
 };
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 80+ messages in thread

* [PATCH 16/24] drm/amdkfd: use bit operation set debug trap
  2023-11-03 13:11 [PATCH 00/24] Support Host Trap Sampling for MI200 James Zhu
                   ` (14 preceding siblings ...)
  2023-11-03 13:11 ` [PATCH 15/24] drm/amdkfd: trigger pc sampling trap for aldebaran James Zhu
@ 2023-11-03 13:11 ` James Zhu
  2023-11-10 19:08   ` Yat Sin, David
  2023-11-03 13:11 ` [PATCH 17/24] drm/amdkfd: add setting trap pc sampling flag James Zhu
                   ` (8 subsequent siblings)
  24 siblings, 1 reply; 80+ messages in thread
From: James Zhu @ 2023-11-03 13:11 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.kuehling, joseph.greathouse, jamesz

1st level TMA's 2nd byte which used for trap type setting,
to use bit operation to change selected bit only.

Signed-off-by: James Zhu <James.Zhu@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_process.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index fbf053001af9..a0b729c65a7c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -1434,13 +1434,23 @@ bool kfd_process_xnack_mode(struct kfd_process *p, bool supported)
 	return true;
 }
 
+/* bit offset in 1st-level TMA's 2nd byte which used for KFD_TRAP_TYPE_BIT */
+enum KFD_TRAP_TYPE_BIT {
+	KFD_TRAP_TYPE_DEBUG = 0,		/* bit 0 for debug trap */
+	KFD_TRAP_TYPE_HOST,
+	KFD_TRAP_TYPE_STOCHASTIC,
+};
+
 void kfd_process_set_trap_debug_flag(struct qcm_process_device *qpd,
 				     bool enabled)
 {
 	if (qpd->cwsr_kaddr) {
-		uint64_t *tma =
-			(uint64_t *)(qpd->cwsr_kaddr + KFD_CWSR_TMA_OFFSET);
-		tma[2] = enabled;
+		volatile unsigned long *tma =
+			(volatile unsigned long *)(qpd->cwsr_kaddr + KFD_CWSR_TMA_OFFSET);
+		if (enabled)
+			set_bit(KFD_TRAP_TYPE_DEBUG, &tma[2]);
+		else
+			clear_bit(KFD_TRAP_TYPE_DEBUG, &tma[2]);
 	}
 }
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 80+ messages in thread

* [PATCH 17/24] drm/amdkfd: add setting trap pc sampling flag
  2023-11-03 13:11 [PATCH 00/24] Support Host Trap Sampling for MI200 James Zhu
                   ` (15 preceding siblings ...)
  2023-11-03 13:11 ` [PATCH 16/24] drm/amdkfd: use bit operation set debug trap James Zhu
@ 2023-11-03 13:11 ` James Zhu
  2023-11-10 19:07   ` Yat Sin, David
  2023-11-03 13:11 ` [PATCH 18/24] drm/amdkfd: enable pc sampling start James Zhu
                   ` (7 subsequent siblings)
  24 siblings, 1 reply; 80+ messages in thread
From: James Zhu @ 2023-11-03 13:11 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.kuehling, joseph.greathouse, jamesz

Add setting trap pc sampling flag.

Signed-off-by: James Zhu <James.Zhu@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h    |  2 ++
 drivers/gpu/drm/amd/amdkfd/kfd_process.c | 13 +++++++++++++
 2 files changed, 15 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 642558026d16..6670534f47b8 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -1186,6 +1186,8 @@ void kfd_process_set_trap_handler(struct qcm_process_device *qpd,
 				  uint64_t tma_addr);
 void kfd_process_set_trap_debug_flag(struct qcm_process_device *qpd,
 				     bool enabled);
+void kfd_process_set_trap_pc_sampling_flag(struct qcm_process_device *qpd,
+				     enum kfd_ioctl_pc_sample_method method, bool enabled);
 
 /* CWSR initialization */
 int kfd_process_init_cwsr_apu(struct kfd_process *process, struct file *filep);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index a0b729c65a7c..d22d804f180d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -1454,6 +1454,19 @@ void kfd_process_set_trap_debug_flag(struct qcm_process_device *qpd,
 	}
 }
 
+void kfd_process_set_trap_pc_sampling_flag(struct qcm_process_device *qpd,
+				     enum kfd_ioctl_pc_sample_method method, bool enabled)
+{
+	if (qpd->cwsr_kaddr) {
+		volatile unsigned long *tma =
+			(volatile unsigned long *)(qpd->cwsr_kaddr + KFD_CWSR_TMA_OFFSET);
+		if (enabled)
+			set_bit(method, &tma[2]);
+		else
+			clear_bit(method, &tma[2]);
+	}
+}
+
 /*
  * On return the kfd_process is fully operational and will be freed when the
  * mm is released
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 80+ messages in thread

* [PATCH 18/24] drm/amdkfd: enable pc sampling start
  2023-11-03 13:11 [PATCH 00/24] Support Host Trap Sampling for MI200 James Zhu
                   ` (16 preceding siblings ...)
  2023-11-03 13:11 ` [PATCH 17/24] drm/amdkfd: add setting trap pc sampling flag James Zhu
@ 2023-11-03 13:11 ` James Zhu
  2023-11-22 22:27   ` Felix Kuehling
  2023-11-03 13:11 ` [PATCH 19/24] drm/amdkfd: enable pc sampling stop James Zhu
                   ` (6 subsequent siblings)
  24 siblings, 1 reply; 80+ messages in thread
From: James Zhu @ 2023-11-03 13:11 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.kuehling, joseph.greathouse, jamesz

Enable pc sampling start.

Signed-off-by: James Zhu <James.Zhu@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c | 26 +++++++++++++++++---
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h        |  2 ++
 2 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
index 60b29b245db5..33d003ca0093 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
@@ -83,9 +83,29 @@ static int kfd_pc_sample_query_cap(struct kfd_process_device *pdd,
 	return 0;
 }
 
-static int kfd_pc_sample_start(struct kfd_process_device *pdd)
+static int kfd_pc_sample_start(struct kfd_process_device *pdd,
+					struct pc_sampling_entry *pcs_entry)
 {
-	return -EINVAL;
+	bool pc_sampling_start = false;
+
+	pcs_entry->enabled = true;
+	mutex_lock(&pdd->dev->pcs_data.mutex);
+	if (!pdd->dev->pcs_data.hosttrap_entry.base.active_count)
+		pc_sampling_start = true;
+	pdd->dev->pcs_data.hosttrap_entry.base.active_count++;
+	mutex_unlock(&pdd->dev->pcs_data.mutex);
+
+	while (pc_sampling_start) {
+		if (READ_ONCE(pdd->dev->pcs_data.hosttrap_entry.base.stop_enable)) {
+			usleep_range(1000, 2000);
+		} else {
+			kfd_process_set_trap_pc_sampling_flag(&pdd->qpd,
+				pdd->dev->pcs_data.hosttrap_entry.base.pc_sample_info.method, true);
+			break;
+		}
+	}
+
+	return 0;
 }
 
 static int kfd_pc_sample_stop(struct kfd_process_device *pdd)
@@ -225,7 +245,7 @@ int kfd_pc_sample(struct kfd_process_device *pdd,
 		if (pcs_entry->enabled)
 			return -EALREADY;
 		else
-			return kfd_pc_sample_start(pdd);
+			return kfd_pc_sample_start(pdd, pcs_entry);
 
 	case KFD_IOCTL_PCS_OP_STOP:
 		if (!pcs_entry->enabled)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 6670534f47b8..613910e0d440 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -258,6 +258,8 @@ struct kfd_dev;
 
 struct kfd_dev_pc_sampling_data {
 	uint32_t use_count;         /* Num of PC sampling sessions */
+	uint32_t active_count;      /* Num of active sessions */
+	bool stop_enable;           /* pc sampling stop in process */
 	struct idr pc_sampling_idr;
 	struct kfd_pc_sample_info pc_sample_info;
 };
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 80+ messages in thread

* [PATCH 19/24] drm/amdkfd: enable pc sampling stop
  2023-11-03 13:11 [PATCH 00/24] Support Host Trap Sampling for MI200 James Zhu
                   ` (17 preceding siblings ...)
  2023-11-03 13:11 ` [PATCH 18/24] drm/amdkfd: enable pc sampling start James Zhu
@ 2023-11-03 13:11 ` James Zhu
  2023-11-10 19:07   ` Yat Sin, David
  2023-11-03 13:11 ` [PATCH 20/24] drm/amdkfd: enable pc sampling work to trigger trap James Zhu
                   ` (5 subsequent siblings)
  24 siblings, 1 reply; 80+ messages in thread
From: James Zhu @ 2023-11-03 13:11 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.kuehling, joseph.greathouse, jamesz

Enable pc sampling stop.

Signed-off-by: James Zhu <James.Zhu@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c | 28 +++++++++++++++++---
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h        |  2 ++
 2 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
index 33d003ca0093..2c4ac5b4cc4b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
@@ -108,10 +108,32 @@ static int kfd_pc_sample_start(struct kfd_process_device *pdd,
 	return 0;
 }
 
-static int kfd_pc_sample_stop(struct kfd_process_device *pdd)
+static int kfd_pc_sample_stop(struct kfd_process_device *pdd,
+					struct pc_sampling_entry *pcs_entry)
 {
-	return -EINVAL;
+	bool pc_sampling_stop = false;
+
+	pcs_entry->enabled = false;
+	mutex_lock(&pdd->dev->pcs_data.mutex);
+	pdd->dev->pcs_data.hosttrap_entry.base.active_count--;
+	if (!pdd->dev->pcs_data.hosttrap_entry.base.active_count) {
+		WRITE_ONCE(pdd->dev->pcs_data.hosttrap_entry.base.stop_enable, true);
+		pc_sampling_stop = true;
+	}
+	mutex_unlock(&pdd->dev->pcs_data.mutex);
 
+	if (pc_sampling_stop) {
+		kfd_process_set_trap_pc_sampling_flag(&pdd->qpd,
+			pdd->dev->pcs_data.hosttrap_entry.base.pc_sample_info.method, false);
+
+		mutex_lock(&pdd->dev->pcs_data.mutex);
+		pdd->dev->pcs_data.hosttrap_entry.base.target_simd = 0;
+		pdd->dev->pcs_data.hosttrap_entry.base.target_wave_slot = 0;
+		WRITE_ONCE(pdd->dev->pcs_data.hosttrap_entry.base.stop_enable, false);
+		mutex_unlock(&pdd->dev->pcs_data.mutex);
+	}
+
+	return 0;
 }
 
 static int kfd_pc_sample_create(struct kfd_process_device *pdd,
@@ -251,7 +273,7 @@ int kfd_pc_sample(struct kfd_process_device *pdd,
 		if (!pcs_entry->enabled)
 			return -EALREADY;
 		else
-			return kfd_pc_sample_stop(pdd);
+			return kfd_pc_sample_stop(pdd, pcs_entry);
 	}
 
 	return -EINVAL;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 613910e0d440..badaa4d68cc4 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -259,6 +259,8 @@ struct kfd_dev;
 struct kfd_dev_pc_sampling_data {
 	uint32_t use_count;         /* Num of PC sampling sessions */
 	uint32_t active_count;      /* Num of active sessions */
+	uint32_t target_simd;       /* target simd for trap */
+	uint32_t target_wave_slot;  /* target wave slot for trap */
 	bool stop_enable;           /* pc sampling stop in process */
 	struct idr pc_sampling_idr;
 	struct kfd_pc_sample_info pc_sample_info;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 80+ messages in thread

* [PATCH 20/24] drm/amdkfd: enable pc sampling work to trigger trap
  2023-11-03 13:11 [PATCH 00/24] Support Host Trap Sampling for MI200 James Zhu
                   ` (18 preceding siblings ...)
  2023-11-03 13:11 ` [PATCH 19/24] drm/amdkfd: enable pc sampling stop James Zhu
@ 2023-11-03 13:11 ` James Zhu
  2023-11-22 22:31   ` Felix Kuehling
  2023-11-03 13:11 ` [PATCH 21/24] drm/amdkfd: add queue remapping James Zhu
                   ` (4 subsequent siblings)
  24 siblings, 1 reply; 80+ messages in thread
From: James Zhu @ 2023-11-03 13:11 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.kuehling, joseph.greathouse, jamesz

Enable a delay work to trigger pc sampling trap.

Signed-off-by: James Zhu <James.Zhu@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_device.c      |  3 ++
 drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c | 39 ++++++++++++++++++++
 drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h |  1 +
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h        |  1 +
 4 files changed, 44 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index bcaeedac8fe0..fb21902e433a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -35,6 +35,7 @@
 #include "kfd_migrate.h"
 #include "amdgpu.h"
 #include "amdgpu_xcp.h"
+#include "kfd_pc_sampling.h"
 
 #define MQD_SIZE_ALIGNED 768
 
@@ -537,6 +538,8 @@ static void kfd_pc_sampling_init(struct kfd_node *dev)
 {
 	mutex_init(&dev->pcs_data.mutex);
 	idr_init_base(&dev->pcs_data.hosttrap_entry.base.pc_sampling_idr, 1);
+	INIT_WORK(&dev->pcs_data.hosttrap_entry.base.pc_sampling_work,
+		kfd_pc_sample_handler);
 }
 
 static void kfd_pc_sampling_exit(struct kfd_node *dev)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
index 2c4ac5b4cc4b..e8f0559b618e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
@@ -38,6 +38,43 @@ struct supported_pc_sample_info supported_formats[] = {
 	{ IP_VERSION(9, 4, 2), &sample_info_hosttrap_9_0_0 },
 };
 
+void kfd_pc_sample_handler(struct work_struct *work)
+{
+	struct amdgpu_device *adev;
+	struct kfd_node *node;
+	uint32_t timeout = 0;
+
+	node = container_of(work, struct kfd_node,
+					pcs_data.hosttrap_entry.base.pc_sampling_work);
+
+	mutex_lock(&node->pcs_data.mutex);
+	if (node->pcs_data.hosttrap_entry.base.active_count &&
+		node->pcs_data.hosttrap_entry.base.pc_sample_info.value &&
+		node->kfd2kgd->trigger_pc_sample_trap) {
+		switch (node->pcs_data.hosttrap_entry.base.pc_sample_info.type) {
+		case KFD_IOCTL_PCS_TYPE_TIME_US:
+			timeout = (uint32_t)node->pcs_data.hosttrap_entry.base.pc_sample_info.value;
+			break;
+		default:
+			pr_debug("PC Sampling type %d not supported.",
+					node->pcs_data.hosttrap_entry.base.pc_sample_info.type);
+		}
+	}
+	mutex_unlock(&node->pcs_data.mutex);
+	if (!timeout)
+		return;
+
+	adev = node->adev;
+	while (!READ_ONCE(node->pcs_data.hosttrap_entry.base.stop_enable)) {
+		node->kfd2kgd->trigger_pc_sample_trap(adev, node->vm_info.last_vmid_kfd,
+				&node->pcs_data.hosttrap_entry.base.target_simd,
+				&node->pcs_data.hosttrap_entry.base.target_wave_slot,
+				node->pcs_data.hosttrap_entry.base.pc_sample_info.method);
+		pr_debug_ratelimited("triggered a host trap.");
+		usleep_range(timeout, timeout + 10);
+	}
+}
+
 static int kfd_pc_sample_query_cap(struct kfd_process_device *pdd,
 					struct kfd_ioctl_pc_sample_args __user *user_args)
 {
@@ -101,6 +138,7 @@ static int kfd_pc_sample_start(struct kfd_process_device *pdd,
 		} else {
 			kfd_process_set_trap_pc_sampling_flag(&pdd->qpd,
 				pdd->dev->pcs_data.hosttrap_entry.base.pc_sample_info.method, true);
+			schedule_work(&pdd->dev->pcs_data.hosttrap_entry.base.pc_sampling_work);
 			break;
 		}
 	}
@@ -123,6 +161,7 @@ static int kfd_pc_sample_stop(struct kfd_process_device *pdd,
 	mutex_unlock(&pdd->dev->pcs_data.mutex);
 
 	if (pc_sampling_stop) {
+		cancel_work_sync(&pdd->dev->pcs_data.hosttrap_entry.base.pc_sampling_work);
 		kfd_process_set_trap_pc_sampling_flag(&pdd->qpd,
 			pdd->dev->pcs_data.hosttrap_entry.base.pc_sample_info.method, false);
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h
index 4eeded4ea5b6..cb93909e6bd3 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h
@@ -30,5 +30,6 @@
 
 int kfd_pc_sample(struct kfd_process_device *pdd,
 					struct kfd_ioctl_pc_sample_args __user *args);
+void kfd_pc_sample_handler(struct work_struct *work);
 
 #endif /* KFD_PC_SAMPLING_H_ */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index badaa4d68cc4..b7062033fda4 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -263,6 +263,7 @@ struct kfd_dev_pc_sampling_data {
 	uint32_t target_wave_slot;  /* target wave slot for trap */
 	bool stop_enable;           /* pc sampling stop in process */
 	struct idr pc_sampling_idr;
+	struct work_struct pc_sampling_work;
 	struct kfd_pc_sample_info pc_sample_info;
 };
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 80+ messages in thread

* [PATCH 21/24] drm/amdkfd: add queue remapping
  2023-11-03 13:11 [PATCH 00/24] Support Host Trap Sampling for MI200 James Zhu
                   ` (19 preceding siblings ...)
  2023-11-03 13:11 ` [PATCH 20/24] drm/amdkfd: enable pc sampling work to trigger trap James Zhu
@ 2023-11-03 13:11 ` James Zhu
  2023-11-22 22:35   ` Felix Kuehling
  2023-11-03 13:11 ` [PATCH 22/24] drm/amdkfd: add pc sampling release when process release James Zhu
                   ` (3 subsequent siblings)
  24 siblings, 1 reply; 80+ messages in thread
From: James Zhu @ 2023-11-03 13:11 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.kuehling, joseph.greathouse, jamesz

Add queue remapping to force the waves in any running
processes to complete a CWSR trap.

Signed-off-by: James Zhu <James.Zhu@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 11 +++++++++++
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h |  5 +++++
 drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c          |  3 +++
 3 files changed, 19 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index c0e71543389a..a3f57be63f4f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -3155,6 +3155,17 @@ int debug_refresh_runlist(struct device_queue_manager *dqm)
 	return debug_map_and_unlock(dqm);
 }
 
+void remap_queue(struct device_queue_manager *dqm,
+				enum kfd_unmap_queues_filter filter,
+				uint32_t filter_param,
+				uint32_t grace_period)
+{
+	dqm_lock(dqm);
+	if (!dqm->dev->kfd->shared_resources.enable_mes)
+		execute_queues_cpsch(dqm, filter, filter_param, grace_period);
+	dqm_unlock(dqm);
+}
+
 #if defined(CONFIG_DEBUG_FS)
 
 static void seq_reg_dump(struct seq_file *m,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index cf7e182588f8..f8aae3747a36 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -303,6 +303,11 @@ int debug_lock_and_unmap(struct device_queue_manager *dqm);
 int debug_map_and_unlock(struct device_queue_manager *dqm);
 int debug_refresh_runlist(struct device_queue_manager *dqm);
 
+void remap_queue(struct device_queue_manager *dqm,
+				enum kfd_unmap_queues_filter filter,
+				uint32_t filter_param,
+				uint32_t grace_period);
+
 static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd)
 {
 	return (pdd->lds_base >> 16) & 0xFF;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
index e8f0559b618e..66670cdb813a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
@@ -24,6 +24,7 @@
 #include "kfd_priv.h"
 #include "amdgpu_amdkfd.h"
 #include "kfd_pc_sampling.h"
+#include "kfd_device_queue_manager.h"
 
 struct supported_pc_sample_info {
 	uint32_t ip_version;
@@ -164,6 +165,8 @@ static int kfd_pc_sample_stop(struct kfd_process_device *pdd,
 		cancel_work_sync(&pdd->dev->pcs_data.hosttrap_entry.base.pc_sampling_work);
 		kfd_process_set_trap_pc_sampling_flag(&pdd->qpd,
 			pdd->dev->pcs_data.hosttrap_entry.base.pc_sample_info.method, false);
+		remap_queue(pdd->dev->dqm,
+			KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD);
 
 		mutex_lock(&pdd->dev->pcs_data.mutex);
 		pdd->dev->pcs_data.hosttrap_entry.base.target_simd = 0;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 80+ messages in thread

* [PATCH 22/24] drm/amdkfd: add pc sampling release when process release
  2023-11-03 13:11 [PATCH 00/24] Support Host Trap Sampling for MI200 James Zhu
                   ` (20 preceding siblings ...)
  2023-11-03 13:11 ` [PATCH 21/24] drm/amdkfd: add queue remapping James Zhu
@ 2023-11-03 13:11 ` James Zhu
  2023-11-10 19:08   ` Yat Sin, David
  2023-11-20 16:23   ` [PATCH v2 " James Zhu
  2023-11-03 13:11 ` [PATCH 23/24] drm/amdkfd: add pc sampling capability check James Zhu
                   ` (2 subsequent siblings)
  24 siblings, 2 replies; 80+ messages in thread
From: James Zhu @ 2023-11-03 13:11 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.kuehling, joseph.greathouse, jamesz

Add pc sampling release when process release, it will force to
stop all activate sessions with this process.

Signed-off-by: James Zhu <James.Zhu@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c | 26 ++++++++++++++++++++
 drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h |  1 +
 drivers/gpu/drm/amd/amdkfd/kfd_process.c     |  3 +++
 3 files changed, 30 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
index 66670cdb813a..00d8d3f400a9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
@@ -274,6 +274,32 @@ static int kfd_pc_sample_destroy(struct kfd_process_device *pdd, uint32_t trace_
 	return 0;
 }
 
+void kfd_pc_sample_release(struct kfd_process_device *pdd)
+{
+	struct pc_sampling_entry *pcs_entry;
+	struct idr *idp;
+	uint32_t id;
+
+	if (sched_policy == KFD_SCHED_POLICY_NO_HWS) {
+		pr_err("PC Sampling does not support sched_policy %i", sched_policy);
+		return;
+	}
+
+	/* force to release all PC sampling task for this process */
+	idp = &pdd->dev->pcs_data.hosttrap_entry.base.pc_sampling_idr;
+	mutex_lock(&pdd->dev->pcs_data.mutex);
+	idr_for_each_entry(idp, pcs_entry, id) {
+		if (pcs_entry->pdd != pdd)
+			continue;
+		mutex_unlock(&pdd->dev->pcs_data.mutex);
+		if (pcs_entry->enabled)
+			kfd_pc_sample_stop(pdd, pcs_entry);
+		kfd_pc_sample_destroy(pdd, id, pcs_entry);
+		mutex_lock(&pdd->dev->pcs_data.mutex);
+	}
+	mutex_unlock(&pdd->dev->pcs_data.mutex);
+}
+
 int kfd_pc_sample(struct kfd_process_device *pdd,
 					struct kfd_ioctl_pc_sample_args __user *args)
 {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h
index cb93909e6bd3..4ea064fdaa98 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h
@@ -30,6 +30,7 @@
 
 int kfd_pc_sample(struct kfd_process_device *pdd,
 					struct kfd_ioctl_pc_sample_args __user *args);
+void kfd_pc_sample_release(struct kfd_process_device *pdd);
 void kfd_pc_sample_handler(struct work_struct *work);
 
 #endif /* KFD_PC_SAMPLING_H_ */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index d22d804f180d..54f3db7eaae2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -43,6 +43,7 @@ struct mm_struct;
 #include "kfd_svm.h"
 #include "kfd_smi_events.h"
 #include "kfd_debug.h"
+#include "kfd_pc_sampling.h"
 
 /*
  * List of struct kfd_process (field kfd_process).
@@ -1020,6 +1021,8 @@ static void kfd_process_destroy_pdds(struct kfd_process *p)
 		pr_debug("Releasing pdd (topology id %d) for process (pasid 0x%x)\n",
 				pdd->dev->id, p->pasid);
 
+		kfd_pc_sample_release(pdd);
+
 		kfd_process_device_destroy_cwsr_dgpu(pdd);
 		kfd_process_device_destroy_ib_mem(pdd);
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 80+ messages in thread

* [PATCH 23/24] drm/amdkfd: add pc sampling capability check
  2023-11-03 13:11 [PATCH 00/24] Support Host Trap Sampling for MI200 James Zhu
                   ` (21 preceding siblings ...)
  2023-11-03 13:11 ` [PATCH 22/24] drm/amdkfd: add pc sampling release when process release James Zhu
@ 2023-11-03 13:11 ` James Zhu
  2023-11-22 22:40   ` Felix Kuehling
  2023-11-03 13:11 ` [PATCH 24/24] drm/amdkfd: bump kfd ioctl minor version for pc sampling availability James Zhu
  2023-11-16 14:51 ` [PATCH 00/24] Support Host Trap Sampling for MI200 James Zhu
  24 siblings, 1 reply; 80+ messages in thread
From: James Zhu @ 2023-11-03 13:11 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.kuehling, joseph.greathouse, jamesz

From: David Yat Sin <david.yatsin@amd.com>

Add pc sampling capability check.

Signed-off-by: David Yat Sin <david.yatsin@amd.com>
Signed-off-by: James Zhu <James.Zhu@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 10 +++++++++-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h    | 13 +++++++++++++
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index b00390e451bf..5e47e374d824 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -3259,7 +3259,7 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
 			kfd_ioctl_set_debug_trap, 0),
 
 	AMDKFD_IOCTL_DEF(AMDKFD_IOC_PC_SAMPLE,
-			kfd_ioctl_pc_sample, 0),
+			kfd_ioctl_pc_sample, KFD_IOC_FLAG_PERFMON),
 };
 
 #define AMDKFD_CORE_IOCTL_COUNT	ARRAY_SIZE(amdkfd_ioctls)
@@ -3336,6 +3336,14 @@ static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
 		}
 	}
 
+	/* PC Sampling Monitor */
+	if (unlikely(ioctl->flags & KFD_IOC_FLAG_PERFMON)) {
+		if (!capable(CAP_PERFMON) && !capable(CAP_SYS_ADMIN)) {
+			retcode = -EACCES;
+			goto err_i1;
+		}
+	}
+
 	if (cmd & (IOC_IN | IOC_OUT)) {
 		if (asize <= sizeof(stack_kdata)) {
 			kdata = stack_kdata;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index b7062033fda4..236d3de85153 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -144,6 +144,19 @@ enum kfd_ioctl_flags {
 	 * we also allow ioctls with SYS_ADMIN capability.
 	 */
 	KFD_IOC_FLAG_CHECKPOINT_RESTORE = BIT(0),
+
+	/*
+	 * @KFD_IOC_FLAG_PERFMON:
+	 * Performance monitoring feature, GPU performance monitoring can allow users
+	 * to gather some information about other processes. PC sampling can allow
+	 * users to infer information about wavefronts from other processes that are
+	 * running on the same CUs, such as which execution units they are using. As
+	 * such, this type of performance monitoring should be protected and only
+	 * available to users with sufficient capabilities: either CAP_PERFMON, or,
+	 * for backwards compatibility, CAP_SYS_ADMIN.
+	 */
+
+	KFD_IOC_FLAG_PERFMON = BIT(1),
 };
 /*
  * Kernel module parameter to specify maximum number of supported queues per
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 80+ messages in thread

* [PATCH 24/24] drm/amdkfd: bump kfd ioctl minor version for pc sampling availability
  2023-11-03 13:11 [PATCH 00/24] Support Host Trap Sampling for MI200 James Zhu
                   ` (22 preceding siblings ...)
  2023-11-03 13:11 ` [PATCH 23/24] drm/amdkfd: add pc sampling capability check James Zhu
@ 2023-11-03 13:11 ` James Zhu
  2023-11-16 14:51 ` [PATCH 00/24] Support Host Trap Sampling for MI200 James Zhu
  24 siblings, 0 replies; 80+ messages in thread
From: James Zhu @ 2023-11-03 13:11 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.kuehling, joseph.greathouse, jamesz

    Bump the minor version to declare pc sampling feature is now
    available.

Signed-off-by: James Zhu <James.Zhu@amd.com>
---
 include/uapi/linux/kfd_ioctl.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 5202e29c9560..682ed27134af 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -40,9 +40,10 @@
  * - 1.12 - Add DMA buf export ioctl
  * - 1.13 - Add debugger API
  * - 1.14 - Update kfd_event_data
+ * - 1.15 - Add PC Sampling ioctl
  */
 #define KFD_IOCTL_MAJOR_VERSION 1
-#define KFD_IOCTL_MINOR_VERSION 14
+#define KFD_IOCTL_MINOR_VERSION 15
 
 struct kfd_ioctl_get_version_args {
 	__u32 major_version;	/* from KFD */
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 80+ messages in thread

* RE: [PATCH 03/24] drm/amdkfd: enable pc sampling query
  2023-11-03 13:11 ` [PATCH 03/24] drm/amdkfd: enable pc sampling query James Zhu
@ 2023-11-10 19:04   ` Yat Sin, David
  2023-11-20 15:34   ` [PATCH v2 " James Zhu
  1 sibling, 0 replies; 80+ messages in thread
From: Yat Sin, David @ 2023-11-10 19:04 UTC (permalink / raw)
  To: Zhu, James, amd-gfx; +Cc: Kuehling, Felix, Greathouse, Joseph

[AMD Official Use Only - General]

> -----Original Message-----
> From: Zhu, James <James.Zhu@amd.com>
> Sent: Friday, November 3, 2023 9:11 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Kuehling, Felix <Felix.Kuehling@amd.com>; Greathouse, Joseph
> <Joseph.Greathouse@amd.com>; Yat Sin, David <David.YatSin@amd.com>; Zhu,
> James <James.Zhu@amd.com>
> Subject: [PATCH 03/24] drm/amdkfd: enable pc sampling query
>
> From: David Yat Sin <david.yatsin@amd.com>
>
> Enable pc sampling to query system capability.
>
> Co-developed-by: James Zhu <James.Zhu@amd.com>
> Signed-off-by: James Zhu <James.Zhu@amd.com>
> Signed-off-by: David Yat Sin <david.yatsin@amd.com>
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c | 54
> +++++++++++++++++++-
>  1 file changed, 53 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
> index a7e78ff42d07..49fecbc7013e 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
> @@ -25,10 +25,62 @@
>  #include "amdgpu_amdkfd.h"
>  #include "kfd_pc_sampling.h"
>
> +struct supported_pc_sample_info {
> +     uint32_t ip_version;
> +     const struct kfd_pc_sample_info *sample_info; };
> +
> +const struct kfd_pc_sample_info sample_info_hosttrap_9_0_0 = {
> +     0, 1, ~0ULL, 0, KFD_IOCTL_PCS_METHOD_HOSTTRAP,
> +KFD_IOCTL_PCS_TYPE_TIME_US };
> +
> +struct supported_pc_sample_info supported_formats[] = {
> +     { IP_VERSION(9, 4, 1), &sample_info_hosttrap_9_0_0 },
> +     { IP_VERSION(9, 4, 2), &sample_info_hosttrap_9_0_0 }, };
Nit pick, the "};" the next needs to be on the next line.

Regards,
David

> +
>  static int kfd_pc_sample_query_cap(struct kfd_process_device *pdd,
>                                       struct kfd_ioctl_pc_sample_args __user
> *user_args)  {
> -     return -EINVAL;
> +     uint64_t sample_offset;
> +     int num_method = 0;
> +     int i;
> +
> +     for (i = 0; i < ARRAY_SIZE(supported_formats); i++)
> +             if (KFD_GC_VERSION(pdd->dev) ==
> supported_formats[i].ip_version)
> +                     num_method++;
> +
> +     if (!num_method) {
> +             pr_debug("PC Sampling not supported on GC_HWIP:0x%x.",
> +                     pdd->dev->adev->ip_versions[GC_HWIP][0]);
> +             return -EOPNOTSUPP;
> +     }
> +
> +     if (!user_args->sample_info_ptr) {
> +             user_args->num_sample_info = num_method;
> +             return 0;
> +     }
> +
> +     if (user_args->num_sample_info < num_method) {
> +             user_args->num_sample_info = num_method;
> +             pr_debug("Sample info buffer is not large enough, "
> +                      "ASIC requires space for %d kfd_pc_sample_info
> entries.", num_method);
> +             return -ENOSPC;
> +     }
> +
> +     sample_offset = user_args->sample_info_ptr;
> +     for (i = 0; i < ARRAY_SIZE(supported_formats); i++) {
> +             if (KFD_GC_VERSION(pdd->dev) ==
> supported_formats[i].ip_version) {
> +                     int ret = copy_to_user((void __user *) sample_offset,
> +                             supported_formats[i].sample_info,
> sizeof(struct kfd_pc_sample_info));
> +                     if (ret) {
> +                             pr_debug("Failed to copy PC sampling info to
> user.");
> +                             return -EFAULT;
> +                     }
> +                     sample_offset += sizeof(struct kfd_pc_sample_info);
> +             }
> +     }
> +
> +     return 0;
>  }
>
>  static int kfd_pc_sample_start(struct kfd_process_device *pdd)
> --
> 2.25.1


^ permalink raw reply	[flat|nested] 80+ messages in thread

* RE: [PATCH 19/24] drm/amdkfd: enable pc sampling stop
  2023-11-03 13:11 ` [PATCH 19/24] drm/amdkfd: enable pc sampling stop James Zhu
@ 2023-11-10 19:07   ` Yat Sin, David
  2023-11-13 15:19     ` James Zhu
  0 siblings, 1 reply; 80+ messages in thread
From: Yat Sin, David @ 2023-11-10 19:07 UTC (permalink / raw)
  To: Zhu, James, amd-gfx; +Cc: Kuehling, Felix, Greathouse, Joseph

[AMD Official Use Only - General]

> -----Original Message-----
> From: Zhu, James <James.Zhu@amd.com>
> Sent: Friday, November 3, 2023 9:12 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Kuehling, Felix <Felix.Kuehling@amd.com>; Greathouse, Joseph
> <Joseph.Greathouse@amd.com>; Yat Sin, David <David.YatSin@amd.com>; Zhu,
> James <James.Zhu@amd.com>
> Subject: [PATCH 19/24] drm/amdkfd: enable pc sampling stop
>
> Enable pc sampling stop.
>
> Signed-off-by: James Zhu <James.Zhu@amd.com>
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c | 28 +++++++++++++++++--
> -
>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h        |  2 ++
>  2 files changed, 27 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
> index 33d003ca0093..2c4ac5b4cc4b 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
> @@ -108,10 +108,32 @@ static int kfd_pc_sample_start(struct
> kfd_process_device *pdd,
>       return 0;
>  }
>
> -static int kfd_pc_sample_stop(struct kfd_process_device *pdd)
> +static int kfd_pc_sample_stop(struct kfd_process_device *pdd,
> +                                     struct pc_sampling_entry *pcs_entry)
>  {
> -     return -EINVAL;
> +     bool pc_sampling_stop = false;
> +
> +     pcs_entry->enabled = false;
> +     mutex_lock(&pdd->dev->pcs_data.mutex);
For the START/STOP/DESTROY ioctls, I think we can hold pdd->dev->pcs_data.mutex through the whole IOCTL. Then we would not have to deal with the intermediate states where the START/STOP/DESTROY are happening at the same time.

> +     pdd->dev->pcs_data.hosttrap_entry.base.active_count--;
> +     if (!pdd->dev->pcs_data.hosttrap_entry.base.active_count) {
> +             WRITE_ONCE(pdd->dev-
> >pcs_data.hosttrap_entry.base.stop_enable, true);
> +             pc_sampling_stop = true;
> +     }
> +     mutex_unlock(&pdd->dev->pcs_data.mutex);
>
> +     if (pc_sampling_stop) {
> +             kfd_process_set_trap_pc_sampling_flag(&pdd->qpd,
> +                     pdd->dev-
> >pcs_data.hosttrap_entry.base.pc_sample_info.method,
> +false);
> +
> +             mutex_lock(&pdd->dev->pcs_data.mutex);
> +             pdd->dev->pcs_data.hosttrap_entry.base.target_simd = 0;
> +             pdd->dev->pcs_data.hosttrap_entry.base.target_wave_slot = 0;
> +             WRITE_ONCE(pdd->dev-
> >pcs_data.hosttrap_entry.base.stop_enable, false);
> +             mutex_unlock(&pdd->dev->pcs_data.mutex);
> +     }
> +
> +     return 0;
>  }
>
>  static int kfd_pc_sample_create(struct kfd_process_device *pdd, @@ -251,7
> +273,7 @@ int kfd_pc_sample(struct kfd_process_device *pdd,
>               if (!pcs_entry->enabled)
>                       return -EALREADY;
>               else
> -                     return kfd_pc_sample_stop(pdd);
> +                     return kfd_pc_sample_stop(pdd, pcs_entry);
>       }
>
>       return -EINVAL;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 613910e0d440..badaa4d68cc4 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -259,6 +259,8 @@ struct kfd_dev;
>  struct kfd_dev_pc_sampling_data {
>       uint32_t use_count;         /* Num of PC sampling sessions */
>       uint32_t active_count;      /* Num of active sessions */
> +     uint32_t target_simd;       /* target simd for trap */
> +     uint32_t target_wave_slot;  /* target wave slot for trap */
>       bool stop_enable;           /* pc sampling stop in process */
>       struct idr pc_sampling_idr;
>       struct kfd_pc_sample_info pc_sample_info;
> --
> 2.25.1


^ permalink raw reply	[flat|nested] 80+ messages in thread

* RE: [PATCH 17/24] drm/amdkfd: add setting trap pc sampling flag
  2023-11-03 13:11 ` [PATCH 17/24] drm/amdkfd: add setting trap pc sampling flag James Zhu
@ 2023-11-10 19:07   ` Yat Sin, David
  0 siblings, 0 replies; 80+ messages in thread
From: Yat Sin, David @ 2023-11-10 19:07 UTC (permalink / raw)
  To: Zhu, James, amd-gfx; +Cc: Kuehling, Felix, Greathouse, Joseph

[AMD Official Use Only - General]

I would recommend merging this with patch 16, but up to you.

> -----Original Message-----
> From: Zhu, James <James.Zhu@amd.com>
> Sent: Friday, November 3, 2023 9:12 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Kuehling, Felix <Felix.Kuehling@amd.com>; Greathouse, Joseph
> <Joseph.Greathouse@amd.com>; Yat Sin, David <David.YatSin@amd.com>; Zhu,
> James <James.Zhu@amd.com>
> Subject: [PATCH 17/24] drm/amdkfd: add setting trap pc sampling flag
>
> Add setting trap pc sampling flag.
>
> Signed-off-by: James Zhu <James.Zhu@amd.com>
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h    |  2 ++
>  drivers/gpu/drm/amd/amdkfd/kfd_process.c | 13 +++++++++++++
>  2 files changed, 15 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 642558026d16..6670534f47b8 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -1186,6 +1186,8 @@ void kfd_process_set_trap_handler(struct
> qcm_process_device *qpd,
>                                 uint64_t tma_addr);
>  void kfd_process_set_trap_debug_flag(struct qcm_process_device *qpd,
>                                    bool enabled);
> +void kfd_process_set_trap_pc_sampling_flag(struct qcm_process_device *qpd,
> +                                  enum kfd_ioctl_pc_sample_method method,
> bool enabled);
>
>  /* CWSR initialization */
>  int kfd_process_init_cwsr_apu(struct kfd_process *process, struct file *filep);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index a0b729c65a7c..d22d804f180d 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -1454,6 +1454,19 @@ void kfd_process_set_trap_debug_flag(struct
> qcm_process_device *qpd,
>       }
>  }
>
> +void kfd_process_set_trap_pc_sampling_flag(struct qcm_process_device *qpd,
> +                                  enum kfd_ioctl_pc_sample_method method,
> bool enabled) {
> +     if (qpd->cwsr_kaddr) {
> +             volatile unsigned long *tma =
> +                     (volatile unsigned long *)(qpd->cwsr_kaddr +
> KFD_CWSR_TMA_OFFSET);
> +             if (enabled)
> +                     set_bit(method, &tma[2]);
> +             else
> +                     clear_bit(method, &tma[2]);
> +     }
> +}
> +
>  /*
>   * On return the kfd_process is fully operational and will be freed when the
>   * mm is released
> --
> 2.25.1


^ permalink raw reply	[flat|nested] 80+ messages in thread

* RE: [PATCH 13/24] drm/amdgpu: add sq host trap status check
  2023-11-03 13:11 ` [PATCH 13/24] drm/amdgpu: add sq host trap status check James Zhu
@ 2023-11-10 19:07   ` Yat Sin, David
  2023-11-20 16:16   ` [PATCH v2 " James Zhu
  1 sibling, 0 replies; 80+ messages in thread
From: Yat Sin, David @ 2023-11-10 19:07 UTC (permalink / raw)
  To: Zhu, James, amd-gfx; +Cc: Kuehling, Felix, Greathouse, Joseph

[AMD Official Use Only - General]

> -----Original Message-----
> From: Zhu, James <James.Zhu@amd.com>
> Sent: Friday, November 3, 2023 9:11 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Kuehling, Felix <Felix.Kuehling@amd.com>; Greathouse, Joseph
> <Joseph.Greathouse@amd.com>; Yat Sin, David <David.YatSin@amd.com>; Zhu,
> James <James.Zhu@amd.com>
> Subject: [PATCH 13/24] drm/amdgpu: add sq host trap status check
>
> Before fire a new host trap, check the host trap status.
>
> Signed-off-by: James Zhu <James.Zhu@amd.com>
> ---
>  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 35
> +++++++++++++++++++
>  .../amd/include/asic_reg/gc/gc_9_0_offset.h   |  2 ++
>  .../amd/include/asic_reg/gc/gc_9_0_sh_mask.h  |  5 +++
>  3 files changed, 42 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> index 740d8a0c9252..2c5bbbb7e34e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> @@ -1146,6 +1146,35 @@ void
> kgd_gfx_v9_program_trap_handler_settings(struct amdgpu_device *adev,
>       kgd_gfx_v9_unlock_srbm(adev, inst);
>  }
>
> +static uint32_t kgd_aldebaran_get_hosttrap_status(struct amdgpu_device
> +*adev) {
> +     uint32_t sq_hosttrap_status;
Initialize to 0. Some static code analyzers may raise warning because it is possible to return an uninitialized value.

Regards,
David

> +     int i, j;
> +
> +     mutex_lock(&adev->grbm_idx_mutex);
> +     for (i = 0; i < adev->gfx.config.max_shader_engines; i++) {
> +             for (j = 0; j < adev->gfx.config.max_sh_per_se; j++) {
> +                     amdgpu_gfx_select_se_sh(adev, i, j, 0xffffffff, 0);
> +                     sq_hosttrap_status = RREG32_SOC15(GC, 0,
> mmSQ_HOSTTRAP_STATUS);
> +
> +                     if (sq_hosttrap_status &
> SQ_HOSTTRAP_STATUS__HTPENDING_OVERRIDE_MASK) {
> +                             WREG32_SOC15(GC, 0,
> mmSQ_HOSTTRAP_STATUS,
> +
>       SQ_HOSTTRAP_STATUS__HTPENDING_OVERRIDE_MASK);
> +                             sq_hosttrap_status = 0x0;
> +                             continue;
> +                     }
> +                     if (sq_hosttrap_status)
> +                             goto out;
> +             }
> +     }
> +
> +out:
> +     amdgpu_gfx_select_se_sh(adev, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
> 0);
> +     mutex_unlock(&adev->grbm_idx_mutex);
> +
> +     return sq_hosttrap_status;
> +}
> +
>  uint32_t kgd_gfx_v9_trigger_pc_sample_trap(struct amdgpu_device *adev,
>                                           uint32_t vmid,
>                                           uint32_t max_wave_slot,
> @@ -1156,6 +1185,12 @@ uint32_t
> kgd_gfx_v9_trigger_pc_sample_trap(struct amdgpu_device *adev,  {
>       if (method == KFD_IOCTL_PCS_METHOD_HOSTTRAP) {
>               uint32_t value = 0;
> +             uint32_t sq_hosttrap_status;
> +
> +             sq_hosttrap_status = kgd_aldebaran_get_hosttrap_status(adev);
> +             /* skip when last host trap request is still pending to complete
> */
> +             if (sq_hosttrap_status)
> +                     return 0;
>
>               value = REG_SET_FIELD(value, SQ_CMD, CMD,
> SQ_IND_CMD_CMD_TRAP);
>               value = REG_SET_FIELD(value, SQ_CMD, MODE,
> SQ_IND_CMD_MODE_SINGLE); diff --git
> a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_offset.h
> b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_offset.h
> index 12d451e5475b..5b17d9066452 100644
> --- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_offset.h
> +++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_offset.h
> @@ -462,6 +462,8 @@
>  #define mmSQ_IND_DATA_BASE_IDX
> 0
>  #define mmSQ_CMD                                                                                       0x037b
>  #define mmSQ_CMD_BASE_IDX                                                                              0
> +#define mmSQ_HOSTTRAP_STATUS
> 0x0376
> +#define mmSQ_HOSTTRAP_STATUS_BASE_IDX
> 0
>  #define mmSQ_TIME_HI                                                                                   0x037c
>  #define mmSQ_TIME_HI_BASE_IDX                                                                          0
>  #define mmSQ_TIME_LO
> 0x037d
> diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_sh_mask.h
> b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_sh_mask.h
> index efc16ddf274a..3dfe4ab31421 100644
> --- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_sh_mask.h
> +++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_sh_mask.h
> @@ -2616,6 +2616,11 @@
>  //SQ_CMD_TIMESTAMP
>  #define SQ_CMD_TIMESTAMP__TIMESTAMP__SHIFT
> 0x0
>  #define SQ_CMD_TIMESTAMP__TIMESTAMP_MASK
> 0x000000FFL
> +//SQ_HOSTTRAP_STATUS
> +#define SQ_HOSTTRAP_STATUS__HTPENDINGCOUNT__SHIFT
> 0x0
> +#define SQ_HOSTTRAP_STATUS__HTPENDING_OVERRIDE__SHIFT
> 0x8
> +#define SQ_HOSTTRAP_STATUS__HTPENDINGCOUNT_MASK
> 0x000000FFL
> +#define SQ_HOSTTRAP_STATUS__HTPENDING_OVERRIDE_MASK
> 0x00000100L
>  //SQ_IND_INDEX
>  #define SQ_IND_INDEX__WAVE_ID__SHIFT
> 0x0
>  #define SQ_IND_INDEX__SIMD_ID__SHIFT
> 0x4
> --
> 2.25.1


^ permalink raw reply	[flat|nested] 80+ messages in thread

* RE: [PATCH 22/24] drm/amdkfd: add pc sampling release when process release
  2023-11-03 13:11 ` [PATCH 22/24] drm/amdkfd: add pc sampling release when process release James Zhu
@ 2023-11-10 19:08   ` Yat Sin, David
  2023-11-13 15:12     ` James Zhu
  2023-11-20 16:23   ` [PATCH v2 " James Zhu
  1 sibling, 1 reply; 80+ messages in thread
From: Yat Sin, David @ 2023-11-10 19:08 UTC (permalink / raw)
  To: Zhu, James, amd-gfx; +Cc: Kuehling, Felix, Greathouse, Joseph

[AMD Official Use Only - General]

> -----Original Message-----
> From: Zhu, James <James.Zhu@amd.com>
> Sent: Friday, November 3, 2023 9:12 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Kuehling, Felix <Felix.Kuehling@amd.com>; Greathouse, Joseph
> <Joseph.Greathouse@amd.com>; Yat Sin, David <David.YatSin@amd.com>; Zhu,
> James <James.Zhu@amd.com>
> Subject: [PATCH 22/24] drm/amdkfd: add pc sampling release when process
> release
>
> Add pc sampling release when process release, it will force to stop all activate
> sessions with this process.
>
> Signed-off-by: James Zhu <James.Zhu@amd.com>
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c | 26
> ++++++++++++++++++++  drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h |
> 1 +
>  drivers/gpu/drm/amd/amdkfd/kfd_process.c     |  3 +++
>  3 files changed, 30 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
> index 66670cdb813a..00d8d3f400a9 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
> @@ -274,6 +274,32 @@ static int kfd_pc_sample_destroy(struct
> kfd_process_device *pdd, uint32_t trace_
>       return 0;
>  }
>
> +void kfd_pc_sample_release(struct kfd_process_device *pdd) {
> +     struct pc_sampling_entry *pcs_entry;
> +     struct idr *idp;
> +     uint32_t id;
> +
> +     if (sched_policy == KFD_SCHED_POLICY_NO_HWS) {
> +             pr_err("PC Sampling does not support sched_policy %i",
> sched_policy);
> +             return;
> +     }
You do not need to check the sched_policy here, already checked in kfd_ioctl_pc_sample(..) , so cannot have a hosttrap session if policy is NO_HWS.
> +
> +     /* force to release all PC sampling task for this process */
> +     idp = &pdd->dev->pcs_data.hosttrap_entry.base.pc_sampling_idr;
> +     mutex_lock(&pdd->dev->pcs_data.mutex);
> +     idr_for_each_entry(idp, pcs_entry, id) {
> +             if (pcs_entry->pdd != pdd)
> +                     continue;
> +             mutex_unlock(&pdd->dev->pcs_data.mutex);
Can we not release the mutex here and just tell the worker thread to exit by setting the stop_enable bit.
I find we have a lot of places where we are acquiring/releasing the mutex within loops and this results in a
lot of extra states that we have to account for during the start/stop/destroy calls.
> +             if (pcs_entry->enabled)
> +                     kfd_pc_sample_stop(pdd, pcs_entry);
> +             kfd_pc_sample_destroy(pdd, id, pcs_entry);
> +             mutex_lock(&pdd->dev->pcs_data.mutex);
> +     }
> +     mutex_unlock(&pdd->dev->pcs_data.mutex);
> +}
> +
>  int kfd_pc_sample(struct kfd_process_device *pdd,
>                                       struct kfd_ioctl_pc_sample_args __user
> *args)  { diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h
> b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h
> index cb93909e6bd3..4ea064fdaa98 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h
> @@ -30,6 +30,7 @@
>
>  int kfd_pc_sample(struct kfd_process_device *pdd,
>                                       struct kfd_ioctl_pc_sample_args __user
> *args);
> +void kfd_pc_sample_release(struct kfd_process_device *pdd);
>  void kfd_pc_sample_handler(struct work_struct *work);
>
>  #endif /* KFD_PC_SAMPLING_H_ */
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index d22d804f180d..54f3db7eaae2 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -43,6 +43,7 @@ struct mm_struct;
>  #include "kfd_svm.h"
>  #include "kfd_smi_events.h"
>  #include "kfd_debug.h"
> +#include "kfd_pc_sampling.h"
>
>  /*
>   * List of struct kfd_process (field kfd_process).
> @@ -1020,6 +1021,8 @@ static void kfd_process_destroy_pdds(struct
> kfd_process *p)
>               pr_debug("Releasing pdd (topology id %d) for process (pasid
> 0x%x)\n",
>                               pdd->dev->id, p->pasid);
>
> +             kfd_pc_sample_release(pdd);
> +
>               kfd_process_device_destroy_cwsr_dgpu(pdd);
>               kfd_process_device_destroy_ib_mem(pdd);
>
> --
> 2.25.1


^ permalink raw reply	[flat|nested] 80+ messages in thread

* RE: [PATCH 16/24] drm/amdkfd: use bit operation set debug trap
  2023-11-03 13:11 ` [PATCH 16/24] drm/amdkfd: use bit operation set debug trap James Zhu
@ 2023-11-10 19:08   ` Yat Sin, David
  2023-11-20 16:21     ` James Zhu
  0 siblings, 1 reply; 80+ messages in thread
From: Yat Sin, David @ 2023-11-10 19:08 UTC (permalink / raw)
  To: Zhu, James, amd-gfx; +Cc: Kuehling, Felix, Greathouse, Joseph

[AMD Official Use Only - General]

> -----Original Message-----
> From: Zhu, James <James.Zhu@amd.com>
> Sent: Friday, November 3, 2023 9:12 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Kuehling, Felix <Felix.Kuehling@amd.com>; Greathouse, Joseph
> <Joseph.Greathouse@amd.com>; Yat Sin, David <David.YatSin@amd.com>; Zhu,
> James <James.Zhu@amd.com>
> Subject: [PATCH 16/24] drm/amdkfd: use bit operation set debug trap
>
> 1st level TMA's 2nd byte which used for trap type setting, to use bit operation to
> change selected bit only.
>
> Signed-off-by: James Zhu <James.Zhu@amd.com>
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_process.c | 16 +++++++++++++---
>  1 file changed, 13 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index fbf053001af9..a0b729c65a7c 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -1434,13 +1434,23 @@ bool kfd_process_xnack_mode(struct kfd_process
> *p, bool supported)
>       return true;
>  }
>
> +/* bit offset in 1st-level TMA's 2nd byte which used for
> +KFD_TRAP_TYPE_BIT */ enum KFD_TRAP_TYPE_BIT {
Nit pick. New line after comment
> +     KFD_TRAP_TYPE_DEBUG = 0,                /* bit 0 for debug trap */
> +     KFD_TRAP_TYPE_HOST,
> +     KFD_TRAP_TYPE_STOCHASTIC,
> +};
> +
>  void kfd_process_set_trap_debug_flag(struct qcm_process_device *qpd,
>                                    bool enabled)
>  {
>       if (qpd->cwsr_kaddr) {
> -             uint64_t *tma =
> -                     (uint64_t *)(qpd->cwsr_kaddr +
> KFD_CWSR_TMA_OFFSET);
> -             tma[2] = enabled;
> +             volatile unsigned long *tma =
> +                     (volatile unsigned long *)(qpd->cwsr_kaddr +
> KFD_CWSR_TMA_OFFSET);
> +             if (enabled)
> +                     set_bit(KFD_TRAP_TYPE_DEBUG, &tma[2]);
> +             else
> +                     clear_bit(KFD_TRAP_TYPE_DEBUG, &tma[2]);
>       }
>  }
>
> --
> 2.25.1


^ permalink raw reply	[flat|nested] 80+ messages in thread

* RE: [PATCH 15/24] drm/amdkfd: trigger pc sampling trap for aldebaran
  2023-11-03 13:11 ` [PATCH 15/24] drm/amdkfd: trigger pc sampling trap for aldebaran James Zhu
@ 2023-11-10 19:08   ` Yat Sin, David
  2023-11-20 16:19     ` James Zhu
  0 siblings, 1 reply; 80+ messages in thread
From: Yat Sin, David @ 2023-11-10 19:08 UTC (permalink / raw)
  To: Zhu, James, amd-gfx; +Cc: Kuehling, Felix, Greathouse, Joseph

[AMD Official Use Only - General]

I would merge this with patch 14 of the series

> -----Original Message-----
> From: Zhu, James <James.Zhu@amd.com>
> Sent: Friday, November 3, 2023 9:12 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Kuehling, Felix <Felix.Kuehling@amd.com>; Greathouse, Joseph
> <Joseph.Greathouse@amd.com>; Yat Sin, David <David.YatSin@amd.com>; Zhu,
> James <James.Zhu@amd.com>
> Subject: [PATCH 15/24] drm/amdkfd: trigger pc sampling trap for aldebaran
>
> Implement trigger pc sampling trap for aldebaran.
>
> Signed-off-by: James Zhu <James.Zhu@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c | 11
> +++++++++++
>  1 file changed, 11 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> index aff08321e976..27eda75ceecb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> @@ -163,6 +163,16 @@ static uint32_t
> kgd_gfx_aldebaran_set_address_watch(
>       return watch_address_cntl;
>  }
>
> +static uint32_t kgd_aldebaran_trigger_pc_sample_trap(struct amdgpu_device
> *adev,
> +                                         uint32_t vmid,
> +                                         uint32_t *target_simd,
> +                                         uint32_t *target_wave_slot,
> +                                         enum kfd_ioctl_pc_sample_method
> method) {
> +     return kgd_gfx_v9_trigger_pc_sample_trap(adev, vmid, 8, 4,
> +                                     target_simd, target_wave_slot,
> method); }
> +
>  const struct kfd2kgd_calls aldebaran_kfd2kgd = {
>       .program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
>       .set_pasid_vmid_mapping = kgd_gfx_v9_set_pasid_vmid_mapping,
> @@ -191,4 +201,5 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
>       .get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
>       .build_grace_period_packet_info =
> kgd_gfx_v9_build_grace_period_packet_info,
>       .program_trap_handler_settings =
> kgd_gfx_v9_program_trap_handler_settings,
> +     .trigger_pc_sample_trap = kgd_aldebaran_trigger_pc_sample_trap,
>  };
> --
> 2.25.1


^ permalink raw reply	[flat|nested] 80+ messages in thread

* RE: [PATCH 10/24] drm/amdkfd: trigger pc sampling trap for gfx v9
  2023-11-03 13:11 ` [PATCH 10/24] drm/amdkfd: trigger pc sampling trap for gfx v9 James Zhu
@ 2023-11-10 19:08   ` Yat Sin, David
  2023-11-20 16:05   ` [PATCH v2 " James Zhu
  1 sibling, 0 replies; 80+ messages in thread
From: Yat Sin, David @ 2023-11-10 19:08 UTC (permalink / raw)
  To: Zhu, James, amd-gfx; +Cc: Kuehling, Felix, Greathouse, Joseph

[AMD Official Use Only - General]

> -----Original Message-----
> From: Zhu, James <James.Zhu@amd.com>
> Sent: Friday, November 3, 2023 9:11 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Kuehling, Felix <Felix.Kuehling@amd.com>; Greathouse, Joseph
> <Joseph.Greathouse@amd.com>; Yat Sin, David <David.YatSin@amd.com>; Zhu,
> James <James.Zhu@amd.com>
> Subject: [PATCH 10/24] drm/amdkfd: trigger pc sampling trap for gfx v9
>
> Implement trigger pc sampling trap for gfx v9.
>
> Signed-off-by: James Zhu <James.Zhu@amd.com>
> ---
>  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 35
> +++++++++++++++++++  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
> |  7 ++++
>  2 files changed, 42 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> index 51011e8ee90d..723fef2d45d6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> @@ -1146,6 +1146,41 @@ void
> kgd_gfx_v9_program_trap_handler_settings(struct amdgpu_device *adev,
>       kgd_gfx_v9_unlock_srbm(adev, inst);
>  }
>
> +uint32_t kgd_gfx_v9_trigger_pc_sample_trap(struct amdgpu_device *adev,
> +                                         uint32_t vmid,
> +                                         uint32_t max_wave_slot,
> +                                         uint32_t max_simd,
> +                                         uint32_t *target_simd,
> +                                         uint32_t *target_wave_slot,
> +                                         enum kfd_ioctl_pc_sample_method
> method) {
Function should return void if it will always return 0.

> +     if (method == KFD_IOCTL_PCS_METHOD_HOSTTRAP) {
> +             uint32_t value = 0;
> +
> +             value = REG_SET_FIELD(value, SQ_CMD, CMD,
> SQ_IND_CMD_CMD_TRAP);
> +             value = REG_SET_FIELD(value, SQ_CMD, MODE,
> SQ_IND_CMD_MODE_SINGLE);
> +
> +             /* select *target_simd */
> +             value = REG_SET_FIELD(value, SQ_CMD, SIMD_ID, *target_simd);
> +             /* select *target_wave_slot */
> +             value = REG_SET_FIELD(value, SQ_CMD, WAVE_ID,
> (*target_wave_slot)++);
> +
> +             mutex_lock(&adev->grbm_idx_mutex);
> +             amdgpu_gfx_select_se_sh(adev, 0xFFFFFFFF, 0xFFFFFFFF,
> 0xFFFFFFFF, 0);
> +             WREG32_SOC15(GC, 0, mmSQ_CMD, value);
> +             mutex_unlock(&adev->grbm_idx_mutex);
> +
> +             *target_wave_slot %= max_wave_slot;
> +             if (!(*target_wave_slot)) {
> +                     (*target_simd)++;
> +                     *target_simd %= max_simd;
> +             }
> +     } else {
> +             pr_debug("PC Sampling method %d not supported.", method);
> +     }
> +     return 0;
> +}
> +
>  const struct kfd2kgd_calls gfx_v9_kfd2kgd = {
>       .program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
>       .set_pasid_vmid_mapping = kgd_gfx_v9_set_pasid_vmid_mapping,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
> index ce424615f59b..b47b926891a8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
> @@ -101,3 +101,10 @@ void
> kgd_gfx_v9_build_grace_period_packet_info(struct amdgpu_device *adev,
>                                              uint32_t grace_period,
>                                              uint32_t *reg_offset,
>                                              uint32_t *reg_data);
> +uint32_t kgd_gfx_v9_trigger_pc_sample_trap(struct amdgpu_device *adev,
> +                                         uint32_t vmid,
> +                                         uint32_t max_wave_slot,
> +                                         uint32_t max_simd,
> +                                         uint32_t *target_simd,
> +                                         uint32_t *target_wave_slot,
> +                                         enum kfd_ioctl_pc_sample_method
> method);
> --
> 2.25.1


^ permalink raw reply	[flat|nested] 80+ messages in thread

* RE: [PATCH 07/24] drm/amdkfd: check pcs_enrty valid
  2023-11-03 13:11 ` [PATCH 07/24] drm/amdkfd: check pcs_enrty valid James Zhu
@ 2023-11-10 19:09   ` Yat Sin, David
  2023-11-20 15:55   ` [PATCH v2 " James Zhu
  2023-11-22 22:15   ` [PATCH " Felix Kuehling
  2 siblings, 0 replies; 80+ messages in thread
From: Yat Sin, David @ 2023-11-10 19:09 UTC (permalink / raw)
  To: Zhu, James, amd-gfx; +Cc: Kuehling, Felix, Greathouse, Joseph

[AMD Official Use Only - General]

> -----Original Message-----
> From: Zhu, James <James.Zhu@amd.com>
> Sent: Friday, November 3, 2023 9:11 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Kuehling, Felix <Felix.Kuehling@amd.com>; Greathouse, Joseph
> <Joseph.Greathouse@amd.com>; Yat Sin, David <David.YatSin@amd.com>; Zhu,
> James <James.Zhu@amd.com>
> Subject: [PATCH 07/24] drm/amdkfd: check pcs_enrty valid
>
> Check pcs_enrty valid for pc sampling ioctl.
Typo: pcs_entry
>
> Signed-off-by: James Zhu <James.Zhu@amd.com>
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c | 30
> ++++++++++++++++++--
>  1 file changed, 27 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
> index 4c9fc48e1a6a..36366c8847de 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
> @@ -179,6 +179,21 @@ static int kfd_pc_sample_destroy(struct
> kfd_process_device *pdd, uint32_t trace_  int kfd_pc_sample(struct
> kfd_process_device *pdd,
>                                       struct kfd_ioctl_pc_sample_args __user
> *args)  {
> +     struct pc_sampling_entry *pcs_entry;
> +
> +     if (args->op != KFD_IOCTL_PCS_OP_QUERY_CAPABILITIES &&
> +             args->op != KFD_IOCTL_PCS_OP_CREATE) {
> +
> +             mutex_lock(&pdd->dev->pcs_data.mutex);
> +             pcs_entry = idr_find(&pdd->dev-
> >pcs_data.hosttrap_entry.base.pc_sampling_idr,
> +                             args->trace_id);
> +             mutex_unlock(&pdd->dev->pcs_data.mutex);
> +
> +             if (!pcs_entry ||
> +                     pcs_entry->pdd != pdd)
> +                     return -EINVAL;
> +     }
> +
>       switch (args->op) {
>       case KFD_IOCTL_PCS_OP_QUERY_CAPABILITIES:
>               return kfd_pc_sample_query_cap(pdd, args); @@ -187,13
> +202,22 @@ int kfd_pc_sample(struct kfd_process_device *pdd,
>               return kfd_pc_sample_create(pdd, args);
>
>       case KFD_IOCTL_PCS_OP_DESTROY:
> -             return kfd_pc_sample_destroy(pdd, args->trace_id);
> +             if (pcs_entry->enabled)
> +                     return -EBUSY;
> +             else
> +                     return kfd_pc_sample_destroy(pdd, args->trace_id);
>
>       case KFD_IOCTL_PCS_OP_START:
> -             return kfd_pc_sample_start(pdd);
> +             if (pcs_entry->enabled)
> +                     return -EALREADY;
> +             else
> +                     return kfd_pc_sample_start(pdd);
>
>       case KFD_IOCTL_PCS_OP_STOP:
> -             return kfd_pc_sample_stop(pdd);
> +             if (!pcs_entry->enabled)
> +                     return -EALREADY;
> +             else
> +                     return kfd_pc_sample_stop(pdd);
>       }
>
>       return -EINVAL;
> --
> 2.25.1


^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH 22/24] drm/amdkfd: add pc sampling release when process release
  2023-11-10 19:08   ` Yat Sin, David
@ 2023-11-13 15:12     ` James Zhu
  2023-11-13 15:19       ` Yat Sin, David
  0 siblings, 1 reply; 80+ messages in thread
From: James Zhu @ 2023-11-13 15:12 UTC (permalink / raw)
  To: Yat Sin, David, Zhu, James, amd-gfx; +Cc: Kuehling, Felix, Greathouse, Joseph

[-- Attachment #1: Type: text/plain, Size: 4509 bytes --]


On 2023-11-10 14:08, Yat Sin, David wrote:
> [AMD Official Use Only - General]
>
>> -----Original Message-----
>> From: Zhu, James<James.Zhu@amd.com>
>> Sent: Friday, November 3, 2023 9:12 AM
>> To:amd-gfx@lists.freedesktop.org
>> Cc: Kuehling, Felix<Felix.Kuehling@amd.com>; Greathouse, Joseph
>> <Joseph.Greathouse@amd.com>; Yat Sin, David<David.YatSin@amd.com>; Zhu,
>> James<James.Zhu@amd.com>
>> Subject: [PATCH 22/24] drm/amdkfd: add pc sampling release when process
>> release
>>
>> Add pc sampling release when process release, it will force to stop all activate
>> sessions with this process.
>>
>> Signed-off-by: James Zhu<James.Zhu@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c | 26
>> ++++++++++++++++++++  drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h |
>> 1 +
>>   drivers/gpu/drm/amd/amdkfd/kfd_process.c     |  3 +++
>>   3 files changed, 30 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>> index 66670cdb813a..00d8d3f400a9 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>> @@ -274,6 +274,32 @@ static int kfd_pc_sample_destroy(struct
>> kfd_process_device *pdd, uint32_t trace_
>>        return 0;
>>   }
>>
>> +void kfd_pc_sample_release(struct kfd_process_device *pdd) {
>> +     struct pc_sampling_entry *pcs_entry;
>> +     struct idr *idp;
>> +     uint32_t id;
>> +
>> +     if (sched_policy == KFD_SCHED_POLICY_NO_HWS) {
>> +             pr_err("PC Sampling does not support sched_policy %i",
>> sched_policy);
>> +             return;
>> +     }
> You do not need to check the sched_policy here, already checked in kfd_ioctl_pc_sample(..) , so cannot have a hosttrap session if policy is NO_HWS.
[JZ]kfd_pc_sample_release is not called from kfd_ioctl_pc_sample. It is 
in process quit process.
>> +
>> +     /* force to release all PC sampling task for this process */
>> +     idp = &pdd->dev->pcs_data.hosttrap_entry.base.pc_sampling_idr;
>> +     mutex_lock(&pdd->dev->pcs_data.mutex);
>> +     idr_for_each_entry(idp, pcs_entry, id) {
>> +             if (pcs_entry->pdd != pdd)
>> +                     continue;
>> +             mutex_unlock(&pdd->dev->pcs_data.mutex);
> Can we not release the mutex here and just tell the worker thread to exit by setting the stop_enable bit.
> I find we have a lot of places where we are acquiring/releasing the mutex within loops and this results in a
> lot of extra states that we have to account for during the start/stop/destroy calls.
>> +             if (pcs_entry->enabled)
>> +                     kfd_pc_sample_stop(pdd, pcs_entry);
>> +             kfd_pc_sample_destroy(pdd, id, pcs_entry);
>> +             mutex_lock(&pdd->dev->pcs_data.mutex);
>> +     }
>> +     mutex_unlock(&pdd->dev->pcs_data.mutex);
>> +}
>> +
>>   int kfd_pc_sample(struct kfd_process_device *pdd,
>>                                        struct kfd_ioctl_pc_sample_args __user
>> *args)  { diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h
>> b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h
>> index cb93909e6bd3..4ea064fdaa98 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h
>> @@ -30,6 +30,7 @@
>>
>>   int kfd_pc_sample(struct kfd_process_device *pdd,
>>                                        struct kfd_ioctl_pc_sample_args __user
>> *args);
>> +void kfd_pc_sample_release(struct kfd_process_device *pdd);
>>   void kfd_pc_sample_handler(struct work_struct *work);
>>
>>   #endif /* KFD_PC_SAMPLING_H_ */
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> index d22d804f180d..54f3db7eaae2 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> @@ -43,6 +43,7 @@ struct mm_struct;
>>   #include "kfd_svm.h"
>>   #include "kfd_smi_events.h"
>>   #include "kfd_debug.h"
>> +#include "kfd_pc_sampling.h"
>>
>>   /*
>>    * List of struct kfd_process (field kfd_process).
>> @@ -1020,6 +1021,8 @@ static void kfd_process_destroy_pdds(struct
>> kfd_process *p)
>>                pr_debug("Releasing pdd (topology id %d) for process (pasid
>> 0x%x)\n",
>>                                pdd->dev->id, p->pasid);
>>
>> +             kfd_pc_sample_release(pdd);
>> +
>>                kfd_process_device_destroy_cwsr_dgpu(pdd);
>>                kfd_process_device_destroy_ib_mem(pdd);
>>
>> --
>> 2.25.1

[-- Attachment #2: Type: text/html, Size: 5986 bytes --]

^ permalink raw reply	[flat|nested] 80+ messages in thread

* RE: [PATCH 22/24] drm/amdkfd: add pc sampling release when process release
  2023-11-13 15:12     ` James Zhu
@ 2023-11-13 15:19       ` Yat Sin, David
  2023-11-13 15:30         ` James Zhu
  0 siblings, 1 reply; 80+ messages in thread
From: Yat Sin, David @ 2023-11-13 15:19 UTC (permalink / raw)
  To: Zhu, James, amd-gfx; +Cc: Kuehling, Felix, Greathouse, Joseph

[-- Attachment #1: Type: text/plain, Size: 5314 bytes --]

[AMD Official Use Only - General]



From: Zhu, James <James.Zhu@amd.com>
Sent: Monday, November 13, 2023 10:13 AM
To: Yat Sin, David <David.YatSin@amd.com>; Zhu, James <James.Zhu@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Kuehling, Felix <Felix.Kuehling@amd.com>; Greathouse, Joseph <Joseph.Greathouse@amd.com>
Subject: Re: [PATCH 22/24] drm/amdkfd: add pc sampling release when process release



On 2023-11-10 14:08, Yat Sin, David wrote:

[AMD Official Use Only - General]



-----Original Message-----

From: Zhu, James <James.Zhu@amd.com><mailto:James.Zhu@amd.com>

Sent: Friday, November 3, 2023 9:12 AM

To: amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>

Cc: Kuehling, Felix <Felix.Kuehling@amd.com><mailto:Felix.Kuehling@amd.com>; Greathouse, Joseph

<Joseph.Greathouse@amd.com><mailto:Joseph.Greathouse@amd.com>; Yat Sin, David <David.YatSin@amd.com><mailto:David.YatSin@amd.com>; Zhu,

James <James.Zhu@amd.com><mailto:James.Zhu@amd.com>

Subject: [PATCH 22/24] drm/amdkfd: add pc sampling release when process

release



Add pc sampling release when process release, it will force to stop all activate

sessions with this process.



Signed-off-by: James Zhu <James.Zhu@amd.com><mailto:James.Zhu@amd.com>

---

 drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c | 26

++++++++++++++++++++  drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h |

1 +

 drivers/gpu/drm/amd/amdkfd/kfd_process.c     |  3 +++

 3 files changed, 30 insertions(+)



diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c

b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c

index 66670cdb813a..00d8d3f400a9 100644

--- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c

+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c

@@ -274,6 +274,32 @@ static int kfd_pc_sample_destroy(struct

kfd_process_device *pdd, uint32_t trace_

      return 0;

 }



+void kfd_pc_sample_release(struct kfd_process_device *pdd) {

+     struct pc_sampling_entry *pcs_entry;

+     struct idr *idp;

+     uint32_t id;

+

+     if (sched_policy == KFD_SCHED_POLICY_NO_HWS) {

+             pr_err("PC Sampling does not support sched_policy %i",

sched_policy);

+             return;

+     }

You do not need to check the sched_policy here, already checked in kfd_ioctl_pc_sample(..) , so cannot have a hosttrap session if policy is NO_HWS.
  [JZ]kfd_pc_sample_release is not called from kfd_ioctl_pc_sample. It is in process quit process.
[David] I know. But you cannot have a PC sampling session during process clean-up when policy=NO_HWS because the session creation would have been blocked on session-create.

+

+     /* force to release all PC sampling task for this process */

+     idp = &pdd->dev->pcs_data.hosttrap_entry.base.pc_sampling_idr;

+     mutex_lock(&pdd->dev->pcs_data.mutex);

+     idr_for_each_entry(idp, pcs_entry, id) {

+             if (pcs_entry->pdd != pdd)

+                     continue;

+             mutex_unlock(&pdd->dev->pcs_data.mutex);

Can we not release the mutex here and just tell the worker thread to exit by setting the stop_enable bit.

I find we have a lot of places where we are acquiring/releasing the mutex within loops and this results in a

lot of extra states that we have to account for during the start/stop/destroy calls.

+             if (pcs_entry->enabled)

+                     kfd_pc_sample_stop(pdd, pcs_entry);

+             kfd_pc_sample_destroy(pdd, id, pcs_entry);

+             mutex_lock(&pdd->dev->pcs_data.mutex);

+     }

+     mutex_unlock(&pdd->dev->pcs_data.mutex);

+}

+

 int kfd_pc_sample(struct kfd_process_device *pdd,

                                      struct kfd_ioctl_pc_sample_args __user

*args)  { diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h

b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h

index cb93909e6bd3..4ea064fdaa98 100644

--- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h

+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h

@@ -30,6 +30,7 @@



 int kfd_pc_sample(struct kfd_process_device *pdd,

                                      struct kfd_ioctl_pc_sample_args __user

*args);

+void kfd_pc_sample_release(struct kfd_process_device *pdd);

 void kfd_pc_sample_handler(struct work_struct *work);



 #endif /* KFD_PC_SAMPLING_H_ */

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c

b/drivers/gpu/drm/amd/amdkfd/kfd_process.c

index d22d804f180d..54f3db7eaae2 100644

--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c

+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c

@@ -43,6 +43,7 @@ struct mm_struct;

 #include "kfd_svm.h"

 #include "kfd_smi_events.h"

 #include "kfd_debug.h"

+#include "kfd_pc_sampling.h"



 /*

  * List of struct kfd_process (field kfd_process).

@@ -1020,6 +1021,8 @@ static void kfd_process_destroy_pdds(struct

kfd_process *p)

              pr_debug("Releasing pdd (topology id %d) for process (pasid

0x%x)\n",

                              pdd->dev->id, p->pasid);



+             kfd_pc_sample_release(pdd);

+

              kfd_process_device_destroy_cwsr_dgpu(pdd);

              kfd_process_device_destroy_ib_mem(pdd);



--

2.25.1



[-- Attachment #2: Type: text/html, Size: 12466 bytes --]

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH 19/24] drm/amdkfd: enable pc sampling stop
  2023-11-10 19:07   ` Yat Sin, David
@ 2023-11-13 15:19     ` James Zhu
  2023-11-13 17:04       ` Yat Sin, David
  0 siblings, 1 reply; 80+ messages in thread
From: James Zhu @ 2023-11-13 15:19 UTC (permalink / raw)
  To: Yat Sin, David, Zhu, James, amd-gfx; +Cc: Kuehling, Felix, Greathouse, Joseph

[-- Attachment #1: Type: text/plain, Size: 4066 bytes --]


On 2023-11-10 14:07, Yat Sin, David wrote:
> [AMD Official Use Only - General]
>
>> -----Original Message-----
>> From: Zhu, James<James.Zhu@amd.com>
>> Sent: Friday, November 3, 2023 9:12 AM
>> To:amd-gfx@lists.freedesktop.org
>> Cc: Kuehling, Felix<Felix.Kuehling@amd.com>; Greathouse, Joseph
>> <Joseph.Greathouse@amd.com>; Yat Sin, David<David.YatSin@amd.com>; Zhu,
>> James<James.Zhu@amd.com>
>> Subject: [PATCH 19/24] drm/amdkfd: enable pc sampling stop
>>
>> Enable pc sampling stop.
>>
>> Signed-off-by: James Zhu<James.Zhu@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c | 28 +++++++++++++++++--
>> -
>>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h        |  2 ++
>>   2 files changed, 27 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>> index 33d003ca0093..2c4ac5b4cc4b 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>> @@ -108,10 +108,32 @@ static int kfd_pc_sample_start(struct
>> kfd_process_device *pdd,
>>        return 0;
>>   }
>>
>> -static int kfd_pc_sample_stop(struct kfd_process_device *pdd)
>> +static int kfd_pc_sample_stop(struct kfd_process_device *pdd,
>> +                                     struct pc_sampling_entry *pcs_entry)
>>   {
>> -     return -EINVAL;
>> +     bool pc_sampling_stop = false;
>> +
>> +     pcs_entry->enabled = false;
>> +     mutex_lock(&pdd->dev->pcs_data.mutex);
> For the START/STOP/DESTROY ioctls, I think we can hold pdd->dev->pcs_data.mutex through the whole IOCTL. Then we would not have to deal with the intermediate states where the START/STOP/DESTROY are happening at the same time.
[JZ] pdd->dev->pcs_data.mutex is per device, not per process. In the 
future, also it will share protection within different pc sampling 
methods on the same devices. So I don't think a bigger lock here is good 
idea.
>
>> +     pdd->dev->pcs_data.hosttrap_entry.base.active_count--;
>> +     if (!pdd->dev->pcs_data.hosttrap_entry.base.active_count) {
>> +             WRITE_ONCE(pdd->dev-
>>> pcs_data.hosttrap_entry.base.stop_enable, true);
>> +             pc_sampling_stop = true;
>> +     }
>> +     mutex_unlock(&pdd->dev->pcs_data.mutex);
>>
>> +     if (pc_sampling_stop) {
>> +             kfd_process_set_trap_pc_sampling_flag(&pdd->qpd,
>> +                     pdd->dev-
>>> pcs_data.hosttrap_entry.base.pc_sample_info.method,
>> +false);
>> +
>> +             mutex_lock(&pdd->dev->pcs_data.mutex);
>> +             pdd->dev->pcs_data.hosttrap_entry.base.target_simd = 0;
>> +             pdd->dev->pcs_data.hosttrap_entry.base.target_wave_slot = 0;
>> +             WRITE_ONCE(pdd->dev-
>>> pcs_data.hosttrap_entry.base.stop_enable, false);
>> +             mutex_unlock(&pdd->dev->pcs_data.mutex);
>> +     }
>> +
>> +     return 0;
>>   }
>>
>>   static int kfd_pc_sample_create(struct kfd_process_device *pdd, @@ -251,7
>> +273,7 @@ int kfd_pc_sample(struct kfd_process_device *pdd,
>>                if (!pcs_entry->enabled)
>>                        return -EALREADY;
>>                else
>> -                     return kfd_pc_sample_stop(pdd);
>> +                     return kfd_pc_sample_stop(pdd, pcs_entry);
>>        }
>>
>>        return -EINVAL;
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> index 613910e0d440..badaa4d68cc4 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> @@ -259,6 +259,8 @@ struct kfd_dev;
>>   struct kfd_dev_pc_sampling_data {
>>        uint32_t use_count;         /* Num of PC sampling sessions */
>>        uint32_t active_count;      /* Num of active sessions */
>> +     uint32_t target_simd;       /* target simd for trap */
>> +     uint32_t target_wave_slot;  /* target wave slot for trap */
>>        bool stop_enable;           /* pc sampling stop in process */
>>        struct idr pc_sampling_idr;
>>        struct kfd_pc_sample_info pc_sample_info;
>> --
>> 2.25.1

[-- Attachment #2: Type: text/html, Size: 5843 bytes --]

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH 22/24] drm/amdkfd: add pc sampling release when process release
  2023-11-13 15:19       ` Yat Sin, David
@ 2023-11-13 15:30         ` James Zhu
  0 siblings, 0 replies; 80+ messages in thread
From: James Zhu @ 2023-11-13 15:30 UTC (permalink / raw)
  To: Yat Sin, David, Zhu, James, amd-gfx; +Cc: Kuehling, Felix, Greathouse, Joseph

[-- Attachment #1: Type: text/plain, Size: 6530 bytes --]


On 2023-11-13 10:19, Yat Sin, David wrote:
>
> [AMD Official Use Only - General]
>
>
> *From:* Zhu, James <James.Zhu@amd.com>
> *Sent:* Monday, November 13, 2023 10:13 AM
> *To:* Yat Sin, David <David.YatSin@amd.com>; Zhu, James 
> <James.Zhu@amd.com>; amd-gfx@lists.freedesktop.org
> *Cc:* Kuehling, Felix <Felix.Kuehling@amd.com>; Greathouse, Joseph 
> <Joseph.Greathouse@amd.com>
> *Subject:* Re: [PATCH 22/24] drm/amdkfd: add pc sampling release when 
> process release
>
> On 2023-11-10 14:08, Yat Sin, David wrote:
>
>     [AMD Official Use Only - General]
>
>         -----Original Message-----
>
>         From: Zhu, James<James.Zhu@amd.com>  <mailto:James.Zhu@amd.com>
>
>         Sent: Friday, November 3, 2023 9:12 AM
>
>         To:amd-gfx@lists.freedesktop.org
>
>         Cc: Kuehling, Felix<Felix.Kuehling@amd.com>  <mailto:Felix.Kuehling@amd.com>; Greathouse, Joseph
>
>         <Joseph.Greathouse@amd.com>  <mailto:Joseph.Greathouse@amd.com>; Yat Sin, David<David.YatSin@amd.com>  <mailto:David.YatSin@amd.com>; Zhu,
>
>         James<James.Zhu@amd.com>  <mailto:James.Zhu@amd.com>
>
>         Subject: [PATCH 22/24] drm/amdkfd: add pc sampling release when process
>
>         release
>
>         Add pc sampling release when process release, it will force to stop all activate
>
>         sessions with this process.
>
>         Signed-off-by: James Zhu<James.Zhu@amd.com>  <mailto:James.Zhu@amd.com>
>
>         ---
>
>           drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c | 26
>
>         ++++++++++++++++++++  drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h |
>
>         1 +
>
>           drivers/gpu/drm/amd/amdkfd/kfd_process.c     |  3 +++
>
>           3 files changed, 30 insertions(+)
>
>         diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>
>         b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>
>         index 66670cdb813a..00d8d3f400a9 100644
>
>         --- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>
>         +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>
>         @@ -274,6 +274,32 @@ static int kfd_pc_sample_destroy(struct
>
>         kfd_process_device *pdd, uint32_t trace_
>
>                return 0;
>
>           }
>
>         +void kfd_pc_sample_release(struct kfd_process_device *pdd) {
>
>         +     struct pc_sampling_entry *pcs_entry;
>
>         +     struct idr *idp;
>
>         +     uint32_t id;
>
>         +
>
>         +     if (sched_policy == KFD_SCHED_POLICY_NO_HWS) {
>
>         +             pr_err("PC Sampling does not support sched_policy %i",
>
>         sched_policy);
>
>         +             return;
>
>         +     }
>
>     You do not need to check the sched_policy here, already checked in kfd_ioctl_pc_sample(..) , so cannot have a hosttrap session if policy is NO_HWS.
>
>   [JZ]kfd_pc_sample_release is not called from kfd_ioctl_pc_sample. It 
> is in process quit process.
>
> [David] I know. But you cannot have a PC sampling session during 
> process clean-up when policy=NO_HWS because the session creation would 
> have been blocked on session-create.
>
[JZ] good point.
>
>         +
>
>         +     /* force to release all PC sampling task for this process */
>
>         +     idp = &pdd->dev->pcs_data.hosttrap_entry.base.pc_sampling_idr;
>
>         +     mutex_lock(&pdd->dev->pcs_data.mutex);
>
>         +     idr_for_each_entry(idp, pcs_entry, id) {
>
>         +             if (pcs_entry->pdd != pdd)
>
>         +                     continue;
>
>         +             mutex_unlock(&pdd->dev->pcs_data.mutex);
>
>     Can we not release the mutex here and just tell the worker thread to exit by setting the stop_enable bit.
>
>     I find we have a lot of places where we are acquiring/releasing the mutex within loops and this results in a
>
>     lot of extra states that we have to account for during the start/stop/destroy calls.
>
>         +             if (pcs_entry->enabled)
>
>         +                     kfd_pc_sample_stop(pdd, pcs_entry);
>
>         +             kfd_pc_sample_destroy(pdd, id, pcs_entry);
>
>         +             mutex_lock(&pdd->dev->pcs_data.mutex);
>
>         +     }
>
>         +     mutex_unlock(&pdd->dev->pcs_data.mutex);
>
>         +}
>
>         +
>
>           int kfd_pc_sample(struct kfd_process_device *pdd,
>
>                                                struct kfd_ioctl_pc_sample_args __user
>
>         *args)  { diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h
>
>         b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h
>
>         index cb93909e6bd3..4ea064fdaa98 100644
>
>         --- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h
>
>         +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h
>
>         @@ -30,6 +30,7 @@
>
>           int kfd_pc_sample(struct kfd_process_device *pdd,
>
>                                                struct kfd_ioctl_pc_sample_args __user
>
>         *args);
>
>         +void kfd_pc_sample_release(struct kfd_process_device *pdd);
>
>           void kfd_pc_sample_handler(struct work_struct *work);
>
>           #endif /* KFD_PC_SAMPLING_H_ */
>
>         diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>
>         b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>
>         index d22d804f180d..54f3db7eaae2 100644
>
>         --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>
>         +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>
>         @@ -43,6 +43,7 @@ struct mm_struct;
>
>           #include "kfd_svm.h"
>
>           #include "kfd_smi_events.h"
>
>           #include "kfd_debug.h"
>
>         +#include "kfd_pc_sampling.h"
>
>           /*
>
>            * List of struct kfd_process (field kfd_process).
>
>         @@ -1020,6 +1021,8 @@ static void kfd_process_destroy_pdds(struct
>
>         kfd_process *p)
>
>                        pr_debug("Releasing pdd (topology id %d) for process (pasid
>
>         0x%x)\n",
>
>                                        pdd->dev->id, p->pasid);
>
>         +             kfd_pc_sample_release(pdd);
>
>         +
>
>                        kfd_process_device_destroy_cwsr_dgpu(pdd);
>
>                        kfd_process_device_destroy_ib_mem(pdd);
>
>         --
>
>         2.25.1
>

[-- Attachment #2: Type: text/html, Size: 15518 bytes --]

^ permalink raw reply	[flat|nested] 80+ messages in thread

* RE: [PATCH 19/24] drm/amdkfd: enable pc sampling stop
  2023-11-13 15:19     ` James Zhu
@ 2023-11-13 17:04       ` Yat Sin, David
  2023-11-13 17:15         ` James Zhu
  0 siblings, 1 reply; 80+ messages in thread
From: Yat Sin, David @ 2023-11-13 17:04 UTC (permalink / raw)
  To: Zhu, James, amd-gfx; +Cc: Kuehling, Felix, Greathouse, Joseph

[-- Attachment #1: Type: text/plain, Size: 4983 bytes --]

[AMD Official Use Only - General]



From: Zhu, James <James.Zhu@amd.com>
Sent: Monday, November 13, 2023 10:20 AM
To: Yat Sin, David <David.YatSin@amd.com>; Zhu, James <James.Zhu@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Kuehling, Felix <Felix.Kuehling@amd.com>; Greathouse, Joseph <Joseph.Greathouse@amd.com>
Subject: Re: [PATCH 19/24] drm/amdkfd: enable pc sampling stop



On 2023-11-10 14:07, Yat Sin, David wrote:

[AMD Official Use Only - General]



-----Original Message-----

From: Zhu, James <James.Zhu@amd.com><mailto:James.Zhu@amd.com>

Sent: Friday, November 3, 2023 9:12 AM

To: amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>

Cc: Kuehling, Felix <Felix.Kuehling@amd.com><mailto:Felix.Kuehling@amd.com>; Greathouse, Joseph

<Joseph.Greathouse@amd.com><mailto:Joseph.Greathouse@amd.com>; Yat Sin, David <David.YatSin@amd.com><mailto:David.YatSin@amd.com>; Zhu,

James <James.Zhu@amd.com><mailto:James.Zhu@amd.com>

Subject: [PATCH 19/24] drm/amdkfd: enable pc sampling stop



Enable pc sampling stop.



Signed-off-by: James Zhu <James.Zhu@amd.com><mailto:James.Zhu@amd.com>

---

 drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c | 28 +++++++++++++++++--

-

 drivers/gpu/drm/amd/amdkfd/kfd_priv.h        |  2 ++

 2 files changed, 27 insertions(+), 3 deletions(-)



diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c

b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c

index 33d003ca0093..2c4ac5b4cc4b 100644

--- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c

+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c

@@ -108,10 +108,32 @@ static int kfd_pc_sample_start(struct

kfd_process_device *pdd,

      return 0;

 }



-static int kfd_pc_sample_stop(struct kfd_process_device *pdd)

+static int kfd_pc_sample_stop(struct kfd_process_device *pdd,

+                                     struct pc_sampling_entry *pcs_entry)

 {

-     return -EINVAL;

+     bool pc_sampling_stop = false;

+

+     pcs_entry->enabled = false;

+     mutex_lock(&pdd->dev->pcs_data.mutex);

For the START/STOP/DESTROY ioctls, I think we can hold pdd->dev->pcs_data.mutex through the whole IOCTL. Then we would not have to deal with the intermediate states where the START/STOP/DESTROY are happening at the same time.
[JZ] pdd->dev->pcs_data.mutex is per device, not per process. In the future, also it will share protection within different pc sampling methods on the same devices. So I don't think a bigger lock here is good idea.
[David] I think the CREATE/START/STOP/DESTROY actions are not time critical. So if two processes are using the same GPU, it is ok for amdgpu to block the 2nd process until amdgpu has fully completed the request from the 1st process. I think we are making the code more complex for a use-case that would be very rare.


+     pdd->dev->pcs_data.hosttrap_entry.base.active_count--;

+     if (!pdd->dev->pcs_data.hosttrap_entry.base.active_count) {

+             WRITE_ONCE(pdd->dev-

pcs_data.hosttrap_entry.base.stop_enable, true);

+             pc_sampling_stop = true;

+     }

+     mutex_unlock(&pdd->dev->pcs_data.mutex);



+     if (pc_sampling_stop) {

+             kfd_process_set_trap_pc_sampling_flag(&pdd->qpd,

+                     pdd->dev-

pcs_data.hosttrap_entry.base.pc_sample_info.method,

+false);

+

+             mutex_lock(&pdd->dev->pcs_data.mutex);

+             pdd->dev->pcs_data.hosttrap_entry.base.target_simd = 0;

+             pdd->dev->pcs_data.hosttrap_entry.base.target_wave_slot = 0;

+             WRITE_ONCE(pdd->dev-

pcs_data.hosttrap_entry.base.stop_enable, false);

+             mutex_unlock(&pdd->dev->pcs_data.mutex);

+     }

+

+     return 0;

 }



 static int kfd_pc_sample_create(struct kfd_process_device *pdd, @@ -251,7

+273,7 @@ int kfd_pc_sample(struct kfd_process_device *pdd,

              if (!pcs_entry->enabled)

                      return -EALREADY;

              else

-                     return kfd_pc_sample_stop(pdd);

+                     return kfd_pc_sample_stop(pdd, pcs_entry);

      }



      return -EINVAL;

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h

b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h

index 613910e0d440..badaa4d68cc4 100644

--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h

+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h

@@ -259,6 +259,8 @@ struct kfd_dev;

 struct kfd_dev_pc_sampling_data {

      uint32_t use_count;         /* Num of PC sampling sessions */

      uint32_t active_count;      /* Num of active sessions */

+     uint32_t target_simd;       /* target simd for trap */

+     uint32_t target_wave_slot;  /* target wave slot for trap */

      bool stop_enable;           /* pc sampling stop in process */

      struct idr pc_sampling_idr;

      struct kfd_pc_sample_info pc_sample_info;

--

2.25.1



[-- Attachment #2: Type: text/html, Size: 12123 bytes --]

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH 19/24] drm/amdkfd: enable pc sampling stop
  2023-11-13 17:04       ` Yat Sin, David
@ 2023-11-13 17:15         ` James Zhu
  0 siblings, 0 replies; 80+ messages in thread
From: James Zhu @ 2023-11-13 17:15 UTC (permalink / raw)
  To: Yat Sin, David, Zhu, James, amd-gfx; +Cc: Kuehling, Felix, Greathouse, Joseph

[-- Attachment #1: Type: text/plain, Size: 6239 bytes --]


On 2023-11-13 12:04, Yat Sin, David wrote:
>
> [AMD Official Use Only - General]
>
>
> *From:* Zhu, James <James.Zhu@amd.com>
> *Sent:* Monday, November 13, 2023 10:20 AM
> *To:* Yat Sin, David <David.YatSin@amd.com>; Zhu, James 
> <James.Zhu@amd.com>; amd-gfx@lists.freedesktop.org
> *Cc:* Kuehling, Felix <Felix.Kuehling@amd.com>; Greathouse, Joseph 
> <Joseph.Greathouse@amd.com>
> *Subject:* Re: [PATCH 19/24] drm/amdkfd: enable pc sampling stop
>
> On 2023-11-10 14:07, Yat Sin, David wrote:
>
>     [AMD Official Use Only - General]
>
>         -----Original Message-----
>
>         From: Zhu, James<James.Zhu@amd.com>  <mailto:James.Zhu@amd.com>
>
>         Sent: Friday, November 3, 2023 9:12 AM
>
>         To:amd-gfx@lists.freedesktop.org
>
>         Cc: Kuehling, Felix<Felix.Kuehling@amd.com>  <mailto:Felix.Kuehling@amd.com>; Greathouse, Joseph
>
>         <Joseph.Greathouse@amd.com>  <mailto:Joseph.Greathouse@amd.com>; Yat Sin, David<David.YatSin@amd.com>  <mailto:David.YatSin@amd.com>; Zhu,
>
>         James<James.Zhu@amd.com>  <mailto:James.Zhu@amd.com>
>
>         Subject: [PATCH 19/24] drm/amdkfd: enable pc sampling stop
>
>         Enable pc sampling stop.
>
>         Signed-off-by: James Zhu<James.Zhu@amd.com>  <mailto:James.Zhu@amd.com>
>
>         ---
>
>           drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c | 28 +++++++++++++++++--
>
>         -
>
>           drivers/gpu/drm/amd/amdkfd/kfd_priv.h        |  2 ++
>
>           2 files changed, 27 insertions(+), 3 deletions(-)
>
>         diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>
>         b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>
>         index 33d003ca0093..2c4ac5b4cc4b 100644
>
>         --- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>
>         +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>
>         @@ -108,10 +108,32 @@ static int kfd_pc_sample_start(struct
>
>         kfd_process_device *pdd,
>
>                return 0;
>
>           }
>
>         -static int kfd_pc_sample_stop(struct kfd_process_device *pdd)
>
>         +static int kfd_pc_sample_stop(struct kfd_process_device *pdd,
>
>         +                                     struct pc_sampling_entry *pcs_entry)
>
>           {
>
>         -     return -EINVAL;
>
>         +     bool pc_sampling_stop = false;
>
>         +
>
>         +     pcs_entry->enabled = false;
>
>         +     mutex_lock(&pdd->dev->pcs_data.mutex);
>
>     For the START/STOP/DESTROY ioctls, I think we can hold pdd->dev->pcs_data.mutex through the whole IOCTL. Then we would not have to deal with the intermediate states where the START/STOP/DESTROY are happening at the same time.
>
> [JZ] pdd->dev->pcs_data.mutex is per device, not per process. In the 
> future, also it will share protection within different pc sampling 
> methods on the same devices. So I don't think a bigger lock here is 
> good idea.
> [David] I think the CREATE/START/STOP/DESTROY actions are not time 
> critical. So if two processes are using the same GPU, it is ok for 
> amdgpu to block the 2^nd process until amdgpu has fully completed the 
> request from the 1^st process. I think we are making the code more 
> complex for a use-case that would be very rare.
>
[JZ] IIRC, the init RFC version used bigger lock, and is questioned as 
an inefficient way,
>
>
>         +     pdd->dev->pcs_data.hosttrap_entry.base.active_count--;
>
>         +     if (!pdd->dev->pcs_data.hosttrap_entry.base.active_count) {
>
>         +             WRITE_ONCE(pdd->dev-
>
>             pcs_data.hosttrap_entry.base.stop_enable, true);
>
>         +             pc_sampling_stop = true;
>
>         +     }
>
>         +     mutex_unlock(&pdd->dev->pcs_data.mutex);
>
>         +     if (pc_sampling_stop) {
>
>         +             kfd_process_set_trap_pc_sampling_flag(&pdd->qpd,
>
>         +                     pdd->dev-
>
>             pcs_data.hosttrap_entry.base.pc_sample_info.method,
>
>         +false);
>
>         +
>
>         +             mutex_lock(&pdd->dev->pcs_data.mutex);
>
>         +             pdd->dev->pcs_data.hosttrap_entry.base.target_simd = 0;
>
>         +             pdd->dev->pcs_data.hosttrap_entry.base.target_wave_slot = 0;
>
>         +             WRITE_ONCE(pdd->dev-
>
>             pcs_data.hosttrap_entry.base.stop_enable, false);
>
>         +             mutex_unlock(&pdd->dev->pcs_data.mutex);
>
>         +     }
>
>         +
>
>         +     return 0;
>
>           }
>
>           static int kfd_pc_sample_create(struct kfd_process_device *pdd, @@ -251,7
>
>         +273,7 @@ int kfd_pc_sample(struct kfd_process_device *pdd,
>
>                        if (!pcs_entry->enabled)
>
>                                return -EALREADY;
>
>                        else
>
>         -                     return kfd_pc_sample_stop(pdd);
>
>         +                     return kfd_pc_sample_stop(pdd, pcs_entry);
>
>                }
>
>                return -EINVAL;
>
>         diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>
>         b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>
>         index 613910e0d440..badaa4d68cc4 100644
>
>         --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>
>         +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>
>         @@ -259,6 +259,8 @@ struct kfd_dev;
>
>           struct kfd_dev_pc_sampling_data {
>
>                uint32_t use_count;         /* Num of PC sampling sessions */
>
>                uint32_t active_count;      /* Num of active sessions */
>
>         +     uint32_t target_simd;       /* target simd for trap */
>
>         +     uint32_t target_wave_slot;  /* target wave slot for trap */
>
>                bool stop_enable;           /* pc sampling stop in process */
>
>                struct idr pc_sampling_idr;
>
>                struct kfd_pc_sample_info pc_sample_info;
>
>         --
>
>         2.25.1
>

[-- Attachment #2: Type: text/html, Size: 15199 bytes --]

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH 00/24] Support Host Trap Sampling for MI200
  2023-11-03 13:11 [PATCH 00/24] Support Host Trap Sampling for MI200 James Zhu
                   ` (23 preceding siblings ...)
  2023-11-03 13:11 ` [PATCH 24/24] drm/amdkfd: bump kfd ioctl minor version for pc sampling availability James Zhu
@ 2023-11-16 14:51 ` James Zhu
  24 siblings, 0 replies; 80+ messages in thread
From: James Zhu @ 2023-11-16 14:51 UTC (permalink / raw)
  To: James Zhu, amd-gfx; +Cc: Felix.kuehling, joseph.greathouse

Ping ...

On 2023-11-03 09:11, James Zhu wrote:
> PC sampling is a form of software profiling, where the threads of an application
> are periodically interrupted and the program counter that the thread is currently
> attempting to execute is saved out for profiling.
>
> David Yat Sin (5):
>    drm/amdkfd/kfd_ioctl: add pc sampling support
>    drm/amdkfd: add pc sampling support
>    drm/amdkfd: enable pc sampling query
>    drm/amdkfd: enable pc sampling create
>    drm/amdkfd: add pc sampling capability check
>
> James Zhu (19):
>    drm/amdkfd: add pc sampling mutex
>    drm/amdkfd: add trace_id return
>    drm/amdkfd: check pcs_enrty valid
>    drm/amdkfd: enable pc sampling destroy
>    drm/amdkfd: add interface to trigger pc sampling trap
>    drm/amdkfd: trigger pc sampling trap for gfx v9
>    drm/amdkfd/gfx9: enable host trap
>    drm/amdgpu: use trapID 4 for host trap
>    drm/amdgpu: add sq host trap status check
>    drm/amdkfd: trigger pc sampling trap for arcturus
>    drm/amdkfd: trigger pc sampling trap for aldebaran
>    drm/amdkfd: use bit operation set debug trap
>    drm/amdkfd: add setting trap pc sampling flag
>    drm/amdkfd: enable pc sampling start
>    drm/amdkfd: enable pc sampling stop
>    drm/amdkfd: enable pc sampling work to trigger trap
>    drm/amdkfd: add queue remapping
>    drm/amdkfd: add pc sampling release when process release
>    drm/amdkfd: bump kfd ioctl minor version for pc sampling availability
>
>   .../drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c  |   11 +
>   .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c   |   14 +-
>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c |   72 +
>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h |    7 +
>   drivers/gpu/drm/amd/amdkfd/Makefile           |    3 +-
>   .../gpu/drm/amd/amdkfd/cwsr_trap_handler.h    | 2106 +++++++++--------
>   .../drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm |   29 +-
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |   44 +
>   drivers/gpu/drm/amd/amdkfd/kfd_device.c       |   17 +
>   .../drm/amd/amdkfd/kfd_device_queue_manager.c |   11 +
>   .../drm/amd/amdkfd/kfd_device_queue_manager.h |    5 +
>   drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c  |  348 +++
>   drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h  |   36 +
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |   43 +
>   drivers/gpu/drm/amd/amdkfd/kfd_process.c      |   32 +-
>   .../amd/include/asic_reg/gc/gc_9_0_offset.h   |    2 +
>   .../amd/include/asic_reg/gc/gc_9_0_sh_mask.h  |    5 +
>   .../gpu/drm/amd/include/kgd_kfd_interface.h   |    6 +
>   include/uapi/linux/kfd_ioctl.h                |   60 +-
>   19 files changed, 1792 insertions(+), 1059 deletions(-)
>   create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>   create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h
>

^ permalink raw reply	[flat|nested] 80+ messages in thread

* [PATCH v2 03/24] drm/amdkfd: enable pc sampling query
  2023-11-03 13:11 ` [PATCH 03/24] drm/amdkfd: enable pc sampling query James Zhu
  2023-11-10 19:04   ` Yat Sin, David
@ 2023-11-20 15:34   ` James Zhu
  1 sibling, 0 replies; 80+ messages in thread
From: James Zhu @ 2023-11-20 15:34 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.kuehling, joseph.greathouse, jamesz

From: David Yat Sin <david.yatsin@amd.com>

Enable pc sampling to query system capability.

Co-developed-by: James Zhu <James.Zhu@amd.com>
Signed-off-by: James Zhu <James.Zhu@amd.com>
Signed-off-by: David Yat Sin <david.yatsin@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c | 54 +++++++++++++++++++-
 1 file changed, 53 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
index a7e78ff42d07..49fecbc7013e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
@@ -25,10 +25,62 @@
 #include "amdgpu_amdkfd.h"
 #include "kfd_pc_sampling.h"
 
+struct supported_pc_sample_info {
+	uint32_t ip_version;
+	const struct kfd_pc_sample_info *sample_info;
+};
+
+const struct kfd_pc_sample_info sample_info_hosttrap_9_0_0 = {
+	0, 1, ~0ULL, 0, KFD_IOCTL_PCS_METHOD_HOSTTRAP, KFD_IOCTL_PCS_TYPE_TIME_US };
+
+struct supported_pc_sample_info supported_formats[] = {
+	{ IP_VERSION(9, 4, 1), &sample_info_hosttrap_9_0_0 },
+	{ IP_VERSION(9, 4, 2), &sample_info_hosttrap_9_0_0 },
+};
+
 static int kfd_pc_sample_query_cap(struct kfd_process_device *pdd,
 					struct kfd_ioctl_pc_sample_args __user *user_args)
 {
-	return -EINVAL;
+	uint64_t sample_offset;
+	int num_method = 0;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(supported_formats); i++)
+		if (KFD_GC_VERSION(pdd->dev) == supported_formats[i].ip_version)
+			num_method++;
+
+	if (!num_method) {
+		pr_debug("PC Sampling not supported on GC_HWIP:0x%x.",
+			pdd->dev->adev->ip_versions[GC_HWIP][0]);
+		return -EOPNOTSUPP;
+	}
+
+	if (!user_args->sample_info_ptr) {
+		user_args->num_sample_info = num_method;
+		return 0;
+	}
+
+	if (user_args->num_sample_info < num_method) {
+		user_args->num_sample_info = num_method;
+		pr_debug("Sample info buffer is not large enough, "
+			 "ASIC requires space for %d kfd_pc_sample_info entries.", num_method);
+		return -ENOSPC;
+	}
+
+	sample_offset = user_args->sample_info_ptr;
+	for (i = 0; i < ARRAY_SIZE(supported_formats); i++) {
+		if (KFD_GC_VERSION(pdd->dev) == supported_formats[i].ip_version) {
+			int ret = copy_to_user((void __user *) sample_offset,
+				supported_formats[i].sample_info, sizeof(struct kfd_pc_sample_info));
+			if (ret) {
+				pr_debug("Failed to copy PC sampling info to user.");
+				return -EFAULT;
+			}
+			sample_offset += sizeof(struct kfd_pc_sample_info);
+		}
+	}
+
+	return 0;
 }
 
 static int kfd_pc_sample_start(struct kfd_process_device *pdd)
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 80+ messages in thread

* [PATCH v2 07/24] drm/amdkfd: check pcs_enrty valid
  2023-11-03 13:11 ` [PATCH 07/24] drm/amdkfd: check pcs_enrty valid James Zhu
  2023-11-10 19:09   ` Yat Sin, David
@ 2023-11-20 15:55   ` James Zhu
  2023-11-22 22:15   ` [PATCH " Felix Kuehling
  2 siblings, 0 replies; 80+ messages in thread
From: James Zhu @ 2023-11-20 15:55 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.kuehling, joseph.greathouse, jamesz

Check pcs_entry valid for pc sampling ioctl.

Signed-off-by: James Zhu <James.Zhu@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c | 30 ++++++++++++++++++--
 1 file changed, 27 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
index 4c9fc48e1a6a..36366c8847de 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
@@ -179,6 +179,21 @@ static int kfd_pc_sample_destroy(struct kfd_process_device *pdd, uint32_t trace_
 int kfd_pc_sample(struct kfd_process_device *pdd,
 					struct kfd_ioctl_pc_sample_args __user *args)
 {
+	struct pc_sampling_entry *pcs_entry;
+
+	if (args->op != KFD_IOCTL_PCS_OP_QUERY_CAPABILITIES &&
+		args->op != KFD_IOCTL_PCS_OP_CREATE) {
+
+		mutex_lock(&pdd->dev->pcs_data.mutex);
+		pcs_entry = idr_find(&pdd->dev->pcs_data.hosttrap_entry.base.pc_sampling_idr,
+				args->trace_id);
+		mutex_unlock(&pdd->dev->pcs_data.mutex);
+
+		if (!pcs_entry ||
+			pcs_entry->pdd != pdd)
+			return -EINVAL;
+	}
+
 	switch (args->op) {
 	case KFD_IOCTL_PCS_OP_QUERY_CAPABILITIES:
 		return kfd_pc_sample_query_cap(pdd, args);
@@ -187,13 +202,22 @@ int kfd_pc_sample(struct kfd_process_device *pdd,
 		return kfd_pc_sample_create(pdd, args);
 
 	case KFD_IOCTL_PCS_OP_DESTROY:
-		return kfd_pc_sample_destroy(pdd, args->trace_id);
+		if (pcs_entry->enabled)
+			return -EBUSY;
+		else
+			return kfd_pc_sample_destroy(pdd, args->trace_id);
 
 	case KFD_IOCTL_PCS_OP_START:
-		return kfd_pc_sample_start(pdd);
+		if (pcs_entry->enabled)
+			return -EALREADY;
+		else
+			return kfd_pc_sample_start(pdd);
 
 	case KFD_IOCTL_PCS_OP_STOP:
-		return kfd_pc_sample_stop(pdd);
+		if (!pcs_entry->enabled)
+			return -EALREADY;
+		else
+			return kfd_pc_sample_stop(pdd);
 	}
 
 	return -EINVAL;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 80+ messages in thread

* [PATCH v2 10/24] drm/amdkfd: trigger pc sampling trap for gfx v9
  2023-11-03 13:11 ` [PATCH 10/24] drm/amdkfd: trigger pc sampling trap for gfx v9 James Zhu
  2023-11-10 19:08   ` Yat Sin, David
@ 2023-11-20 16:05   ` James Zhu
  1 sibling, 0 replies; 80+ messages in thread
From: James Zhu @ 2023-11-20 16:05 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.kuehling, joseph.greathouse, jamesz

Implement trigger pc sampling trap for gfx v9.

Signed-off-by: James Zhu <James.Zhu@amd.com>
---
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 36 +++++++++++++++++++
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h |  7 ++++
 2 files changed, 43 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
index 51011e8ee90d..5e1330888860 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
@@ -1146,6 +1146,42 @@ void kgd_gfx_v9_program_trap_handler_settings(struct amdgpu_device *adev,
 	kgd_gfx_v9_unlock_srbm(adev, inst);
 }
 
+uint32_t kgd_gfx_v9_trigger_pc_sample_trap(struct amdgpu_device *adev,
+					    uint32_t vmid,
+					    uint32_t max_wave_slot,
+					    uint32_t max_simd,
+					    uint32_t *target_simd,
+					    uint32_t *target_wave_slot,
+					    enum kfd_ioctl_pc_sample_method method)
+{
+	if (method == KFD_IOCTL_PCS_METHOD_HOSTTRAP) {
+		uint32_t value = 0;
+
+		value = REG_SET_FIELD(value, SQ_CMD, CMD, SQ_IND_CMD_CMD_TRAP);
+		value = REG_SET_FIELD(value, SQ_CMD, MODE, SQ_IND_CMD_MODE_SINGLE);
+
+		/* select *target_simd */
+		value = REG_SET_FIELD(value, SQ_CMD, SIMD_ID, *target_simd);
+		/* select *target_wave_slot */
+		value = REG_SET_FIELD(value, SQ_CMD, WAVE_ID, (*target_wave_slot)++);
+
+		mutex_lock(&adev->grbm_idx_mutex);
+		amdgpu_gfx_select_se_sh(adev, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0);
+		WREG32_SOC15(GC, 0, mmSQ_CMD, value);
+		mutex_unlock(&adev->grbm_idx_mutex);
+
+		*target_wave_slot %= max_wave_slot;
+		if (!(*target_wave_slot)) {
+			(*target_simd)++;
+			*target_simd %= max_simd;
+		}
+	} else {
+		pr_debug("PC Sampling method %d not supported.", method);
+		return -EOPNOTSUPP;
+	}
+	return 0;
+}
+
 const struct kfd2kgd_calls gfx_v9_kfd2kgd = {
 	.program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
 	.set_pasid_vmid_mapping = kgd_gfx_v9_set_pasid_vmid_mapping,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
index ce424615f59b..b47b926891a8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
@@ -101,3 +101,10 @@ void kgd_gfx_v9_build_grace_period_packet_info(struct amdgpu_device *adev,
 					       uint32_t grace_period,
 					       uint32_t *reg_offset,
 					       uint32_t *reg_data);
+uint32_t kgd_gfx_v9_trigger_pc_sample_trap(struct amdgpu_device *adev,
+					    uint32_t vmid,
+					    uint32_t max_wave_slot,
+					    uint32_t max_simd,
+					    uint32_t *target_simd,
+					    uint32_t *target_wave_slot,
+					    enum kfd_ioctl_pc_sample_method method);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 80+ messages in thread

* [PATCH v2 12/24] drm/amdgpu: use trapID 4 for host trap
  2023-11-03 13:11 ` [PATCH 12/24] drm/amdgpu: use trapID 4 for " James Zhu
@ 2023-11-20 16:08   ` James Zhu
  0 siblings, 0 replies; 80+ messages in thread
From: James Zhu @ 2023-11-20 16:08 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.kuehling, joseph.greathouse, jamesz

Since TRAPSTS.HOST_TRAP won't work pre-gfx943, so use
TTMP1 (bit 24: HT) and (bit 16-23: trapID) to identify
the host trap.

Signed-off-by: James Zhu <James.Zhu@amd.com>
---
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c |    2 +
 .../gpu/drm/amd/amdkfd/cwsr_trap_handler.h    | 2117 +++++++++--------
 .../drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm |    5 +
 3 files changed, 1070 insertions(+), 1054 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
index 5e1330888860..423611904eaf 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
@@ -1164,6 +1164,8 @@ uint32_t kgd_gfx_v9_trigger_pc_sample_trap(struct amdgpu_device *adev,
 		value = REG_SET_FIELD(value, SQ_CMD, SIMD_ID, *target_simd);
 		/* select *target_wave_slot */
 		value = REG_SET_FIELD(value, SQ_CMD, WAVE_ID, (*target_wave_slot)++);
+		/* set TrapID 4 for HOSTTRAP */
+		value = REG_SET_FIELD(value, SQ_CMD, DATA, 0x4);
 
 		mutex_lock(&adev->grbm_idx_mutex);
 		amdgpu_gfx_select_se_sh(adev, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0);
diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
index c16595680faa..8e55bb0bb0b8 100644
--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
@@ -274,155 +274,263 @@ static const uint32_t cwsr_trap_gfx8_hex[] = {
 
 
 static const uint32_t cwsr_trap_gfx9_hex[] = {
-	0xbf820001, 0xbf82025e,
+	0xbf820001, 0xbf820263,
 	0xb8f8f802, 0x8978ff78,
 	0x00020006, 0xb8fbf803,
 	0x866eff78, 0x00002000,
 	0xbf840009, 0x866eff6d,
-	0x00ff0000, 0xbf85001e,
+	0x00ff0000, 0xbf850023,
 	0x866eff7b, 0x00000400,
-	0xbf85005b, 0xbf8e0010,
+	0xbf850060, 0xbf8e0010,
 	0xb8fbf803, 0xbf82fffa,
 	0x866eff7b, 0x03c00900,
-	0xbf850015, 0x866eff7b,
-	0x000071ff, 0xbf840008,
-	0x866fff7b, 0x00007080,
-	0xbf840001, 0xbeee1a87,
-	0xb8eff801, 0x8e6e8c6e,
-	0x866e6f6e, 0xbf85000a,
-	0x866eff6d, 0x00ff0000,
-	0xbf850007, 0xb8eef801,
-	0x866eff6e, 0x00000800,
-	0xbf850003, 0x866eff7b,
-	0x00000400, 0xbf850040,
-	0xb8faf807, 0x867aff7a,
-	0x001f8000, 0x8e7a8b7a,
-	0x8977ff77, 0xfc000000,
-	0x87777a77, 0xba7ff807,
-	0x00000000, 0xb8faf812,
-	0xb8fbf813, 0x8efa887a,
-	0xbf0d8f7b, 0xbf840002,
-	0x877bff7b, 0xffff0000,
-	0xc0031c3d, 0x00000010,
-	0xc0071bbd, 0x00000000,
-	0xc0071ebd, 0x00000008,
-	0xbf8cc07f, 0x8671ff6d,
-	0x01000000, 0xbf840004,
-	0x92f1ff70, 0x00010001,
-	0xbf840016, 0xbf820005,
-	0x86708170, 0x8e709770,
-	0x8977ff77, 0x00800000,
-	0x87777077, 0x86ee6e6e,
-	0xbf840001, 0xbe801d6e,
-	0x866eff6d, 0x01ff0000,
-	0xbf850005, 0x8778ff78,
-	0x00002000, 0x80ec886c,
-	0x82ed806d, 0xbf820005,
-	0x866eff6d, 0x01000000,
-	0xbf850002, 0x806c846c,
-	0x826d806d, 0x866dff6d,
-	0x0000ffff, 0x8f7a8b77,
+	0xbf85001a, 0x866eff6d,
+	0x01ff0000, 0xbf06ff6e,
+	0x01040000, 0xbf850015,
+	0x866eff7b, 0x000071ff,
+	0xbf840008, 0x866fff7b,
+	0x00007080, 0xbf840001,
+	0xbeee1a87, 0xb8eff801,
+	0x8e6e8c6e, 0x866e6f6e,
+	0xbf85000a, 0x866eff6d,
+	0x00ff0000, 0xbf850007,
+	0xb8eef801, 0x866eff6e,
+	0x00000800, 0xbf850003,
+	0x866eff7b, 0x00000400,
+	0xbf850040, 0xb8faf807,
 	0x867aff7a, 0x001f8000,
-	0xb97af807, 0x86fe7e7e,
-	0x86ea6a6a, 0x8f6e8378,
-	0xb96ee0c2, 0xbf800002,
-	0xb9780002, 0xbe801f6c,
+	0x8e7a8b7a, 0x8977ff77,
+	0xfc000000, 0x87777a77,
+	0xba7ff807, 0x00000000,
+	0xb8faf812, 0xb8fbf813,
+	0x8efa887a, 0xbf0d8f7b,
+	0xbf840002, 0x877bff7b,
+	0xffff0000, 0xc0031c3d,
+	0x00000010, 0xc0071bbd,
+	0x00000000, 0xc0071ebd,
+	0x00000008, 0xbf8cc07f,
+	0x8671ff6d, 0x01000000,
+	0xbf840004, 0x92f1ff70,
+	0x00010001, 0xbf840016,
+	0xbf820005, 0x86708170,
+	0x8e709770, 0x8977ff77,
+	0x00800000, 0x87777077,
+	0x86ee6e6e, 0xbf840001,
+	0xbe801d6e, 0x866eff6d,
+	0x01ff0000, 0xbf850005,
+	0x8778ff78, 0x00002000,
+	0x80ec886c, 0x82ed806d,
+	0xbf820005, 0x866eff6d,
+	0x01000000, 0xbf850002,
+	0x806c846c, 0x826d806d,
 	0x866dff6d, 0x0000ffff,
-	0xbefa0080, 0xb97a0283,
-	0xb8faf807, 0x867aff7a,
-	0x001f8000, 0x8e7a8b7a,
-	0x8977ff77, 0xfc000000,
-	0x87777a77, 0xba7ff807,
-	0x00000000, 0xbeee007e,
-	0xbeef007f, 0xbefe0180,
-	0xbf900004, 0x877a8478,
-	0xb97af802, 0xbf8e0002,
-	0xbf88fffe, 0xb8fa2a05,
-	0x807a817a, 0x8e7a8a7a,
-	0xb8fb1605, 0x807b817b,
-	0x8e7b867b, 0x807a7b7a,
-	0x807a7e7a, 0x827b807f,
-	0x867bff7b, 0x0000ffff,
-	0xc04b1c3d, 0x00000050,
-	0xbf8cc07f, 0xc04b1d3d,
-	0x00000060, 0xbf8cc07f,
-	0xc0431e7d, 0x00000074,
-	0xbf8cc07f, 0xbef4007e,
-	0x8675ff7f, 0x0000ffff,
-	0x8775ff75, 0x00040000,
-	0xbef60080, 0xbef700ff,
-	0x00807fac, 0xbef1007c,
-	0xbef00080, 0xb8f02a05,
-	0x80708170, 0x8e708a70,
-	0xb8fa1605, 0x807a817a,
-	0x8e7a867a, 0x80707a70,
-	0xbef60084, 0xbef600ff,
-	0x01000000, 0xbefe007c,
-	0xbefc0070, 0xc0611c7a,
-	0x0000007c, 0xbf8cc07f,
-	0x80708470, 0xbefc007e,
+	0x8f7a8b77, 0x867aff7a,
+	0x001f8000, 0xb97af807,
+	0x86fe7e7e, 0x86ea6a6a,
+	0x8f6e8378, 0xb96ee0c2,
+	0xbf800002, 0xb9780002,
+	0xbe801f6c, 0x866dff6d,
+	0x0000ffff, 0xbefa0080,
+	0xb97a0283, 0xb8faf807,
+	0x867aff7a, 0x001f8000,
+	0x8e7a8b7a, 0x8977ff77,
+	0xfc000000, 0x87777a77,
+	0xba7ff807, 0x00000000,
+	0xbeee007e, 0xbeef007f,
+	0xbefe0180, 0xbf900004,
+	0x877a8478, 0xb97af802,
+	0xbf8e0002, 0xbf88fffe,
+	0xb8fa2a05, 0x807a817a,
+	0x8e7a8a7a, 0xb8fb1605,
+	0x807b817b, 0x8e7b867b,
+	0x807a7b7a, 0x807a7e7a,
+	0x827b807f, 0x867bff7b,
+	0x0000ffff, 0xc04b1c3d,
+	0x00000050, 0xbf8cc07f,
+	0xc04b1d3d, 0x00000060,
+	0xbf8cc07f, 0xc0431e7d,
+	0x00000074, 0xbf8cc07f,
+	0xbef4007e, 0x8675ff7f,
+	0x0000ffff, 0x8775ff75,
+	0x00040000, 0xbef60080,
+	0xbef700ff, 0x00807fac,
+	0xbef1007c, 0xbef00080,
+	0xb8f02a05, 0x80708170,
+	0x8e708a70, 0xb8fa1605,
+	0x807a817a, 0x8e7a867a,
+	0x80707a70, 0xbef60084,
+	0xbef600ff, 0x01000000,
 	0xbefe007c, 0xbefc0070,
-	0xc0611b3a, 0x0000007c,
+	0xc0611c7a, 0x0000007c,
 	0xbf8cc07f, 0x80708470,
 	0xbefc007e, 0xbefe007c,
-	0xbefc0070, 0xc0611b7a,
+	0xbefc0070, 0xc0611b3a,
 	0x0000007c, 0xbf8cc07f,
 	0x80708470, 0xbefc007e,
 	0xbefe007c, 0xbefc0070,
-	0xc0611bba, 0x0000007c,
+	0xc0611b7a, 0x0000007c,
 	0xbf8cc07f, 0x80708470,
 	0xbefc007e, 0xbefe007c,
-	0xbefc0070, 0xc0611bfa,
+	0xbefc0070, 0xc0611bba,
 	0x0000007c, 0xbf8cc07f,
 	0x80708470, 0xbefc007e,
 	0xbefe007c, 0xbefc0070,
-	0xc0611e3a, 0x0000007c,
-	0xbf8cc07f, 0x80708470,
-	0xbefc007e, 0xb8fbf803,
-	0xbefe007c, 0xbefc0070,
-	0xc0611efa, 0x0000007c,
+	0xc0611bfa, 0x0000007c,
 	0xbf8cc07f, 0x80708470,
 	0xbefc007e, 0xbefe007c,
-	0xbefc0070, 0xc0611a3a,
+	0xbefc0070, 0xc0611e3a,
+	0x0000007c, 0xbf8cc07f,
+	0x80708470, 0xbefc007e,
+	0xb8fbf803, 0xbefe007c,
+	0xbefc0070, 0xc0611efa,
 	0x0000007c, 0xbf8cc07f,
 	0x80708470, 0xbefc007e,
 	0xbefe007c, 0xbefc0070,
-	0xc0611a7a, 0x0000007c,
-	0xbf8cc07f, 0x80708470,
-	0xbefc007e, 0xb8f1f801,
-	0xbefe007c, 0xbefc0070,
-	0xc0611c7a, 0x0000007c,
+	0xc0611a3a, 0x0000007c,
 	0xbf8cc07f, 0x80708470,
-	0xbefc007e, 0x867aff7f,
-	0x04000000, 0xbeef0080,
-	0x876f6f7a, 0xb8f02a05,
+	0xbefc007e, 0xbefe007c,
+	0xbefc0070, 0xc0611a7a,
+	0x0000007c, 0xbf8cc07f,
+	0x80708470, 0xbefc007e,
+	0xb8f1f801, 0xbefe007c,
+	0xbefc0070, 0xc0611c7a,
+	0x0000007c, 0xbf8cc07f,
+	0x80708470, 0xbefc007e,
+	0x867aff7f, 0x04000000,
+	0xbeef0080, 0x876f6f7a,
+	0xb8f02a05, 0x80708170,
+	0x8e708a70, 0xb8fb1605,
+	0x807b817b, 0x8e7b847b,
+	0x8e76827b, 0xbef600ff,
+	0x01000000, 0xbef20174,
+	0x80747074, 0x82758075,
+	0xbefc0080, 0xbf800000,
+	0xbe802b00, 0xbe822b02,
+	0xbe842b04, 0xbe862b06,
+	0xbe882b08, 0xbe8a2b0a,
+	0xbe8c2b0c, 0xbe8e2b0e,
+	0xc06b003a, 0x00000000,
+	0xbf8cc07f, 0xc06b013a,
+	0x00000010, 0xbf8cc07f,
+	0xc06b023a, 0x00000020,
+	0xbf8cc07f, 0xc06b033a,
+	0x00000030, 0xbf8cc07f,
+	0x8074c074, 0x82758075,
+	0x807c907c, 0xbf0a7b7c,
+	0xbf85ffe7, 0xbef40172,
+	0xbef00080, 0xbefe00c1,
+	0xbeff00c1, 0xbee80080,
+	0xbee90080, 0xbef600ff,
+	0x01000000, 0x867aff78,
+	0x00400000, 0xbf850003,
+	0xb8faf803, 0x897a7aff,
+	0x10000000, 0xbf85004d,
+	0xbe840080, 0xd2890000,
+	0x00000900, 0x80048104,
+	0xd2890001, 0x00000900,
+	0x80048104, 0xd2890002,
+	0x00000900, 0x80048104,
+	0xd2890003, 0x00000900,
+	0x80048104, 0xc069003a,
+	0x00000070, 0xbf8cc07f,
+	0x80709070, 0xbf06c004,
+	0xbf84ffee, 0xbe840080,
+	0xd2890000, 0x00000901,
+	0x80048104, 0xd2890001,
+	0x00000901, 0x80048104,
+	0xd2890002, 0x00000901,
+	0x80048104, 0xd2890003,
+	0x00000901, 0x80048104,
+	0xc069003a, 0x00000070,
+	0xbf8cc07f, 0x80709070,
+	0xbf06c004, 0xbf84ffee,
+	0xbe840080, 0xd2890000,
+	0x00000902, 0x80048104,
+	0xd2890001, 0x00000902,
+	0x80048104, 0xd2890002,
+	0x00000902, 0x80048104,
+	0xd2890003, 0x00000902,
+	0x80048104, 0xc069003a,
+	0x00000070, 0xbf8cc07f,
+	0x80709070, 0xbf06c004,
+	0xbf84ffee, 0xbe840080,
+	0xd2890000, 0x00000903,
+	0x80048104, 0xd2890001,
+	0x00000903, 0x80048104,
+	0xd2890002, 0x00000903,
+	0x80048104, 0xd2890003,
+	0x00000903, 0x80048104,
+	0xc069003a, 0x00000070,
+	0xbf8cc07f, 0x80709070,
+	0xbf06c004, 0xbf84ffee,
+	0xbf820008, 0xe0724000,
+	0x701d0000, 0xe0724100,
+	0x701d0100, 0xe0724200,
+	0x701d0200, 0xe0724300,
+	0x701d0300, 0xbefe00c1,
+	0xbeff00c1, 0xb8fb4306,
+	0x867bc17b, 0xbf840063,
+	0xbf8a0000, 0x867aff6f,
+	0x04000000, 0xbf84005f,
+	0x8e7b867b, 0x8e7b827b,
+	0xbef6007b, 0xb8f02a05,
 	0x80708170, 0x8e708a70,
-	0xb8fb1605, 0x807b817b,
-	0x8e7b847b, 0x8e76827b,
+	0xb8fa1605, 0x807a817a,
+	0x8e7a867a, 0x80707a70,
+	0x8070ff70, 0x00000080,
 	0xbef600ff, 0x01000000,
-	0xbef20174, 0x80747074,
-	0x82758075, 0xbefc0080,
-	0xbf800000, 0xbe802b00,
-	0xbe822b02, 0xbe842b04,
-	0xbe862b06, 0xbe882b08,
-	0xbe8a2b0a, 0xbe8c2b0c,
-	0xbe8e2b0e, 0xc06b003a,
-	0x00000000, 0xbf8cc07f,
-	0xc06b013a, 0x00000010,
-	0xbf8cc07f, 0xc06b023a,
-	0x00000020, 0xbf8cc07f,
-	0xc06b033a, 0x00000030,
-	0xbf8cc07f, 0x8074c074,
-	0x82758075, 0x807c907c,
-	0xbf0a7b7c, 0xbf85ffe7,
-	0xbef40172, 0xbef00080,
-	0xbefe00c1, 0xbeff00c1,
-	0xbee80080, 0xbee90080,
+	0xbefc0080, 0xd28c0002,
+	0x000100c1, 0xd28d0003,
+	0x000204c1, 0x867aff78,
+	0x00400000, 0xbf850003,
+	0xb8faf803, 0x897a7aff,
+	0x10000000, 0xbf850030,
+	0x24040682, 0xd86e4000,
+	0x00000002, 0xbf8cc07f,
+	0xbe840080, 0xd2890000,
+	0x00000900, 0x80048104,
+	0xd2890001, 0x00000900,
+	0x80048104, 0xd2890002,
+	0x00000900, 0x80048104,
+	0xd2890003, 0x00000900,
+	0x80048104, 0xc069003a,
+	0x00000070, 0xbf8cc07f,
+	0x80709070, 0xbf06c004,
+	0xbf84ffee, 0xbe840080,
+	0xd2890000, 0x00000901,
+	0x80048104, 0xd2890001,
+	0x00000901, 0x80048104,
+	0xd2890002, 0x00000901,
+	0x80048104, 0xd2890003,
+	0x00000901, 0x80048104,
+	0xc069003a, 0x00000070,
+	0xbf8cc07f, 0x80709070,
+	0xbf06c004, 0xbf84ffee,
+	0x680404ff, 0x00000200,
+	0xd0c9006a, 0x0000f702,
+	0xbf87ffd2, 0xbf820015,
+	0xd1060002, 0x00011103,
+	0x7e0602ff, 0x00000200,
+	0xbefc00ff, 0x00010000,
+	0xbe800077, 0x8677ff77,
+	0xff7fffff, 0x8777ff77,
+	0x00058000, 0xd8ec0000,
+	0x00000002, 0xbf8cc07f,
+	0xe0765000, 0x701d0002,
+	0x68040702, 0xd0c9006a,
+	0x0000f702, 0xbf87fff7,
+	0xbef70000, 0xbef000ff,
+	0x00000400, 0xbefe00c1,
+	0xbeff00c1, 0xb8fb2a05,
+	0x807b817b, 0x8e7b827b,
 	0xbef600ff, 0x01000000,
+	0xbefc0084, 0xbf0a7b7c,
+	0xbf84006d, 0xbf11017c,
+	0x807bff7b, 0x00001000,
 	0x867aff78, 0x00400000,
 	0xbf850003, 0xb8faf803,
 	0x897a7aff, 0x10000000,
-	0xbf85004d, 0xbe840080,
+	0xbf850051, 0xbe840080,
 	0xd2890000, 0x00000900,
 	0x80048104, 0xd2890001,
 	0x00000900, 0x80048104,
@@ -460,224 +568,119 @@ static const uint32_t cwsr_trap_gfx9_hex[] = {
 	0x80048104, 0xc069003a,
 	0x00000070, 0xbf8cc07f,
 	0x80709070, 0xbf06c004,
-	0xbf84ffee, 0xbf820008,
+	0xbf84ffee, 0x807c847c,
+	0xbf0a7b7c, 0xbf85ffb1,
+	0xbf9c0000, 0xbf820012,
+	0x7e000300, 0x7e020301,
+	0x7e040302, 0x7e060303,
 	0xe0724000, 0x701d0000,
 	0xe0724100, 0x701d0100,
 	0xe0724200, 0x701d0200,
 	0xe0724300, 0x701d0300,
+	0x807c847c, 0x8070ff70,
+	0x00000400, 0xbf0a7b7c,
+	0xbf85ffef, 0xbf9c0000,
+	0xbf8200c7, 0xbef4007e,
+	0x8675ff7f, 0x0000ffff,
+	0x8775ff75, 0x00040000,
+	0xbef60080, 0xbef700ff,
+	0x00807fac, 0x866eff7f,
+	0x04000000, 0xbf84001e,
 	0xbefe00c1, 0xbeff00c1,
-	0xb8fb4306, 0x867bc17b,
-	0xbf840063, 0xbf8a0000,
-	0x867aff6f, 0x04000000,
-	0xbf84005f, 0x8e7b867b,
-	0x8e7b827b, 0xbef6007b,
-	0xb8f02a05, 0x80708170,
-	0x8e708a70, 0xb8fa1605,
-	0x807a817a, 0x8e7a867a,
-	0x80707a70, 0x8070ff70,
+	0xb8ef4306, 0x866fc16f,
+	0xbf840019, 0x8e6f866f,
+	0x8e6f826f, 0xbef6006f,
+	0xb8f82a05, 0x80788178,
+	0x8e788a78, 0xb8ee1605,
+	0x806e816e, 0x8e6e866e,
+	0x80786e78, 0x8078ff78,
 	0x00000080, 0xbef600ff,
 	0x01000000, 0xbefc0080,
-	0xd28c0002, 0x000100c1,
-	0xd28d0003, 0x000204c1,
-	0x867aff78, 0x00400000,
-	0xbf850003, 0xb8faf803,
-	0x897a7aff, 0x10000000,
-	0xbf850030, 0x24040682,
-	0xd86e4000, 0x00000002,
-	0xbf8cc07f, 0xbe840080,
-	0xd2890000, 0x00000900,
-	0x80048104, 0xd2890001,
-	0x00000900, 0x80048104,
-	0xd2890002, 0x00000900,
-	0x80048104, 0xd2890003,
-	0x00000900, 0x80048104,
-	0xc069003a, 0x00000070,
-	0xbf8cc07f, 0x80709070,
-	0xbf06c004, 0xbf84ffee,
-	0xbe840080, 0xd2890000,
-	0x00000901, 0x80048104,
-	0xd2890001, 0x00000901,
-	0x80048104, 0xd2890002,
-	0x00000901, 0x80048104,
-	0xd2890003, 0x00000901,
-	0x80048104, 0xc069003a,
-	0x00000070, 0xbf8cc07f,
-	0x80709070, 0xbf06c004,
-	0xbf84ffee, 0x680404ff,
-	0x00000200, 0xd0c9006a,
-	0x0000f702, 0xbf87ffd2,
-	0xbf820015, 0xd1060002,
-	0x00011103, 0x7e0602ff,
-	0x00000200, 0xbefc00ff,
-	0x00010000, 0xbe800077,
-	0x8677ff77, 0xff7fffff,
-	0x8777ff77, 0x00058000,
-	0xd8ec0000, 0x00000002,
-	0xbf8cc07f, 0xe0765000,
-	0x701d0002, 0x68040702,
-	0xd0c9006a, 0x0000f702,
-	0xbf87fff7, 0xbef70000,
-	0xbef000ff, 0x00000400,
+	0xe0510000, 0x781d0000,
+	0xe0510100, 0x781d0000,
+	0x807cff7c, 0x00000200,
+	0x8078ff78, 0x00000200,
+	0xbf0a6f7c, 0xbf85fff6,
 	0xbefe00c1, 0xbeff00c1,
-	0xb8fb2a05, 0x807b817b,
-	0x8e7b827b, 0xbef600ff,
-	0x01000000, 0xbefc0084,
-	0xbf0a7b7c, 0xbf84006d,
-	0xbf11017c, 0x807bff7b,
-	0x00001000, 0x867aff78,
-	0x00400000, 0xbf850003,
-	0xb8faf803, 0x897a7aff,
-	0x10000000, 0xbf850051,
-	0xbe840080, 0xd2890000,
-	0x00000900, 0x80048104,
-	0xd2890001, 0x00000900,
-	0x80048104, 0xd2890002,
-	0x00000900, 0x80048104,
-	0xd2890003, 0x00000900,
-	0x80048104, 0xc069003a,
-	0x00000070, 0xbf8cc07f,
-	0x80709070, 0xbf06c004,
-	0xbf84ffee, 0xbe840080,
-	0xd2890000, 0x00000901,
-	0x80048104, 0xd2890001,
-	0x00000901, 0x80048104,
-	0xd2890002, 0x00000901,
-	0x80048104, 0xd2890003,
-	0x00000901, 0x80048104,
-	0xc069003a, 0x00000070,
-	0xbf8cc07f, 0x80709070,
-	0xbf06c004, 0xbf84ffee,
-	0xbe840080, 0xd2890000,
-	0x00000902, 0x80048104,
-	0xd2890001, 0x00000902,
-	0x80048104, 0xd2890002,
-	0x00000902, 0x80048104,
-	0xd2890003, 0x00000902,
-	0x80048104, 0xc069003a,
-	0x00000070, 0xbf8cc07f,
-	0x80709070, 0xbf06c004,
-	0xbf84ffee, 0xbe840080,
-	0xd2890000, 0x00000903,
-	0x80048104, 0xd2890001,
-	0x00000903, 0x80048104,
-	0xd2890002, 0x00000903,
-	0x80048104, 0xd2890003,
-	0x00000903, 0x80048104,
-	0xc069003a, 0x00000070,
-	0xbf8cc07f, 0x80709070,
-	0xbf06c004, 0xbf84ffee,
-	0x807c847c, 0xbf0a7b7c,
-	0xbf85ffb1, 0xbf9c0000,
-	0xbf820012, 0x7e000300,
-	0x7e020301, 0x7e040302,
-	0x7e060303, 0xe0724000,
-	0x701d0000, 0xe0724100,
-	0x701d0100, 0xe0724200,
-	0x701d0200, 0xe0724300,
-	0x701d0300, 0x807c847c,
-	0x8070ff70, 0x00000400,
-	0xbf0a7b7c, 0xbf85ffef,
-	0xbf9c0000, 0xbf8200c7,
-	0xbef4007e, 0x8675ff7f,
-	0x0000ffff, 0x8775ff75,
-	0x00040000, 0xbef60080,
-	0xbef700ff, 0x00807fac,
-	0x866eff7f, 0x04000000,
-	0xbf84001e, 0xbefe00c1,
-	0xbeff00c1, 0xb8ef4306,
-	0x866fc16f, 0xbf840019,
-	0x8e6f866f, 0x8e6f826f,
-	0xbef6006f, 0xb8f82a05,
+	0xbef600ff, 0x01000000,
+	0xb8ef2a05, 0x806f816f,
+	0x8e6f826f, 0x806fff6f,
+	0x00008000, 0xbef80080,
+	0xbeee0078, 0x8078ff78,
+	0x00000400, 0xbefc0084,
+	0xbf11087c, 0xe0524000,
+	0x781d0000, 0xe0524100,
+	0x781d0100, 0xe0524200,
+	0x781d0200, 0xe0524300,
+	0x781d0300, 0xbf8c0f70,
+	0x7e000300, 0x7e020301,
+	0x7e040302, 0x7e060303,
+	0x807c847c, 0x8078ff78,
+	0x00000400, 0xbf0a6f7c,
+	0xbf85ffee, 0xbf9c0000,
+	0xe0524000, 0x6e1d0000,
+	0xe0524100, 0x6e1d0100,
+	0xe0524200, 0x6e1d0200,
+	0xe0524300, 0x6e1d0300,
+	0xbf8c0f70, 0xb8f82a05,
 	0x80788178, 0x8e788a78,
 	0xb8ee1605, 0x806e816e,
 	0x8e6e866e, 0x80786e78,
-	0x8078ff78, 0x00000080,
-	0xbef600ff, 0x01000000,
-	0xbefc0080, 0xe0510000,
-	0x781d0000, 0xe0510100,
-	0x781d0000, 0x807cff7c,
-	0x00000200, 0x8078ff78,
-	0x00000200, 0xbf0a6f7c,
-	0xbf85fff6, 0xbefe00c1,
-	0xbeff00c1, 0xbef600ff,
-	0x01000000, 0xb8ef2a05,
-	0x806f816f, 0x8e6f826f,
-	0x806fff6f, 0x00008000,
-	0xbef80080, 0xbeee0078,
-	0x8078ff78, 0x00000400,
-	0xbefc0084, 0xbf11087c,
-	0xe0524000, 0x781d0000,
-	0xe0524100, 0x781d0100,
-	0xe0524200, 0x781d0200,
-	0xe0524300, 0x781d0300,
-	0xbf8c0f70, 0x7e000300,
-	0x7e020301, 0x7e040302,
-	0x7e060303, 0x807c847c,
-	0x8078ff78, 0x00000400,
-	0xbf0a6f7c, 0xbf85ffee,
-	0xbf9c0000, 0xe0524000,
-	0x6e1d0000, 0xe0524100,
-	0x6e1d0100, 0xe0524200,
-	0x6e1d0200, 0xe0524300,
-	0x6e1d0300, 0xbf8c0f70,
+	0x80f8c078, 0xb8ef1605,
+	0x806f816f, 0x8e6f846f,
+	0x8e76826f, 0xbef600ff,
+	0x01000000, 0xbefc006f,
+	0xc031003a, 0x00000078,
+	0x80f8c078, 0xbf8cc07f,
+	0x80fc907c, 0xbf800000,
+	0xbe802d00, 0xbe822d02,
+	0xbe842d04, 0xbe862d06,
+	0xbe882d08, 0xbe8a2d0a,
+	0xbe8c2d0c, 0xbe8e2d0e,
+	0xbf06807c, 0xbf84fff0,
 	0xb8f82a05, 0x80788178,
 	0x8e788a78, 0xb8ee1605,
 	0x806e816e, 0x8e6e866e,
-	0x80786e78, 0x80f8c078,
-	0xb8ef1605, 0x806f816f,
-	0x8e6f846f, 0x8e76826f,
+	0x80786e78, 0xbef60084,
 	0xbef600ff, 0x01000000,
-	0xbefc006f, 0xc031003a,
-	0x00000078, 0x80f8c078,
-	0xbf8cc07f, 0x80fc907c,
-	0xbf800000, 0xbe802d00,
-	0xbe822d02, 0xbe842d04,
-	0xbe862d06, 0xbe882d08,
-	0xbe8a2d0a, 0xbe8c2d0c,
-	0xbe8e2d0e, 0xbf06807c,
-	0xbf84fff0, 0xb8f82a05,
-	0x80788178, 0x8e788a78,
-	0xb8ee1605, 0x806e816e,
-	0x8e6e866e, 0x80786e78,
-	0xbef60084, 0xbef600ff,
-	0x01000000, 0xc0211bfa,
+	0xc0211bfa, 0x00000078,
+	0x80788478, 0xc0211b3a,
 	0x00000078, 0x80788478,
-	0xc0211b3a, 0x00000078,
-	0x80788478, 0xc0211b7a,
+	0xc0211b7a, 0x00000078,
+	0x80788478, 0xc0211c3a,
 	0x00000078, 0x80788478,
-	0xc0211c3a, 0x00000078,
-	0x80788478, 0xc0211c7a,
+	0xc0211c7a, 0x00000078,
+	0x80788478, 0xc0211eba,
 	0x00000078, 0x80788478,
-	0xc0211eba, 0x00000078,
-	0x80788478, 0xc0211efa,
+	0xc0211efa, 0x00000078,
+	0x80788478, 0xc0211a3a,
 	0x00000078, 0x80788478,
-	0xc0211a3a, 0x00000078,
-	0x80788478, 0xc0211a7a,
+	0xc0211a7a, 0x00000078,
+	0x80788478, 0xc0211cfa,
 	0x00000078, 0x80788478,
-	0xc0211cfa, 0x00000078,
-	0x80788478, 0xbf8cc07f,
-	0xbefc006f, 0xbefe0070,
-	0xbeff0071, 0x866f7bff,
-	0x000003ff, 0xb96f4803,
-	0x866f7bff, 0xfffff800,
-	0x8f6f8b6f, 0xb96fa2c3,
-	0xb973f801, 0xb8ee2a05,
-	0x806e816e, 0x8e6e8a6e,
-	0xb8ef1605, 0x806f816f,
-	0x8e6f866f, 0x806e6f6e,
-	0x806e746e, 0x826f8075,
-	0x866fff6f, 0x0000ffff,
-	0xc00b1c37, 0x00000050,
-	0xc00b1d37, 0x00000060,
-	0xc0031e77, 0x00000074,
-	0xbf8cc07f, 0x8f6e8b77,
-	0x866eff6e, 0x001f8000,
-	0xb96ef807, 0x866dff6d,
-	0x0000ffff, 0x86fe7e7e,
-	0x86ea6a6a, 0x8f6e837a,
-	0xb96ee0c2, 0xbf800002,
-	0xb97a0002, 0xbf8a0000,
-	0xbe801f6c, 0xbf810000,
+	0xbf8cc07f, 0xbefc006f,
+	0xbefe0070, 0xbeff0071,
+	0x866f7bff, 0x000003ff,
+	0xb96f4803, 0x866f7bff,
+	0xfffff800, 0x8f6f8b6f,
+	0xb96fa2c3, 0xb973f801,
+	0xb8ee2a05, 0x806e816e,
+	0x8e6e8a6e, 0xb8ef1605,
+	0x806f816f, 0x8e6f866f,
+	0x806e6f6e, 0x806e746e,
+	0x826f8075, 0x866fff6f,
+	0x0000ffff, 0xc00b1c37,
+	0x00000050, 0xc00b1d37,
+	0x00000060, 0xc0031e77,
+	0x00000074, 0xbf8cc07f,
+	0x8f6e8b77, 0x866eff6e,
+	0x001f8000, 0xb96ef807,
+	0x866dff6d, 0x0000ffff,
+	0x86fe7e7e, 0x86ea6a6a,
+	0x8f6e837a, 0xb96ee0c2,
+	0xbf800002, 0xb97a0002,
+	0xbf8a0000, 0xbe801f6c,
+	0xbf810000, 0x00000000,
 };
 
 static const uint32_t cwsr_trap_nv1x_hex[] = {
@@ -1101,219 +1104,159 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
 };
 
 static const uint32_t cwsr_trap_arcturus_hex[] = {
-	0xbf820001, 0xbf8202da,
+	0xbf820001, 0xbf8202df,
 	0xb8f8f802, 0x8978ff78,
 	0x00020006, 0xb8fbf803,
 	0x866eff78, 0x00002000,
 	0xbf840009, 0x866eff6d,
-	0x00ff0000, 0xbf85001e,
+	0x00ff0000, 0xbf850023,
 	0x866eff7b, 0x00000400,
-	0xbf85005b, 0xbf8e0010,
+	0xbf850060, 0xbf8e0010,
 	0xb8fbf803, 0xbf82fffa,
 	0x866eff7b, 0x03c00900,
-	0xbf850015, 0x866eff7b,
-	0x000071ff, 0xbf840008,
-	0x866fff7b, 0x00007080,
-	0xbf840001, 0xbeee1a87,
-	0xb8eff801, 0x8e6e8c6e,
-	0x866e6f6e, 0xbf85000a,
-	0x866eff6d, 0x00ff0000,
-	0xbf850007, 0xb8eef801,
-	0x866eff6e, 0x00000800,
-	0xbf850003, 0x866eff7b,
-	0x00000400, 0xbf850040,
-	0xb8faf807, 0x867aff7a,
-	0x001f8000, 0x8e7a8b7a,
-	0x8977ff77, 0xfc000000,
-	0x87777a77, 0xba7ff807,
-	0x00000000, 0xb8faf812,
-	0xb8fbf813, 0x8efa887a,
-	0xbf0d8f7b, 0xbf840002,
-	0x877bff7b, 0xffff0000,
-	0xc0031c3d, 0x00000010,
-	0xc0071bbd, 0x00000000,
-	0xc0071ebd, 0x00000008,
-	0xbf8cc07f, 0x8671ff6d,
-	0x01000000, 0xbf840004,
-	0x92f1ff70, 0x00010001,
-	0xbf840016, 0xbf820005,
-	0x86708170, 0x8e709770,
-	0x8977ff77, 0x00800000,
-	0x87777077, 0x86ee6e6e,
-	0xbf840001, 0xbe801d6e,
-	0x866eff6d, 0x01ff0000,
-	0xbf850005, 0x8778ff78,
-	0x00002000, 0x80ec886c,
-	0x82ed806d, 0xbf820005,
-	0x866eff6d, 0x01000000,
-	0xbf850002, 0x806c846c,
-	0x826d806d, 0x866dff6d,
-	0x0000ffff, 0x8f7a8b77,
+	0xbf85001a, 0x866eff6d,
+	0x01ff0000, 0xbf06ff6e,
+	0x01040000, 0xbf850015,
+	0x866eff7b, 0x000071ff,
+	0xbf840008, 0x866fff7b,
+	0x00007080, 0xbf840001,
+	0xbeee1a87, 0xb8eff801,
+	0x8e6e8c6e, 0x866e6f6e,
+	0xbf85000a, 0x866eff6d,
+	0x00ff0000, 0xbf850007,
+	0xb8eef801, 0x866eff6e,
+	0x00000800, 0xbf850003,
+	0x866eff7b, 0x00000400,
+	0xbf850040, 0xb8faf807,
 	0x867aff7a, 0x001f8000,
-	0xb97af807, 0x86fe7e7e,
-	0x86ea6a6a, 0x8f6e8378,
-	0xb96ee0c2, 0xbf800002,
-	0xb9780002, 0xbe801f6c,
+	0x8e7a8b7a, 0x8977ff77,
+	0xfc000000, 0x87777a77,
+	0xba7ff807, 0x00000000,
+	0xb8faf812, 0xb8fbf813,
+	0x8efa887a, 0xbf0d8f7b,
+	0xbf840002, 0x877bff7b,
+	0xffff0000, 0xc0031c3d,
+	0x00000010, 0xc0071bbd,
+	0x00000000, 0xc0071ebd,
+	0x00000008, 0xbf8cc07f,
+	0x8671ff6d, 0x01000000,
+	0xbf840004, 0x92f1ff70,
+	0x00010001, 0xbf840016,
+	0xbf820005, 0x86708170,
+	0x8e709770, 0x8977ff77,
+	0x00800000, 0x87777077,
+	0x86ee6e6e, 0xbf840001,
+	0xbe801d6e, 0x866eff6d,
+	0x01ff0000, 0xbf850005,
+	0x8778ff78, 0x00002000,
+	0x80ec886c, 0x82ed806d,
+	0xbf820005, 0x866eff6d,
+	0x01000000, 0xbf850002,
+	0x806c846c, 0x826d806d,
 	0x866dff6d, 0x0000ffff,
-	0xbefa0080, 0xb97a0283,
-	0xb8faf807, 0x867aff7a,
-	0x001f8000, 0x8e7a8b7a,
-	0x8977ff77, 0xfc000000,
-	0x87777a77, 0xba7ff807,
-	0x00000000, 0xbeee007e,
-	0xbeef007f, 0xbefe0180,
-	0xbf900004, 0x877a8478,
-	0xb97af802, 0xbf8e0002,
-	0xbf88fffe, 0xb8fa2a05,
-	0x807a817a, 0x8e7a8a7a,
-	0x8e7a817a, 0xb8fb1605,
-	0x807b817b, 0x8e7b867b,
-	0x807a7b7a, 0x807a7e7a,
-	0x827b807f, 0x867bff7b,
-	0x0000ffff, 0xc04b1c3d,
-	0x00000050, 0xbf8cc07f,
-	0xc04b1d3d, 0x00000060,
-	0xbf8cc07f, 0xc0431e7d,
-	0x00000074, 0xbf8cc07f,
-	0xbef4007e, 0x8675ff7f,
-	0x0000ffff, 0x8775ff75,
-	0x00040000, 0xbef60080,
-	0xbef700ff, 0x00807fac,
-	0xbef1007c, 0xbef00080,
-	0xb8f02a05, 0x80708170,
-	0x8e708a70, 0x8e708170,
-	0xb8fa1605, 0x807a817a,
-	0x8e7a867a, 0x80707a70,
-	0xbef60084, 0xbef600ff,
-	0x01000000, 0xbefe007c,
-	0xbefc0070, 0xc0611c7a,
-	0x0000007c, 0xbf8cc07f,
-	0x80708470, 0xbefc007e,
+	0x8f7a8b77, 0x867aff7a,
+	0x001f8000, 0xb97af807,
+	0x86fe7e7e, 0x86ea6a6a,
+	0x8f6e8378, 0xb96ee0c2,
+	0xbf800002, 0xb9780002,
+	0xbe801f6c, 0x866dff6d,
+	0x0000ffff, 0xbefa0080,
+	0xb97a0283, 0xb8faf807,
+	0x867aff7a, 0x001f8000,
+	0x8e7a8b7a, 0x8977ff77,
+	0xfc000000, 0x87777a77,
+	0xba7ff807, 0x00000000,
+	0xbeee007e, 0xbeef007f,
+	0xbefe0180, 0xbf900004,
+	0x877a8478, 0xb97af802,
+	0xbf8e0002, 0xbf88fffe,
+	0xb8fa2a05, 0x807a817a,
+	0x8e7a8a7a, 0x8e7a817a,
+	0xb8fb1605, 0x807b817b,
+	0x8e7b867b, 0x807a7b7a,
+	0x807a7e7a, 0x827b807f,
+	0x867bff7b, 0x0000ffff,
+	0xc04b1c3d, 0x00000050,
+	0xbf8cc07f, 0xc04b1d3d,
+	0x00000060, 0xbf8cc07f,
+	0xc0431e7d, 0x00000074,
+	0xbf8cc07f, 0xbef4007e,
+	0x8675ff7f, 0x0000ffff,
+	0x8775ff75, 0x00040000,
+	0xbef60080, 0xbef700ff,
+	0x00807fac, 0xbef1007c,
+	0xbef00080, 0xb8f02a05,
+	0x80708170, 0x8e708a70,
+	0x8e708170, 0xb8fa1605,
+	0x807a817a, 0x8e7a867a,
+	0x80707a70, 0xbef60084,
+	0xbef600ff, 0x01000000,
 	0xbefe007c, 0xbefc0070,
-	0xc0611b3a, 0x0000007c,
+	0xc0611c7a, 0x0000007c,
 	0xbf8cc07f, 0x80708470,
 	0xbefc007e, 0xbefe007c,
-	0xbefc0070, 0xc0611b7a,
+	0xbefc0070, 0xc0611b3a,
 	0x0000007c, 0xbf8cc07f,
 	0x80708470, 0xbefc007e,
 	0xbefe007c, 0xbefc0070,
-	0xc0611bba, 0x0000007c,
+	0xc0611b7a, 0x0000007c,
 	0xbf8cc07f, 0x80708470,
 	0xbefc007e, 0xbefe007c,
-	0xbefc0070, 0xc0611bfa,
+	0xbefc0070, 0xc0611bba,
 	0x0000007c, 0xbf8cc07f,
 	0x80708470, 0xbefc007e,
 	0xbefe007c, 0xbefc0070,
-	0xc0611e3a, 0x0000007c,
-	0xbf8cc07f, 0x80708470,
-	0xbefc007e, 0xb8fbf803,
-	0xbefe007c, 0xbefc0070,
-	0xc0611efa, 0x0000007c,
+	0xc0611bfa, 0x0000007c,
 	0xbf8cc07f, 0x80708470,
 	0xbefc007e, 0xbefe007c,
-	0xbefc0070, 0xc0611a3a,
+	0xbefc0070, 0xc0611e3a,
+	0x0000007c, 0xbf8cc07f,
+	0x80708470, 0xbefc007e,
+	0xb8fbf803, 0xbefe007c,
+	0xbefc0070, 0xc0611efa,
 	0x0000007c, 0xbf8cc07f,
 	0x80708470, 0xbefc007e,
 	0xbefe007c, 0xbefc0070,
-	0xc0611a7a, 0x0000007c,
-	0xbf8cc07f, 0x80708470,
-	0xbefc007e, 0xb8f1f801,
-	0xbefe007c, 0xbefc0070,
-	0xc0611c7a, 0x0000007c,
+	0xc0611a3a, 0x0000007c,
 	0xbf8cc07f, 0x80708470,
-	0xbefc007e, 0x867aff7f,
-	0x04000000, 0xbeef0080,
-	0x876f6f7a, 0xb8f02a05,
-	0x80708170, 0x8e708a70,
-	0x8e708170, 0xb8fb1605,
-	0x807b817b, 0x8e7b847b,
-	0x8e76827b, 0xbef600ff,
-	0x01000000, 0xbef20174,
-	0x80747074, 0x82758075,
-	0xbefc0080, 0xbf800000,
-	0xbe802b00, 0xbe822b02,
-	0xbe842b04, 0xbe862b06,
-	0xbe882b08, 0xbe8a2b0a,
-	0xbe8c2b0c, 0xbe8e2b0e,
-	0xc06b003a, 0x00000000,
-	0xbf8cc07f, 0xc06b013a,
-	0x00000010, 0xbf8cc07f,
-	0xc06b023a, 0x00000020,
-	0xbf8cc07f, 0xc06b033a,
-	0x00000030, 0xbf8cc07f,
-	0x8074c074, 0x82758075,
-	0x807c907c, 0xbf0a7b7c,
-	0xbf85ffe7, 0xbef40172,
-	0xbef00080, 0xbefe00c1,
-	0xbeff00c1, 0xbee80080,
-	0xbee90080, 0xbef600ff,
-	0x01000000, 0x867aff78,
-	0x00400000, 0xbf850003,
-	0xb8faf803, 0x897a7aff,
-	0x10000000, 0xbf85004d,
-	0xbe840080, 0xd2890000,
-	0x00000900, 0x80048104,
-	0xd2890001, 0x00000900,
-	0x80048104, 0xd2890002,
-	0x00000900, 0x80048104,
-	0xd2890003, 0x00000900,
-	0x80048104, 0xc069003a,
-	0x00000070, 0xbf8cc07f,
-	0x80709070, 0xbf06c004,
-	0xbf84ffee, 0xbe840080,
-	0xd2890000, 0x00000901,
-	0x80048104, 0xd2890001,
-	0x00000901, 0x80048104,
-	0xd2890002, 0x00000901,
-	0x80048104, 0xd2890003,
-	0x00000901, 0x80048104,
-	0xc069003a, 0x00000070,
-	0xbf8cc07f, 0x80709070,
-	0xbf06c004, 0xbf84ffee,
-	0xbe840080, 0xd2890000,
-	0x00000902, 0x80048104,
-	0xd2890001, 0x00000902,
-	0x80048104, 0xd2890002,
-	0x00000902, 0x80048104,
-	0xd2890003, 0x00000902,
-	0x80048104, 0xc069003a,
-	0x00000070, 0xbf8cc07f,
-	0x80709070, 0xbf06c004,
-	0xbf84ffee, 0xbe840080,
-	0xd2890000, 0x00000903,
-	0x80048104, 0xd2890001,
-	0x00000903, 0x80048104,
-	0xd2890002, 0x00000903,
-	0x80048104, 0xd2890003,
-	0x00000903, 0x80048104,
-	0xc069003a, 0x00000070,
-	0xbf8cc07f, 0x80709070,
-	0xbf06c004, 0xbf84ffee,
-	0xbf820008, 0xe0724000,
-	0x701d0000, 0xe0724100,
-	0x701d0100, 0xe0724200,
-	0x701d0200, 0xe0724300,
-	0x701d0300, 0xbefe00c1,
-	0xbeff00c1, 0xb8fb4306,
-	0x867bc17b, 0xbf840064,
-	0xbf8a0000, 0x867aff6f,
-	0x04000000, 0xbf840060,
-	0x8e7b867b, 0x8e7b827b,
-	0xbef6007b, 0xb8f02a05,
-	0x80708170, 0x8e708a70,
-	0x8e708170, 0xb8fa1605,
-	0x807a817a, 0x8e7a867a,
-	0x80707a70, 0x8070ff70,
-	0x00000080, 0xbef600ff,
-	0x01000000, 0xbefc0080,
-	0xd28c0002, 0x000100c1,
-	0xd28d0003, 0x000204c1,
+	0xbefc007e, 0xbefe007c,
+	0xbefc0070, 0xc0611a7a,
+	0x0000007c, 0xbf8cc07f,
+	0x80708470, 0xbefc007e,
+	0xb8f1f801, 0xbefe007c,
+	0xbefc0070, 0xc0611c7a,
+	0x0000007c, 0xbf8cc07f,
+	0x80708470, 0xbefc007e,
+	0x867aff7f, 0x04000000,
+	0xbeef0080, 0x876f6f7a,
+	0xb8f02a05, 0x80708170,
+	0x8e708a70, 0x8e708170,
+	0xb8fb1605, 0x807b817b,
+	0x8e7b847b, 0x8e76827b,
+	0xbef600ff, 0x01000000,
+	0xbef20174, 0x80747074,
+	0x82758075, 0xbefc0080,
+	0xbf800000, 0xbe802b00,
+	0xbe822b02, 0xbe842b04,
+	0xbe862b06, 0xbe882b08,
+	0xbe8a2b0a, 0xbe8c2b0c,
+	0xbe8e2b0e, 0xc06b003a,
+	0x00000000, 0xbf8cc07f,
+	0xc06b013a, 0x00000010,
+	0xbf8cc07f, 0xc06b023a,
+	0x00000020, 0xbf8cc07f,
+	0xc06b033a, 0x00000030,
+	0xbf8cc07f, 0x8074c074,
+	0x82758075, 0x807c907c,
+	0xbf0a7b7c, 0xbf85ffe7,
+	0xbef40172, 0xbef00080,
+	0xbefe00c1, 0xbeff00c1,
+	0xbee80080, 0xbee90080,
+	0xbef600ff, 0x01000000,
 	0x867aff78, 0x00400000,
 	0xbf850003, 0xb8faf803,
 	0x897a7aff, 0x10000000,
-	0xbf850030, 0x24040682,
-	0xd86e4000, 0x00000002,
-	0xbf8cc07f, 0xbe840080,
+	0xbf85004d, 0xbe840080,
 	0xd2890000, 0x00000900,
 	0x80048104, 0xd2890001,
 	0x00000900, 0x80048104,
@@ -1332,31 +1275,50 @@ static const uint32_t cwsr_trap_arcturus_hex[] = {
 	0x80048104, 0xc069003a,
 	0x00000070, 0xbf8cc07f,
 	0x80709070, 0xbf06c004,
-	0xbf84ffee, 0x680404ff,
-	0x00000200, 0xd0c9006a,
-	0x0000f702, 0xbf87ffd2,
-	0xbf820015, 0xd1060002,
-	0x00011103, 0x7e0602ff,
-	0x00000200, 0xbefc00ff,
-	0x00010000, 0xbe800077,
-	0x8677ff77, 0xff7fffff,
-	0x8777ff77, 0x00058000,
-	0xd8ec0000, 0x00000002,
-	0xbf8cc07f, 0xe0765000,
-	0x701d0002, 0x68040702,
-	0xd0c9006a, 0x0000f702,
-	0xbf87fff7, 0xbef70000,
-	0xbef000ff, 0x00000400,
+	0xbf84ffee, 0xbe840080,
+	0xd2890000, 0x00000902,
+	0x80048104, 0xd2890001,
+	0x00000902, 0x80048104,
+	0xd2890002, 0x00000902,
+	0x80048104, 0xd2890003,
+	0x00000902, 0x80048104,
+	0xc069003a, 0x00000070,
+	0xbf8cc07f, 0x80709070,
+	0xbf06c004, 0xbf84ffee,
+	0xbe840080, 0xd2890000,
+	0x00000903, 0x80048104,
+	0xd2890001, 0x00000903,
+	0x80048104, 0xd2890002,
+	0x00000903, 0x80048104,
+	0xd2890003, 0x00000903,
+	0x80048104, 0xc069003a,
+	0x00000070, 0xbf8cc07f,
+	0x80709070, 0xbf06c004,
+	0xbf84ffee, 0xbf820008,
+	0xe0724000, 0x701d0000,
+	0xe0724100, 0x701d0100,
+	0xe0724200, 0x701d0200,
+	0xe0724300, 0x701d0300,
 	0xbefe00c1, 0xbeff00c1,
-	0xb8fb2a05, 0x807b817b,
-	0x8e7b827b, 0xbef600ff,
-	0x01000000, 0xbefc0084,
-	0xbf0a7b7c, 0xbf84006d,
-	0xbf11017c, 0x807bff7b,
-	0x00001000, 0x867aff78,
+	0xb8fb4306, 0x867bc17b,
+	0xbf840064, 0xbf8a0000,
+	0x867aff6f, 0x04000000,
+	0xbf840060, 0x8e7b867b,
+	0x8e7b827b, 0xbef6007b,
+	0xb8f02a05, 0x80708170,
+	0x8e708a70, 0x8e708170,
+	0xb8fa1605, 0x807a817a,
+	0x8e7a867a, 0x80707a70,
+	0x8070ff70, 0x00000080,
+	0xbef600ff, 0x01000000,
+	0xbefc0080, 0xd28c0002,
+	0x000100c1, 0xd28d0003,
+	0x000204c1, 0x867aff78,
 	0x00400000, 0xbf850003,
 	0xb8faf803, 0x897a7aff,
-	0x10000000, 0xbf850051,
+	0x10000000, 0xbf850030,
+	0x24040682, 0xd86e4000,
+	0x00000002, 0xbf8cc07f,
 	0xbe840080, 0xd2890000,
 	0x00000900, 0x80048104,
 	0xd2890001, 0x00000900,
@@ -1376,427 +1338,411 @@ static const uint32_t cwsr_trap_arcturus_hex[] = {
 	0xc069003a, 0x00000070,
 	0xbf8cc07f, 0x80709070,
 	0xbf06c004, 0xbf84ffee,
-	0xbe840080, 0xd2890000,
-	0x00000902, 0x80048104,
-	0xd2890001, 0x00000902,
-	0x80048104, 0xd2890002,
-	0x00000902, 0x80048104,
-	0xd2890003, 0x00000902,
-	0x80048104, 0xc069003a,
+	0x680404ff, 0x00000200,
+	0xd0c9006a, 0x0000f702,
+	0xbf87ffd2, 0xbf820015,
+	0xd1060002, 0x00011103,
+	0x7e0602ff, 0x00000200,
+	0xbefc00ff, 0x00010000,
+	0xbe800077, 0x8677ff77,
+	0xff7fffff, 0x8777ff77,
+	0x00058000, 0xd8ec0000,
+	0x00000002, 0xbf8cc07f,
+	0xe0765000, 0x701d0002,
+	0x68040702, 0xd0c9006a,
+	0x0000f702, 0xbf87fff7,
+	0xbef70000, 0xbef000ff,
+	0x00000400, 0xbefe00c1,
+	0xbeff00c1, 0xb8fb2a05,
+	0x807b817b, 0x8e7b827b,
+	0xbef600ff, 0x01000000,
+	0xbefc0084, 0xbf0a7b7c,
+	0xbf84006d, 0xbf11017c,
+	0x807bff7b, 0x00001000,
+	0x867aff78, 0x00400000,
+	0xbf850003, 0xb8faf803,
+	0x897a7aff, 0x10000000,
+	0xbf850051, 0xbe840080,
+	0xd2890000, 0x00000900,
+	0x80048104, 0xd2890001,
+	0x00000900, 0x80048104,
+	0xd2890002, 0x00000900,
+	0x80048104, 0xd2890003,
+	0x00000900, 0x80048104,
+	0xc069003a, 0x00000070,
+	0xbf8cc07f, 0x80709070,
+	0xbf06c004, 0xbf84ffee,
+	0xbe840080, 0xd2890000,
+	0x00000901, 0x80048104,
+	0xd2890001, 0x00000901,
+	0x80048104, 0xd2890002,
+	0x00000901, 0x80048104,
+	0xd2890003, 0x00000901,
+	0x80048104, 0xc069003a,
 	0x00000070, 0xbf8cc07f,
 	0x80709070, 0xbf06c004,
 	0xbf84ffee, 0xbe840080,
-	0xd2890000, 0x00000903,
+	0xd2890000, 0x00000902,
 	0x80048104, 0xd2890001,
-	0x00000903, 0x80048104,
-	0xd2890002, 0x00000903,
+	0x00000902, 0x80048104,
+	0xd2890002, 0x00000902,
 	0x80048104, 0xd2890003,
-	0x00000903, 0x80048104,
+	0x00000902, 0x80048104,
 	0xc069003a, 0x00000070,
 	0xbf8cc07f, 0x80709070,
 	0xbf06c004, 0xbf84ffee,
-	0x807c847c, 0xbf0a7b7c,
-	0xbf85ffb1, 0xbf9c0000,
-	0xbf820012, 0x7e000300,
-	0x7e020301, 0x7e040302,
-	0x7e060303, 0xe0724000,
-	0x701d0000, 0xe0724100,
-	0x701d0100, 0xe0724200,
-	0x701d0200, 0xe0724300,
-	0x701d0300, 0x807c847c,
-	0x8070ff70, 0x00000400,
-	0xbf0a7b7c, 0xbf85ffef,
-	0xbf9c0000, 0xbefc0080,
-	0xbf11017c, 0x867aff78,
-	0x00400000, 0xbf850003,
-	0xb8faf803, 0x897a7aff,
-	0x10000000, 0xbf850059,
-	0xd3d84000, 0x18000100,
-	0xd3d84001, 0x18000101,
-	0xd3d84002, 0x18000102,
-	0xd3d84003, 0x18000103,
 	0xbe840080, 0xd2890000,
-	0x00000900, 0x80048104,
-	0xd2890001, 0x00000900,
+	0x00000903, 0x80048104,
+	0xd2890001, 0x00000903,
 	0x80048104, 0xd2890002,
-	0x00000900, 0x80048104,
-	0xd2890003, 0x00000900,
+	0x00000903, 0x80048104,
+	0xd2890003, 0x00000903,
 	0x80048104, 0xc069003a,
 	0x00000070, 0xbf8cc07f,
 	0x80709070, 0xbf06c004,
-	0xbf84ffee, 0xbe840080,
-	0xd2890000, 0x00000901,
+	0xbf84ffee, 0x807c847c,
+	0xbf0a7b7c, 0xbf85ffb1,
+	0xbf9c0000, 0xbf820012,
+	0x7e000300, 0x7e020301,
+	0x7e040302, 0x7e060303,
+	0xe0724000, 0x701d0000,
+	0xe0724100, 0x701d0100,
+	0xe0724200, 0x701d0200,
+	0xe0724300, 0x701d0300,
+	0x807c847c, 0x8070ff70,
+	0x00000400, 0xbf0a7b7c,
+	0xbf85ffef, 0xbf9c0000,
+	0xbefc0080, 0xbf11017c,
+	0x867aff78, 0x00400000,
+	0xbf850003, 0xb8faf803,
+	0x897a7aff, 0x10000000,
+	0xbf850059, 0xd3d84000,
+	0x18000100, 0xd3d84001,
+	0x18000101, 0xd3d84002,
+	0x18000102, 0xd3d84003,
+	0x18000103, 0xbe840080,
+	0xd2890000, 0x00000900,
 	0x80048104, 0xd2890001,
-	0x00000901, 0x80048104,
-	0xd2890002, 0x00000901,
+	0x00000900, 0x80048104,
+	0xd2890002, 0x00000900,
 	0x80048104, 0xd2890003,
-	0x00000901, 0x80048104,
+	0x00000900, 0x80048104,
 	0xc069003a, 0x00000070,
 	0xbf8cc07f, 0x80709070,
 	0xbf06c004, 0xbf84ffee,
 	0xbe840080, 0xd2890000,
-	0x00000902, 0x80048104,
-	0xd2890001, 0x00000902,
+	0x00000901, 0x80048104,
+	0xd2890001, 0x00000901,
 	0x80048104, 0xd2890002,
-	0x00000902, 0x80048104,
-	0xd2890003, 0x00000902,
+	0x00000901, 0x80048104,
+	0xd2890003, 0x00000901,
 	0x80048104, 0xc069003a,
 	0x00000070, 0xbf8cc07f,
 	0x80709070, 0xbf06c004,
 	0xbf84ffee, 0xbe840080,
-	0xd2890000, 0x00000903,
+	0xd2890000, 0x00000902,
 	0x80048104, 0xd2890001,
-	0x00000903, 0x80048104,
-	0xd2890002, 0x00000903,
+	0x00000902, 0x80048104,
+	0xd2890002, 0x00000902,
 	0x80048104, 0xd2890003,
-	0x00000903, 0x80048104,
+	0x00000902, 0x80048104,
 	0xc069003a, 0x00000070,
 	0xbf8cc07f, 0x80709070,
 	0xbf06c004, 0xbf84ffee,
-	0x807c847c, 0xbf0a7b7c,
-	0xbf85ffa9, 0xbf9c0000,
-	0xbf820016, 0xd3d84000,
-	0x18000100, 0xd3d84001,
-	0x18000101, 0xd3d84002,
-	0x18000102, 0xd3d84003,
-	0x18000103, 0xe0724000,
-	0x701d0000, 0xe0724100,
-	0x701d0100, 0xe0724200,
-	0x701d0200, 0xe0724300,
-	0x701d0300, 0x807c847c,
-	0x8070ff70, 0x00000400,
-	0xbf0a7b7c, 0xbf85ffeb,
-	0xbf9c0000, 0xbf8200e3,
-	0xbef4007e, 0x8675ff7f,
-	0x0000ffff, 0x8775ff75,
-	0x00040000, 0xbef60080,
-	0xbef700ff, 0x00807fac,
-	0x866eff7f, 0x04000000,
-	0xbf84001f, 0xbefe00c1,
-	0xbeff00c1, 0xb8ef4306,
-	0x866fc16f, 0xbf84001a,
-	0x8e6f866f, 0x8e6f826f,
-	0xbef6006f, 0xb8f82a05,
-	0x80788178, 0x8e788a78,
-	0x8e788178, 0xb8ee1605,
-	0x806e816e, 0x8e6e866e,
-	0x80786e78, 0x8078ff78,
-	0x00000080, 0xbef600ff,
-	0x01000000, 0xbefc0080,
-	0xe0510000, 0x781d0000,
-	0xe0510100, 0x781d0000,
-	0x807cff7c, 0x00000200,
-	0x8078ff78, 0x00000200,
-	0xbf0a6f7c, 0xbf85fff6,
+	0xbe840080, 0xd2890000,
+	0x00000903, 0x80048104,
+	0xd2890001, 0x00000903,
+	0x80048104, 0xd2890002,
+	0x00000903, 0x80048104,
+	0xd2890003, 0x00000903,
+	0x80048104, 0xc069003a,
+	0x00000070, 0xbf8cc07f,
+	0x80709070, 0xbf06c004,
+	0xbf84ffee, 0x807c847c,
+	0xbf0a7b7c, 0xbf85ffa9,
+	0xbf9c0000, 0xbf820016,
+	0xd3d84000, 0x18000100,
+	0xd3d84001, 0x18000101,
+	0xd3d84002, 0x18000102,
+	0xd3d84003, 0x18000103,
+	0xe0724000, 0x701d0000,
+	0xe0724100, 0x701d0100,
+	0xe0724200, 0x701d0200,
+	0xe0724300, 0x701d0300,
+	0x807c847c, 0x8070ff70,
+	0x00000400, 0xbf0a7b7c,
+	0xbf85ffeb, 0xbf9c0000,
+	0xbf8200e3, 0xbef4007e,
+	0x8675ff7f, 0x0000ffff,
+	0x8775ff75, 0x00040000,
+	0xbef60080, 0xbef700ff,
+	0x00807fac, 0x866eff7f,
+	0x04000000, 0xbf84001f,
 	0xbefe00c1, 0xbeff00c1,
+	0xb8ef4306, 0x866fc16f,
+	0xbf84001a, 0x8e6f866f,
+	0x8e6f826f, 0xbef6006f,
+	0xb8f82a05, 0x80788178,
+	0x8e788a78, 0x8e788178,
+	0xb8ee1605, 0x806e816e,
+	0x8e6e866e, 0x80786e78,
+	0x8078ff78, 0x00000080,
 	0xbef600ff, 0x01000000,
-	0xb8ef2a05, 0x806f816f,
-	0x8e6f826f, 0x806fff6f,
-	0x00008000, 0xbef80080,
-	0xbeee0078, 0x8078ff78,
-	0x00000400, 0xbefc0084,
-	0xbf11087c, 0xe0524000,
-	0x781d0000, 0xe0524100,
-	0x781d0100, 0xe0524200,
-	0x781d0200, 0xe0524300,
-	0x781d0300, 0xbf8c0f70,
-	0x7e000300, 0x7e020301,
-	0x7e040302, 0x7e060303,
-	0x807c847c, 0x8078ff78,
-	0x00000400, 0xbf0a6f7c,
-	0xbf85ffee, 0xbefc0080,
-	0xbf11087c, 0xe0524000,
-	0x781d0000, 0xe0524100,
-	0x781d0100, 0xe0524200,
-	0x781d0200, 0xe0524300,
-	0x781d0300, 0xbf8c0f70,
-	0xd3d94000, 0x18000100,
-	0xd3d94001, 0x18000101,
-	0xd3d94002, 0x18000102,
-	0xd3d94003, 0x18000103,
-	0x807c847c, 0x8078ff78,
-	0x00000400, 0xbf0a6f7c,
-	0xbf85ffea, 0xbf9c0000,
-	0xe0524000, 0x6e1d0000,
-	0xe0524100, 0x6e1d0100,
-	0xe0524200, 0x6e1d0200,
-	0xe0524300, 0x6e1d0300,
-	0xbf8c0f70, 0xb8f82a05,
-	0x80788178, 0x8e788a78,
-	0x8e788178, 0xb8ee1605,
-	0x806e816e, 0x8e6e866e,
-	0x80786e78, 0x80f8c078,
-	0xb8ef1605, 0x806f816f,
-	0x8e6f846f, 0x8e76826f,
-	0xbef600ff, 0x01000000,
-	0xbefc006f, 0xc031003a,
-	0x00000078, 0x80f8c078,
-	0xbf8cc07f, 0x80fc907c,
-	0xbf800000, 0xbe802d00,
-	0xbe822d02, 0xbe842d04,
-	0xbe862d06, 0xbe882d08,
-	0xbe8a2d0a, 0xbe8c2d0c,
-	0xbe8e2d0e, 0xbf06807c,
-	0xbf84fff0, 0xb8f82a05,
-	0x80788178, 0x8e788a78,
-	0x8e788178, 0xb8ee1605,
-	0x806e816e, 0x8e6e866e,
-	0x80786e78, 0xbef60084,
-	0xbef600ff, 0x01000000,
-	0xc0211bfa, 0x00000078,
-	0x80788478, 0xc0211b3a,
+	0xbefc0080, 0xe0510000,
+	0x781d0000, 0xe0510100,
+	0x781d0000, 0x807cff7c,
+	0x00000200, 0x8078ff78,
+	0x00000200, 0xbf0a6f7c,
+	0xbf85fff6, 0xbefe00c1,
+	0xbeff00c1, 0xbef600ff,
+	0x01000000, 0xb8ef2a05,
+	0x806f816f, 0x8e6f826f,
+	0x806fff6f, 0x00008000,
+	0xbef80080, 0xbeee0078,
+	0x8078ff78, 0x00000400,
+	0xbefc0084, 0xbf11087c,
+	0xe0524000, 0x781d0000,
+	0xe0524100, 0x781d0100,
+	0xe0524200, 0x781d0200,
+	0xe0524300, 0x781d0300,
+	0xbf8c0f70, 0x7e000300,
+	0x7e020301, 0x7e040302,
+	0x7e060303, 0x807c847c,
+	0x8078ff78, 0x00000400,
+	0xbf0a6f7c, 0xbf85ffee,
+	0xbefc0080, 0xbf11087c,
+	0xe0524000, 0x781d0000,
+	0xe0524100, 0x781d0100,
+	0xe0524200, 0x781d0200,
+	0xe0524300, 0x781d0300,
+	0xbf8c0f70, 0xd3d94000,
+	0x18000100, 0xd3d94001,
+	0x18000101, 0xd3d94002,
+	0x18000102, 0xd3d94003,
+	0x18000103, 0x807c847c,
+	0x8078ff78, 0x00000400,
+	0xbf0a6f7c, 0xbf85ffea,
+	0xbf9c0000, 0xe0524000,
+	0x6e1d0000, 0xe0524100,
+	0x6e1d0100, 0xe0524200,
+	0x6e1d0200, 0xe0524300,
+	0x6e1d0300, 0xbf8c0f70,
+	0xb8f82a05, 0x80788178,
+	0x8e788a78, 0x8e788178,
+	0xb8ee1605, 0x806e816e,
+	0x8e6e866e, 0x80786e78,
+	0x80f8c078, 0xb8ef1605,
+	0x806f816f, 0x8e6f846f,
+	0x8e76826f, 0xbef600ff,
+	0x01000000, 0xbefc006f,
+	0xc031003a, 0x00000078,
+	0x80f8c078, 0xbf8cc07f,
+	0x80fc907c, 0xbf800000,
+	0xbe802d00, 0xbe822d02,
+	0xbe842d04, 0xbe862d06,
+	0xbe882d08, 0xbe8a2d0a,
+	0xbe8c2d0c, 0xbe8e2d0e,
+	0xbf06807c, 0xbf84fff0,
+	0xb8f82a05, 0x80788178,
+	0x8e788a78, 0x8e788178,
+	0xb8ee1605, 0x806e816e,
+	0x8e6e866e, 0x80786e78,
+	0xbef60084, 0xbef600ff,
+	0x01000000, 0xc0211bfa,
 	0x00000078, 0x80788478,
-	0xc0211b7a, 0x00000078,
-	0x80788478, 0xc0211c3a,
+	0xc0211b3a, 0x00000078,
+	0x80788478, 0xc0211b7a,
 	0x00000078, 0x80788478,
-	0xc0211c7a, 0x00000078,
-	0x80788478, 0xc0211eba,
+	0xc0211c3a, 0x00000078,
+	0x80788478, 0xc0211c7a,
 	0x00000078, 0x80788478,
-	0xc0211efa, 0x00000078,
-	0x80788478, 0xc0211a3a,
+	0xc0211eba, 0x00000078,
+	0x80788478, 0xc0211efa,
 	0x00000078, 0x80788478,
-	0xc0211a7a, 0x00000078,
-	0x80788478, 0xc0211cfa,
+	0xc0211a3a, 0x00000078,
+	0x80788478, 0xc0211a7a,
 	0x00000078, 0x80788478,
-	0xbf8cc07f, 0xbefc006f,
-	0xbefe0070, 0xbeff0071,
-	0x866f7bff, 0x000003ff,
-	0xb96f4803, 0x866f7bff,
-	0xfffff800, 0x8f6f8b6f,
-	0xb96fa2c3, 0xb973f801,
-	0xb8ee2a05, 0x806e816e,
-	0x8e6e8a6e, 0x8e6e816e,
-	0xb8ef1605, 0x806f816f,
-	0x8e6f866f, 0x806e6f6e,
-	0x806e746e, 0x826f8075,
-	0x866fff6f, 0x0000ffff,
-	0xc00b1c37, 0x00000050,
-	0xc00b1d37, 0x00000060,
-	0xc0031e77, 0x00000074,
-	0xbf8cc07f, 0x8f6e8b77,
-	0x866eff6e, 0x001f8000,
-	0xb96ef807, 0x866dff6d,
-	0x0000ffff, 0x86fe7e7e,
-	0x86ea6a6a, 0x8f6e837a,
-	0xb96ee0c2, 0xbf800002,
-	0xb97a0002, 0xbf8a0000,
-	0xbe801f6c, 0xbf810000,
+	0xc0211cfa, 0x00000078,
+	0x80788478, 0xbf8cc07f,
+	0xbefc006f, 0xbefe0070,
+	0xbeff0071, 0x866f7bff,
+	0x000003ff, 0xb96f4803,
+	0x866f7bff, 0xfffff800,
+	0x8f6f8b6f, 0xb96fa2c3,
+	0xb973f801, 0xb8ee2a05,
+	0x806e816e, 0x8e6e8a6e,
+	0x8e6e816e, 0xb8ef1605,
+	0x806f816f, 0x8e6f866f,
+	0x806e6f6e, 0x806e746e,
+	0x826f8075, 0x866fff6f,
+	0x0000ffff, 0xc00b1c37,
+	0x00000050, 0xc00b1d37,
+	0x00000060, 0xc0031e77,
+	0x00000074, 0xbf8cc07f,
+	0x8f6e8b77, 0x866eff6e,
+	0x001f8000, 0xb96ef807,
+	0x866dff6d, 0x0000ffff,
+	0x86fe7e7e, 0x86ea6a6a,
+	0x8f6e837a, 0xb96ee0c2,
+	0xbf800002, 0xb97a0002,
+	0xbf8a0000, 0xbe801f6c,
+	0xbf810000, 0x00000000,
 };
 
 static const uint32_t cwsr_trap_aldebaran_hex[] = {
-	0xbf820001, 0xbf8202e5,
+	0xbf820001, 0xbf8202ea,
 	0xb8f8f802, 0x8978ff78,
 	0x00020006, 0xb8fbf803,
 	0x866eff78, 0x00002000,
 	0xbf840009, 0x866eff6d,
-	0x00ff0000, 0xbf85001e,
+	0x00ff0000, 0xbf850023,
 	0x866eff7b, 0x00000400,
-	0xbf85005b, 0xbf8e0010,
+	0xbf850060, 0xbf8e0010,
 	0xb8fbf803, 0xbf82fffa,
 	0x866eff7b, 0x03c00900,
-	0xbf850015, 0x866eff7b,
-	0x000071ff, 0xbf840008,
-	0x866fff7b, 0x00007080,
-	0xbf840001, 0xbeee1a87,
-	0xb8eff801, 0x8e6e8c6e,
-	0x866e6f6e, 0xbf85000a,
-	0x866eff6d, 0x00ff0000,
-	0xbf850007, 0xb8eef801,
-	0x866eff6e, 0x00000800,
-	0xbf850003, 0x866eff7b,
-	0x00000400, 0xbf850040,
-	0xb8faf807, 0x867aff7a,
-	0x001f8000, 0x8e7a8b7a,
-	0x8977ff77, 0xfc000000,
-	0x87777a77, 0xba7ff807,
-	0x00000000, 0xb8faf812,
-	0xb8fbf813, 0x8efa887a,
-	0xbf0d8f7b, 0xbf840002,
-	0x877bff7b, 0xffff0000,
-	0xc0031c3d, 0x00000010,
-	0xc0071bbd, 0x00000000,
-	0xc0071ebd, 0x00000008,
-	0xbf8cc07f, 0x8671ff6d,
-	0x01000000, 0xbf840004,
-	0x92f1ff70, 0x00010001,
-	0xbf840016, 0xbf820005,
-	0x86708170, 0x8e709770,
-	0x8977ff77, 0x00800000,
-	0x87777077, 0x86ee6e6e,
-	0xbf840001, 0xbe801d6e,
-	0x866eff6d, 0x01ff0000,
-	0xbf850005, 0x8778ff78,
-	0x00002000, 0x80ec886c,
-	0x82ed806d, 0xbf820005,
-	0x866eff6d, 0x01000000,
-	0xbf850002, 0x806c846c,
-	0x826d806d, 0x866dff6d,
-	0x0000ffff, 0x8f7a8b77,
+	0xbf85001a, 0x866eff6d,
+	0x01ff0000, 0xbf06ff6e,
+	0x01040000, 0xbf850015,
+	0x866eff7b, 0x000071ff,
+	0xbf840008, 0x866fff7b,
+	0x00007080, 0xbf840001,
+	0xbeee1a87, 0xb8eff801,
+	0x8e6e8c6e, 0x866e6f6e,
+	0xbf85000a, 0x866eff6d,
+	0x00ff0000, 0xbf850007,
+	0xb8eef801, 0x866eff6e,
+	0x00000800, 0xbf850003,
+	0x866eff7b, 0x00000400,
+	0xbf850040, 0xb8faf807,
 	0x867aff7a, 0x001f8000,
-	0xb97af807, 0x86fe7e7e,
-	0x86ea6a6a, 0x8f6e8378,
-	0xb96ee0c2, 0xbf800002,
-	0xb9780002, 0xbe801f6c,
-	0x866dff6d, 0x0000ffff,
-	0xbefa0080, 0xb97a0283,
-	0xb8faf807, 0x867aff7a,
-	0x001f8000, 0x8e7a8b7a,
-	0x8977ff77, 0xfc000000,
-	0x87777a77, 0xba7ff807,
-	0x00000000, 0xbeee007e,
-	0xbeef007f, 0xbefe0180,
-	0xbf900004, 0x877a8478,
-	0xb97af802, 0xbf8e0002,
-	0xbf88fffe, 0xb8fa2985,
-	0x807a817a, 0x8e7a8a7a,
-	0x8e7a817a, 0xb8fb1605,
-	0x807b817b, 0x8e7b867b,
-	0x807a7b7a, 0x807a7e7a,
-	0x827b807f, 0x867bff7b,
-	0x0000ffff, 0xc04b1c3d,
-	0x00000050, 0xbf8cc07f,
-	0xc04b1d3d, 0x00000060,
-	0xbf8cc07f, 0xc0431e7d,
-	0x00000074, 0xbf8cc07f,
-	0xbef4007e, 0x8675ff7f,
-	0x0000ffff, 0x8775ff75,
-	0x00040000, 0xbef60080,
-	0xbef700ff, 0x00807fac,
-	0xbef1007c, 0xbef00080,
-	0xb8f02985, 0x80708170,
-	0x8e708a70, 0x8e708170,
-	0xb8fa1605, 0x807a817a,
-	0x8e7a867a, 0x80707a70,
-	0xbef60084, 0xbef600ff,
-	0x01000000, 0xbefe007c,
-	0xbefc0070, 0xc0611c7a,
-	0x0000007c, 0xbf8cc07f,
-	0x80708470, 0xbefc007e,
+	0x8e7a8b7a, 0x8977ff77,
+	0xfc000000, 0x87777a77,
+	0xba7ff807, 0x00000000,
+	0xb8faf812, 0xb8fbf813,
+	0x8efa887a, 0xbf0d8f7b,
+	0xbf840002, 0x877bff7b,
+	0xffff0000, 0xc0031c3d,
+	0x00000010, 0xc0071bbd,
+	0x00000000, 0xc0071ebd,
+	0x00000008, 0xbf8cc07f,
+	0x8671ff6d, 0x01000000,
+	0xbf840004, 0x92f1ff70,
+	0x00010001, 0xbf840016,
+	0xbf820005, 0x86708170,
+	0x8e709770, 0x8977ff77,
+	0x00800000, 0x87777077,
+	0x86ee6e6e, 0xbf840001,
+	0xbe801d6e, 0x866eff6d,
+	0x01ff0000, 0xbf850005,
+	0x8778ff78, 0x00002000,
+	0x80ec886c, 0x82ed806d,
+	0xbf820005, 0x866eff6d,
+	0x01000000, 0xbf850002,
+	0x806c846c, 0x826d806d,
+	0x866dff6d, 0x0000ffff,
+	0x8f7a8b77, 0x867aff7a,
+	0x001f8000, 0xb97af807,
+	0x86fe7e7e, 0x86ea6a6a,
+	0x8f6e8378, 0xb96ee0c2,
+	0xbf800002, 0xb9780002,
+	0xbe801f6c, 0x866dff6d,
+	0x0000ffff, 0xbefa0080,
+	0xb97a0283, 0xb8faf807,
+	0x867aff7a, 0x001f8000,
+	0x8e7a8b7a, 0x8977ff77,
+	0xfc000000, 0x87777a77,
+	0xba7ff807, 0x00000000,
+	0xbeee007e, 0xbeef007f,
+	0xbefe0180, 0xbf900004,
+	0x877a8478, 0xb97af802,
+	0xbf8e0002, 0xbf88fffe,
+	0xb8fa2985, 0x807a817a,
+	0x8e7a8a7a, 0x8e7a817a,
+	0xb8fb1605, 0x807b817b,
+	0x8e7b867b, 0x807a7b7a,
+	0x807a7e7a, 0x827b807f,
+	0x867bff7b, 0x0000ffff,
+	0xc04b1c3d, 0x00000050,
+	0xbf8cc07f, 0xc04b1d3d,
+	0x00000060, 0xbf8cc07f,
+	0xc0431e7d, 0x00000074,
+	0xbf8cc07f, 0xbef4007e,
+	0x8675ff7f, 0x0000ffff,
+	0x8775ff75, 0x00040000,
+	0xbef60080, 0xbef700ff,
+	0x00807fac, 0xbef1007c,
+	0xbef00080, 0xb8f02985,
+	0x80708170, 0x8e708a70,
+	0x8e708170, 0xb8fa1605,
+	0x807a817a, 0x8e7a867a,
+	0x80707a70, 0xbef60084,
+	0xbef600ff, 0x01000000,
 	0xbefe007c, 0xbefc0070,
-	0xc0611b3a, 0x0000007c,
+	0xc0611c7a, 0x0000007c,
 	0xbf8cc07f, 0x80708470,
 	0xbefc007e, 0xbefe007c,
-	0xbefc0070, 0xc0611b7a,
+	0xbefc0070, 0xc0611b3a,
 	0x0000007c, 0xbf8cc07f,
 	0x80708470, 0xbefc007e,
 	0xbefe007c, 0xbefc0070,
-	0xc0611bba, 0x0000007c,
+	0xc0611b7a, 0x0000007c,
 	0xbf8cc07f, 0x80708470,
 	0xbefc007e, 0xbefe007c,
-	0xbefc0070, 0xc0611bfa,
+	0xbefc0070, 0xc0611bba,
 	0x0000007c, 0xbf8cc07f,
 	0x80708470, 0xbefc007e,
 	0xbefe007c, 0xbefc0070,
-	0xc0611e3a, 0x0000007c,
-	0xbf8cc07f, 0x80708470,
-	0xbefc007e, 0xb8fbf803,
-	0xbefe007c, 0xbefc0070,
-	0xc0611efa, 0x0000007c,
+	0xc0611bfa, 0x0000007c,
 	0xbf8cc07f, 0x80708470,
 	0xbefc007e, 0xbefe007c,
-	0xbefc0070, 0xc0611a3a,
+	0xbefc0070, 0xc0611e3a,
+	0x0000007c, 0xbf8cc07f,
+	0x80708470, 0xbefc007e,
+	0xb8fbf803, 0xbefe007c,
+	0xbefc0070, 0xc0611efa,
 	0x0000007c, 0xbf8cc07f,
 	0x80708470, 0xbefc007e,
 	0xbefe007c, 0xbefc0070,
-	0xc0611a7a, 0x0000007c,
-	0xbf8cc07f, 0x80708470,
-	0xbefc007e, 0xb8f1f801,
-	0xbefe007c, 0xbefc0070,
-	0xc0611c7a, 0x0000007c,
+	0xc0611a3a, 0x0000007c,
 	0xbf8cc07f, 0x80708470,
-	0xbefc007e, 0x867aff7f,
-	0x04000000, 0xbeef0080,
-	0x876f6f7a, 0xb8f02985,
-	0x80708170, 0x8e708a70,
-	0x8e708170, 0xb8fb1605,
-	0x807b817b, 0x8e7b847b,
-	0x8e76827b, 0xbef600ff,
-	0x01000000, 0xbef20174,
-	0x80747074, 0x82758075,
-	0xbefc0080, 0xbf800000,
-	0xbe802b00, 0xbe822b02,
-	0xbe842b04, 0xbe862b06,
-	0xbe882b08, 0xbe8a2b0a,
-	0xbe8c2b0c, 0xbe8e2b0e,
-	0xc06b003a, 0x00000000,
-	0xbf8cc07f, 0xc06b013a,
-	0x00000010, 0xbf8cc07f,
-	0xc06b023a, 0x00000020,
-	0xbf8cc07f, 0xc06b033a,
-	0x00000030, 0xbf8cc07f,
-	0x8074c074, 0x82758075,
-	0x807c907c, 0xbf0a7b7c,
-	0xbf85ffe7, 0xbef40172,
-	0xbef00080, 0xbefe00c1,
-	0xbeff00c1, 0xbee80080,
-	0xbee90080, 0xbef600ff,
-	0x01000000, 0x867aff78,
-	0x00400000, 0xbf850003,
-	0xb8faf803, 0x897a7aff,
-	0x10000000, 0xbf85004d,
-	0xbe840080, 0xd2890000,
-	0x00000900, 0x80048104,
-	0xd2890001, 0x00000900,
-	0x80048104, 0xd2890002,
-	0x00000900, 0x80048104,
-	0xd2890003, 0x00000900,
-	0x80048104, 0xc069003a,
-	0x00000070, 0xbf8cc07f,
-	0x80709070, 0xbf06c004,
-	0xbf84ffee, 0xbe840080,
-	0xd2890000, 0x00000901,
-	0x80048104, 0xd2890001,
-	0x00000901, 0x80048104,
-	0xd2890002, 0x00000901,
-	0x80048104, 0xd2890003,
-	0x00000901, 0x80048104,
-	0xc069003a, 0x00000070,
-	0xbf8cc07f, 0x80709070,
-	0xbf06c004, 0xbf84ffee,
-	0xbe840080, 0xd2890000,
-	0x00000902, 0x80048104,
-	0xd2890001, 0x00000902,
-	0x80048104, 0xd2890002,
-	0x00000902, 0x80048104,
-	0xd2890003, 0x00000902,
-	0x80048104, 0xc069003a,
-	0x00000070, 0xbf8cc07f,
-	0x80709070, 0xbf06c004,
-	0xbf84ffee, 0xbe840080,
-	0xd2890000, 0x00000903,
-	0x80048104, 0xd2890001,
-	0x00000903, 0x80048104,
-	0xd2890002, 0x00000903,
-	0x80048104, 0xd2890003,
-	0x00000903, 0x80048104,
-	0xc069003a, 0x00000070,
-	0xbf8cc07f, 0x80709070,
-	0xbf06c004, 0xbf84ffee,
-	0xbf820008, 0xe0724000,
-	0x701d0000, 0xe0724100,
-	0x701d0100, 0xe0724200,
-	0x701d0200, 0xe0724300,
-	0x701d0300, 0xbefe00c1,
-	0xbeff00c1, 0xb8fb4306,
-	0x867bc17b, 0xbf840064,
-	0xbf8a0000, 0x867aff6f,
-	0x04000000, 0xbf840060,
-	0x8e7b867b, 0x8e7b827b,
-	0xbef6007b, 0xb8f02985,
-	0x80708170, 0x8e708a70,
-	0x8e708170, 0xb8fa1605,
-	0x807a817a, 0x8e7a867a,
-	0x80707a70, 0x8070ff70,
-	0x00000080, 0xbef600ff,
-	0x01000000, 0xbefc0080,
-	0xd28c0002, 0x000100c1,
-	0xd28d0003, 0x000204c1,
+	0xbefc007e, 0xbefe007c,
+	0xbefc0070, 0xc0611a7a,
+	0x0000007c, 0xbf8cc07f,
+	0x80708470, 0xbefc007e,
+	0xb8f1f801, 0xbefe007c,
+	0xbefc0070, 0xc0611c7a,
+	0x0000007c, 0xbf8cc07f,
+	0x80708470, 0xbefc007e,
+	0x867aff7f, 0x04000000,
+	0xbeef0080, 0x876f6f7a,
+	0xb8f02985, 0x80708170,
+	0x8e708a70, 0x8e708170,
+	0xb8fb1605, 0x807b817b,
+	0x8e7b847b, 0x8e76827b,
+	0xbef600ff, 0x01000000,
+	0xbef20174, 0x80747074,
+	0x82758075, 0xbefc0080,
+	0xbf800000, 0xbe802b00,
+	0xbe822b02, 0xbe842b04,
+	0xbe862b06, 0xbe882b08,
+	0xbe8a2b0a, 0xbe8c2b0c,
+	0xbe8e2b0e, 0xc06b003a,
+	0x00000000, 0xbf8cc07f,
+	0xc06b013a, 0x00000010,
+	0xbf8cc07f, 0xc06b023a,
+	0x00000020, 0xbf8cc07f,
+	0xc06b033a, 0x00000030,
+	0xbf8cc07f, 0x8074c074,
+	0x82758075, 0x807c907c,
+	0xbf0a7b7c, 0xbf85ffe7,
+	0xbef40172, 0xbef00080,
+	0xbefe00c1, 0xbeff00c1,
+	0xbee80080, 0xbee90080,
+	0xbef600ff, 0x01000000,
 	0x867aff78, 0x00400000,
 	0xbf850003, 0xb8faf803,
 	0x897a7aff, 0x10000000,
-	0xbf850030, 0x24040682,
-	0xd86e4000, 0x00000002,
-	0xbf8cc07f, 0xbe840080,
+	0xbf85004d, 0xbe840080,
 	0xd2890000, 0x00000900,
 	0x80048104, 0xd2890001,
 	0x00000900, 0x80048104,
@@ -1815,31 +1761,50 @@ static const uint32_t cwsr_trap_aldebaran_hex[] = {
 	0x80048104, 0xc069003a,
 	0x00000070, 0xbf8cc07f,
 	0x80709070, 0xbf06c004,
-	0xbf84ffee, 0x680404ff,
-	0x00000200, 0xd0c9006a,
-	0x0000f702, 0xbf87ffd2,
-	0xbf820015, 0xd1060002,
-	0x00011103, 0x7e0602ff,
-	0x00000200, 0xbefc00ff,
-	0x00010000, 0xbe800077,
-	0x8677ff77, 0xff7fffff,
-	0x8777ff77, 0x00058000,
-	0xd8ec0000, 0x00000002,
-	0xbf8cc07f, 0xe0765000,
-	0x701d0002, 0x68040702,
-	0xd0c9006a, 0x0000f702,
-	0xbf87fff7, 0xbef70000,
-	0xbef000ff, 0x00000400,
+	0xbf84ffee, 0xbe840080,
+	0xd2890000, 0x00000902,
+	0x80048104, 0xd2890001,
+	0x00000902, 0x80048104,
+	0xd2890002, 0x00000902,
+	0x80048104, 0xd2890003,
+	0x00000902, 0x80048104,
+	0xc069003a, 0x00000070,
+	0xbf8cc07f, 0x80709070,
+	0xbf06c004, 0xbf84ffee,
+	0xbe840080, 0xd2890000,
+	0x00000903, 0x80048104,
+	0xd2890001, 0x00000903,
+	0x80048104, 0xd2890002,
+	0x00000903, 0x80048104,
+	0xd2890003, 0x00000903,
+	0x80048104, 0xc069003a,
+	0x00000070, 0xbf8cc07f,
+	0x80709070, 0xbf06c004,
+	0xbf84ffee, 0xbf820008,
+	0xe0724000, 0x701d0000,
+	0xe0724100, 0x701d0100,
+	0xe0724200, 0x701d0200,
+	0xe0724300, 0x701d0300,
 	0xbefe00c1, 0xbeff00c1,
-	0xb8fb2b05, 0x807b817b,
-	0x8e7b827b, 0xbef600ff,
-	0x01000000, 0xbefc0084,
-	0xbf0a7b7c, 0xbf84006d,
-	0xbf11017c, 0x807bff7b,
-	0x00001000, 0x867aff78,
+	0xb8fb4306, 0x867bc17b,
+	0xbf840064, 0xbf8a0000,
+	0x867aff6f, 0x04000000,
+	0xbf840060, 0x8e7b867b,
+	0x8e7b827b, 0xbef6007b,
+	0xb8f02985, 0x80708170,
+	0x8e708a70, 0x8e708170,
+	0xb8fa1605, 0x807a817a,
+	0x8e7a867a, 0x80707a70,
+	0x8070ff70, 0x00000080,
+	0xbef600ff, 0x01000000,
+	0xbefc0080, 0xd28c0002,
+	0x000100c1, 0xd28d0003,
+	0x000204c1, 0x867aff78,
 	0x00400000, 0xbf850003,
 	0xb8faf803, 0x897a7aff,
-	0x10000000, 0xbf850051,
+	0x10000000, 0xbf850030,
+	0x24040682, 0xd86e4000,
+	0x00000002, 0xbf8cc07f,
 	0xbe840080, 0xd2890000,
 	0x00000900, 0x80048104,
 	0xd2890001, 0x00000900,
@@ -1856,54 +1821,34 @@ static const uint32_t cwsr_trap_aldebaran_hex[] = {
 	0xd2890002, 0x00000901,
 	0x80048104, 0xd2890003,
 	0x00000901, 0x80048104,
-	0xc069003a, 0x00000070,
-	0xbf8cc07f, 0x80709070,
-	0xbf06c004, 0xbf84ffee,
-	0xbe840080, 0xd2890000,
-	0x00000902, 0x80048104,
-	0xd2890001, 0x00000902,
-	0x80048104, 0xd2890002,
-	0x00000902, 0x80048104,
-	0xd2890003, 0x00000902,
-	0x80048104, 0xc069003a,
-	0x00000070, 0xbf8cc07f,
-	0x80709070, 0xbf06c004,
-	0xbf84ffee, 0xbe840080,
-	0xd2890000, 0x00000903,
-	0x80048104, 0xd2890001,
-	0x00000903, 0x80048104,
-	0xd2890002, 0x00000903,
-	0x80048104, 0xd2890003,
-	0x00000903, 0x80048104,
-	0xc069003a, 0x00000070,
-	0xbf8cc07f, 0x80709070,
-	0xbf06c004, 0xbf84ffee,
-	0x807c847c, 0xbf0a7b7c,
-	0xbf85ffb1, 0xbf9c0000,
-	0xbf820012, 0x7e000300,
-	0x7e020301, 0x7e040302,
-	0x7e060303, 0xe0724000,
-	0x701d0000, 0xe0724100,
-	0x701d0100, 0xe0724200,
-	0x701d0200, 0xe0724300,
-	0x701d0300, 0x807c847c,
-	0x8070ff70, 0x00000400,
-	0xbf0a7b7c, 0xbf85ffef,
-	0xbf9c0000, 0xb8fb2985,
-	0x807b817b, 0x8e7b837b,
-	0xb8fa2b05, 0x807a817a,
-	0x8e7a827a, 0x80fb7a7b,
-	0x867b7b7b, 0xbf84007a,
+	0xc069003a, 0x00000070,
+	0xbf8cc07f, 0x80709070,
+	0xbf06c004, 0xbf84ffee,
+	0x680404ff, 0x00000200,
+	0xd0c9006a, 0x0000f702,
+	0xbf87ffd2, 0xbf820015,
+	0xd1060002, 0x00011103,
+	0x7e0602ff, 0x00000200,
+	0xbefc00ff, 0x00010000,
+	0xbe800077, 0x8677ff77,
+	0xff7fffff, 0x8777ff77,
+	0x00058000, 0xd8ec0000,
+	0x00000002, 0xbf8cc07f,
+	0xe0765000, 0x701d0002,
+	0x68040702, 0xd0c9006a,
+	0x0000f702, 0xbf87fff7,
+	0xbef70000, 0xbef000ff,
+	0x00000400, 0xbefe00c1,
+	0xbeff00c1, 0xb8fb2b05,
+	0x807b817b, 0x8e7b827b,
+	0xbef600ff, 0x01000000,
+	0xbefc0084, 0xbf0a7b7c,
+	0xbf84006d, 0xbf11017c,
 	0x807bff7b, 0x00001000,
-	0xbefc0080, 0xbf11017c,
 	0x867aff78, 0x00400000,
 	0xbf850003, 0xb8faf803,
 	0x897a7aff, 0x10000000,
-	0xbf850059, 0xd3d84000,
-	0x18000100, 0xd3d84001,
-	0x18000101, 0xd3d84002,
-	0x18000102, 0xd3d84003,
-	0x18000103, 0xbe840080,
+	0xbf850051, 0xbe840080,
 	0xd2890000, 0x00000900,
 	0x80048104, 0xd2890001,
 	0x00000900, 0x80048104,
@@ -1942,139 +1887,203 @@ static const uint32_t cwsr_trap_aldebaran_hex[] = {
 	0x00000070, 0xbf8cc07f,
 	0x80709070, 0xbf06c004,
 	0xbf84ffee, 0x807c847c,
-	0xbf0a7b7c, 0xbf85ffa9,
-	0xbf9c0000, 0xbf820016,
-	0xd3d84000, 0x18000100,
-	0xd3d84001, 0x18000101,
-	0xd3d84002, 0x18000102,
-	0xd3d84003, 0x18000103,
+	0xbf0a7b7c, 0xbf85ffb1,
+	0xbf9c0000, 0xbf820012,
+	0x7e000300, 0x7e020301,
+	0x7e040302, 0x7e060303,
 	0xe0724000, 0x701d0000,
 	0xe0724100, 0x701d0100,
 	0xe0724200, 0x701d0200,
 	0xe0724300, 0x701d0300,
 	0x807c847c, 0x8070ff70,
 	0x00000400, 0xbf0a7b7c,
-	0xbf85ffeb, 0xbf9c0000,
-	0xbf8200ee, 0xbef4007e,
-	0x8675ff7f, 0x0000ffff,
-	0x8775ff75, 0x00040000,
-	0xbef60080, 0xbef700ff,
-	0x00807fac, 0x866eff7f,
-	0x04000000, 0xbf84001f,
+	0xbf85ffef, 0xbf9c0000,
+	0xb8fb2985, 0x807b817b,
+	0x8e7b837b, 0xb8fa2b05,
+	0x807a817a, 0x8e7a827a,
+	0x80fb7a7b, 0x867b7b7b,
+	0xbf84007a, 0x807bff7b,
+	0x00001000, 0xbefc0080,
+	0xbf11017c, 0x867aff78,
+	0x00400000, 0xbf850003,
+	0xb8faf803, 0x897a7aff,
+	0x10000000, 0xbf850059,
+	0xd3d84000, 0x18000100,
+	0xd3d84001, 0x18000101,
+	0xd3d84002, 0x18000102,
+	0xd3d84003, 0x18000103,
+	0xbe840080, 0xd2890000,
+	0x00000900, 0x80048104,
+	0xd2890001, 0x00000900,
+	0x80048104, 0xd2890002,
+	0x00000900, 0x80048104,
+	0xd2890003, 0x00000900,
+	0x80048104, 0xc069003a,
+	0x00000070, 0xbf8cc07f,
+	0x80709070, 0xbf06c004,
+	0xbf84ffee, 0xbe840080,
+	0xd2890000, 0x00000901,
+	0x80048104, 0xd2890001,
+	0x00000901, 0x80048104,
+	0xd2890002, 0x00000901,
+	0x80048104, 0xd2890003,
+	0x00000901, 0x80048104,
+	0xc069003a, 0x00000070,
+	0xbf8cc07f, 0x80709070,
+	0xbf06c004, 0xbf84ffee,
+	0xbe840080, 0xd2890000,
+	0x00000902, 0x80048104,
+	0xd2890001, 0x00000902,
+	0x80048104, 0xd2890002,
+	0x00000902, 0x80048104,
+	0xd2890003, 0x00000902,
+	0x80048104, 0xc069003a,
+	0x00000070, 0xbf8cc07f,
+	0x80709070, 0xbf06c004,
+	0xbf84ffee, 0xbe840080,
+	0xd2890000, 0x00000903,
+	0x80048104, 0xd2890001,
+	0x00000903, 0x80048104,
+	0xd2890002, 0x00000903,
+	0x80048104, 0xd2890003,
+	0x00000903, 0x80048104,
+	0xc069003a, 0x00000070,
+	0xbf8cc07f, 0x80709070,
+	0xbf06c004, 0xbf84ffee,
+	0x807c847c, 0xbf0a7b7c,
+	0xbf85ffa9, 0xbf9c0000,
+	0xbf820016, 0xd3d84000,
+	0x18000100, 0xd3d84001,
+	0x18000101, 0xd3d84002,
+	0x18000102, 0xd3d84003,
+	0x18000103, 0xe0724000,
+	0x701d0000, 0xe0724100,
+	0x701d0100, 0xe0724200,
+	0x701d0200, 0xe0724300,
+	0x701d0300, 0x807c847c,
+	0x8070ff70, 0x00000400,
+	0xbf0a7b7c, 0xbf85ffeb,
+	0xbf9c0000, 0xbf8200ee,
+	0xbef4007e, 0x8675ff7f,
+	0x0000ffff, 0x8775ff75,
+	0x00040000, 0xbef60080,
+	0xbef700ff, 0x00807fac,
+	0x866eff7f, 0x04000000,
+	0xbf84001f, 0xbefe00c1,
+	0xbeff00c1, 0xb8ef4306,
+	0x866fc16f, 0xbf84001a,
+	0x8e6f866f, 0x8e6f826f,
+	0xbef6006f, 0xb8f82985,
+	0x80788178, 0x8e788a78,
+	0x8e788178, 0xb8ee1605,
+	0x806e816e, 0x8e6e866e,
+	0x80786e78, 0x8078ff78,
+	0x00000080, 0xbef600ff,
+	0x01000000, 0xbefc0080,
+	0xe0510000, 0x781d0000,
+	0xe0510100, 0x781d0000,
+	0x807cff7c, 0x00000200,
+	0x8078ff78, 0x00000200,
+	0xbf0a6f7c, 0xbf85fff6,
 	0xbefe00c1, 0xbeff00c1,
-	0xb8ef4306, 0x866fc16f,
-	0xbf84001a, 0x8e6f866f,
-	0x8e6f826f, 0xbef6006f,
-	0xb8f82985, 0x80788178,
-	0x8e788a78, 0x8e788178,
-	0xb8ee1605, 0x806e816e,
-	0x8e6e866e, 0x80786e78,
-	0x8078ff78, 0x00000080,
 	0xbef600ff, 0x01000000,
-	0xbefc0080, 0xe0510000,
-	0x781d0000, 0xe0510100,
-	0x781d0000, 0x807cff7c,
-	0x00000200, 0x8078ff78,
-	0x00000200, 0xbf0a6f7c,
-	0xbf85fff6, 0xbefe00c1,
-	0xbeff00c1, 0xbef600ff,
-	0x01000000, 0xb8ef2b05,
-	0x806f816f, 0x8e6f826f,
-	0x806fff6f, 0x00008000,
-	0xbef80080, 0xbeee0078,
-	0x8078ff78, 0x00000400,
-	0xbefc0084, 0xbf11087c,
-	0xe0524000, 0x781d0000,
-	0xe0524100, 0x781d0100,
-	0xe0524200, 0x781d0200,
-	0xe0524300, 0x781d0300,
-	0xbf8c0f70, 0x7e000300,
-	0x7e020301, 0x7e040302,
-	0x7e060303, 0x807c847c,
-	0x8078ff78, 0x00000400,
-	0xbf0a6f7c, 0xbf85ffee,
-	0xb8ef2985, 0x806f816f,
-	0x8e6f836f, 0xb8f92b05,
-	0x80798179, 0x8e798279,
-	0x80ef796f, 0x866f6f6f,
-	0xbf84001a, 0x806fff6f,
-	0x00008000, 0xbefc0080,
+	0xb8ef2b05, 0x806f816f,
+	0x8e6f826f, 0x806fff6f,
+	0x00008000, 0xbef80080,
+	0xbeee0078, 0x8078ff78,
+	0x00000400, 0xbefc0084,
 	0xbf11087c, 0xe0524000,
 	0x781d0000, 0xe0524100,
 	0x781d0100, 0xe0524200,
 	0x781d0200, 0xe0524300,
 	0x781d0300, 0xbf8c0f70,
-	0xd3d94000, 0x18000100,
-	0xd3d94001, 0x18000101,
-	0xd3d94002, 0x18000102,
-	0xd3d94003, 0x18000103,
+	0x7e000300, 0x7e020301,
+	0x7e040302, 0x7e060303,
 	0x807c847c, 0x8078ff78,
 	0x00000400, 0xbf0a6f7c,
-	0xbf85ffea, 0xbf9c0000,
-	0xe0524000, 0x6e1d0000,
-	0xe0524100, 0x6e1d0100,
-	0xe0524200, 0x6e1d0200,
-	0xe0524300, 0x6e1d0300,
-	0xbf8c0f70, 0xb8f82985,
-	0x80788178, 0x8e788a78,
-	0x8e788178, 0xb8ee1605,
-	0x806e816e, 0x8e6e866e,
-	0x80786e78, 0x80f8c078,
-	0xb8ef1605, 0x806f816f,
-	0x8e6f846f, 0x8e76826f,
-	0xbef600ff, 0x01000000,
-	0xbefc006f, 0xc031003a,
-	0x00000078, 0x80f8c078,
-	0xbf8cc07f, 0x80fc907c,
-	0xbf800000, 0xbe802d00,
-	0xbe822d02, 0xbe842d04,
-	0xbe862d06, 0xbe882d08,
-	0xbe8a2d0a, 0xbe8c2d0c,
-	0xbe8e2d0e, 0xbf06807c,
-	0xbf84fff0, 0xb8f82985,
-	0x80788178, 0x8e788a78,
-	0x8e788178, 0xb8ee1605,
-	0x806e816e, 0x8e6e866e,
-	0x80786e78, 0xbef60084,
-	0xbef600ff, 0x01000000,
-	0xc0211bfa, 0x00000078,
-	0x80788478, 0xc0211b3a,
+	0xbf85ffee, 0xb8ef2985,
+	0x806f816f, 0x8e6f836f,
+	0xb8f92b05, 0x80798179,
+	0x8e798279, 0x80ef796f,
+	0x866f6f6f, 0xbf84001a,
+	0x806fff6f, 0x00008000,
+	0xbefc0080, 0xbf11087c,
+	0xe0524000, 0x781d0000,
+	0xe0524100, 0x781d0100,
+	0xe0524200, 0x781d0200,
+	0xe0524300, 0x781d0300,
+	0xbf8c0f70, 0xd3d94000,
+	0x18000100, 0xd3d94001,
+	0x18000101, 0xd3d94002,
+	0x18000102, 0xd3d94003,
+	0x18000103, 0x807c847c,
+	0x8078ff78, 0x00000400,
+	0xbf0a6f7c, 0xbf85ffea,
+	0xbf9c0000, 0xe0524000,
+	0x6e1d0000, 0xe0524100,
+	0x6e1d0100, 0xe0524200,
+	0x6e1d0200, 0xe0524300,
+	0x6e1d0300, 0xbf8c0f70,
+	0xb8f82985, 0x80788178,
+	0x8e788a78, 0x8e788178,
+	0xb8ee1605, 0x806e816e,
+	0x8e6e866e, 0x80786e78,
+	0x80f8c078, 0xb8ef1605,
+	0x806f816f, 0x8e6f846f,
+	0x8e76826f, 0xbef600ff,
+	0x01000000, 0xbefc006f,
+	0xc031003a, 0x00000078,
+	0x80f8c078, 0xbf8cc07f,
+	0x80fc907c, 0xbf800000,
+	0xbe802d00, 0xbe822d02,
+	0xbe842d04, 0xbe862d06,
+	0xbe882d08, 0xbe8a2d0a,
+	0xbe8c2d0c, 0xbe8e2d0e,
+	0xbf06807c, 0xbf84fff0,
+	0xb8f82985, 0x80788178,
+	0x8e788a78, 0x8e788178,
+	0xb8ee1605, 0x806e816e,
+	0x8e6e866e, 0x80786e78,
+	0xbef60084, 0xbef600ff,
+	0x01000000, 0xc0211bfa,
 	0x00000078, 0x80788478,
-	0xc0211b7a, 0x00000078,
-	0x80788478, 0xc0211c3a,
+	0xc0211b3a, 0x00000078,
+	0x80788478, 0xc0211b7a,
 	0x00000078, 0x80788478,
-	0xc0211c7a, 0x00000078,
-	0x80788478, 0xc0211eba,
+	0xc0211c3a, 0x00000078,
+	0x80788478, 0xc0211c7a,
 	0x00000078, 0x80788478,
-	0xc0211efa, 0x00000078,
-	0x80788478, 0xc0211a3a,
+	0xc0211eba, 0x00000078,
+	0x80788478, 0xc0211efa,
 	0x00000078, 0x80788478,
-	0xc0211a7a, 0x00000078,
-	0x80788478, 0xc0211cfa,
+	0xc0211a3a, 0x00000078,
+	0x80788478, 0xc0211a7a,
 	0x00000078, 0x80788478,
-	0xbf8cc07f, 0xbefc006f,
-	0xbefe0070, 0xbeff0071,
-	0x866f7bff, 0x000003ff,
-	0xb96f4803, 0x866f7bff,
-	0xfffff800, 0x8f6f8b6f,
-	0xb96fa2c3, 0xb973f801,
-	0xb8ee2985, 0x806e816e,
-	0x8e6e8a6e, 0x8e6e816e,
-	0xb8ef1605, 0x806f816f,
-	0x8e6f866f, 0x806e6f6e,
-	0x806e746e, 0x826f8075,
-	0x866fff6f, 0x0000ffff,
-	0xc00b1c37, 0x00000050,
-	0xc00b1d37, 0x00000060,
-	0xc0031e77, 0x00000074,
-	0xbf8cc07f, 0x8f6e8b77,
-	0x866eff6e, 0x001f8000,
-	0xb96ef807, 0x866dff6d,
-	0x0000ffff, 0x86fe7e7e,
-	0x86ea6a6a, 0x8f6e837a,
-	0xb96ee0c2, 0xbf800002,
-	0xb97a0002, 0xbf8a0000,
-	0xbe801f6c, 0xbf810000,
+	0xc0211cfa, 0x00000078,
+	0x80788478, 0xbf8cc07f,
+	0xbefc006f, 0xbefe0070,
+	0xbeff0071, 0x866f7bff,
+	0x000003ff, 0xb96f4803,
+	0x866f7bff, 0xfffff800,
+	0x8f6f8b6f, 0xb96fa2c3,
+	0xb973f801, 0xb8ee2985,
+	0x806e816e, 0x8e6e8a6e,
+	0x8e6e816e, 0xb8ef1605,
+	0x806f816f, 0x8e6f866f,
+	0x806e6f6e, 0x806e746e,
+	0x826f8075, 0x866fff6f,
+	0x0000ffff, 0xc00b1c37,
+	0x00000050, 0xc00b1d37,
+	0x00000060, 0xc0031e77,
+	0x00000074, 0xbf8cc07f,
+	0x8f6e8b77, 0x866eff6e,
+	0x001f8000, 0xb96ef807,
+	0x866dff6d, 0x0000ffff,
+	0x86fe7e7e, 0x86ea6a6a,
+	0x8f6e837a, 0xb96ee0c2,
+	0xbf800002, 0xb97a0002,
+	0xbf8a0000, 0xbe801f6c,
+	0xbf810000, 0x00000000,
 };
 
 static const uint32_t cwsr_trap_gfx10_hex[] = {
diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm
index 6880340c25af..f1d12e42f89a 100644
--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm
@@ -244,6 +244,11 @@ L_NOT_HALTED:
         SQ_WAVE_TRAPSTS_TRAP_AFTER_INST_MASK
     s_cbranch_scc1  L_FETCH_2ND_TRAP
 
+    // Check TTMP1 bits 24 (HT) and 23:16(trapID): HT == 1 & trapID == 4
+    s_and_b32       ttmp2, s_save_pc_hi, (S_SAVE_PC_HI_TRAP_ID_MASK|S_SAVE_PC_HI_HT_MASK)
+    s_cmp_eq_u32    ttmp2, 0x1040000
+    s_cbranch_scc1  L_FETCH_2ND_TRAP
+
     // Check for maskable exceptions in trapsts.excp and trapsts.excp_hi.
     // Maskable exceptions only cause the wave to enter the trap handler if
     // their respective bit in mode.excp_en is set.
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 80+ messages in thread

* [PATCH v2 13/24] drm/amdgpu: add sq host trap status check
  2023-11-03 13:11 ` [PATCH 13/24] drm/amdgpu: add sq host trap status check James Zhu
  2023-11-10 19:07   ` Yat Sin, David
@ 2023-11-20 16:16   ` James Zhu
  1 sibling, 0 replies; 80+ messages in thread
From: James Zhu @ 2023-11-20 16:16 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.kuehling, joseph.greathouse, jamesz

Before fire a new host trap, check the host trap status.

Signed-off-by: James Zhu <James.Zhu@amd.com>
---
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 35 +++++++++++++++++++
 .../amd/include/asic_reg/gc/gc_9_0_offset.h   |  2 ++
 .../amd/include/asic_reg/gc/gc_9_0_sh_mask.h  |  5 +++
 3 files changed, 42 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
index 423611904eaf..89157130e476 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
@@ -1146,6 +1146,35 @@ void kgd_gfx_v9_program_trap_handler_settings(struct amdgpu_device *adev,
 	kgd_gfx_v9_unlock_srbm(adev, inst);
 }
 
+static uint32_t kgd_aldebaran_get_hosttrap_status(struct amdgpu_device *adev)
+{
+	uint32_t sq_hosttrap_status = 0x0;
+	int i, j;
+
+	mutex_lock(&adev->grbm_idx_mutex);
+	for (i = 0; i < adev->gfx.config.max_shader_engines; i++) {
+		for (j = 0; j < adev->gfx.config.max_sh_per_se; j++) {
+			amdgpu_gfx_select_se_sh(adev, i, j, 0xffffffff, 0);
+			sq_hosttrap_status = RREG32_SOC15(GC, 0, mmSQ_HOSTTRAP_STATUS);
+
+			if (sq_hosttrap_status & SQ_HOSTTRAP_STATUS__HTPENDING_OVERRIDE_MASK) {
+				WREG32_SOC15(GC, 0, mmSQ_HOSTTRAP_STATUS,
+					SQ_HOSTTRAP_STATUS__HTPENDING_OVERRIDE_MASK);
+				sq_hosttrap_status = 0x0;
+				continue;
+			}
+			if (sq_hosttrap_status)
+				goto out;
+		}
+	}
+
+out:
+	amdgpu_gfx_select_se_sh(adev, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0);
+	mutex_unlock(&adev->grbm_idx_mutex);
+
+	return sq_hosttrap_status;
+}
+
 uint32_t kgd_gfx_v9_trigger_pc_sample_trap(struct amdgpu_device *adev,
 					    uint32_t vmid,
 					    uint32_t max_wave_slot,
@@ -1156,6 +1185,12 @@ uint32_t kgd_gfx_v9_trigger_pc_sample_trap(struct amdgpu_device *adev,
 {
 	if (method == KFD_IOCTL_PCS_METHOD_HOSTTRAP) {
 		uint32_t value = 0;
+		uint32_t sq_hosttrap_status = 0x0;
+
+		sq_hosttrap_status = kgd_aldebaran_get_hosttrap_status(adev);
+		/* skip when last host trap request is still pending to complete */
+		if (sq_hosttrap_status)
+			return 0;
 
 		value = REG_SET_FIELD(value, SQ_CMD, CMD, SQ_IND_CMD_CMD_TRAP);
 		value = REG_SET_FIELD(value, SQ_CMD, MODE, SQ_IND_CMD_MODE_SINGLE);
diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_offset.h b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_offset.h
index 12d451e5475b..5b17d9066452 100644
--- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_offset.h
+++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_offset.h
@@ -462,6 +462,8 @@
 #define mmSQ_IND_DATA_BASE_IDX                                                                         0
 #define mmSQ_CMD                                                                                       0x037b
 #define mmSQ_CMD_BASE_IDX                                                                              0
+#define mmSQ_HOSTTRAP_STATUS                                                                           0x0376
+#define mmSQ_HOSTTRAP_STATUS_BASE_IDX                                                                  0
 #define mmSQ_TIME_HI                                                                                   0x037c
 #define mmSQ_TIME_HI_BASE_IDX                                                                          0
 #define mmSQ_TIME_LO                                                                                   0x037d
diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_sh_mask.h b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_sh_mask.h
index efc16ddf274a..3dfe4ab31421 100644
--- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_sh_mask.h
+++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_sh_mask.h
@@ -2616,6 +2616,11 @@
 //SQ_CMD_TIMESTAMP
 #define SQ_CMD_TIMESTAMP__TIMESTAMP__SHIFT                                                                    0x0
 #define SQ_CMD_TIMESTAMP__TIMESTAMP_MASK                                                                      0x000000FFL
+//SQ_HOSTTRAP_STATUS
+#define SQ_HOSTTRAP_STATUS__HTPENDINGCOUNT__SHIFT                                                             0x0
+#define SQ_HOSTTRAP_STATUS__HTPENDING_OVERRIDE__SHIFT                                                         0x8
+#define SQ_HOSTTRAP_STATUS__HTPENDINGCOUNT_MASK                                                               0x000000FFL
+#define SQ_HOSTTRAP_STATUS__HTPENDING_OVERRIDE_MASK                                                           0x00000100L
 //SQ_IND_INDEX
 #define SQ_IND_INDEX__WAVE_ID__SHIFT                                                                          0x0
 #define SQ_IND_INDEX__SIMD_ID__SHIFT                                                                          0x4
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 80+ messages in thread

* Re: [PATCH 15/24] drm/amdkfd: trigger pc sampling trap for aldebaran
  2023-11-10 19:08   ` Yat Sin, David
@ 2023-11-20 16:19     ` James Zhu
  0 siblings, 0 replies; 80+ messages in thread
From: James Zhu @ 2023-11-20 16:19 UTC (permalink / raw)
  To: Yat Sin, David, Zhu, James, amd-gfx; +Cc: Kuehling, Felix, Greathouse, Joseph


On 2023-11-10 14:08, Yat Sin, David wrote:
> [AMD Official Use Only - General]
>
> I would merge this with patch 14 of the series
[JZ] it is better to keep two ASIC patches separately.
>
>> -----Original Message-----
>> From: Zhu, James <James.Zhu@amd.com>
>> Sent: Friday, November 3, 2023 9:12 AM
>> To: amd-gfx@lists.freedesktop.org
>> Cc: Kuehling, Felix <Felix.Kuehling@amd.com>; Greathouse, Joseph
>> <Joseph.Greathouse@amd.com>; Yat Sin, David <David.YatSin@amd.com>; Zhu,
>> James <James.Zhu@amd.com>
>> Subject: [PATCH 15/24] drm/amdkfd: trigger pc sampling trap for aldebaran
>>
>> Implement trigger pc sampling trap for aldebaran.
>>
>> Signed-off-by: James Zhu <James.Zhu@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c | 11
>> +++++++++++
>>   1 file changed, 11 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
>> index aff08321e976..27eda75ceecb 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
>> @@ -163,6 +163,16 @@ static uint32_t
>> kgd_gfx_aldebaran_set_address_watch(
>>        return watch_address_cntl;
>>   }
>>
>> +static uint32_t kgd_aldebaran_trigger_pc_sample_trap(struct amdgpu_device
>> *adev,
>> +                                         uint32_t vmid,
>> +                                         uint32_t *target_simd,
>> +                                         uint32_t *target_wave_slot,
>> +                                         enum kfd_ioctl_pc_sample_method
>> method) {
>> +     return kgd_gfx_v9_trigger_pc_sample_trap(adev, vmid, 8, 4,
>> +                                     target_simd, target_wave_slot,
>> method); }
>> +
>>   const struct kfd2kgd_calls aldebaran_kfd2kgd = {
>>        .program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
>>        .set_pasid_vmid_mapping = kgd_gfx_v9_set_pasid_vmid_mapping,
>> @@ -191,4 +201,5 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
>>        .get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
>>        .build_grace_period_packet_info =
>> kgd_gfx_v9_build_grace_period_packet_info,
>>        .program_trap_handler_settings =
>> kgd_gfx_v9_program_trap_handler_settings,
>> +     .trigger_pc_sample_trap = kgd_aldebaran_trigger_pc_sample_trap,
>>   };
>> --
>> 2.25.1

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH 16/24] drm/amdkfd: use bit operation set debug trap
  2023-11-10 19:08   ` Yat Sin, David
@ 2023-11-20 16:21     ` James Zhu
  0 siblings, 0 replies; 80+ messages in thread
From: James Zhu @ 2023-11-20 16:21 UTC (permalink / raw)
  To: Yat Sin, David, Zhu, James, amd-gfx; +Cc: Kuehling, Felix, Greathouse, Joseph


On 2023-11-10 14:08, Yat Sin, David wrote:
> [AMD Official Use Only - General]
>
>> -----Original Message-----
>> From: Zhu, James <James.Zhu@amd.com>
>> Sent: Friday, November 3, 2023 9:12 AM
>> To: amd-gfx@lists.freedesktop.org
>> Cc: Kuehling, Felix <Felix.Kuehling@amd.com>; Greathouse, Joseph
>> <Joseph.Greathouse@amd.com>; Yat Sin, David <David.YatSin@amd.com>; Zhu,
>> James <James.Zhu@amd.com>
>> Subject: [PATCH 16/24] drm/amdkfd: use bit operation set debug trap
>>
>> 1st level TMA's 2nd byte which used for trap type setting, to use bit operation to
>> change selected bit only.
>>
>> Signed-off-by: James Zhu <James.Zhu@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdkfd/kfd_process.c | 16 +++++++++++++---
>>   1 file changed, 13 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> index fbf053001af9..a0b729c65a7c 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> @@ -1434,13 +1434,23 @@ bool kfd_process_xnack_mode(struct kfd_process
>> *p, bool supported)
>>        return true;
>>   }
>>
>> +/* bit offset in 1st-level TMA's 2nd byte which used for
>> +KFD_TRAP_TYPE_BIT */ enum KFD_TRAP_TYPE_BIT {
> Nit pick. New line after comment
[JZ] something wrong with your email. it is new line originally,
>> +     KFD_TRAP_TYPE_DEBUG = 0,                /* bit 0 for debug trap */
>> +     KFD_TRAP_TYPE_HOST,
>> +     KFD_TRAP_TYPE_STOCHASTIC,
>> +};
>> +
>>   void kfd_process_set_trap_debug_flag(struct qcm_process_device *qpd,
>>                                     bool enabled)
>>   {
>>        if (qpd->cwsr_kaddr) {
>> -             uint64_t *tma =
>> -                     (uint64_t *)(qpd->cwsr_kaddr +
>> KFD_CWSR_TMA_OFFSET);
>> -             tma[2] = enabled;
>> +             volatile unsigned long *tma =
>> +                     (volatile unsigned long *)(qpd->cwsr_kaddr +
>> KFD_CWSR_TMA_OFFSET);
>> +             if (enabled)
>> +                     set_bit(KFD_TRAP_TYPE_DEBUG, &tma[2]);
>> +             else
>> +                     clear_bit(KFD_TRAP_TYPE_DEBUG, &tma[2]);
>>        }
>>   }
>>
>> --
>> 2.25.1

^ permalink raw reply	[flat|nested] 80+ messages in thread

* [PATCH v2 22/24] drm/amdkfd: add pc sampling release when process release
  2023-11-03 13:11 ` [PATCH 22/24] drm/amdkfd: add pc sampling release when process release James Zhu
  2023-11-10 19:08   ` Yat Sin, David
@ 2023-11-20 16:23   ` James Zhu
  1 sibling, 0 replies; 80+ messages in thread
From: James Zhu @ 2023-11-20 16:23 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.kuehling, joseph.greathouse, jamesz

Add pc sampling release when process release, it will force to
stop all activate sessions with this process.

Signed-off-by: James Zhu <James.Zhu@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c | 21 ++++++++++++++++++++
 drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h |  1 +
 drivers/gpu/drm/amd/amdkfd/kfd_process.c     |  3 +++
 3 files changed, 25 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
index 66670cdb813a..748c548f7e7a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
@@ -274,6 +274,27 @@ static int kfd_pc_sample_destroy(struct kfd_process_device *pdd, uint32_t trace_
 	return 0;
 }
 
+void kfd_pc_sample_release(struct kfd_process_device *pdd)
+{
+	struct pc_sampling_entry *pcs_entry;
+	struct idr *idp;
+	uint32_t id;
+
+	/* force to release all PC sampling task for this process */
+	idp = &pdd->dev->pcs_data.hosttrap_entry.base.pc_sampling_idr;
+	mutex_lock(&pdd->dev->pcs_data.mutex);
+	idr_for_each_entry(idp, pcs_entry, id) {
+		if (pcs_entry->pdd != pdd)
+			continue;
+		mutex_unlock(&pdd->dev->pcs_data.mutex);
+		if (pcs_entry->enabled)
+			kfd_pc_sample_stop(pdd, pcs_entry);
+		kfd_pc_sample_destroy(pdd, id, pcs_entry);
+		mutex_lock(&pdd->dev->pcs_data.mutex);
+	}
+	mutex_unlock(&pdd->dev->pcs_data.mutex);
+}
+
 int kfd_pc_sample(struct kfd_process_device *pdd,
 					struct kfd_ioctl_pc_sample_args __user *args)
 {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h
index cb93909e6bd3..4ea064fdaa98 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h
@@ -30,6 +30,7 @@
 
 int kfd_pc_sample(struct kfd_process_device *pdd,
 					struct kfd_ioctl_pc_sample_args __user *args);
+void kfd_pc_sample_release(struct kfd_process_device *pdd);
 void kfd_pc_sample_handler(struct work_struct *work);
 
 #endif /* KFD_PC_SAMPLING_H_ */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index d22d804f180d..54f3db7eaae2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -43,6 +43,7 @@ struct mm_struct;
 #include "kfd_svm.h"
 #include "kfd_smi_events.h"
 #include "kfd_debug.h"
+#include "kfd_pc_sampling.h"
 
 /*
  * List of struct kfd_process (field kfd_process).
@@ -1020,6 +1021,8 @@ static void kfd_process_destroy_pdds(struct kfd_process *p)
 		pr_debug("Releasing pdd (topology id %d) for process (pasid 0x%x)\n",
 				pdd->dev->id, p->pasid);
 
+		kfd_pc_sample_release(pdd);
+
 		kfd_process_device_destroy_cwsr_dgpu(pdd);
 		kfd_process_device_destroy_ib_mem(pdd);
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 80+ messages in thread

* Re: [PATCH 01/24] drm/amdkfd/kfd_ioctl: add pc sampling support
  2023-11-03 13:11 ` [PATCH 01/24] drm/amdkfd/kfd_ioctl: add pc sampling support James Zhu
@ 2023-11-22 21:14   ` Felix Kuehling
  2023-11-23 20:33     ` James Zhu
  2023-11-27 19:11   ` Alex Deucher
  1 sibling, 1 reply; 80+ messages in thread
From: Felix Kuehling @ 2023-11-22 21:14 UTC (permalink / raw)
  To: James Zhu, amd-gfx; +Cc: joseph.greathouse, jamesz

On 2023-11-03 09:11, James Zhu wrote:
> From: David Yat Sin <david.yatsin@amd.com>
>
> Add pc sampling support in kfd_ioctl.
>
> Co-developed-by: James Zhu <James.Zhu@amd.com>
> Signed-off-by: James Zhu <James.Zhu@amd.com>
> Signed-off-by: David Yat Sin <david.yatsin@amd.com>
> ---
>   include/uapi/linux/kfd_ioctl.h | 57 +++++++++++++++++++++++++++++++++-
>   1 file changed, 56 insertions(+), 1 deletion(-)
>
> diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
> index f0ed68974c54..5202e29c9560 100644
> --- a/include/uapi/linux/kfd_ioctl.h
> +++ b/include/uapi/linux/kfd_ioctl.h
> @@ -1446,6 +1446,58 @@ struct kfd_ioctl_dbg_trap_args {
>   	};
>   };
>   
> +/**
> + * kfd_ioctl_pc_sample_op - PC Sampling ioctl operations
> + *
> + * @KFD_IOCTL_PCS_OP_QUERY_CAPABILITIES: Query device PC Sampling capabilities
> + * @KFD_IOCTL_PCS_OP_CREATE:             Register this process with a per-device PC sampler instance
> + * @KFD_IOCTL_PCS_OP_DESTROY:            Unregister from a previously registered PC sampler instance
> + * @KFD_IOCTL_PCS_OP_START:              Process begins taking samples from a previously registered PC sampler instance
> + * @KFD_IOCTL_PCS_OP_STOP:               Process stops taking samples from a previously registered PC sampler instance
> + */
> +enum kfd_ioctl_pc_sample_op {
> +	KFD_IOCTL_PCS_OP_QUERY_CAPABILITIES,
> +	KFD_IOCTL_PCS_OP_CREATE,
> +	KFD_IOCTL_PCS_OP_DESTROY,
> +	KFD_IOCTL_PCS_OP_START,
> +	KFD_IOCTL_PCS_OP_STOP,
> +};
> +
> +/* Values have to be a power of 2*/
> +#define KFD_IOCTL_PCS_FLAG_POWER_OF_2 0x00000001
> +
> +enum kfd_ioctl_pc_sample_method {
> +	KFD_IOCTL_PCS_METHOD_HOSTTRAP = 1,
> +	KFD_IOCTL_PCS_METHOD_STOCHASTIC,
> +};
> +
> +enum kfd_ioctl_pc_sample_type {
> +	KFD_IOCTL_PCS_TYPE_TIME_US,
> +	KFD_IOCTL_PCS_TYPE_CLOCK_CYCLES,
> +	KFD_IOCTL_PCS_TYPE_INSTRUCTIONS
> +};
> +
> +struct kfd_pc_sample_info {
> +	__u64 value;         /* [IN] if PCS_TYPE_INTERVAL_US: sample interval in us
> +	                      * if PCS_TYPE_CLOCK_CYCLES: sample interval in graphics core clk cycles
> +	                      * if PCS_TYPE_INSTRUCTIONS: sample interval in instructions issued by
> +	                      * graphics compute units

I'd call this "interval". That's still generic enough to be a sampling 
interval in a unit that depends on the PCS type. "value" is misleading, 
because it sounds like it may be an actual sample.


> +	                      */
> +	__u64 value_min;     /* [OUT] */
> +	__u64 value_max;     /* [OUT] */

interval_min/max.

Regards,
   Felix


> +	__u64 flags;         /* [OUT] indicate potential restrictions e.g FLAG_POWER_OF_2 */
> +	__u32 method;        /* [IN/OUT] kfd_ioctl_pc_sample_method */
> +	__u32 type;          /* [IN/OUT] kfd_ioctl_pc_sample_type */
> +};
> +
> +struct kfd_ioctl_pc_sample_args {
> +	__u64 sample_info_ptr;   /* array of kfd_pc_sample_info */
> +	__u32 num_sample_info;
> +	__u32 op;                /* kfd_ioctl_pc_sample_op */
> +	__u32 gpu_id;
> +	__u32 trace_id;
> +};
> +
>   #define AMDKFD_IOCTL_BASE 'K'
>   #define AMDKFD_IO(nr)			_IO(AMDKFD_IOCTL_BASE, nr)
>   #define AMDKFD_IOR(nr, type)		_IOR(AMDKFD_IOCTL_BASE, nr, type)
> @@ -1566,7 +1618,10 @@ struct kfd_ioctl_dbg_trap_args {
>   #define AMDKFD_IOC_DBG_TRAP			\
>   		AMDKFD_IOWR(0x26, struct kfd_ioctl_dbg_trap_args)
>   
> +#define AMDKFD_IOC_PC_SAMPLE		\
> +		AMDKFD_IOWR(0x27, struct kfd_ioctl_pc_sample_args)
> +
>   #define AMDKFD_COMMAND_START		0x01
> -#define AMDKFD_COMMAND_END		0x27
> +#define AMDKFD_COMMAND_END		0x28
>   
>   #endif

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH 05/24] drm/amdkfd: enable pc sampling create
  2023-11-03 13:11 ` [PATCH 05/24] drm/amdkfd: enable pc sampling create James Zhu
@ 2023-11-22 21:51   ` Felix Kuehling
  2023-11-23 20:25     ` James Zhu
  0 siblings, 1 reply; 80+ messages in thread
From: Felix Kuehling @ 2023-11-22 21:51 UTC (permalink / raw)
  To: James Zhu, amd-gfx; +Cc: joseph.greathouse, jamesz


On 2023-11-03 09:11, James Zhu wrote:
> From: David Yat Sin <david.yatsin@amd.com>
>
> Enable pc sampling create.
>
> Co-developed-by: James Zhu <James.Zhu@amd.com>
> Signed-off-by: James Zhu <James.Zhu@amd.com>
> Signed-off-by: David Yat Sin <david.yatsin@amd.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c | 54 +++++++++++++++++++-
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h        | 10 ++++
>   2 files changed, 63 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
> index 49fecbc7013e..f0d910ee730c 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
> @@ -97,7 +97,59 @@ static int kfd_pc_sample_stop(struct kfd_process_device *pdd)
>   static int kfd_pc_sample_create(struct kfd_process_device *pdd,
>   					struct kfd_ioctl_pc_sample_args __user *user_args)
>   {
> -	return -EINVAL;
> +	struct kfd_pc_sample_info *supported_format = NULL;
> +	struct kfd_pc_sample_info user_info;
> +	int ret;
> +	int i;
> +
> +	if (user_args->num_sample_info != 1)
> +		return -EINVAL;
> +
> +	ret = copy_from_user(&user_info, (void __user *) user_args->sample_info_ptr,
> +				sizeof(struct kfd_pc_sample_info));
> +	if (ret) {
> +		pr_debug("Failed to copy PC sampling info from user\n");
> +		return -EFAULT;
> +	}
> +
> +	for (i = 0; i < ARRAY_SIZE(supported_formats); i++) {
> +		if (KFD_GC_VERSION(pdd->dev) == supported_formats[i].ip_version
> +			&& user_info.method == supported_formats[i].sample_info->method
> +			&& user_info.type == supported_formats[i].sample_info->type
> +			&& user_info.value <= supported_formats[i].sample_info->value_max
> +			&& user_info.value >= supported_formats[i].sample_info->value_min) {
> +			supported_format =
> +				(struct kfd_pc_sample_info *)supported_formats[i].sample_info;
> +			break;
> +		}
> +	}
> +
> +	if (!supported_format) {
> +		pr_debug("Sampling format is not supported!");
> +		return -EOPNOTSUPP;
> +	}
> +
> +	mutex_lock(&pdd->dev->pcs_data.mutex);
> +	if (pdd->dev->pcs_data.hosttrap_entry.base.use_count &&
> +		memcmp(&pdd->dev->pcs_data.hosttrap_entry.base.pc_sample_info,
> +				&user_info, sizeof(user_info))) {

I think you can compare structures in C. This would be more readable:

	if (pdd->dev->pcs_data.hosttrap_entry.base.use_count &&
	    pdd->dev->pcs_data.hosttrap_entry.base.pc_sample_info != user_info) {
		...
	}


> +		ret = copy_to_user((void __user *) user_args->sample_info_ptr,
> +			&pdd->dev->pcs_data.hosttrap_entry.base.pc_sample_info,
> +			sizeof(struct kfd_pc_sample_info));
> +		mutex_unlock(&pdd->dev->pcs_data.mutex);
> +		return ret ? ret : -EEXIST;

When copy_to_user fails, it returns the number of bytes not copied. 
That's not a useful return value here. This should be

		return ret ? -EFAULT : -EEXIST;

Also -EBUSY may be more appropriate than -EEXIST.


> +	}
> +
> +	/* TODO: add trace_id return */
> +
> +	if (!pdd->dev->pcs_data.hosttrap_entry.base.use_count)
> +		memcpy(&pdd->dev->pcs_data.hosttrap_entry.base.pc_sample_info,
> +				&user_info, sizeof(user_info));

I think you can assign structures in C. Just do

		pdd->dev->pcs_data.hosttrap_entry.base.pc_sample_info = user_info;

Regards,
   Felix


> +
> +	pdd->dev->pcs_data.hosttrap_entry.base.use_count++;
> +	mutex_unlock(&pdd->dev->pcs_data.mutex);
> +
> +	return 0;
>   }
>   
>   static int kfd_pc_sample_destroy(struct kfd_process_device *pdd, uint32_t trace_id)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 4a0b66189c67..81c925fb2952 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -256,9 +256,19 @@ struct kfd_vmid_info {
>   
>   struct kfd_dev;
>   
> +struct kfd_dev_pc_sampling_data {
> +	uint32_t use_count;         /* Num of PC sampling sessions */
> +	struct kfd_pc_sample_info pc_sample_info;
> +};
> +
> +struct kfd_dev_pcs_hosttrap {
> +	struct kfd_dev_pc_sampling_data base;
> +};
> +
>   /* Per device PC Sampling data */
>   struct kfd_dev_pc_sampling {
>   	struct mutex mutex;
> +	struct kfd_dev_pcs_hosttrap hosttrap_entry;
>   };
>   
>   struct kfd_node {

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH 06/24] drm/amdkfd: add trace_id return
  2023-11-03 13:11 ` [PATCH 06/24] drm/amdkfd: add trace_id return James Zhu
@ 2023-11-22 21:56   ` Felix Kuehling
  2023-11-23 20:22     ` James Zhu
  2023-11-22 22:21   ` Felix Kuehling
  1 sibling, 1 reply; 80+ messages in thread
From: Felix Kuehling @ 2023-11-22 21:56 UTC (permalink / raw)
  To: James Zhu, amd-gfx; +Cc: joseph.greathouse, jamesz


On 2023-11-03 09:11, James Zhu wrote:
> Add trace_id return for new pc sampling creation per device,
> Use IDR to quickly locate pc_sampling_entry for reference.
>
> Signed-off-by: James Zhu <James.Zhu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_device.c      |  2 ++
>   drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c | 20 +++++++++++++++++++-
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h        |  6 ++++++
>   3 files changed, 27 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index 0e24e011f66b..bcaeedac8fe0 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -536,10 +536,12 @@ static void kfd_smi_init(struct kfd_node *dev)
>   static void kfd_pc_sampling_init(struct kfd_node *dev)
>   {
>   	mutex_init(&dev->pcs_data.mutex);
> +	idr_init_base(&dev->pcs_data.hosttrap_entry.base.pc_sampling_idr, 1);
>   }
>   
>   static void kfd_pc_sampling_exit(struct kfd_node *dev)
>   {
> +	idr_destroy(&dev->pcs_data.hosttrap_entry.base.pc_sampling_idr);
>   	mutex_destroy(&dev->pcs_data.mutex);
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
> index f0d910ee730c..4c9fc48e1a6a 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
> @@ -99,6 +99,7 @@ static int kfd_pc_sample_create(struct kfd_process_device *pdd,
>   {
>   	struct kfd_pc_sample_info *supported_format = NULL;
>   	struct kfd_pc_sample_info user_info;
> +	struct pc_sampling_entry *pcs_entry;
>   	int ret;
>   	int i;
>   
> @@ -140,7 +141,19 @@ static int kfd_pc_sample_create(struct kfd_process_device *pdd,
>   		return ret ? ret : -EEXIST;
>   	}
>   
> -	/* TODO: add trace_id return */
> +	pcs_entry = kvzalloc(sizeof(*pcs_entry), GFP_KERNEL);

I don't see a reason to use kvzalloc here. You know the size of the 
structure, so kzalloc should be perfectly fine.


> +	if (!pcs_entry) {
> +		mutex_unlock(&pdd->dev->pcs_data.mutex);
> +		return -ENOMEM;
> +	}
> +
> +	i = idr_alloc_cyclic(&pdd->dev->pcs_data.hosttrap_entry.base.pc_sampling_idr,
> +				pcs_entry, 1, 0, GFP_KERNEL);
> +	if (i < 0) {
> +		mutex_unlock(&pdd->dev->pcs_data.mutex);
> +		kvfree(pcs_entry);

kfree


> +		return i;
> +	}
>   
>   	if (!pdd->dev->pcs_data.hosttrap_entry.base.use_count)
>   		memcpy(&pdd->dev->pcs_data.hosttrap_entry.base.pc_sample_info,
> @@ -149,6 +162,11 @@ static int kfd_pc_sample_create(struct kfd_process_device *pdd,
>   	pdd->dev->pcs_data.hosttrap_entry.base.use_count++;
>   	mutex_unlock(&pdd->dev->pcs_data.mutex);
>   
> +	pcs_entry->pdd = pdd;
> +	user_args->trace_id = (uint32_t)i;

I suspect this should be done inside the lock. You don't want someone 
looking up the pcs_entry before it has been initialized.

Regards,
   Felix


> +
> +	pr_debug("alloc pcs_entry = %p, trace_id = 0x%x on gpu 0x%x", pcs_entry, i, pdd->dev->id);
> +
>   	return 0;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 81c925fb2952..642558026d16 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -258,6 +258,7 @@ struct kfd_dev;
>   
>   struct kfd_dev_pc_sampling_data {
>   	uint32_t use_count;         /* Num of PC sampling sessions */
> +	struct idr pc_sampling_idr;
>   	struct kfd_pc_sample_info pc_sample_info;
>   };
>   
> @@ -743,6 +744,11 @@ enum kfd_pdd_bound {
>    */
>   #define SDMA_ACTIVITY_DIVISOR  100
>   
> +struct pc_sampling_entry {
> +	bool enabled;
> +	struct kfd_process_device *pdd;
> +};
> +
>   /* Data that is per-process-per device. */
>   struct kfd_process_device {
>   	/* The device that owns this data. */

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH 07/24] drm/amdkfd: check pcs_enrty valid
  2023-11-03 13:11 ` [PATCH 07/24] drm/amdkfd: check pcs_enrty valid James Zhu
  2023-11-10 19:09   ` Yat Sin, David
  2023-11-20 15:55   ` [PATCH v2 " James Zhu
@ 2023-11-22 22:15   ` Felix Kuehling
  2023-11-23 20:18     ` James Zhu
  2 siblings, 1 reply; 80+ messages in thread
From: Felix Kuehling @ 2023-11-22 22:15 UTC (permalink / raw)
  To: James Zhu, amd-gfx; +Cc: joseph.greathouse, jamesz


On 2023-11-03 09:11, James Zhu wrote:
> Check pcs_enrty valid for pc sampling ioctl.
>
> Signed-off-by: James Zhu <James.Zhu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c | 30 ++++++++++++++++++--
>   1 file changed, 27 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
> index 4c9fc48e1a6a..36366c8847de 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
> @@ -179,6 +179,21 @@ static int kfd_pc_sample_destroy(struct kfd_process_device *pdd, uint32_t trace_
>   int kfd_pc_sample(struct kfd_process_device *pdd,
>   					struct kfd_ioctl_pc_sample_args __user *args)
>   {
> +	struct pc_sampling_entry *pcs_entry;
> +
> +	if (args->op != KFD_IOCTL_PCS_OP_QUERY_CAPABILITIES &&
> +		args->op != KFD_IOCTL_PCS_OP_CREATE) {
> +
> +		mutex_lock(&pdd->dev->pcs_data.mutex);
> +		pcs_entry = idr_find(&pdd->dev->pcs_data.hosttrap_entry.base.pc_sampling_idr,
> +				args->trace_id);
> +		mutex_unlock(&pdd->dev->pcs_data.mutex);

You need to keep holding the lock while the pcs_entry is still used. 
That includes any of the kfd_pc_sample_<op> functions below. Otherwise 
someone could free it concurrently. It would also simplify the ..._<op> 
functions, if they didn't have to worry about the locking themselves.

Regards,
   Felix


> +
> +		if (!pcs_entry ||
> +			pcs_entry->pdd != pdd)
> +			return -EINVAL;
> +	}
> +
>   	switch (args->op) {
>   	case KFD_IOCTL_PCS_OP_QUERY_CAPABILITIES:
>   		return kfd_pc_sample_query_cap(pdd, args);
> @@ -187,13 +202,22 @@ int kfd_pc_sample(struct kfd_process_device *pdd,
>   		return kfd_pc_sample_create(pdd, args);
>   
>   	case KFD_IOCTL_PCS_OP_DESTROY:
> -		return kfd_pc_sample_destroy(pdd, args->trace_id);
> +		if (pcs_entry->enabled)
> +			return -EBUSY;
> +		else
> +			return kfd_pc_sample_destroy(pdd, args->trace_id);
>   
>   	case KFD_IOCTL_PCS_OP_START:
> -		return kfd_pc_sample_start(pdd);
> +		if (pcs_entry->enabled)
> +			return -EALREADY;
> +		else
> +			return kfd_pc_sample_start(pdd);
>   
>   	case KFD_IOCTL_PCS_OP_STOP:
> -		return kfd_pc_sample_stop(pdd);
> +		if (!pcs_entry->enabled)
> +			return -EALREADY;
> +		else
> +			return kfd_pc_sample_stop(pdd);
>   	}
>   
>   	return -EINVAL;

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH 06/24] drm/amdkfd: add trace_id return
  2023-11-03 13:11 ` [PATCH 06/24] drm/amdkfd: add trace_id return James Zhu
  2023-11-22 21:56   ` Felix Kuehling
@ 2023-11-22 22:21   ` Felix Kuehling
  2023-11-23 22:14     ` Zhu, James
  1 sibling, 1 reply; 80+ messages in thread
From: Felix Kuehling @ 2023-11-22 22:21 UTC (permalink / raw)
  To: James Zhu, amd-gfx; +Cc: joseph.greathouse, jamesz


On 2023-11-03 09:11, James Zhu wrote:
> Add trace_id return for new pc sampling creation per device,
> Use IDR to quickly locate pc_sampling_entry for reference.
>
> Signed-off-by: James Zhu <James.Zhu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_device.c      |  2 ++
>   drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c | 20 +++++++++++++++++++-
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h        |  6 ++++++
>   3 files changed, 27 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index 0e24e011f66b..bcaeedac8fe0 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -536,10 +536,12 @@ static void kfd_smi_init(struct kfd_node *dev)
>   static void kfd_pc_sampling_init(struct kfd_node *dev)
>   {
>   	mutex_init(&dev->pcs_data.mutex);
> +	idr_init_base(&dev->pcs_data.hosttrap_entry.base.pc_sampling_idr, 1);
>   }
>   
>   static void kfd_pc_sampling_exit(struct kfd_node *dev)
>   {
> +	idr_destroy(&dev->pcs_data.hosttrap_entry.base.pc_sampling_idr);
>   	mutex_destroy(&dev->pcs_data.mutex);
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
> index f0d910ee730c..4c9fc48e1a6a 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
> @@ -99,6 +99,7 @@ static int kfd_pc_sample_create(struct kfd_process_device *pdd,
>   {
>   	struct kfd_pc_sample_info *supported_format = NULL;
>   	struct kfd_pc_sample_info user_info;
> +	struct pc_sampling_entry *pcs_entry;
>   	int ret;
>   	int i;
>   
> @@ -140,7 +141,19 @@ static int kfd_pc_sample_create(struct kfd_process_device *pdd,
>   		return ret ? ret : -EEXIST;
>   	}
>   
> -	/* TODO: add trace_id return */
> +	pcs_entry = kvzalloc(sizeof(*pcs_entry), GFP_KERNEL);
> +	if (!pcs_entry) {
> +		mutex_unlock(&pdd->dev->pcs_data.mutex);
> +		return -ENOMEM;
> +	}
> +
> +	i = idr_alloc_cyclic(&pdd->dev->pcs_data.hosttrap_entry.base.pc_sampling_idr,
> +				pcs_entry, 1, 0, GFP_KERNEL);
> +	if (i < 0) {
> +		mutex_unlock(&pdd->dev->pcs_data.mutex);
> +		kvfree(pcs_entry);
> +		return i;
> +	}
>   
>   	if (!pdd->dev->pcs_data.hosttrap_entry.base.use_count)
>   		memcpy(&pdd->dev->pcs_data.hosttrap_entry.base.pc_sample_info,
> @@ -149,6 +162,11 @@ static int kfd_pc_sample_create(struct kfd_process_device *pdd,
>   	pdd->dev->pcs_data.hosttrap_entry.base.use_count++;
>   	mutex_unlock(&pdd->dev->pcs_data.mutex);
>   
> +	pcs_entry->pdd = pdd;

One more thought: You have a bunch of pcs_entries pointing to pdd. What 
mechanism guarantees, that the pcs_entries are destroyed before the pdd 
on process termination? I think kfd_pc_sampling_exit doesn't run during 
process termination, because it deals with per-device data structures, 
not per-process data structures.

Should the IDR be in the PDD rather than the device? In that case you 
wouldn't even need the pdd pointer in struct pcs_entry.

I see you have a patch much later in the series "drm/amdkfd: add pc 
sampling release when process release". I'd prefer this squashed here 
and in the patches that add the start/stop functions.

Regards,
   Felix


> +	user_args->trace_id = (uint32_t)i;
> +
> +	pr_debug("alloc pcs_entry = %p, trace_id = 0x%x on gpu 0x%x", pcs_entry, i, pdd->dev->id);
> +
>   	return 0;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 81c925fb2952..642558026d16 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -258,6 +258,7 @@ struct kfd_dev;
>   
>   struct kfd_dev_pc_sampling_data {
>   	uint32_t use_count;         /* Num of PC sampling sessions */
> +	struct idr pc_sampling_idr;
>   	struct kfd_pc_sample_info pc_sample_info;
>   };
>   
> @@ -743,6 +744,11 @@ enum kfd_pdd_bound {
>    */
>   #define SDMA_ACTIVITY_DIVISOR  100
>   
> +struct pc_sampling_entry {
> +	bool enabled;
> +	struct kfd_process_device *pdd;
> +};
> +
>   /* Data that is per-process-per device. */
>   struct kfd_process_device {
>   	/* The device that owns this data. */

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH 18/24] drm/amdkfd: enable pc sampling start
  2023-11-03 13:11 ` [PATCH 18/24] drm/amdkfd: enable pc sampling start James Zhu
@ 2023-11-22 22:27   ` Felix Kuehling
  2023-11-23 20:01     ` James Zhu
  0 siblings, 1 reply; 80+ messages in thread
From: Felix Kuehling @ 2023-11-22 22:27 UTC (permalink / raw)
  To: James Zhu, amd-gfx; +Cc: joseph.greathouse, jamesz


On 2023-11-03 09:11, James Zhu wrote:
> Enable pc sampling start.
>
> Signed-off-by: James Zhu <James.Zhu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c | 26 +++++++++++++++++---
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h        |  2 ++
>   2 files changed, 25 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
> index 60b29b245db5..33d003ca0093 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
> @@ -83,9 +83,29 @@ static int kfd_pc_sample_query_cap(struct kfd_process_device *pdd,
>   	return 0;
>   }
>   
> -static int kfd_pc_sample_start(struct kfd_process_device *pdd)
> +static int kfd_pc_sample_start(struct kfd_process_device *pdd,
> +					struct pc_sampling_entry *pcs_entry)
>   {
> -	return -EINVAL;
> +	bool pc_sampling_start = false;
> +
> +	pcs_entry->enabled = true;
> +	mutex_lock(&pdd->dev->pcs_data.mutex);
> +	if (!pdd->dev->pcs_data.hosttrap_entry.base.active_count)
> +		pc_sampling_start = true;
> +	pdd->dev->pcs_data.hosttrap_entry.base.active_count++;
> +	mutex_unlock(&pdd->dev->pcs_data.mutex);
> +
> +	while (pc_sampling_start) {
> +		if (READ_ONCE(pdd->dev->pcs_data.hosttrap_entry.base.stop_enable)) {
> +			usleep_range(1000, 2000);

I don't understand why you need this synchronization through 
stop_enable. Why can't you do both the start and stop while holding the 
mutex? It's just setting a flag in the TMA, so it's not a time-consuming 
operation, and I don't see any potential for deadlocks.

Regards,
   Felix


> +		} else {
> +			kfd_process_set_trap_pc_sampling_flag(&pdd->qpd,
> +				pdd->dev->pcs_data.hosttrap_entry.base.pc_sample_info.method, true);
> +			break;
> +		}
> +	}
> +
> +	return 0;
>   }
>   
>   static int kfd_pc_sample_stop(struct kfd_process_device *pdd)
> @@ -225,7 +245,7 @@ int kfd_pc_sample(struct kfd_process_device *pdd,
>   		if (pcs_entry->enabled)
>   			return -EALREADY;
>   		else
> -			return kfd_pc_sample_start(pdd);
> +			return kfd_pc_sample_start(pdd, pcs_entry);
>   
>   	case KFD_IOCTL_PCS_OP_STOP:
>   		if (!pcs_entry->enabled)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 6670534f47b8..613910e0d440 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -258,6 +258,8 @@ struct kfd_dev;
>   
>   struct kfd_dev_pc_sampling_data {
>   	uint32_t use_count;         /* Num of PC sampling sessions */
> +	uint32_t active_count;      /* Num of active sessions */
> +	bool stop_enable;           /* pc sampling stop in process */
>   	struct idr pc_sampling_idr;
>   	struct kfd_pc_sample_info pc_sample_info;
>   };

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH 20/24] drm/amdkfd: enable pc sampling work to trigger trap
  2023-11-03 13:11 ` [PATCH 20/24] drm/amdkfd: enable pc sampling work to trigger trap James Zhu
@ 2023-11-22 22:31   ` Felix Kuehling
  2023-11-23 18:27     ` James Zhu
  0 siblings, 1 reply; 80+ messages in thread
From: Felix Kuehling @ 2023-11-22 22:31 UTC (permalink / raw)
  To: James Zhu, amd-gfx; +Cc: joseph.greathouse, jamesz


On 2023-11-03 09:11, James Zhu wrote:
> Enable a delay work to trigger pc sampling trap.
>
> Signed-off-by: James Zhu <James.Zhu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_device.c      |  3 ++
>   drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c | 39 ++++++++++++++++++++
>   drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h |  1 +
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h        |  1 +
>   4 files changed, 44 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index bcaeedac8fe0..fb21902e433a 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -35,6 +35,7 @@
>   #include "kfd_migrate.h"
>   #include "amdgpu.h"
>   #include "amdgpu_xcp.h"
> +#include "kfd_pc_sampling.h"
>   
>   #define MQD_SIZE_ALIGNED 768
>   
> @@ -537,6 +538,8 @@ static void kfd_pc_sampling_init(struct kfd_node *dev)
>   {
>   	mutex_init(&dev->pcs_data.mutex);
>   	idr_init_base(&dev->pcs_data.hosttrap_entry.base.pc_sampling_idr, 1);
> +	INIT_WORK(&dev->pcs_data.hosttrap_entry.base.pc_sampling_work,
> +		kfd_pc_sample_handler);
>   }
>   
>   static void kfd_pc_sampling_exit(struct kfd_node *dev)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
> index 2c4ac5b4cc4b..e8f0559b618e 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
> @@ -38,6 +38,43 @@ struct supported_pc_sample_info supported_formats[] = {
>   	{ IP_VERSION(9, 4, 2), &sample_info_hosttrap_9_0_0 },
>   };
>   
> +void kfd_pc_sample_handler(struct work_struct *work)
> +{
> +	struct amdgpu_device *adev;
> +	struct kfd_node *node;
> +	uint32_t timeout = 0;
> +
> +	node = container_of(work, struct kfd_node,
> +					pcs_data.hosttrap_entry.base.pc_sampling_work);
> +
> +	mutex_lock(&node->pcs_data.mutex);
> +	if (node->pcs_data.hosttrap_entry.base.active_count &&
> +		node->pcs_data.hosttrap_entry.base.pc_sample_info.value &&
> +		node->kfd2kgd->trigger_pc_sample_trap) {
> +		switch (node->pcs_data.hosttrap_entry.base.pc_sample_info.type) {
> +		case KFD_IOCTL_PCS_TYPE_TIME_US:
> +			timeout = (uint32_t)node->pcs_data.hosttrap_entry.base.pc_sample_info.value;
> +			break;
> +		default:
> +			pr_debug("PC Sampling type %d not supported.",
> +					node->pcs_data.hosttrap_entry.base.pc_sample_info.type);
> +		}
> +	}
> +	mutex_unlock(&node->pcs_data.mutex);
> +	if (!timeout)
> +		return;
> +
> +	adev = node->adev;
> +	while (!READ_ONCE(node->pcs_data.hosttrap_entry.base.stop_enable)) {

This worker basically runs indefinitely (controlled by user mode).


> +		node->kfd2kgd->trigger_pc_sample_trap(adev, node->vm_info.last_vmid_kfd,
> +				&node->pcs_data.hosttrap_entry.base.target_simd,
> +				&node->pcs_data.hosttrap_entry.base.target_wave_slot,
> +				node->pcs_data.hosttrap_entry.base.pc_sample_info.method);
> +		pr_debug_ratelimited("triggered a host trap.");
> +		usleep_range(timeout, timeout + 10);

This will cause drift of the interval. Instead what you should do, is 
calculate the wait time at the end of every iteration based on the 
current time and the interval.


> +	}
> +}
> +
>   static int kfd_pc_sample_query_cap(struct kfd_process_device *pdd,
>   					struct kfd_ioctl_pc_sample_args __user *user_args)
>   {
> @@ -101,6 +138,7 @@ static int kfd_pc_sample_start(struct kfd_process_device *pdd,
>   		} else {
>   			kfd_process_set_trap_pc_sampling_flag(&pdd->qpd,
>   				pdd->dev->pcs_data.hosttrap_entry.base.pc_sample_info.method, true);
> +			schedule_work(&pdd->dev->pcs_data.hosttrap_entry.base.pc_sampling_work);

Scheduling a worker that runs indefinitely on the system workqueue is 
probably a bad idea. It could block other work items indefinitely. I 
think you are misusing the work queue API here. What you really want is 
probably, to crease a kernel thread.

Regards,
   Felix


>   			break;
>   		}
>   	}
> @@ -123,6 +161,7 @@ static int kfd_pc_sample_stop(struct kfd_process_device *pdd,
>   	mutex_unlock(&pdd->dev->pcs_data.mutex);
>   
>   	if (pc_sampling_stop) {
> +		cancel_work_sync(&pdd->dev->pcs_data.hosttrap_entry.base.pc_sampling_work);
>   		kfd_process_set_trap_pc_sampling_flag(&pdd->qpd,
>   			pdd->dev->pcs_data.hosttrap_entry.base.pc_sample_info.method, false);
>   
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h
> index 4eeded4ea5b6..cb93909e6bd3 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h
> @@ -30,5 +30,6 @@
>   
>   int kfd_pc_sample(struct kfd_process_device *pdd,
>   					struct kfd_ioctl_pc_sample_args __user *args);
> +void kfd_pc_sample_handler(struct work_struct *work);
>   
>   #endif /* KFD_PC_SAMPLING_H_ */
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index badaa4d68cc4..b7062033fda4 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -263,6 +263,7 @@ struct kfd_dev_pc_sampling_data {
>   	uint32_t target_wave_slot;  /* target wave slot for trap */
>   	bool stop_enable;           /* pc sampling stop in process */
>   	struct idr pc_sampling_idr;
> +	struct work_struct pc_sampling_work;
>   	struct kfd_pc_sample_info pc_sample_info;
>   };
>   

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH 21/24] drm/amdkfd: add queue remapping
  2023-11-03 13:11 ` [PATCH 21/24] drm/amdkfd: add queue remapping James Zhu
@ 2023-11-22 22:35   ` Felix Kuehling
  2023-11-23 16:25     ` James Zhu
  0 siblings, 1 reply; 80+ messages in thread
From: Felix Kuehling @ 2023-11-22 22:35 UTC (permalink / raw)
  To: James Zhu, amd-gfx; +Cc: joseph.greathouse, jamesz


On 2023-11-03 09:11, James Zhu wrote:
> Add queue remapping to force the waves in any running
> processes to complete a CWSR trap.

Please add an explanation why this is needed.


>
> Signed-off-by: James Zhu <James.Zhu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 11 +++++++++++
>   drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h |  5 +++++
>   drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c          |  3 +++
>   3 files changed, 19 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index c0e71543389a..a3f57be63f4f 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -3155,6 +3155,17 @@ int debug_refresh_runlist(struct device_queue_manager *dqm)
>   	return debug_map_and_unlock(dqm);
>   }
>   
> +void remap_queue(struct device_queue_manager *dqm,
> +				enum kfd_unmap_queues_filter filter,
> +				uint32_t filter_param,
> +				uint32_t grace_period)

Not sure if you need the filter and grace period parameters in this 
function. What's the point of exposing that to callers who just want to 
trigger a CWSR? You could also change the function name to reflect the 
purpose of the function, rather than the implementation.

Regards,
   Felix


> +{
> +	dqm_lock(dqm);
> +	if (!dqm->dev->kfd->shared_resources.enable_mes)
> +		execute_queues_cpsch(dqm, filter, filter_param, grace_period);
> +	dqm_unlock(dqm);
> +}
> +
>   #if defined(CONFIG_DEBUG_FS)
>   
>   static void seq_reg_dump(struct seq_file *m,
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> index cf7e182588f8..f8aae3747a36 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> @@ -303,6 +303,11 @@ int debug_lock_and_unmap(struct device_queue_manager *dqm);
>   int debug_map_and_unlock(struct device_queue_manager *dqm);
>   int debug_refresh_runlist(struct device_queue_manager *dqm);
>   
> +void remap_queue(struct device_queue_manager *dqm,
> +				enum kfd_unmap_queues_filter filter,
> +				uint32_t filter_param,
> +				uint32_t grace_period);
> +
>   static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd)
>   {
>   	return (pdd->lds_base >> 16) & 0xFF;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
> index e8f0559b618e..66670cdb813a 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
> @@ -24,6 +24,7 @@
>   #include "kfd_priv.h"
>   #include "amdgpu_amdkfd.h"
>   #include "kfd_pc_sampling.h"
> +#include "kfd_device_queue_manager.h"
>   
>   struct supported_pc_sample_info {
>   	uint32_t ip_version;
> @@ -164,6 +165,8 @@ static int kfd_pc_sample_stop(struct kfd_process_device *pdd,
>   		cancel_work_sync(&pdd->dev->pcs_data.hosttrap_entry.base.pc_sampling_work);
>   		kfd_process_set_trap_pc_sampling_flag(&pdd->qpd,
>   			pdd->dev->pcs_data.hosttrap_entry.base.pc_sample_info.method, false);
> +		remap_queue(pdd->dev->dqm,
> +			KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD);
>   
>   		mutex_lock(&pdd->dev->pcs_data.mutex);
>   		pdd->dev->pcs_data.hosttrap_entry.base.target_simd = 0;

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH 23/24] drm/amdkfd: add pc sampling capability check
  2023-11-03 13:11 ` [PATCH 23/24] drm/amdkfd: add pc sampling capability check James Zhu
@ 2023-11-22 22:40   ` Felix Kuehling
  2023-11-23 16:06     ` James Zhu
  0 siblings, 1 reply; 80+ messages in thread
From: Felix Kuehling @ 2023-11-22 22:40 UTC (permalink / raw)
  To: James Zhu, amd-gfx; +Cc: joseph.greathouse, jamesz

On 2023-11-03 09:11, James Zhu wrote:
> From: David Yat Sin <david.yatsin@amd.com>
>
> Add pc sampling capability check.

This should be squashed into patch 2. Or if you want to keep it 
separate, put this patch before patch 2 and define AMDKFD_IOC_PC_SAMPLE 
with KFD_IOC_FLAG_PERFMON from the beginning.

Regards,
   Felix


>
> Signed-off-by: David Yat Sin <david.yatsin@amd.com>
> Signed-off-by: James Zhu <James.Zhu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 10 +++++++++-
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h    | 13 +++++++++++++
>   2 files changed, 22 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index b00390e451bf..5e47e374d824 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -3259,7 +3259,7 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
>   			kfd_ioctl_set_debug_trap, 0),
>   
>   	AMDKFD_IOCTL_DEF(AMDKFD_IOC_PC_SAMPLE,
> -			kfd_ioctl_pc_sample, 0),
> +			kfd_ioctl_pc_sample, KFD_IOC_FLAG_PERFMON),
>   };
>   
>   #define AMDKFD_CORE_IOCTL_COUNT	ARRAY_SIZE(amdkfd_ioctls)
> @@ -3336,6 +3336,14 @@ static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
>   		}
>   	}
>   
> +	/* PC Sampling Monitor */
> +	if (unlikely(ioctl->flags & KFD_IOC_FLAG_PERFMON)) {
> +		if (!capable(CAP_PERFMON) && !capable(CAP_SYS_ADMIN)) {
> +			retcode = -EACCES;
> +			goto err_i1;
> +		}
> +	}
> +
>   	if (cmd & (IOC_IN | IOC_OUT)) {
>   		if (asize <= sizeof(stack_kdata)) {
>   			kdata = stack_kdata;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index b7062033fda4..236d3de85153 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -144,6 +144,19 @@ enum kfd_ioctl_flags {
>   	 * we also allow ioctls with SYS_ADMIN capability.
>   	 */
>   	KFD_IOC_FLAG_CHECKPOINT_RESTORE = BIT(0),
> +
> +	/*
> +	 * @KFD_IOC_FLAG_PERFMON:
> +	 * Performance monitoring feature, GPU performance monitoring can allow users
> +	 * to gather some information about other processes. PC sampling can allow
> +	 * users to infer information about wavefronts from other processes that are
> +	 * running on the same CUs, such as which execution units they are using. As
> +	 * such, this type of performance monitoring should be protected and only
> +	 * available to users with sufficient capabilities: either CAP_PERFMON, or,
> +	 * for backwards compatibility, CAP_SYS_ADMIN.
> +	 */
> +
> +	KFD_IOC_FLAG_PERFMON = BIT(1),
>   };
>   /*
>    * Kernel module parameter to specify maximum number of supported queues per

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH 23/24] drm/amdkfd: add pc sampling capability check
  2023-11-22 22:40   ` Felix Kuehling
@ 2023-11-23 16:06     ` James Zhu
  0 siblings, 0 replies; 80+ messages in thread
From: James Zhu @ 2023-11-23 16:06 UTC (permalink / raw)
  To: Felix Kuehling, James Zhu, amd-gfx; +Cc: joseph.greathouse


On 2023-11-22 17:40, Felix Kuehling wrote:
> On 2023-11-03 09:11, James Zhu wrote:
>> From: David Yat Sin <david.yatsin@amd.com>
>>
>> Add pc sampling capability check.
>
> This should be squashed into patch 2. Or if you want to keep it 
> separate, put this patch before patch 2 and define 
> AMDKFD_IOC_PC_SAMPLE with KFD_IOC_FLAG_PERFMON from the beginning.
[JZ] will do , thanks!
>
> Regards,
>   Felix
>
>
>>
>> Signed-off-by: David Yat Sin <david.yatsin@amd.com>
>> Signed-off-by: James Zhu <James.Zhu@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 10 +++++++++-
>>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h    | 13 +++++++++++++
>>   2 files changed, 22 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> index b00390e451bf..5e47e374d824 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> @@ -3259,7 +3259,7 @@ static const struct amdkfd_ioctl_desc 
>> amdkfd_ioctls[] = {
>>               kfd_ioctl_set_debug_trap, 0),
>>         AMDKFD_IOCTL_DEF(AMDKFD_IOC_PC_SAMPLE,
>> -            kfd_ioctl_pc_sample, 0),
>> +            kfd_ioctl_pc_sample, KFD_IOC_FLAG_PERFMON),
>>   };
>>     #define AMDKFD_CORE_IOCTL_COUNT    ARRAY_SIZE(amdkfd_ioctls)
>> @@ -3336,6 +3336,14 @@ static long kfd_ioctl(struct file *filep, 
>> unsigned int cmd, unsigned long arg)
>>           }
>>       }
>>   +    /* PC Sampling Monitor */
>> +    if (unlikely(ioctl->flags & KFD_IOC_FLAG_PERFMON)) {
>> +        if (!capable(CAP_PERFMON) && !capable(CAP_SYS_ADMIN)) {
>> +            retcode = -EACCES;
>> +            goto err_i1;
>> +        }
>> +    }
>> +
>>       if (cmd & (IOC_IN | IOC_OUT)) {
>>           if (asize <= sizeof(stack_kdata)) {
>>               kdata = stack_kdata;
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> index b7062033fda4..236d3de85153 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> @@ -144,6 +144,19 @@ enum kfd_ioctl_flags {
>>        * we also allow ioctls with SYS_ADMIN capability.
>>        */
>>       KFD_IOC_FLAG_CHECKPOINT_RESTORE = BIT(0),
>> +
>> +    /*
>> +     * @KFD_IOC_FLAG_PERFMON:
>> +     * Performance monitoring feature, GPU performance monitoring 
>> can allow users
>> +     * to gather some information about other processes. PC sampling 
>> can allow
>> +     * users to infer information about wavefronts from other 
>> processes that are
>> +     * running on the same CUs, such as which execution units they 
>> are using. As
>> +     * such, this type of performance monitoring should be protected 
>> and only
>> +     * available to users with sufficient capabilities: either 
>> CAP_PERFMON, or,
>> +     * for backwards compatibility, CAP_SYS_ADMIN.
>> +     */
>> +
>> +    KFD_IOC_FLAG_PERFMON = BIT(1),
>>   };
>>   /*
>>    * Kernel module parameter to specify maximum number of supported 
>> queues per

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH 21/24] drm/amdkfd: add queue remapping
  2023-11-22 22:35   ` Felix Kuehling
@ 2023-11-23 16:25     ` James Zhu
  2023-11-23 19:02       ` Felix Kuehling
  0 siblings, 1 reply; 80+ messages in thread
From: James Zhu @ 2023-11-23 16:25 UTC (permalink / raw)
  To: Felix Kuehling, James Zhu, amd-gfx; +Cc: joseph.greathouse


On 2023-11-22 17:35, Felix Kuehling wrote:
>
> On 2023-11-03 09:11, James Zhu wrote:
>> Add queue remapping to force the waves in any running
>> processes to complete a CWSR trap.
>
> Please add an explanation why this is needed.

[JZ] Even though the profiling-enabled bits is turned off, the CWSR trap 
handlers for some kernels with this process may still in running stage, 
this will

force the waves in any running processes to complete a CWSR trap, and 
make sure pc sampling is completely stopped with this process.   I will 
add it later.

>
>
>>
>> Signed-off-by: James Zhu <James.Zhu@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 11 +++++++++++
>>   drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h |  5 +++++
>>   drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c          |  3 +++
>>   3 files changed, 19 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> index c0e71543389a..a3f57be63f4f 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> @@ -3155,6 +3155,17 @@ int debug_refresh_runlist(struct 
>> device_queue_manager *dqm)
>>       return debug_map_and_unlock(dqm);
>>   }
>>   +void remap_queue(struct device_queue_manager *dqm,
>> +                enum kfd_unmap_queues_filter filter,
>> +                uint32_t filter_param,
>> +                uint32_t grace_period)
>
> Not sure if you need the filter and grace period parameters in this 
> function. What's the point of exposing that to callers who just want 
> to trigger a CWSR? You could also change the function name to reflect 
> the purpose of the function, rather than the implementation.
[JZ] Just want to create a general function in case that used by others. 
I am fine to remove passing filter_param/grace_period
>
> Regards,
>   Felix
>
>
>> +{
>> +    dqm_lock(dqm);
>> +    if (!dqm->dev->kfd->shared_resources.enable_mes)
>> +        execute_queues_cpsch(dqm, filter, filter_param, grace_period);
>> +    dqm_unlock(dqm);
>> +}
>> +
>>   #if defined(CONFIG_DEBUG_FS)
>>     static void seq_reg_dump(struct seq_file *m,
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
>> index cf7e182588f8..f8aae3747a36 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
>> @@ -303,6 +303,11 @@ int debug_lock_and_unmap(struct 
>> device_queue_manager *dqm);
>>   int debug_map_and_unlock(struct device_queue_manager *dqm);
>>   int debug_refresh_runlist(struct device_queue_manager *dqm);
>>   +void remap_queue(struct device_queue_manager *dqm,
>> +                enum kfd_unmap_queues_filter filter,
>> +                uint32_t filter_param,
>> +                uint32_t grace_period);
>> +
>>   static inline unsigned int get_sh_mem_bases_32(struct 
>> kfd_process_device *pdd)
>>   {
>>       return (pdd->lds_base >> 16) & 0xFF;
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>> index e8f0559b618e..66670cdb813a 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>> @@ -24,6 +24,7 @@
>>   #include "kfd_priv.h"
>>   #include "amdgpu_amdkfd.h"
>>   #include "kfd_pc_sampling.h"
>> +#include "kfd_device_queue_manager.h"
>>     struct supported_pc_sample_info {
>>       uint32_t ip_version;
>> @@ -164,6 +165,8 @@ static int kfd_pc_sample_stop(struct 
>> kfd_process_device *pdd,
>> cancel_work_sync(&pdd->dev->pcs_data.hosttrap_entry.base.pc_sampling_work);
>> kfd_process_set_trap_pc_sampling_flag(&pdd->qpd,
>> pdd->dev->pcs_data.hosttrap_entry.base.pc_sample_info.method, false);
>> +        remap_queue(pdd->dev->dqm,
>> +            KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, 
>> USE_DEFAULT_GRACE_PERIOD);
>>             mutex_lock(&pdd->dev->pcs_data.mutex);
>> pdd->dev->pcs_data.hosttrap_entry.base.target_simd = 0;

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH 20/24] drm/amdkfd: enable pc sampling work to trigger trap
  2023-11-22 22:31   ` Felix Kuehling
@ 2023-11-23 18:27     ` James Zhu
  2023-11-23 19:08       ` Felix Kuehling
  0 siblings, 1 reply; 80+ messages in thread
From: James Zhu @ 2023-11-23 18:27 UTC (permalink / raw)
  To: Felix Kuehling, James Zhu, amd-gfx; +Cc: joseph.greathouse


On 2023-11-22 17:31, Felix Kuehling wrote:
>
> On 2023-11-03 09:11, James Zhu wrote:
>> Enable a delay work to trigger pc sampling trap.
>>
>> Signed-off-by: James Zhu <James.Zhu@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdkfd/kfd_device.c      |  3 ++
>>   drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c | 39 ++++++++++++++++++++
>>   drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h |  1 +
>>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h        |  1 +
>>   4 files changed, 44 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> index bcaeedac8fe0..fb21902e433a 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> @@ -35,6 +35,7 @@
>>   #include "kfd_migrate.h"
>>   #include "amdgpu.h"
>>   #include "amdgpu_xcp.h"
>> +#include "kfd_pc_sampling.h"
>>     #define MQD_SIZE_ALIGNED 768
>>   @@ -537,6 +538,8 @@ static void kfd_pc_sampling_init(struct 
>> kfd_node *dev)
>>   {
>>       mutex_init(&dev->pcs_data.mutex);
>> idr_init_base(&dev->pcs_data.hosttrap_entry.base.pc_sampling_idr, 1);
>> + INIT_WORK(&dev->pcs_data.hosttrap_entry.base.pc_sampling_work,
>> +        kfd_pc_sample_handler);
>>   }
>>     static void kfd_pc_sampling_exit(struct kfd_node *dev)
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>> index 2c4ac5b4cc4b..e8f0559b618e 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>> @@ -38,6 +38,43 @@ struct supported_pc_sample_info 
>> supported_formats[] = {
>>       { IP_VERSION(9, 4, 2), &sample_info_hosttrap_9_0_0 },
>>   };
>>   +void kfd_pc_sample_handler(struct work_struct *work)
>> +{
>> +    struct amdgpu_device *adev;
>> +    struct kfd_node *node;
>> +    uint32_t timeout = 0;
>> +
>> +    node = container_of(work, struct kfd_node,
>> + pcs_data.hosttrap_entry.base.pc_sampling_work);
>> +
>> +    mutex_lock(&node->pcs_data.mutex);
>> +    if (node->pcs_data.hosttrap_entry.base.active_count &&
>> + node->pcs_data.hosttrap_entry.base.pc_sample_info.value &&
>> +        node->kfd2kgd->trigger_pc_sample_trap) {
>> +        switch 
>> (node->pcs_data.hosttrap_entry.base.pc_sample_info.type) {
>> +        case KFD_IOCTL_PCS_TYPE_TIME_US:
>> +            timeout = 
>> (uint32_t)node->pcs_data.hosttrap_entry.base.pc_sample_info.value;
>> +            break;
>> +        default:
>> +            pr_debug("PC Sampling type %d not supported.",
>> + node->pcs_data.hosttrap_entry.base.pc_sample_info.type);
>> +        }
>> +    }
>> +    mutex_unlock(&node->pcs_data.mutex);
>> +    if (!timeout)
>> +        return;
>> +
>> +    adev = node->adev;
>> +    while 
>> (!READ_ONCE(node->pcs_data.hosttrap_entry.base.stop_enable)) {
>
> This worker basically runs indefinitely (controlled by user mode).
>
>> + node->kfd2kgd->trigger_pc_sample_trap(adev, 
>> node->vm_info.last_vmid_kfd,
>> + &node->pcs_data.hosttrap_entry.base.target_simd,
>> + &node->pcs_data.hosttrap_entry.base.target_wave_slot,
>> + node->pcs_data.hosttrap_entry.base.pc_sample_info.method);
>> +        pr_debug_ratelimited("triggered a host trap.");
>> +        usleep_range(timeout, timeout + 10);
>
> This will cause drift of the interval. Instead what you should do, is 
> calculate the wait time at the end of every iteration based on the 
> current time and the interval.
[JZ] I am wondering what degree of accuracy is requested  on interval, 
there is HW time stamp with each pc sampling data packet,
>
>
>> +    }
>> +}
>> +
>>   static int kfd_pc_sample_query_cap(struct kfd_process_device *pdd,
>>                       struct kfd_ioctl_pc_sample_args __user *user_args)
>>   {
>> @@ -101,6 +138,7 @@ static int kfd_pc_sample_start(struct 
>> kfd_process_device *pdd,
>>           } else {
>> kfd_process_set_trap_pc_sampling_flag(&pdd->qpd,
>> pdd->dev->pcs_data.hosttrap_entry.base.pc_sample_info.method, true);
>> + 
>> schedule_work(&pdd->dev->pcs_data.hosttrap_entry.base.pc_sampling_work);
>
> Scheduling a worker that runs indefinitely on the system workqueue is 
> probably a bad idea. It could block other work items indefinitely. I 
> think you are misusing the work queue API here. What you really want 
> is probably, to crease a kernel thread.
[JZ] Yes, you are right. How about use  alloc_workqueue to create queue 
instead of system queue, is alloc_workqueue more efficient than kernel 
thread creation?
>
> Regards,
>   Felix
>
>
>>               break;
>>           }
>>       }
>> @@ -123,6 +161,7 @@ static int kfd_pc_sample_stop(struct 
>> kfd_process_device *pdd,
>>       mutex_unlock(&pdd->dev->pcs_data.mutex);
>>         if (pc_sampling_stop) {
>> + 
>> cancel_work_sync(&pdd->dev->pcs_data.hosttrap_entry.base.pc_sampling_work);
>> kfd_process_set_trap_pc_sampling_flag(&pdd->qpd,
>> pdd->dev->pcs_data.hosttrap_entry.base.pc_sample_info.method, false);
>>   diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h
>> index 4eeded4ea5b6..cb93909e6bd3 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h
>> @@ -30,5 +30,6 @@
>>     int kfd_pc_sample(struct kfd_process_device *pdd,
>>                       struct kfd_ioctl_pc_sample_args __user *args);
>> +void kfd_pc_sample_handler(struct work_struct *work);
>>     #endif /* KFD_PC_SAMPLING_H_ */
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> index badaa4d68cc4..b7062033fda4 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> @@ -263,6 +263,7 @@ struct kfd_dev_pc_sampling_data {
>>       uint32_t target_wave_slot;  /* target wave slot for trap */
>>       bool stop_enable;           /* pc sampling stop in process */
>>       struct idr pc_sampling_idr;
>> +    struct work_struct pc_sampling_work;
>>       struct kfd_pc_sample_info pc_sample_info;
>>   };

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH 21/24] drm/amdkfd: add queue remapping
  2023-11-23 16:25     ` James Zhu
@ 2023-11-23 19:02       ` Felix Kuehling
  2023-11-23 19:49         ` James Zhu
  0 siblings, 1 reply; 80+ messages in thread
From: Felix Kuehling @ 2023-11-23 19:02 UTC (permalink / raw)
  To: James Zhu, James Zhu, amd-gfx; +Cc: joseph.greathouse

On 2023-11-23 11:25, James Zhu wrote:
>
> On 2023-11-22 17:35, Felix Kuehling wrote:
>>
>> On 2023-11-03 09:11, James Zhu wrote:
>>> Add queue remapping to force the waves in any running
>>> processes to complete a CWSR trap.
>>
>> Please add an explanation why this is needed.
>
> [JZ] Even though the profiling-enabled bits is turned off, the CWSR 
> trap handlers for some kernels with this process may still in running 
> stage, this will
>
> force the waves in any running processes to complete a CWSR trap, and 
> make sure pc sampling is completely stopped with this process.   I 
> will add it later.

It may be confusing to talk specifically about "CWSR trap handler". 
There is only one trap handler that is triggered by different events: 
CWSR, host trap, s_trap instructions, exceptions, etc. When a new trap 
triggers, it serializes with any currently running trap handler in that 
wavefront. So it seems that you're using CWSR as a way to ensure that 
any host trap has completed: CWSR will wait for previous traps to finish 
before trapping again for CWSR, the HWS firmware waits for CWSR 
completion and the driver waits for HWS to finish CWSR with a fence on a 
HIQ QUERY_STATUS packet. Is that correct?

Regards,
   Felix


>
>>
>>
>>>
>>> Signed-off-by: James Zhu <James.Zhu@amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 11 
>>> +++++++++++
>>>   drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h |  5 +++++
>>>   drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c          |  3 +++
>>>   3 files changed, 19 insertions(+)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> index c0e71543389a..a3f57be63f4f 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> @@ -3155,6 +3155,17 @@ int debug_refresh_runlist(struct 
>>> device_queue_manager *dqm)
>>>       return debug_map_and_unlock(dqm);
>>>   }
>>>   +void remap_queue(struct device_queue_manager *dqm,
>>> +                enum kfd_unmap_queues_filter filter,
>>> +                uint32_t filter_param,
>>> +                uint32_t grace_period)
>>
>> Not sure if you need the filter and grace period parameters in this 
>> function. What's the point of exposing that to callers who just want 
>> to trigger a CWSR? You could also change the function name to reflect 
>> the purpose of the function, rather than the implementation.
> [JZ] Just want to create a general function in case that used by 
> others. I am fine to remove passing filter_param/grace_period
>>
>> Regards,
>>   Felix
>>
>>
>>> +{
>>> +    dqm_lock(dqm);
>>> +    if (!dqm->dev->kfd->shared_resources.enable_mes)
>>> +        execute_queues_cpsch(dqm, filter, filter_param, grace_period);
>>> +    dqm_unlock(dqm);
>>> +}
>>> +
>>>   #if defined(CONFIG_DEBUG_FS)
>>>     static void seq_reg_dump(struct seq_file *m,
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h 
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
>>> index cf7e182588f8..f8aae3747a36 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
>>> @@ -303,6 +303,11 @@ int debug_lock_and_unmap(struct 
>>> device_queue_manager *dqm);
>>>   int debug_map_and_unlock(struct device_queue_manager *dqm);
>>>   int debug_refresh_runlist(struct device_queue_manager *dqm);
>>>   +void remap_queue(struct device_queue_manager *dqm,
>>> +                enum kfd_unmap_queues_filter filter,
>>> +                uint32_t filter_param,
>>> +                uint32_t grace_period);
>>> +
>>>   static inline unsigned int get_sh_mem_bases_32(struct 
>>> kfd_process_device *pdd)
>>>   {
>>>       return (pdd->lds_base >> 16) & 0xFF;
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c 
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>>> index e8f0559b618e..66670cdb813a 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>>> @@ -24,6 +24,7 @@
>>>   #include "kfd_priv.h"
>>>   #include "amdgpu_amdkfd.h"
>>>   #include "kfd_pc_sampling.h"
>>> +#include "kfd_device_queue_manager.h"
>>>     struct supported_pc_sample_info {
>>>       uint32_t ip_version;
>>> @@ -164,6 +165,8 @@ static int kfd_pc_sample_stop(struct 
>>> kfd_process_device *pdd,
>>> cancel_work_sync(&pdd->dev->pcs_data.hosttrap_entry.base.pc_sampling_work); 
>>>
>>> kfd_process_set_trap_pc_sampling_flag(&pdd->qpd,
>>> pdd->dev->pcs_data.hosttrap_entry.base.pc_sample_info.method, false);
>>> +        remap_queue(pdd->dev->dqm,
>>> +            KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, 
>>> USE_DEFAULT_GRACE_PERIOD);
>>>             mutex_lock(&pdd->dev->pcs_data.mutex);
>>> pdd->dev->pcs_data.hosttrap_entry.base.target_simd = 0;

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH 20/24] drm/amdkfd: enable pc sampling work to trigger trap
  2023-11-23 18:27     ` James Zhu
@ 2023-11-23 19:08       ` Felix Kuehling
  2023-11-23 19:52         ` James Zhu
  0 siblings, 1 reply; 80+ messages in thread
From: Felix Kuehling @ 2023-11-23 19:08 UTC (permalink / raw)
  To: James Zhu, James Zhu, amd-gfx; +Cc: joseph.greathouse

On 2023-11-23 13:27, James Zhu wrote:
>
> On 2023-11-22 17:31, Felix Kuehling wrote:
>>
>> On 2023-11-03 09:11, James Zhu wrote:
>>> Enable a delay work to trigger pc sampling trap.
>>>
>>> Signed-off-by: James Zhu <James.Zhu@amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdkfd/kfd_device.c      |  3 ++
>>>   drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c | 39 
>>> ++++++++++++++++++++
>>>   drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h |  1 +
>>>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h        |  1 +
>>>   4 files changed, 44 insertions(+)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>> index bcaeedac8fe0..fb21902e433a 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>> @@ -35,6 +35,7 @@
>>>   #include "kfd_migrate.h"
>>>   #include "amdgpu.h"
>>>   #include "amdgpu_xcp.h"
>>> +#include "kfd_pc_sampling.h"
>>>     #define MQD_SIZE_ALIGNED 768
>>>   @@ -537,6 +538,8 @@ static void kfd_pc_sampling_init(struct 
>>> kfd_node *dev)
>>>   {
>>>       mutex_init(&dev->pcs_data.mutex);
>>> idr_init_base(&dev->pcs_data.hosttrap_entry.base.pc_sampling_idr, 1);
>>> + INIT_WORK(&dev->pcs_data.hosttrap_entry.base.pc_sampling_work,
>>> +        kfd_pc_sample_handler);
>>>   }
>>>     static void kfd_pc_sampling_exit(struct kfd_node *dev)
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c 
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>>> index 2c4ac5b4cc4b..e8f0559b618e 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>>> @@ -38,6 +38,43 @@ struct supported_pc_sample_info 
>>> supported_formats[] = {
>>>       { IP_VERSION(9, 4, 2), &sample_info_hosttrap_9_0_0 },
>>>   };
>>>   +void kfd_pc_sample_handler(struct work_struct *work)
>>> +{
>>> +    struct amdgpu_device *adev;
>>> +    struct kfd_node *node;
>>> +    uint32_t timeout = 0;
>>> +
>>> +    node = container_of(work, struct kfd_node,
>>> + pcs_data.hosttrap_entry.base.pc_sampling_work);
>>> +
>>> +    mutex_lock(&node->pcs_data.mutex);
>>> +    if (node->pcs_data.hosttrap_entry.base.active_count &&
>>> + node->pcs_data.hosttrap_entry.base.pc_sample_info.value &&
>>> +        node->kfd2kgd->trigger_pc_sample_trap) {
>>> +        switch 
>>> (node->pcs_data.hosttrap_entry.base.pc_sample_info.type) {
>>> +        case KFD_IOCTL_PCS_TYPE_TIME_US:
>>> +            timeout = 
>>> (uint32_t)node->pcs_data.hosttrap_entry.base.pc_sample_info.value;
>>> +            break;
>>> +        default:
>>> +            pr_debug("PC Sampling type %d not supported.",
>>> + node->pcs_data.hosttrap_entry.base.pc_sample_info.type);
>>> +        }
>>> +    }
>>> +    mutex_unlock(&node->pcs_data.mutex);
>>> +    if (!timeout)
>>> +        return;
>>> +
>>> +    adev = node->adev;
>>> +    while 
>>> (!READ_ONCE(node->pcs_data.hosttrap_entry.base.stop_enable)) {
>>
>> This worker basically runs indefinitely (controlled by user mode).
>>
>>> + node->kfd2kgd->trigger_pc_sample_trap(adev, 
>>> node->vm_info.last_vmid_kfd,
>>> + &node->pcs_data.hosttrap_entry.base.target_simd,
>>> + &node->pcs_data.hosttrap_entry.base.target_wave_slot,
>>> + node->pcs_data.hosttrap_entry.base.pc_sample_info.method);
>>> +        pr_debug_ratelimited("triggered a host trap.");
>>> +        usleep_range(timeout, timeout + 10);
>>
>> This will cause drift of the interval. Instead what you should do, is 
>> calculate the wait time at the end of every iteration based on the 
>> current time and the interval.
> [JZ] I am wondering what degree of accuracy is requested  on interval, 
> there is HW time stamp with each pc sampling data packet,
>>
>>
>>> +    }
>>> +}
>>> +
>>>   static int kfd_pc_sample_query_cap(struct kfd_process_device *pdd,
>>>                       struct kfd_ioctl_pc_sample_args __user 
>>> *user_args)
>>>   {
>>> @@ -101,6 +138,7 @@ static int kfd_pc_sample_start(struct 
>>> kfd_process_device *pdd,
>>>           } else {
>>> kfd_process_set_trap_pc_sampling_flag(&pdd->qpd,
>>> pdd->dev->pcs_data.hosttrap_entry.base.pc_sample_info.method, true);
>>> + 
>>> schedule_work(&pdd->dev->pcs_data.hosttrap_entry.base.pc_sampling_work);
>>
>> Scheduling a worker that runs indefinitely on the system workqueue is 
>> probably a bad idea. It could block other work items indefinitely. I 
>> think you are misusing the work queue API here. What you really want 
>> is probably, to crease a kernel thread.
> [JZ] Yes, you are right. How about use  alloc_workqueue to create 
> queue instead of system queue, is alloc_workqueue more efficient than 
> kernel thread creation?

A work queue can create many kernel threads to handle the execution of 
work items. You really only need a single kernel thread per GPU for 
time-based PC sampling. IMO the work queue just adds a bunch of 
overhead. Using a work queue for something that runs indefinitely feels 
like an abuse of the API. I don't have much experience with creating 
kernel threads directly. See include/linux/kthread.h. If you want to 
look for an example, it seems drivers/gpu/drm/scheduler uses the kthread 
API.

Regards,
   Felix


>>
>> Regards,
>>   Felix
>>
>>
>>>               break;
>>>           }
>>>       }
>>> @@ -123,6 +161,7 @@ static int kfd_pc_sample_stop(struct 
>>> kfd_process_device *pdd,
>>>       mutex_unlock(&pdd->dev->pcs_data.mutex);
>>>         if (pc_sampling_stop) {
>>> + 
>>> cancel_work_sync(&pdd->dev->pcs_data.hosttrap_entry.base.pc_sampling_work);
>>> kfd_process_set_trap_pc_sampling_flag(&pdd->qpd,
>>> pdd->dev->pcs_data.hosttrap_entry.base.pc_sample_info.method, false);
>>>   diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h 
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h
>>> index 4eeded4ea5b6..cb93909e6bd3 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h
>>> @@ -30,5 +30,6 @@
>>>     int kfd_pc_sample(struct kfd_process_device *pdd,
>>>                       struct kfd_ioctl_pc_sample_args __user *args);
>>> +void kfd_pc_sample_handler(struct work_struct *work);
>>>     #endif /* KFD_PC_SAMPLING_H_ */
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> index badaa4d68cc4..b7062033fda4 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> @@ -263,6 +263,7 @@ struct kfd_dev_pc_sampling_data {
>>>       uint32_t target_wave_slot;  /* target wave slot for trap */
>>>       bool stop_enable;           /* pc sampling stop in process */
>>>       struct idr pc_sampling_idr;
>>> +    struct work_struct pc_sampling_work;
>>>       struct kfd_pc_sample_info pc_sample_info;
>>>   };

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH 21/24] drm/amdkfd: add queue remapping
  2023-11-23 19:02       ` Felix Kuehling
@ 2023-11-23 19:49         ` James Zhu
  2023-11-23 22:41           ` Greathouse, Joseph
  0 siblings, 1 reply; 80+ messages in thread
From: James Zhu @ 2023-11-23 19:49 UTC (permalink / raw)
  To: Felix Kuehling, James Zhu, amd-gfx; +Cc: joseph.greathouse


On 2023-11-23 14:02, Felix Kuehling wrote:
> On 2023-11-23 11:25, James Zhu wrote:
>>
>> On 2023-11-22 17:35, Felix Kuehling wrote:
>>>
>>> On 2023-11-03 09:11, James Zhu wrote:
>>>> Add queue remapping to force the waves in any running
>>>> processes to complete a CWSR trap.
>>>
>>> Please add an explanation why this is needed.
>>
>> [JZ] Even though the profiling-enabled bits is turned off, the CWSR 
>> trap handlers for some kernels with this process may still in running 
>> stage, this will
>>
>> force the waves in any running processes to complete a CWSR trap, and 
>> make sure pc sampling is completely stopped with this process.   I 
>> will add it later.
>
> It may be confusing to talk specifically about "CWSR trap handler". 
> There is only one trap handler that is triggered by different events: 
> CWSR, host trap, s_trap instructions, exceptions, etc. When a new trap 
> triggers, it serializes with any currently running trap handler in 
> that wavefront. So it seems that you're using CWSR as a way to ensure 
> that any host trap has completed: CWSR will wait for previous traps to 
> finish before trapping again for CWSR, the HWS firmware waits for CWSR 
> completion and the driver waits for HWS to finish CWSR with a fence on 
> a HIQ QUERY_STATUS packet. Is that correct?
[JZ] I think your explanation is more detail. Need Joseph to confirm.
>
> Regards,
>   Felix
>
>
>>
>>>
>>>
>>>>
>>>> Signed-off-by: James Zhu <James.Zhu@amd.com>
>>>> ---
>>>>   drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 11 
>>>> +++++++++++
>>>>   drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h |  5 +++++
>>>>   drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c          |  3 +++
>>>>   3 files changed, 19 insertions(+)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>>> index c0e71543389a..a3f57be63f4f 100644
>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>>> @@ -3155,6 +3155,17 @@ int debug_refresh_runlist(struct 
>>>> device_queue_manager *dqm)
>>>>       return debug_map_and_unlock(dqm);
>>>>   }
>>>>   +void remap_queue(struct device_queue_manager *dqm,
>>>> +                enum kfd_unmap_queues_filter filter,
>>>> +                uint32_t filter_param,
>>>> +                uint32_t grace_period)
>>>
>>> Not sure if you need the filter and grace period parameters in this 
>>> function. What's the point of exposing that to callers who just want 
>>> to trigger a CWSR? You could also change the function name to 
>>> reflect the purpose of the function, rather than the implementation.
>> [JZ] Just want to create a general function in case that used by 
>> others. I am fine to remove passing filter_param/grace_period
>>>
>>> Regards,
>>>   Felix
>>>
>>>
>>>> +{
>>>> +    dqm_lock(dqm);
>>>> +    if (!dqm->dev->kfd->shared_resources.enable_mes)
>>>> +        execute_queues_cpsch(dqm, filter, filter_param, 
>>>> grace_period);
>>>> +    dqm_unlock(dqm);
>>>> +}
>>>> +
>>>>   #if defined(CONFIG_DEBUG_FS)
>>>>     static void seq_reg_dump(struct seq_file *m,
>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h 
>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
>>>> index cf7e182588f8..f8aae3747a36 100644
>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
>>>> @@ -303,6 +303,11 @@ int debug_lock_and_unmap(struct 
>>>> device_queue_manager *dqm);
>>>>   int debug_map_and_unlock(struct device_queue_manager *dqm);
>>>>   int debug_refresh_runlist(struct device_queue_manager *dqm);
>>>>   +void remap_queue(struct device_queue_manager *dqm,
>>>> +                enum kfd_unmap_queues_filter filter,
>>>> +                uint32_t filter_param,
>>>> +                uint32_t grace_period);
>>>> +
>>>>   static inline unsigned int get_sh_mem_bases_32(struct 
>>>> kfd_process_device *pdd)
>>>>   {
>>>>       return (pdd->lds_base >> 16) & 0xFF;
>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c 
>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>>>> index e8f0559b618e..66670cdb813a 100644
>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>>>> @@ -24,6 +24,7 @@
>>>>   #include "kfd_priv.h"
>>>>   #include "amdgpu_amdkfd.h"
>>>>   #include "kfd_pc_sampling.h"
>>>> +#include "kfd_device_queue_manager.h"
>>>>     struct supported_pc_sample_info {
>>>>       uint32_t ip_version;
>>>> @@ -164,6 +165,8 @@ static int kfd_pc_sample_stop(struct 
>>>> kfd_process_device *pdd,
>>>> cancel_work_sync(&pdd->dev->pcs_data.hosttrap_entry.base.pc_sampling_work); 
>>>>
>>>> kfd_process_set_trap_pc_sampling_flag(&pdd->qpd,
>>>> pdd->dev->pcs_data.hosttrap_entry.base.pc_sample_info.method, false);
>>>> +        remap_queue(pdd->dev->dqm,
>>>> +            KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, 
>>>> USE_DEFAULT_GRACE_PERIOD);
>>>>             mutex_lock(&pdd->dev->pcs_data.mutex);
>>>> pdd->dev->pcs_data.hosttrap_entry.base.target_simd = 0;

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH 20/24] drm/amdkfd: enable pc sampling work to trigger trap
  2023-11-23 19:08       ` Felix Kuehling
@ 2023-11-23 19:52         ` James Zhu
  0 siblings, 0 replies; 80+ messages in thread
From: James Zhu @ 2023-11-23 19:52 UTC (permalink / raw)
  To: Felix Kuehling, James Zhu, amd-gfx; +Cc: joseph.greathouse


On 2023-11-23 14:08, Felix Kuehling wrote:
> On 2023-11-23 13:27, James Zhu wrote:
>>
>> On 2023-11-22 17:31, Felix Kuehling wrote:
>>>
>>> On 2023-11-03 09:11, James Zhu wrote:
>>>> Enable a delay work to trigger pc sampling trap.
>>>>
>>>> Signed-off-by: James Zhu <James.Zhu@amd.com>
>>>> ---
>>>>   drivers/gpu/drm/amd/amdkfd/kfd_device.c      |  3 ++
>>>>   drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c | 39 
>>>> ++++++++++++++++++++
>>>>   drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h |  1 +
>>>>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h        |  1 +
>>>>   4 files changed, 44 insertions(+)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>>> index bcaeedac8fe0..fb21902e433a 100644
>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>>> @@ -35,6 +35,7 @@
>>>>   #include "kfd_migrate.h"
>>>>   #include "amdgpu.h"
>>>>   #include "amdgpu_xcp.h"
>>>> +#include "kfd_pc_sampling.h"
>>>>     #define MQD_SIZE_ALIGNED 768
>>>>   @@ -537,6 +538,8 @@ static void kfd_pc_sampling_init(struct 
>>>> kfd_node *dev)
>>>>   {
>>>>       mutex_init(&dev->pcs_data.mutex);
>>>> idr_init_base(&dev->pcs_data.hosttrap_entry.base.pc_sampling_idr, 1);
>>>> + INIT_WORK(&dev->pcs_data.hosttrap_entry.base.pc_sampling_work,
>>>> +        kfd_pc_sample_handler);
>>>>   }
>>>>     static void kfd_pc_sampling_exit(struct kfd_node *dev)
>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c 
>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>>>> index 2c4ac5b4cc4b..e8f0559b618e 100644
>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>>>> @@ -38,6 +38,43 @@ struct supported_pc_sample_info 
>>>> supported_formats[] = {
>>>>       { IP_VERSION(9, 4, 2), &sample_info_hosttrap_9_0_0 },
>>>>   };
>>>>   +void kfd_pc_sample_handler(struct work_struct *work)
>>>> +{
>>>> +    struct amdgpu_device *adev;
>>>> +    struct kfd_node *node;
>>>> +    uint32_t timeout = 0;
>>>> +
>>>> +    node = container_of(work, struct kfd_node,
>>>> + pcs_data.hosttrap_entry.base.pc_sampling_work);
>>>> +
>>>> +    mutex_lock(&node->pcs_data.mutex);
>>>> +    if (node->pcs_data.hosttrap_entry.base.active_count &&
>>>> + node->pcs_data.hosttrap_entry.base.pc_sample_info.value &&
>>>> +        node->kfd2kgd->trigger_pc_sample_trap) {
>>>> +        switch 
>>>> (node->pcs_data.hosttrap_entry.base.pc_sample_info.type) {
>>>> +        case KFD_IOCTL_PCS_TYPE_TIME_US:
>>>> +            timeout = 
>>>> (uint32_t)node->pcs_data.hosttrap_entry.base.pc_sample_info.value;
>>>> +            break;
>>>> +        default:
>>>> +            pr_debug("PC Sampling type %d not supported.",
>>>> + node->pcs_data.hosttrap_entry.base.pc_sample_info.type);
>>>> +        }
>>>> +    }
>>>> +    mutex_unlock(&node->pcs_data.mutex);
>>>> +    if (!timeout)
>>>> +        return;
>>>> +
>>>> +    adev = node->adev;
>>>> +    while 
>>>> (!READ_ONCE(node->pcs_data.hosttrap_entry.base.stop_enable)) {
>>>
>>> This worker basically runs indefinitely (controlled by user mode).
>>>
>>>> + node->kfd2kgd->trigger_pc_sample_trap(adev, 
>>>> node->vm_info.last_vmid_kfd,
>>>> + &node->pcs_data.hosttrap_entry.base.target_simd,
>>>> + &node->pcs_data.hosttrap_entry.base.target_wave_slot,
>>>> + node->pcs_data.hosttrap_entry.base.pc_sample_info.method);
>>>> +        pr_debug_ratelimited("triggered a host trap.");
>>>> +        usleep_range(timeout, timeout + 10);
>>>
>>> This will cause drift of the interval. Instead what you should do, 
>>> is calculate the wait time at the end of every iteration based on 
>>> the current time and the interval.
>> [JZ] I am wondering what degree of accuracy is requested  on 
>> interval, there is HW time stamp with each pc sampling data packet,
>>>
>>>
>>>> +    }
>>>> +}
>>>> +
>>>>   static int kfd_pc_sample_query_cap(struct kfd_process_device *pdd,
>>>>                       struct kfd_ioctl_pc_sample_args __user 
>>>> *user_args)
>>>>   {
>>>> @@ -101,6 +138,7 @@ static int kfd_pc_sample_start(struct 
>>>> kfd_process_device *pdd,
>>>>           } else {
>>>> kfd_process_set_trap_pc_sampling_flag(&pdd->qpd,
>>>> pdd->dev->pcs_data.hosttrap_entry.base.pc_sample_info.method, true);
>>>> + 
>>>> schedule_work(&pdd->dev->pcs_data.hosttrap_entry.base.pc_sampling_work);
>>>
>>> Scheduling a worker that runs indefinitely on the system workqueue 
>>> is probably a bad idea. It could block other work items 
>>> indefinitely. I think you are misusing the work queue API here. What 
>>> you really want is probably, to crease a kernel thread.
>> [JZ] Yes, you are right. How about use  alloc_workqueue to create 
>> queue instead of system queue, is alloc_workqueue more efficient than 
>> kernel thread creation?
>
> A work queue can create many kernel threads to handle the execution of 
> work items. You really only need a single kernel thread per GPU for 
> time-based PC sampling. IMO the work queue just adds a bunch of 
> overhead. Using a work queue for something that runs indefinitely 
> feels like an abuse of the API. I don't have much experience with 
> creating kernel threads directly. See include/linux/kthread.h. If you 
> want to look for an example, it seems drivers/gpu/drm/scheduler uses 
> the kthread API.
[JZ] then let me switch to kthread
>
> Regards,
>   Felix
>
>
>>>
>>> Regards,
>>>   Felix
>>>
>>>
>>>>               break;
>>>>           }
>>>>       }
>>>> @@ -123,6 +161,7 @@ static int kfd_pc_sample_stop(struct 
>>>> kfd_process_device *pdd,
>>>>       mutex_unlock(&pdd->dev->pcs_data.mutex);
>>>>         if (pc_sampling_stop) {
>>>> + 
>>>> cancel_work_sync(&pdd->dev->pcs_data.hosttrap_entry.base.pc_sampling_work);
>>>> kfd_process_set_trap_pc_sampling_flag(&pdd->qpd,
>>>> pdd->dev->pcs_data.hosttrap_entry.base.pc_sample_info.method, false);
>>>>   diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h 
>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h
>>>> index 4eeded4ea5b6..cb93909e6bd3 100644
>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h
>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h
>>>> @@ -30,5 +30,6 @@
>>>>     int kfd_pc_sample(struct kfd_process_device *pdd,
>>>>                       struct kfd_ioctl_pc_sample_args __user *args);
>>>> +void kfd_pc_sample_handler(struct work_struct *work);
>>>>     #endif /* KFD_PC_SAMPLING_H_ */
>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>> index badaa4d68cc4..b7062033fda4 100644
>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>> @@ -263,6 +263,7 @@ struct kfd_dev_pc_sampling_data {
>>>>       uint32_t target_wave_slot;  /* target wave slot for trap */
>>>>       bool stop_enable;           /* pc sampling stop in process */
>>>>       struct idr pc_sampling_idr;
>>>> +    struct work_struct pc_sampling_work;
>>>>       struct kfd_pc_sample_info pc_sample_info;
>>>>   };

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH 18/24] drm/amdkfd: enable pc sampling start
  2023-11-22 22:27   ` Felix Kuehling
@ 2023-11-23 20:01     ` James Zhu
  2023-11-23 20:21       ` Felix Kuehling
  0 siblings, 1 reply; 80+ messages in thread
From: James Zhu @ 2023-11-23 20:01 UTC (permalink / raw)
  To: Felix Kuehling, James Zhu, amd-gfx; +Cc: joseph.greathouse


On 2023-11-22 17:27, Felix Kuehling wrote:
>
> On 2023-11-03 09:11, James Zhu wrote:
>> Enable pc sampling start.
>>
>> Signed-off-by: James Zhu <James.Zhu@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c | 26 +++++++++++++++++---
>>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h        |  2 ++
>>   2 files changed, 25 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>> index 60b29b245db5..33d003ca0093 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>> @@ -83,9 +83,29 @@ static int kfd_pc_sample_query_cap(struct 
>> kfd_process_device *pdd,
>>       return 0;
>>   }
>>   -static int kfd_pc_sample_start(struct kfd_process_device *pdd)
>> +static int kfd_pc_sample_start(struct kfd_process_device *pdd,
>> +                    struct pc_sampling_entry *pcs_entry)
>>   {
>> -    return -EINVAL;
>> +    bool pc_sampling_start = false;
>> +
>> +    pcs_entry->enabled = true;
>> +    mutex_lock(&pdd->dev->pcs_data.mutex);
>> +    if (!pdd->dev->pcs_data.hosttrap_entry.base.active_count)
>> +        pc_sampling_start = true;
>> + pdd->dev->pcs_data.hosttrap_entry.base.active_count++;
>> +    mutex_unlock(&pdd->dev->pcs_data.mutex);
>> +
>> +    while (pc_sampling_start) {
>> +        if 
>> (READ_ONCE(pdd->dev->pcs_data.hosttrap_entry.base.stop_enable)) {
>> +            usleep_range(1000, 2000);
>
> I don't understand why you need this synchronization through 
> stop_enable. Why can't you do both the start and stop while holding 
> the mutex? It's just setting a flag in the TMA, so it's not a 
> time-consuming operation, and I don't see any potential for deadlocks.
[JZ] for stop, not just set TMA. need wait for current pc sampling 
completely stop and reset some initial setting.
>
> Regards,
>   Felix
>
>
>> +        } else {
>> + kfd_process_set_trap_pc_sampling_flag(&pdd->qpd,
>> + pdd->dev->pcs_data.hosttrap_entry.base.pc_sample_info.method, true);
>> +            break;
>> +        }
>> +    }
>> +
>> +    return 0;
>>   }
>>     static int kfd_pc_sample_stop(struct kfd_process_device *pdd)
>> @@ -225,7 +245,7 @@ int kfd_pc_sample(struct kfd_process_device *pdd,
>>           if (pcs_entry->enabled)
>>               return -EALREADY;
>>           else
>> -            return kfd_pc_sample_start(pdd);
>> +            return kfd_pc_sample_start(pdd, pcs_entry);
>>         case KFD_IOCTL_PCS_OP_STOP:
>>           if (!pcs_entry->enabled)
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> index 6670534f47b8..613910e0d440 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> @@ -258,6 +258,8 @@ struct kfd_dev;
>>     struct kfd_dev_pc_sampling_data {
>>       uint32_t use_count;         /* Num of PC sampling sessions */
>> +    uint32_t active_count;      /* Num of active sessions */
>> +    bool stop_enable;           /* pc sampling stop in process */
>>       struct idr pc_sampling_idr;
>>       struct kfd_pc_sample_info pc_sample_info;
>>   };

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH 07/24] drm/amdkfd: check pcs_enrty valid
  2023-11-22 22:15   ` [PATCH " Felix Kuehling
@ 2023-11-23 20:18     ` James Zhu
  2023-11-23 20:32       ` Felix Kuehling
  0 siblings, 1 reply; 80+ messages in thread
From: James Zhu @ 2023-11-23 20:18 UTC (permalink / raw)
  To: Felix Kuehling, James Zhu, amd-gfx; +Cc: joseph.greathouse


On 2023-11-22 17:15, Felix Kuehling wrote:
>
> On 2023-11-03 09:11, James Zhu wrote:
>> Check pcs_enrty valid for pc sampling ioctl.
>>
>> Signed-off-by: James Zhu <James.Zhu@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c | 30 ++++++++++++++++++--
>>   1 file changed, 27 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>> index 4c9fc48e1a6a..36366c8847de 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>> @@ -179,6 +179,21 @@ static int kfd_pc_sample_destroy(struct 
>> kfd_process_device *pdd, uint32_t trace_
>>   int kfd_pc_sample(struct kfd_process_device *pdd,
>>                       struct kfd_ioctl_pc_sample_args __user *args)
>>   {
>> +    struct pc_sampling_entry *pcs_entry;
>> +
>> +    if (args->op != KFD_IOCTL_PCS_OP_QUERY_CAPABILITIES &&
>> +        args->op != KFD_IOCTL_PCS_OP_CREATE) {
>> +
>> +        mutex_lock(&pdd->dev->pcs_data.mutex);
>> +        pcs_entry = 
>> idr_find(&pdd->dev->pcs_data.hosttrap_entry.base.pc_sampling_idr,
>> +                args->trace_id);
>> +        mutex_unlock(&pdd->dev->pcs_data.mutex);
>
> You need to keep holding the lock while the pcs_entry is still used. 
> That includes any of the kfd_pc_sample_<op> functions below. Otherwise 
> someone could free it concurrently. It would also simplify the 
> ..._<op> functions, if they didn't have to worry about the locking 
> themselves.
[JZ] pcs_entry is only for this pc sampling process, which has 
kfd_process->mutex protected here.
>
> Regards,
>   Felix
>
>
>> +
>> +        if (!pcs_entry ||
>> +            pcs_entry->pdd != pdd)
>> +            return -EINVAL;
>> +    }
>> +
>>       switch (args->op) {
>>       case KFD_IOCTL_PCS_OP_QUERY_CAPABILITIES:
>>           return kfd_pc_sample_query_cap(pdd, args);
>> @@ -187,13 +202,22 @@ int kfd_pc_sample(struct kfd_process_device *pdd,
>>           return kfd_pc_sample_create(pdd, args);
>>         case KFD_IOCTL_PCS_OP_DESTROY:
>> -        return kfd_pc_sample_destroy(pdd, args->trace_id);
>> +        if (pcs_entry->enabled)
>> +            return -EBUSY;
>> +        else
>> +            return kfd_pc_sample_destroy(pdd, args->trace_id);
>>         case KFD_IOCTL_PCS_OP_START:
>> -        return kfd_pc_sample_start(pdd);
>> +        if (pcs_entry->enabled)
>> +            return -EALREADY;
>> +        else
>> +            return kfd_pc_sample_start(pdd);
>>         case KFD_IOCTL_PCS_OP_STOP:
>> -        return kfd_pc_sample_stop(pdd);
>> +        if (!pcs_entry->enabled)
>> +            return -EALREADY;
>> +        else
>> +            return kfd_pc_sample_stop(pdd);
>>       }
>>         return -EINVAL;

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH 18/24] drm/amdkfd: enable pc sampling start
  2023-11-23 20:01     ` James Zhu
@ 2023-11-23 20:21       ` Felix Kuehling
  2023-11-23 22:00         ` James Zhu
  0 siblings, 1 reply; 80+ messages in thread
From: Felix Kuehling @ 2023-11-23 20:21 UTC (permalink / raw)
  To: James Zhu, James Zhu, amd-gfx; +Cc: joseph.greathouse


On 2023-11-23 15:01, James Zhu wrote:
>
> On 2023-11-22 17:27, Felix Kuehling wrote:
>>
>> On 2023-11-03 09:11, James Zhu wrote:
>>> Enable pc sampling start.
>>>
>>> Signed-off-by: James Zhu <James.Zhu@amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c | 26 
>>> +++++++++++++++++---
>>>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h        |  2 ++
>>>   2 files changed, 25 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c 
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>>> index 60b29b245db5..33d003ca0093 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>>> @@ -83,9 +83,29 @@ static int kfd_pc_sample_query_cap(struct 
>>> kfd_process_device *pdd,
>>>       return 0;
>>>   }
>>>   -static int kfd_pc_sample_start(struct kfd_process_device *pdd)
>>> +static int kfd_pc_sample_start(struct kfd_process_device *pdd,
>>> +                    struct pc_sampling_entry *pcs_entry)
>>>   {
>>> -    return -EINVAL;
>>> +    bool pc_sampling_start = false;
>>> +
>>> +    pcs_entry->enabled = true;
>>> +    mutex_lock(&pdd->dev->pcs_data.mutex);
>>> +    if (!pdd->dev->pcs_data.hosttrap_entry.base.active_count)
>>> +        pc_sampling_start = true;
>>> + pdd->dev->pcs_data.hosttrap_entry.base.active_count++;
>>> +    mutex_unlock(&pdd->dev->pcs_data.mutex);
>>> +
>>> +    while (pc_sampling_start) {
>>> +        if 
>>> (READ_ONCE(pdd->dev->pcs_data.hosttrap_entry.base.stop_enable)) {
>>> +            usleep_range(1000, 2000);
>>
>> I don't understand why you need this synchronization through 
>> stop_enable. Why can't you do both the start and stop while holding 
>> the mutex? It's just setting a flag in the TMA, so it's not a 
>> time-consuming operation, and I don't see any potential for deadlocks.
> [JZ] for stop, not just set TMA. need wait for current pc sampling 
> completely stop and reset some initial setting.

I think that's being obfuscated by how you split up this patch series. 
Maybe if you squash the queue remapping patch into this one, it would be 
more obvious what's really happening when you stop sampling and would 
make it easier to review the synchronization and locking strategy.

Regards,
   Felix


>>
>> Regards,
>>   Felix
>>
>>
>>> +        } else {
>>> + kfd_process_set_trap_pc_sampling_flag(&pdd->qpd,
>>> + pdd->dev->pcs_data.hosttrap_entry.base.pc_sample_info.method, true);
>>> +            break;
>>> +        }
>>> +    }
>>> +
>>> +    return 0;
>>>   }
>>>     static int kfd_pc_sample_stop(struct kfd_process_device *pdd)
>>> @@ -225,7 +245,7 @@ int kfd_pc_sample(struct kfd_process_device *pdd,
>>>           if (pcs_entry->enabled)
>>>               return -EALREADY;
>>>           else
>>> -            return kfd_pc_sample_start(pdd);
>>> +            return kfd_pc_sample_start(pdd, pcs_entry);
>>>         case KFD_IOCTL_PCS_OP_STOP:
>>>           if (!pcs_entry->enabled)
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> index 6670534f47b8..613910e0d440 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> @@ -258,6 +258,8 @@ struct kfd_dev;
>>>     struct kfd_dev_pc_sampling_data {
>>>       uint32_t use_count;         /* Num of PC sampling sessions */
>>> +    uint32_t active_count;      /* Num of active sessions */
>>> +    bool stop_enable;           /* pc sampling stop in process */
>>>       struct idr pc_sampling_idr;
>>>       struct kfd_pc_sample_info pc_sample_info;
>>>   };

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH 06/24] drm/amdkfd: add trace_id return
  2023-11-22 21:56   ` Felix Kuehling
@ 2023-11-23 20:22     ` James Zhu
  0 siblings, 0 replies; 80+ messages in thread
From: James Zhu @ 2023-11-23 20:22 UTC (permalink / raw)
  To: Felix Kuehling, James Zhu, amd-gfx; +Cc: joseph.greathouse


On 2023-11-22 16:56, Felix Kuehling wrote:
>
> On 2023-11-03 09:11, James Zhu wrote:
>> Add trace_id return for new pc sampling creation per device,
>> Use IDR to quickly locate pc_sampling_entry for reference.
>>
>> Signed-off-by: James Zhu <James.Zhu@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdkfd/kfd_device.c      |  2 ++
>>   drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c | 20 +++++++++++++++++++-
>>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h        |  6 ++++++
>>   3 files changed, 27 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> index 0e24e011f66b..bcaeedac8fe0 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> @@ -536,10 +536,12 @@ static void kfd_smi_init(struct kfd_node *dev)
>>   static void kfd_pc_sampling_init(struct kfd_node *dev)
>>   {
>>       mutex_init(&dev->pcs_data.mutex);
>> + idr_init_base(&dev->pcs_data.hosttrap_entry.base.pc_sampling_idr, 1);
>>   }
>>     static void kfd_pc_sampling_exit(struct kfd_node *dev)
>>   {
>> + idr_destroy(&dev->pcs_data.hosttrap_entry.base.pc_sampling_idr);
>>       mutex_destroy(&dev->pcs_data.mutex);
>>   }
>>   diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>> index f0d910ee730c..4c9fc48e1a6a 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>> @@ -99,6 +99,7 @@ static int kfd_pc_sample_create(struct 
>> kfd_process_device *pdd,
>>   {
>>       struct kfd_pc_sample_info *supported_format = NULL;
>>       struct kfd_pc_sample_info user_info;
>> +    struct pc_sampling_entry *pcs_entry;
>>       int ret;
>>       int i;
>>   @@ -140,7 +141,19 @@ static int kfd_pc_sample_create(struct 
>> kfd_process_device *pdd,
>>           return ret ? ret : -EEXIST;
>>       }
>>   -    /* TODO: add trace_id return */
>> +    pcs_entry = kvzalloc(sizeof(*pcs_entry), GFP_KERNEL);
>
> I don't see a reason to use kvzalloc here. You know the size of the 
> structure, so kzalloc should be perfectly fine.
[JZ] Sure, will change to kzalloc
>
>
>> +    if (!pcs_entry) {
>> +        mutex_unlock(&pdd->dev->pcs_data.mutex);
>> +        return -ENOMEM;
>> +    }
>> +
>> +    i = 
>> idr_alloc_cyclic(&pdd->dev->pcs_data.hosttrap_entry.base.pc_sampling_idr,
>> +                pcs_entry, 1, 0, GFP_KERNEL);
>> +    if (i < 0) {
>> +        mutex_unlock(&pdd->dev->pcs_data.mutex);
>> +        kvfree(pcs_entry);
>
> kfree
>
>
>> +        return i;
>> +    }
>>         if (!pdd->dev->pcs_data.hosttrap_entry.base.use_count)
>> memcpy(&pdd->dev->pcs_data.hosttrap_entry.base.pc_sample_info,
>> @@ -149,6 +162,11 @@ static int kfd_pc_sample_create(struct 
>> kfd_process_device *pdd,
>>       pdd->dev->pcs_data.hosttrap_entry.base.use_count++;
>>       mutex_unlock(&pdd->dev->pcs_data.mutex);
>>   +    pcs_entry->pdd = pdd;
>> +    user_args->trace_id = (uint32_t)i;
>
> I suspect this should be done inside the lock. You don't want someone 
> looking up the pcs_entry before it has been initialized.
[JZ]pcs_entry is for this pc sampling process, and it has 
kfd_process->mutex protected,
>
> Regards,
>   Felix
>
>
>> +
>> +    pr_debug("alloc pcs_entry = %p, trace_id = 0x%x on gpu 0x%x", 
>> pcs_entry, i, pdd->dev->id);
>> +
>>       return 0;
>>   }
>>   diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> index 81c925fb2952..642558026d16 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> @@ -258,6 +258,7 @@ struct kfd_dev;
>>     struct kfd_dev_pc_sampling_data {
>>       uint32_t use_count;         /* Num of PC sampling sessions */
>> +    struct idr pc_sampling_idr;
>>       struct kfd_pc_sample_info pc_sample_info;
>>   };
>>   @@ -743,6 +744,11 @@ enum kfd_pdd_bound {
>>    */
>>   #define SDMA_ACTIVITY_DIVISOR  100
>>   +struct pc_sampling_entry {
>> +    bool enabled;
>> +    struct kfd_process_device *pdd;
>> +};
>> +
>>   /* Data that is per-process-per device. */
>>   struct kfd_process_device {
>>       /* The device that owns this data. */

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH 05/24] drm/amdkfd: enable pc sampling create
  2023-11-22 21:51   ` Felix Kuehling
@ 2023-11-23 20:25     ` James Zhu
  0 siblings, 0 replies; 80+ messages in thread
From: James Zhu @ 2023-11-23 20:25 UTC (permalink / raw)
  To: Felix Kuehling, James Zhu, amd-gfx; +Cc: joseph.greathouse


On 2023-11-22 16:51, Felix Kuehling wrote:
>
> On 2023-11-03 09:11, James Zhu wrote:
>> From: David Yat Sin <david.yatsin@amd.com>
>>
>> Enable pc sampling create.
>>
>> Co-developed-by: James Zhu <James.Zhu@amd.com>
>> Signed-off-by: James Zhu <James.Zhu@amd.com>
>> Signed-off-by: David Yat Sin <david.yatsin@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c | 54 +++++++++++++++++++-
>>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h        | 10 ++++
>>   2 files changed, 63 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>> index 49fecbc7013e..f0d910ee730c 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>> @@ -97,7 +97,59 @@ static int kfd_pc_sample_stop(struct 
>> kfd_process_device *pdd)
>>   static int kfd_pc_sample_create(struct kfd_process_device *pdd,
>>                       struct kfd_ioctl_pc_sample_args __user *user_args)
>>   {
>> -    return -EINVAL;
>> +    struct kfd_pc_sample_info *supported_format = NULL;
>> +    struct kfd_pc_sample_info user_info;
>> +    int ret;
>> +    int i;
>> +
>> +    if (user_args->num_sample_info != 1)
>> +        return -EINVAL;
>> +
>> +    ret = copy_from_user(&user_info, (void __user *) 
>> user_args->sample_info_ptr,
>> +                sizeof(struct kfd_pc_sample_info));
>> +    if (ret) {
>> +        pr_debug("Failed to copy PC sampling info from user\n");
>> +        return -EFAULT;
>> +    }
>> +
>> +    for (i = 0; i < ARRAY_SIZE(supported_formats); i++) {
>> +        if (KFD_GC_VERSION(pdd->dev) == supported_formats[i].ip_version
>> +            && user_info.method == 
>> supported_formats[i].sample_info->method
>> +            && user_info.type == supported_formats[i].sample_info->type
>> +            && user_info.value <= 
>> supported_formats[i].sample_info->value_max
>> +            && user_info.value >= 
>> supported_formats[i].sample_info->value_min) {
>> +            supported_format =
>> +                (struct kfd_pc_sample_info 
>> *)supported_formats[i].sample_info;
>> +            break;
>> +        }
>> +    }
>> +
>> +    if (!supported_format) {
>> +        pr_debug("Sampling format is not supported!");
>> +        return -EOPNOTSUPP;
>> +    }
>> +
>> +    mutex_lock(&pdd->dev->pcs_data.mutex);
>> +    if (pdd->dev->pcs_data.hosttrap_entry.base.use_count &&
>> + memcmp(&pdd->dev->pcs_data.hosttrap_entry.base.pc_sample_info,
>> +                &user_info, sizeof(user_info))) {
>
> I think you can compare structures in C. This would be more readable:
>
>     if (pdd->dev->pcs_data.hosttrap_entry.base.use_count &&
> pdd->dev->pcs_data.hosttrap_entry.base.pc_sample_info != user_info) {
>         ...
>     }
> [JZ[ Sure
>
>> +        ret = copy_to_user((void __user *) user_args->sample_info_ptr,
>> + &pdd->dev->pcs_data.hosttrap_entry.base.pc_sample_info,
>> +            sizeof(struct kfd_pc_sample_info));
>> +        mutex_unlock(&pdd->dev->pcs_data.mutex);
>> +        return ret ? ret : -EEXIST;
>
> When copy_to_user fails, it returns the number of bytes not copied. 
> That's not a useful return value here. This should be
>
>         return ret ? -EFAULT : -EEXIST;
>
> Also -EBUSY may be more appropriate than -EEXIST.
[JZ[ Sure
>
>
>> +    }
>> +
>> +    /* TODO: add trace_id return */
>> +
>> +    if (!pdd->dev->pcs_data.hosttrap_entry.base.use_count)
>> + memcpy(&pdd->dev->pcs_data.hosttrap_entry.base.pc_sample_info,
>> +                &user_info, sizeof(user_info));
>
> I think you can assign structures in C. Just do
>
> pdd->dev->pcs_data.hosttrap_entry.base.pc_sample_info = user_info;
> [JZ[ Sure
> Regards,
>   Felix
>
>
>> +
>> +    pdd->dev->pcs_data.hosttrap_entry.base.use_count++;
>> +    mutex_unlock(&pdd->dev->pcs_data.mutex);
>> +
>> +    return 0;
>>   }
>>     static int kfd_pc_sample_destroy(struct kfd_process_device *pdd, 
>> uint32_t trace_id)
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> index 4a0b66189c67..81c925fb2952 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> @@ -256,9 +256,19 @@ struct kfd_vmid_info {
>>     struct kfd_dev;
>>   +struct kfd_dev_pc_sampling_data {
>> +    uint32_t use_count;         /* Num of PC sampling sessions */
>> +    struct kfd_pc_sample_info pc_sample_info;
>> +};
>> +
>> +struct kfd_dev_pcs_hosttrap {
>> +    struct kfd_dev_pc_sampling_data base;
>> +};
>> +
>>   /* Per device PC Sampling data */
>>   struct kfd_dev_pc_sampling {
>>       struct mutex mutex;
>> +    struct kfd_dev_pcs_hosttrap hosttrap_entry;
>>   };
>>     struct kfd_node {

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH 07/24] drm/amdkfd: check pcs_enrty valid
  2023-11-23 20:18     ` James Zhu
@ 2023-11-23 20:32       ` Felix Kuehling
  2023-11-23 22:06         ` James Zhu
  0 siblings, 1 reply; 80+ messages in thread
From: Felix Kuehling @ 2023-11-23 20:32 UTC (permalink / raw)
  To: James Zhu, James Zhu, amd-gfx; +Cc: joseph.greathouse


On 2023-11-23 15:18, James Zhu wrote:
>
> On 2023-11-22 17:15, Felix Kuehling wrote:
>>
>> On 2023-11-03 09:11, James Zhu wrote:
>>> Check pcs_enrty valid for pc sampling ioctl.
>>>
>>> Signed-off-by: James Zhu <James.Zhu@amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c | 30 
>>> ++++++++++++++++++--
>>>   1 file changed, 27 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c 
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>>> index 4c9fc48e1a6a..36366c8847de 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>>> @@ -179,6 +179,21 @@ static int kfd_pc_sample_destroy(struct 
>>> kfd_process_device *pdd, uint32_t trace_
>>>   int kfd_pc_sample(struct kfd_process_device *pdd,
>>>                       struct kfd_ioctl_pc_sample_args __user *args)
>>>   {
>>> +    struct pc_sampling_entry *pcs_entry;
>>> +
>>> +    if (args->op != KFD_IOCTL_PCS_OP_QUERY_CAPABILITIES &&
>>> +        args->op != KFD_IOCTL_PCS_OP_CREATE) {
>>> +
>>> +        mutex_lock(&pdd->dev->pcs_data.mutex);
>>> +        pcs_entry = 
>>> idr_find(&pdd->dev->pcs_data.hosttrap_entry.base.pc_sampling_idr,
>>> +                args->trace_id);
>>> +        mutex_unlock(&pdd->dev->pcs_data.mutex);
>>
>> You need to keep holding the lock while the pcs_entry is still used. 
>> That includes any of the kfd_pc_sample_<op> functions below. 
>> Otherwise someone could free it concurrently. It would also simplify 
>> the ..._<op> functions, if they didn't have to worry about the 
>> locking themselves.
> [JZ] pcs_entry is only for this pc sampling process, which has 
> kfd_process->mutex protected here.

OK. That's not obvious. I'm also wary about depending too much on the 
big process lock. We will need to make that locking more granular soon, 
because it is causing performance issues with multi-threaded processes.

Regards,
   Felix


>>
>> Regards,
>>   Felix
>>
>>
>>> +
>>> +        if (!pcs_entry ||
>>> +            pcs_entry->pdd != pdd)
>>> +            return -EINVAL;
>>> +    }
>>> +
>>>       switch (args->op) {
>>>       case KFD_IOCTL_PCS_OP_QUERY_CAPABILITIES:
>>>           return kfd_pc_sample_query_cap(pdd, args);
>>> @@ -187,13 +202,22 @@ int kfd_pc_sample(struct kfd_process_device *pdd,
>>>           return kfd_pc_sample_create(pdd, args);
>>>         case KFD_IOCTL_PCS_OP_DESTROY:
>>> -        return kfd_pc_sample_destroy(pdd, args->trace_id);
>>> +        if (pcs_entry->enabled)
>>> +            return -EBUSY;
>>> +        else
>>> +            return kfd_pc_sample_destroy(pdd, args->trace_id);
>>>         case KFD_IOCTL_PCS_OP_START:
>>> -        return kfd_pc_sample_start(pdd);
>>> +        if (pcs_entry->enabled)
>>> +            return -EALREADY;
>>> +        else
>>> +            return kfd_pc_sample_start(pdd);
>>>         case KFD_IOCTL_PCS_OP_STOP:
>>> -        return kfd_pc_sample_stop(pdd);
>>> +        if (!pcs_entry->enabled)
>>> +            return -EALREADY;
>>> +        else
>>> +            return kfd_pc_sample_stop(pdd);
>>>       }
>>>         return -EINVAL;

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH 01/24] drm/amdkfd/kfd_ioctl: add pc sampling support
  2023-11-22 21:14   ` Felix Kuehling
@ 2023-11-23 20:33     ` James Zhu
  0 siblings, 0 replies; 80+ messages in thread
From: James Zhu @ 2023-11-23 20:33 UTC (permalink / raw)
  To: Felix Kuehling, James Zhu, amd-gfx; +Cc: joseph.greathouse


On 2023-11-22 16:14, Felix Kuehling wrote:
> On 2023-11-03 09:11, James Zhu wrote:
>> From: David Yat Sin <david.yatsin@amd.com>
>>
>> Add pc sampling support in kfd_ioctl.
>>
>> Co-developed-by: James Zhu <James.Zhu@amd.com>
>> Signed-off-by: James Zhu <James.Zhu@amd.com>
>> Signed-off-by: David Yat Sin <david.yatsin@amd.com>
>> ---
>>   include/uapi/linux/kfd_ioctl.h | 57 +++++++++++++++++++++++++++++++++-
>>   1 file changed, 56 insertions(+), 1 deletion(-)
>>
>> diff --git a/include/uapi/linux/kfd_ioctl.h 
>> b/include/uapi/linux/kfd_ioctl.h
>> index f0ed68974c54..5202e29c9560 100644
>> --- a/include/uapi/linux/kfd_ioctl.h
>> +++ b/include/uapi/linux/kfd_ioctl.h
>> @@ -1446,6 +1446,58 @@ struct kfd_ioctl_dbg_trap_args {
>>       };
>>   };
>>   +/**
>> + * kfd_ioctl_pc_sample_op - PC Sampling ioctl operations
>> + *
>> + * @KFD_IOCTL_PCS_OP_QUERY_CAPABILITIES: Query device PC Sampling 
>> capabilities
>> + * @KFD_IOCTL_PCS_OP_CREATE:             Register this process with 
>> a per-device PC sampler instance
>> + * @KFD_IOCTL_PCS_OP_DESTROY:            Unregister from a 
>> previously registered PC sampler instance
>> + * @KFD_IOCTL_PCS_OP_START:              Process begins taking 
>> samples from a previously registered PC sampler instance
>> + * @KFD_IOCTL_PCS_OP_STOP:               Process stops taking 
>> samples from a previously registered PC sampler instance
>> + */
>> +enum kfd_ioctl_pc_sample_op {
>> +    KFD_IOCTL_PCS_OP_QUERY_CAPABILITIES,
>> +    KFD_IOCTL_PCS_OP_CREATE,
>> +    KFD_IOCTL_PCS_OP_DESTROY,
>> +    KFD_IOCTL_PCS_OP_START,
>> +    KFD_IOCTL_PCS_OP_STOP,
>> +};
>> +
>> +/* Values have to be a power of 2*/
>> +#define KFD_IOCTL_PCS_FLAG_POWER_OF_2 0x00000001
>> +
>> +enum kfd_ioctl_pc_sample_method {
>> +    KFD_IOCTL_PCS_METHOD_HOSTTRAP = 1,
>> +    KFD_IOCTL_PCS_METHOD_STOCHASTIC,
>> +};
>> +
>> +enum kfd_ioctl_pc_sample_type {
>> +    KFD_IOCTL_PCS_TYPE_TIME_US,
>> +    KFD_IOCTL_PCS_TYPE_CLOCK_CYCLES,
>> +    KFD_IOCTL_PCS_TYPE_INSTRUCTIONS
>> +};
>> +
>> +struct kfd_pc_sample_info {
>> +    __u64 value;         /* [IN] if PCS_TYPE_INTERVAL_US: sample 
>> interval in us
>> +                          * if PCS_TYPE_CLOCK_CYCLES: sample 
>> interval in graphics core clk cycles
>> +                          * if PCS_TYPE_INSTRUCTIONS: sample 
>> interval in instructions issued by
>> +                          * graphics compute units
>
> I'd call this "interval". That's still generic enough to be a sampling 
> interval in a unit that depends on the PCS type. "value" is 
> misleading, because it sounds like it may be an actual sample.
[JZ] I am fine this interface name changes,
>
>
>> +                          */
>> +    __u64 value_min;     /* [OUT] */
>> +    __u64 value_max;     /* [OUT] */
>
> interval_min/max.
>
> Regards,
>   Felix
>
>
>> +    __u64 flags;         /* [OUT] indicate potential restrictions 
>> e.g FLAG_POWER_OF_2 */
>> +    __u32 method;        /* [IN/OUT] kfd_ioctl_pc_sample_method */
>> +    __u32 type;          /* [IN/OUT] kfd_ioctl_pc_sample_type */
>> +};
>> +
>> +struct kfd_ioctl_pc_sample_args {
>> +    __u64 sample_info_ptr;   /* array of kfd_pc_sample_info */
>> +    __u32 num_sample_info;
>> +    __u32 op;                /* kfd_ioctl_pc_sample_op */
>> +    __u32 gpu_id;
>> +    __u32 trace_id;
>> +};
>> +
>>   #define AMDKFD_IOCTL_BASE 'K'
>>   #define AMDKFD_IO(nr)            _IO(AMDKFD_IOCTL_BASE, nr)
>>   #define AMDKFD_IOR(nr, type)        _IOR(AMDKFD_IOCTL_BASE, nr, type)
>> @@ -1566,7 +1618,10 @@ struct kfd_ioctl_dbg_trap_args {
>>   #define AMDKFD_IOC_DBG_TRAP            \
>>           AMDKFD_IOWR(0x26, struct kfd_ioctl_dbg_trap_args)
>>   +#define AMDKFD_IOC_PC_SAMPLE        \
>> +        AMDKFD_IOWR(0x27, struct kfd_ioctl_pc_sample_args)
>> +
>>   #define AMDKFD_COMMAND_START        0x01
>> -#define AMDKFD_COMMAND_END        0x27
>> +#define AMDKFD_COMMAND_END        0x28
>>     #endif

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH 18/24] drm/amdkfd: enable pc sampling start
  2023-11-23 20:21       ` Felix Kuehling
@ 2023-11-23 22:00         ` James Zhu
  0 siblings, 0 replies; 80+ messages in thread
From: James Zhu @ 2023-11-23 22:00 UTC (permalink / raw)
  To: Felix Kuehling, James Zhu, amd-gfx; +Cc: joseph.greathouse


On 2023-11-23 15:21, Felix Kuehling wrote:
>
> On 2023-11-23 15:01, James Zhu wrote:
>>
>> On 2023-11-22 17:27, Felix Kuehling wrote:
>>>
>>> On 2023-11-03 09:11, James Zhu wrote:
>>>> Enable pc sampling start.
>>>>
>>>> Signed-off-by: James Zhu <James.Zhu@amd.com>
>>>> ---
>>>>   drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c | 26 
>>>> +++++++++++++++++---
>>>>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h        |  2 ++
>>>>   2 files changed, 25 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c 
>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>>>> index 60b29b245db5..33d003ca0093 100644
>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>>>> @@ -83,9 +83,29 @@ static int kfd_pc_sample_query_cap(struct 
>>>> kfd_process_device *pdd,
>>>>       return 0;
>>>>   }
>>>>   -static int kfd_pc_sample_start(struct kfd_process_device *pdd)
>>>> +static int kfd_pc_sample_start(struct kfd_process_device *pdd,
>>>> +                    struct pc_sampling_entry *pcs_entry)
>>>>   {
>>>> -    return -EINVAL;
>>>> +    bool pc_sampling_start = false;
>>>> +
>>>> +    pcs_entry->enabled = true;
>>>> +    mutex_lock(&pdd->dev->pcs_data.mutex);
>>>> +    if (!pdd->dev->pcs_data.hosttrap_entry.base.active_count)
>>>> +        pc_sampling_start = true;
>>>> + pdd->dev->pcs_data.hosttrap_entry.base.active_count++;
>>>> +    mutex_unlock(&pdd->dev->pcs_data.mutex);
>>>> +
>>>> +    while (pc_sampling_start) {
>>>> +        if 
>>>> (READ_ONCE(pdd->dev->pcs_data.hosttrap_entry.base.stop_enable)) {
>>>> +            usleep_range(1000, 2000);
>>>
>>> I don't understand why you need this synchronization through 
>>> stop_enable. Why can't you do both the start and stop while holding 
>>> the mutex? It's just setting a flag in the TMA, so it's not a 
>>> time-consuming operation, and I don't see any potential for deadlocks.
>> [JZ] for stop, not just set TMA. need wait for current pc sampling 
>> completely stop and reset some initial setting.
>
> I think that's being obfuscated by how you split up this patch series. 
> Maybe if you squash the queue remapping patch into this one, it would 
> be more obvious what's really happening when you stop sampling and 
> would make it easier to review the synchronization and locking strategy.
[JZ] Sure
>
> Regards,
>   Felix
>
>
>>>
>>> Regards,
>>>   Felix
>>>
>>>
>>>> +        } else {
>>>> + kfd_process_set_trap_pc_sampling_flag(&pdd->qpd,
>>>> + pdd->dev->pcs_data.hosttrap_entry.base.pc_sample_info.method, true);
>>>> +            break;
>>>> +        }
>>>> +    }
>>>> +
>>>> +    return 0;
>>>>   }
>>>>     static int kfd_pc_sample_stop(struct kfd_process_device *pdd)
>>>> @@ -225,7 +245,7 @@ int kfd_pc_sample(struct kfd_process_device *pdd,
>>>>           if (pcs_entry->enabled)
>>>>               return -EALREADY;
>>>>           else
>>>> -            return kfd_pc_sample_start(pdd);
>>>> +            return kfd_pc_sample_start(pdd, pcs_entry);
>>>>         case KFD_IOCTL_PCS_OP_STOP:
>>>>           if (!pcs_entry->enabled)
>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>> index 6670534f47b8..613910e0d440 100644
>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>> @@ -258,6 +258,8 @@ struct kfd_dev;
>>>>     struct kfd_dev_pc_sampling_data {
>>>>       uint32_t use_count;         /* Num of PC sampling sessions */
>>>> +    uint32_t active_count;      /* Num of active sessions */
>>>> +    bool stop_enable;           /* pc sampling stop in process */
>>>>       struct idr pc_sampling_idr;
>>>>       struct kfd_pc_sample_info pc_sample_info;
>>>>   };

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH 07/24] drm/amdkfd: check pcs_enrty valid
  2023-11-23 20:32       ` Felix Kuehling
@ 2023-11-23 22:06         ` James Zhu
  0 siblings, 0 replies; 80+ messages in thread
From: James Zhu @ 2023-11-23 22:06 UTC (permalink / raw)
  To: Felix Kuehling, James Zhu, amd-gfx; +Cc: joseph.greathouse


On 2023-11-23 15:32, Felix Kuehling wrote:
>
> On 2023-11-23 15:18, James Zhu wrote:
>>
>> On 2023-11-22 17:15, Felix Kuehling wrote:
>>>
>>> On 2023-11-03 09:11, James Zhu wrote:
>>>> Check pcs_enrty valid for pc sampling ioctl.
>>>>
>>>> Signed-off-by: James Zhu <James.Zhu@amd.com>
>>>> ---
>>>>   drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c | 30 
>>>> ++++++++++++++++++--
>>>>   1 file changed, 27 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c 
>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>>>> index 4c9fc48e1a6a..36366c8847de 100644
>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>>>> @@ -179,6 +179,21 @@ static int kfd_pc_sample_destroy(struct 
>>>> kfd_process_device *pdd, uint32_t trace_
>>>>   int kfd_pc_sample(struct kfd_process_device *pdd,
>>>>                       struct kfd_ioctl_pc_sample_args __user *args)
>>>>   {
>>>> +    struct pc_sampling_entry *pcs_entry;
>>>> +
>>>> +    if (args->op != KFD_IOCTL_PCS_OP_QUERY_CAPABILITIES &&
>>>> +        args->op != KFD_IOCTL_PCS_OP_CREATE) {
>>>> +
>>>> +        mutex_lock(&pdd->dev->pcs_data.mutex);
>>>> +        pcs_entry = 
>>>> idr_find(&pdd->dev->pcs_data.hosttrap_entry.base.pc_sampling_idr,
>>>> +                args->trace_id);
>>>> +        mutex_unlock(&pdd->dev->pcs_data.mutex);
>>>
>>> You need to keep holding the lock while the pcs_entry is still used. 
>>> That includes any of the kfd_pc_sample_<op> functions below. 
>>> Otherwise someone could free it concurrently. It would also simplify 
>>> the ..._<op> functions, if they didn't have to worry about the 
>>> locking themselves.
>> [JZ] pcs_entry is only for this pc sampling process, which has 
>> kfd_process->mutex protected here.
>
> OK. That's not obvious. I'm also wary about depending too much on the 
> big process lock. We will need to make that locking more granular 
> soon, because it is causing performance issues with multi-threaded 
> processes.
[Jz] Let me add some comments on pcs_entry.
>
> Regards,
>   Felix
>
>
>>>
>>> Regards,
>>>   Felix
>>>
>>>
>>>> +
>>>> +        if (!pcs_entry ||
>>>> +            pcs_entry->pdd != pdd)
>>>> +            return -EINVAL;
>>>> +    }
>>>> +
>>>>       switch (args->op) {
>>>>       case KFD_IOCTL_PCS_OP_QUERY_CAPABILITIES:
>>>>           return kfd_pc_sample_query_cap(pdd, args);
>>>> @@ -187,13 +202,22 @@ int kfd_pc_sample(struct kfd_process_device 
>>>> *pdd,
>>>>           return kfd_pc_sample_create(pdd, args);
>>>>         case KFD_IOCTL_PCS_OP_DESTROY:
>>>> -        return kfd_pc_sample_destroy(pdd, args->trace_id);
>>>> +        if (pcs_entry->enabled)
>>>> +            return -EBUSY;
>>>> +        else
>>>> +            return kfd_pc_sample_destroy(pdd, args->trace_id);
>>>>         case KFD_IOCTL_PCS_OP_START:
>>>> -        return kfd_pc_sample_start(pdd);
>>>> +        if (pcs_entry->enabled)
>>>> +            return -EALREADY;
>>>> +        else
>>>> +            return kfd_pc_sample_start(pdd);
>>>>         case KFD_IOCTL_PCS_OP_STOP:
>>>> -        return kfd_pc_sample_stop(pdd);
>>>> +        if (!pcs_entry->enabled)
>>>> +            return -EALREADY;
>>>> +        else
>>>> +            return kfd_pc_sample_stop(pdd);
>>>>       }
>>>>         return -EINVAL;

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH 06/24] drm/amdkfd: add trace_id return
  2023-11-22 22:21   ` Felix Kuehling
@ 2023-11-23 22:14     ` Zhu, James
  0 siblings, 0 replies; 80+ messages in thread
From: Zhu, James @ 2023-11-23 22:14 UTC (permalink / raw)
  To: Kuehling, Felix, Zhu, James, amd-gfx, Yat Sin, David; +Cc: Greathouse, Joseph

[-- Attachment #1: Type: text/plain, Size: 4619 bytes --]

[AMD Official Use Only - General]



On 2023-11-22 17:21, Felix Kuehling wrote:

On 2023-11-03 09:11, James Zhu wrote:
Add trace_id return for new pc sampling creation per device,
Use IDR to quickly locate pc_sampling_entry for reference.

Signed-off-by: James Zhu <James.Zhu@amd.com><mailto:James.Zhu@amd.com>
---
  drivers/gpu/drm/amd/amdkfd/kfd_device.c      |  2 ++
  drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c | 20 +++++++++++++++++++-
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h        |  6 ++++++
  3 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 0e24e011f66b..bcaeedac8fe0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -536,10 +536,12 @@ static void kfd_smi_init(struct kfd_node *dev)
  static void kfd_pc_sampling_init(struct kfd_node *dev)
  {
      mutex_init(&dev->pcs_data.mutex);
+    idr_init_base(&dev->pcs_data.hosttrap_entry.base.pc_sampling_idr, 1);
  }
    static void kfd_pc_sampling_exit(struct kfd_node *dev)
  {
+    idr_destroy(&dev->pcs_data.hosttrap_entry.base.pc_sampling_idr);
      mutex_destroy(&dev->pcs_data.mutex);
  }
  diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
index f0d910ee730c..4c9fc48e1a6a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
@@ -99,6 +99,7 @@ static int kfd_pc_sample_create(struct kfd_process_device *pdd,
  {
      struct kfd_pc_sample_info *supported_format = NULL;
      struct kfd_pc_sample_info user_info;
+    struct pc_sampling_entry *pcs_entry;
      int ret;
      int i;
  @@ -140,7 +141,19 @@ static int kfd_pc_sample_create(struct kfd_process_device *pdd,
          return ret ? ret : -EEXIST;
      }
  -    /* TODO: add trace_id return */
+    pcs_entry = kvzalloc(sizeof(*pcs_entry), GFP_KERNEL);
+    if (!pcs_entry) {
+        mutex_unlock(&pdd->dev->pcs_data.mutex);
+        return -ENOMEM;
+    }
+
+    i = idr_alloc_cyclic(&pdd->dev->pcs_data.hosttrap_entry.base.pc_sampling_idr,
+                pcs_entry, 1, 0, GFP_KERNEL);
+    if (i < 0) {
+        mutex_unlock(&pdd->dev->pcs_data.mutex);
+        kvfree(pcs_entry);
+        return i;
+    }
        if (!pdd->dev->pcs_data.hosttrap_entry.base.use_count)
          memcpy(&pdd->dev->pcs_data.hosttrap_entry.base.pc_sample_info,
@@ -149,6 +162,11 @@ static int kfd_pc_sample_create(struct kfd_process_device *pdd,
      pdd->dev->pcs_data.hosttrap_entry.base.use_count++;
      mutex_unlock(&pdd->dev->pcs_data.mutex);
  +    pcs_entry->pdd = pdd;

One more thought: You have a bunch of pcs_entries pointing to pdd. What mechanism guarantees, that the pcs_entries are destroyed before the pdd on process termination? I think kfd_pc_sampling_exit doesn't run during process termination, because it deals with per-device data structures, not per-process data structures.

Should the IDR be in the PDD rather than the device? In that case you wouldn't even need the pdd pointer in struct pcs_entry.
[JZ] the IDR here is mainly for generating trace_id with this device. I am not sure if ROCr/ROCprofiler are fine with this change which means same process has same trace_id value for different nodes. @Yat Sin, David<mailto:David.YatSin@amd.com> would you mind give your comments here?

I see you have a patch much later in the series "drm/amdkfd: add pc sampling release when process release". I'd prefer this squashed here and in the patches that add the start/stop functions.

Regards,
  Felix


+    user_args->trace_id = (uint32_t)i;
+
+    pr_debug("alloc pcs_entry = %p, trace_id = 0x%x on gpu 0x%x", pcs_entry, i, pdd->dev->id);
+
      return 0;
  }
  diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 81c925fb2952..642558026d16 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -258,6 +258,7 @@ struct kfd_dev;
    struct kfd_dev_pc_sampling_data {
      uint32_t use_count;         /* Num of PC sampling sessions */
+    struct idr pc_sampling_idr;
      struct kfd_pc_sample_info pc_sample_info;
  };
  @@ -743,6 +744,11 @@ enum kfd_pdd_bound {
   */
  #define SDMA_ACTIVITY_DIVISOR  100
  +struct pc_sampling_entry {
+    bool enabled;
+    struct kfd_process_device *pdd;
+};
+
  /* Data that is per-process-per device. */
  struct kfd_process_device {
      /* The device that owns this data. */

[-- Attachment #2: Type: text/html, Size: 7257 bytes --]

^ permalink raw reply related	[flat|nested] 80+ messages in thread

* RE: [PATCH 21/24] drm/amdkfd: add queue remapping
  2023-11-23 19:49         ` James Zhu
@ 2023-11-23 22:41           ` Greathouse, Joseph
  2023-11-23 23:01             ` Felix Kuehling
  0 siblings, 1 reply; 80+ messages in thread
From: Greathouse, Joseph @ 2023-11-23 22:41 UTC (permalink / raw)
  To: Zhu, James, Kuehling, Felix, amd-gfx

[Public]

> -----Original Message-----
> From: Zhu, James <James.Zhu@amd.com>
> Sent: Thursday, November 23, 2023 1:49 PM
>
> On 2023-11-23 14:02, Felix Kuehling wrote:
> > On 2023-11-23 11:25, James Zhu wrote:
> >>
> >> On 2023-11-22 17:35, Felix Kuehling wrote:
> >>>
> >>> On 2023-11-03 09:11, James Zhu wrote:
> >>>> Add queue remapping to force the waves in any running
> >>>> processes to complete a CWSR trap.
> >>>
> >>> Please add an explanation why this is needed.
> >>
> >> [JZ] Even though the profiling-enabled bits is turned off, the CWSR
> >> trap handlers for some kernels with this process may still in running
> >> stage, this will
> >>
> >> force the waves in any running processes to complete a CWSR trap, and
> >> make sure pc sampling is completely stopped with this process.   I
> >> will add it later.
> >
> > It may be confusing to talk specifically about "CWSR trap handler".
> > There is only one trap handler that is triggered by different events:
> > CWSR, host trap, s_trap instructions, exceptions, etc. When a new trap
> > triggers, it serializes with any currently running trap handler in
> > that wavefront. So it seems that you're using CWSR as a way to ensure
> > that any host trap has completed: CWSR will wait for previous traps to
> > finish before trapping again for CWSR, the HWS firmware waits for CWSR
> > completion and the driver waits for HWS to finish CWSR with a fence on
> > a HIQ QUERY_STATUS packet. Is that correct?
> [JZ] I think your explanation is more detail. Need Joseph to confirm.

Felix, your summary is correct. The reason we are trying to perform a queue unmap/map cycle as part of the PC sampling stop is to prevent the following:

1. A PC sampling request arrives to Wave X, sending it to 1st-level trap handler
2. User thread asks KFD to stop sampling for this process, which leads to kfd_pc_sample_stop()
3. kfd_pc_sample_stop() decrements the sampling refcent. If this is the last process to stop sampling, it stops any further sampling traps from being generated
4. kfd_pc_sample_stop() sets this process's TMA flag to false so waves in the 1st-level trap handler know sampling is disabled
    4.1. Wave X may be in 1st-level handler and not yet checked the TMA flag. If so, it will exit the 1st-level handler when it sees flag is false
    4.2. Wave X may have already passed the 1st-level TMA flag check and entered the 2nd-level trap handler to do the PC sample
5. kfd_pc_sample_stop() returns, eventually causing ioctl to return, back to user-space
6. Because the stop ioctl has returned, user-land deallocates user-space buffer the 2nd level trap handler uses to output sample data
7. Wave X that was in the 2nd-level handler tries to finish its sample output and writes to the now-freed location, causing a use-after-free

Note that Step 3 does not always stop further traps from arriving -- if another process still wants to do sampling, the driver or HW might still send traps to every wave on the device after Step 3.
As such, to avoid going into the 2nd-level handler for non-sampled processes, all 1st-level handlers must check their TMA flag to see if they should allow the sample to flow to the 2nd-level handler.

By removing the queue from the HW after Step 4, we can be sure that any existing waves from this process that entered the PC sampling 2nd-level handler before Step 4 are done.
Any waves that were still in the 1st-level handler at Step 4.1 will be filtered by the TMA flag being set to false. CWSR will wait until they exit.
Any waves that were already in the 2nd-level handler (4.2) must complete before the CWSR save will complete and allow this queue removal request to complete.
Any waves that enter the 1st-level trap handler after Step 4 won't go into the PC sampling logic in the 2nd-level handler because the TMA flag is set to false. CWSR will wait until they exit.

When we then put the queue back on the hardware, any further traps that might show up (e.g. because another process is sampling) will get filtered by the TMA flag.

So once the queue removal (and thus CWSR save cycle) has completed, we can be sure that no other traps to this process will try to use its PC sample data buffer, so it's safe to return to user-space and let them potentially free that buffer.

I don't know how to summarize this nicely in a comment, but hopefully y'all can figure that out. :)

Thanks,
-Joe

> >
> > Regards,
> >   Felix
> >
> >
> >>
> >>>
> >>>
> >>>>
> >>>> Signed-off-by: James Zhu <James.Zhu@amd.com>
> >>>> ---
> >>>>   drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 11
> >>>> +++++++++++
> >>>>   drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h |  5 +++++
> >>>>   drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c          |  3 +++
> >>>>   3 files changed, 19 insertions(+)
> >>>>
> >>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> >>>> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> >>>> index c0e71543389a..a3f57be63f4f 100644
> >>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> >>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> >>>> @@ -3155,6 +3155,17 @@ int debug_refresh_runlist(struct
> >>>> device_queue_manager *dqm)
> >>>>       return debug_map_and_unlock(dqm);
> >>>>   }
> >>>>   +void remap_queue(struct device_queue_manager *dqm,
> >>>> +                enum kfd_unmap_queues_filter filter,
> >>>> +                uint32_t filter_param,
> >>>> +                uint32_t grace_period)
> >>>
> >>> Not sure if you need the filter and grace period parameters in this
> >>> function. What's the point of exposing that to callers who just want
> >>> to trigger a CWSR? You could also change the function name to
> >>> reflect the purpose of the function, rather than the implementation.
> >> [JZ] Just want to create a general function in case that used by
> >> others. I am fine to remove passing filter_param/grace_period
> >>>
> >>> Regards,
> >>>   Felix
> >>>
> >>>
> >>>> +{
> >>>> +    dqm_lock(dqm);
> >>>> +    if (!dqm->dev->kfd->shared_resources.enable_mes)
> >>>> +        execute_queues_cpsch(dqm, filter, filter_param,
> >>>> grace_period);
> >>>> +    dqm_unlock(dqm);
> >>>> +}
> >>>> +
> >>>>   #if defined(CONFIG_DEBUG_FS)
> >>>>     static void seq_reg_dump(struct seq_file *m,
> >>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> >>>> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> >>>> index cf7e182588f8..f8aae3747a36 100644
> >>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> >>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> >>>> @@ -303,6 +303,11 @@ int debug_lock_and_unmap(struct
> >>>> device_queue_manager *dqm);
> >>>>   int debug_map_and_unlock(struct device_queue_manager *dqm);
> >>>>   int debug_refresh_runlist(struct device_queue_manager *dqm);
> >>>>   +void remap_queue(struct device_queue_manager *dqm,
> >>>> +                enum kfd_unmap_queues_filter filter,
> >>>> +                uint32_t filter_param,
> >>>> +                uint32_t grace_period);
> >>>> +
> >>>>   static inline unsigned int get_sh_mem_bases_32(struct
> >>>> kfd_process_device *pdd)
> >>>>   {
> >>>>       return (pdd->lds_base >> 16) & 0xFF;
> >>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
> >>>> b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
> >>>> index e8f0559b618e..66670cdb813a 100644
> >>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
> >>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
> >>>> @@ -24,6 +24,7 @@
> >>>>   #include "kfd_priv.h"
> >>>>   #include "amdgpu_amdkfd.h"
> >>>>   #include "kfd_pc_sampling.h"
> >>>> +#include "kfd_device_queue_manager.h"
> >>>>     struct supported_pc_sample_info {
> >>>>       uint32_t ip_version;
> >>>> @@ -164,6 +165,8 @@ static int kfd_pc_sample_stop(struct
> >>>> kfd_process_device *pdd,
> >>>> cancel_work_sync(&pdd->dev->pcs_data.hosttrap_entry.base.pc_sampling_work);
> >>>>
> >>>> kfd_process_set_trap_pc_sampling_flag(&pdd->qpd,
> >>>> pdd->dev->pcs_data.hosttrap_entry.base.pc_sample_info.method, false);
> >>>> +        remap_queue(pdd->dev->dqm,
> >>>> +            KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0,
> >>>> USE_DEFAULT_GRACE_PERIOD);
> >>>>             mutex_lock(&pdd->dev->pcs_data.mutex);
> >>>> pdd->dev->pcs_data.hosttrap_entry.base.target_simd = 0;

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH 21/24] drm/amdkfd: add queue remapping
  2023-11-23 22:41           ` Greathouse, Joseph
@ 2023-11-23 23:01             ` Felix Kuehling
  2023-11-23 23:16               ` James Zhu
  0 siblings, 1 reply; 80+ messages in thread
From: Felix Kuehling @ 2023-11-23 23:01 UTC (permalink / raw)
  To: Greathouse, Joseph, Zhu, James, amd-gfx

On 2023-11-23 17:41, Greathouse, Joseph wrote:
> [Public]
>
>> -----Original Message-----
>> From: Zhu, James <James.Zhu@amd.com>
>> Sent: Thursday, November 23, 2023 1:49 PM
>>
>> On 2023-11-23 14:02, Felix Kuehling wrote:
>>> On 2023-11-23 11:25, James Zhu wrote:
>>>> On 2023-11-22 17:35, Felix Kuehling wrote:
>>>>> On 2023-11-03 09:11, James Zhu wrote:
>>>>>> Add queue remapping to force the waves in any running
>>>>>> processes to complete a CWSR trap.
>>>>> Please add an explanation why this is needed.
>>>> [JZ] Even though the profiling-enabled bits is turned off, the CWSR
>>>> trap handlers for some kernels with this process may still in running
>>>> stage, this will
>>>>
>>>> force the waves in any running processes to complete a CWSR trap, and
>>>> make sure pc sampling is completely stopped with this process.   I
>>>> will add it later.
>>> It may be confusing to talk specifically about "CWSR trap handler".
>>> There is only one trap handler that is triggered by different events:
>>> CWSR, host trap, s_trap instructions, exceptions, etc. When a new trap
>>> triggers, it serializes with any currently running trap handler in
>>> that wavefront. So it seems that you're using CWSR as a way to ensure
>>> that any host trap has completed: CWSR will wait for previous traps to
>>> finish before trapping again for CWSR, the HWS firmware waits for CWSR
>>> completion and the driver waits for HWS to finish CWSR with a fence on
>>> a HIQ QUERY_STATUS packet. Is that correct?
>> [JZ] I think your explanation is more detail. Need Joseph to confirm.
> Felix, your summary is correct. The reason we are trying to perform a queue unmap/map cycle as part of the PC sampling stop is to prevent the following:
>
> 1. A PC sampling request arrives to Wave X, sending it to 1st-level trap handler
> 2. User thread asks KFD to stop sampling for this process, which leads to kfd_pc_sample_stop()
> 3. kfd_pc_sample_stop() decrements the sampling refcent. If this is the last process to stop sampling, it stops any further sampling traps from being generated
> 4. kfd_pc_sample_stop() sets this process's TMA flag to false so waves in the 1st-level trap handler know sampling is disabled
>      4.1. Wave X may be in 1st-level handler and not yet checked the TMA flag. If so, it will exit the 1st-level handler when it sees flag is false
>      4.2. Wave X may have already passed the 1st-level TMA flag check and entered the 2nd-level trap handler to do the PC sample
> 5. kfd_pc_sample_stop() returns, eventually causing ioctl to return, back to user-space
> 6. Because the stop ioctl has returned, user-land deallocates user-space buffer the 2nd level trap handler uses to output sample data
> 7. Wave X that was in the 2nd-level handler tries to finish its sample output and writes to the now-freed location, causing a use-after-free
>
> Note that Step 3 does not always stop further traps from arriving -- if another process still wants to do sampling, the driver or HW might still send traps to every wave on the device after Step 3.
> As such, to avoid going into the 2nd-level handler for non-sampled processes, all 1st-level handlers must check their TMA flag to see if they should allow the sample to flow to the 2nd-level handler.
>
> By removing the queue from the HW after Step 4, we can be sure that any existing waves from this process that entered the PC sampling 2nd-level handler before Step 4 are done.
> Any waves that were still in the 1st-level handler at Step 4.1 will be filtered by the TMA flag being set to false. CWSR will wait until they exit.
> Any waves that were already in the 2nd-level handler (4.2) must complete before the CWSR save will complete and allow this queue removal request to complete.
> Any waves that enter the 1st-level trap handler after Step 4 won't go into the PC sampling logic in the 2nd-level handler because the TMA flag is set to false. CWSR will wait until they exit.
>
> When we then put the queue back on the hardware, any further traps that might show up (e.g. because another process is sampling) will get filtered by the TMA flag.
>
> So once the queue removal (and thus CWSR save cycle) has completed, we can be sure that no other traps to this process will try to use its PC sample data buffer, so it's safe to return to user-space and let them potentially free that buffer.
>
> I don't know how to summarize this nicely in a comment, but hopefully y'all can figure that out. :)

My best summary: We need to ensure that any waves executing the PC 
sampling part of the trap handler are done before kfd_pc_sample_stop 
returns, and that no new waves enter that part of the trap handler 
afterwards. This avoids race conditions that could lead to 
use-after-free. Unmapping and remapping the queues either waits for the 
waves to drain, or preempts them with CWSR, which itself executes a trap 
and waits for previous traps to finish.

Regards,
   Felix


>
> Thanks,
> -Joe
>
>>> Regards,
>>>    Felix
>>>
>>>
>>>>>> Signed-off-by: James Zhu <James.Zhu@amd.com>
>>>>>> ---
>>>>>>    drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 11
>>>>>> +++++++++++
>>>>>>    drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h |  5 +++++
>>>>>>    drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c          |  3 +++
>>>>>>    3 files changed, 19 insertions(+)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>>>>> index c0e71543389a..a3f57be63f4f 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>>>>> @@ -3155,6 +3155,17 @@ int debug_refresh_runlist(struct
>>>>>> device_queue_manager *dqm)
>>>>>>        return debug_map_and_unlock(dqm);
>>>>>>    }
>>>>>>    +void remap_queue(struct device_queue_manager *dqm,
>>>>>> +                enum kfd_unmap_queues_filter filter,
>>>>>> +                uint32_t filter_param,
>>>>>> +                uint32_t grace_period)
>>>>> Not sure if you need the filter and grace period parameters in this
>>>>> function. What's the point of exposing that to callers who just want
>>>>> to trigger a CWSR? You could also change the function name to
>>>>> reflect the purpose of the function, rather than the implementation.
>>>> [JZ] Just want to create a general function in case that used by
>>>> others. I am fine to remove passing filter_param/grace_period
>>>>> Regards,
>>>>>    Felix
>>>>>
>>>>>
>>>>>> +{
>>>>>> +    dqm_lock(dqm);
>>>>>> +    if (!dqm->dev->kfd->shared_resources.enable_mes)
>>>>>> +        execute_queues_cpsch(dqm, filter, filter_param,
>>>>>> grace_period);
>>>>>> +    dqm_unlock(dqm);
>>>>>> +}
>>>>>> +
>>>>>>    #if defined(CONFIG_DEBUG_FS)
>>>>>>      static void seq_reg_dump(struct seq_file *m,
>>>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
>>>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
>>>>>> index cf7e182588f8..f8aae3747a36 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
>>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
>>>>>> @@ -303,6 +303,11 @@ int debug_lock_and_unmap(struct
>>>>>> device_queue_manager *dqm);
>>>>>>    int debug_map_and_unlock(struct device_queue_manager *dqm);
>>>>>>    int debug_refresh_runlist(struct device_queue_manager *dqm);
>>>>>>    +void remap_queue(struct device_queue_manager *dqm,
>>>>>> +                enum kfd_unmap_queues_filter filter,
>>>>>> +                uint32_t filter_param,
>>>>>> +                uint32_t grace_period);
>>>>>> +
>>>>>>    static inline unsigned int get_sh_mem_bases_32(struct
>>>>>> kfd_process_device *pdd)
>>>>>>    {
>>>>>>        return (pdd->lds_base >> 16) & 0xFF;
>>>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>>>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>>>>>> index e8f0559b618e..66670cdb813a 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>>>>>> @@ -24,6 +24,7 @@
>>>>>>    #include "kfd_priv.h"
>>>>>>    #include "amdgpu_amdkfd.h"
>>>>>>    #include "kfd_pc_sampling.h"
>>>>>> +#include "kfd_device_queue_manager.h"
>>>>>>      struct supported_pc_sample_info {
>>>>>>        uint32_t ip_version;
>>>>>> @@ -164,6 +165,8 @@ static int kfd_pc_sample_stop(struct
>>>>>> kfd_process_device *pdd,
>>>>>> cancel_work_sync(&pdd->dev->pcs_data.hosttrap_entry.base.pc_sampling_work);
>>>>>>
>>>>>> kfd_process_set_trap_pc_sampling_flag(&pdd->qpd,
>>>>>> pdd->dev->pcs_data.hosttrap_entry.base.pc_sample_info.method, false);
>>>>>> +        remap_queue(pdd->dev->dqm,
>>>>>> +            KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0,
>>>>>> USE_DEFAULT_GRACE_PERIOD);
>>>>>>              mutex_lock(&pdd->dev->pcs_data.mutex);
>>>>>> pdd->dev->pcs_data.hosttrap_entry.base.target_simd = 0;

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH 21/24] drm/amdkfd: add queue remapping
  2023-11-23 23:01             ` Felix Kuehling
@ 2023-11-23 23:16               ` James Zhu
  0 siblings, 0 replies; 80+ messages in thread
From: James Zhu @ 2023-11-23 23:16 UTC (permalink / raw)
  To: Felix Kuehling, Greathouse, Joseph, Zhu, James, amd-gfx


On 2023-11-23 18:01, Felix Kuehling wrote:
> On 2023-11-23 17:41, Greathouse, Joseph wrote:
>> [Public]
>>
>>> -----Original Message-----
>>> From: Zhu, James <James.Zhu@amd.com>
>>> Sent: Thursday, November 23, 2023 1:49 PM
>>>
>>> On 2023-11-23 14:02, Felix Kuehling wrote:
>>>> On 2023-11-23 11:25, James Zhu wrote:
>>>>> On 2023-11-22 17:35, Felix Kuehling wrote:
>>>>>> On 2023-11-03 09:11, James Zhu wrote:
>>>>>>> Add queue remapping to force the waves in any running
>>>>>>> processes to complete a CWSR trap.
>>>>>> Please add an explanation why this is needed.
>>>>> [JZ] Even though the profiling-enabled bits is turned off, the CWSR
>>>>> trap handlers for some kernels with this process may still in running
>>>>> stage, this will
>>>>>
>>>>> force the waves in any running processes to complete a CWSR trap, and
>>>>> make sure pc sampling is completely stopped with this process.   I
>>>>> will add it later.
>>>> It may be confusing to talk specifically about "CWSR trap handler".
>>>> There is only one trap handler that is triggered by different events:
>>>> CWSR, host trap, s_trap instructions, exceptions, etc. When a new trap
>>>> triggers, it serializes with any currently running trap handler in
>>>> that wavefront. So it seems that you're using CWSR as a way to ensure
>>>> that any host trap has completed: CWSR will wait for previous traps to
>>>> finish before trapping again for CWSR, the HWS firmware waits for CWSR
>>>> completion and the driver waits for HWS to finish CWSR with a fence on
>>>> a HIQ QUERY_STATUS packet. Is that correct?
>>> [JZ] I think your explanation is more detail. Need Joseph to confirm.
>> Felix, your summary is correct. The reason we are trying to perform a 
>> queue unmap/map cycle as part of the PC sampling stop is to prevent 
>> the following:
>>
>> 1. A PC sampling request arrives to Wave X, sending it to 1st-level 
>> trap handler
>> 2. User thread asks KFD to stop sampling for this process, which 
>> leads to kfd_pc_sample_stop()
>> 3. kfd_pc_sample_stop() decrements the sampling refcent. If this is 
>> the last process to stop sampling, it stops any further sampling 
>> traps from being generated
>> 4. kfd_pc_sample_stop() sets this process's TMA flag to false so 
>> waves in the 1st-level trap handler know sampling is disabled
>>      4.1. Wave X may be in 1st-level handler and not yet checked the 
>> TMA flag. If so, it will exit the 1st-level handler when it sees flag 
>> is false
>>      4.2. Wave X may have already passed the 1st-level TMA flag check 
>> and entered the 2nd-level trap handler to do the PC sample
>> 5. kfd_pc_sample_stop() returns, eventually causing ioctl to return, 
>> back to user-space
>> 6. Because the stop ioctl has returned, user-land deallocates 
>> user-space buffer the 2nd level trap handler uses to output sample data
>> 7. Wave X that was in the 2nd-level handler tries to finish its 
>> sample output and writes to the now-freed location, causing a 
>> use-after-free
>>
>> Note that Step 3 does not always stop further traps from arriving -- 
>> if another process still wants to do sampling, the driver or HW might 
>> still send traps to every wave on the device after Step 3.
>> As such, to avoid going into the 2nd-level handler for non-sampled 
>> processes, all 1st-level handlers must check their TMA flag to see if 
>> they should allow the sample to flow to the 2nd-level handler.
>>
>> By removing the queue from the HW after Step 4, we can be sure that 
>> any existing waves from this process that entered the PC sampling 
>> 2nd-level handler before Step 4 are done.
>> Any waves that were still in the 1st-level handler at Step 4.1 will 
>> be filtered by the TMA flag being set to false. CWSR will wait until 
>> they exit.
>> Any waves that were already in the 2nd-level handler (4.2) must 
>> complete before the CWSR save will complete and allow this queue 
>> removal request to complete.
>> Any waves that enter the 1st-level trap handler after Step 4 won't go 
>> into the PC sampling logic in the 2nd-level handler because the TMA 
>> flag is set to false. CWSR will wait until they exit.
>>
>> When we then put the queue back on the hardware, any further traps 
>> that might show up (e.g. because another process is sampling) will 
>> get filtered by the TMA flag.
>>
>> So once the queue removal (and thus CWSR save cycle) has completed, 
>> we can be sure that no other traps to this process will try to use 
>> its PC sample data buffer, so it's safe to return to user-space and 
>> let them potentially free that buffer.
>>
>> I don't know how to summarize this nicely in a comment, but hopefully 
>> y'all can figure that out. :)
>
> My best summary: We need to ensure that any waves executing the PC 
> sampling part of the trap handler are done before kfd_pc_sample_stop 
> returns, and that no new waves enter that part of the trap handler 
> afterwards. This avoids race conditions that could lead to 
> use-after-free. Unmapping and remapping the queues either waits for 
> the waves to drain, or preempts them with CWSR, which itself executes 
> a trap and waits for previous traps to finish.

> [JZ]  Thanks all!

> Regards,
>   Felix
>
>
>>
>> Thanks,
>> -Joe
>>
>>>> Regards,
>>>>    Felix
>>>>
>>>>
>>>>>>> Signed-off-by: James Zhu <James.Zhu@amd.com>
>>>>>>> ---
>>>>>>> drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 11
>>>>>>> +++++++++++
>>>>>>> drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h |  5 +++++
>>>>>>> drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c |  3 +++
>>>>>>>    3 files changed, 19 insertions(+)
>>>>>>>
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>>>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>>>>>> index c0e71543389a..a3f57be63f4f 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>>>>>> @@ -3155,6 +3155,17 @@ int debug_refresh_runlist(struct
>>>>>>> device_queue_manager *dqm)
>>>>>>>        return debug_map_and_unlock(dqm);
>>>>>>>    }
>>>>>>>    +void remap_queue(struct device_queue_manager *dqm,
>>>>>>> +                enum kfd_unmap_queues_filter filter,
>>>>>>> +                uint32_t filter_param,
>>>>>>> +                uint32_t grace_period)
>>>>>> Not sure if you need the filter and grace period parameters in this
>>>>>> function. What's the point of exposing that to callers who just want
>>>>>> to trigger a CWSR? You could also change the function name to
>>>>>> reflect the purpose of the function, rather than the implementation.
>>>>> [JZ] Just want to create a general function in case that used by
>>>>> others. I am fine to remove passing filter_param/grace_period
>>>>>> Regards,
>>>>>>    Felix
>>>>>>
>>>>>>
>>>>>>> +{
>>>>>>> +    dqm_lock(dqm);
>>>>>>> +    if (!dqm->dev->kfd->shared_resources.enable_mes)
>>>>>>> +        execute_queues_cpsch(dqm, filter, filter_param,
>>>>>>> grace_period);
>>>>>>> +    dqm_unlock(dqm);
>>>>>>> +}
>>>>>>> +
>>>>>>>    #if defined(CONFIG_DEBUG_FS)
>>>>>>>      static void seq_reg_dump(struct seq_file *m,
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
>>>>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
>>>>>>> index cf7e182588f8..f8aae3747a36 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
>>>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
>>>>>>> @@ -303,6 +303,11 @@ int debug_lock_and_unmap(struct
>>>>>>> device_queue_manager *dqm);
>>>>>>>    int debug_map_and_unlock(struct device_queue_manager *dqm);
>>>>>>>    int debug_refresh_runlist(struct device_queue_manager *dqm);
>>>>>>>    +void remap_queue(struct device_queue_manager *dqm,
>>>>>>> +                enum kfd_unmap_queues_filter filter,
>>>>>>> +                uint32_t filter_param,
>>>>>>> +                uint32_t grace_period);
>>>>>>> +
>>>>>>>    static inline unsigned int get_sh_mem_bases_32(struct
>>>>>>> kfd_process_device *pdd)
>>>>>>>    {
>>>>>>>        return (pdd->lds_base >> 16) & 0xFF;
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>>>>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>>>>>>> index e8f0559b618e..66670cdb813a 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
>>>>>>> @@ -24,6 +24,7 @@
>>>>>>>    #include "kfd_priv.h"
>>>>>>>    #include "amdgpu_amdkfd.h"
>>>>>>>    #include "kfd_pc_sampling.h"
>>>>>>> +#include "kfd_device_queue_manager.h"
>>>>>>>      struct supported_pc_sample_info {
>>>>>>>        uint32_t ip_version;
>>>>>>> @@ -164,6 +165,8 @@ static int kfd_pc_sample_stop(struct
>>>>>>> kfd_process_device *pdd,
>>>>>>> cancel_work_sync(&pdd->dev->pcs_data.hosttrap_entry.base.pc_sampling_work); 
>>>>>>>
>>>>>>>
>>>>>>> kfd_process_set_trap_pc_sampling_flag(&pdd->qpd,
>>>>>>> pdd->dev->pcs_data.hosttrap_entry.base.pc_sample_info.method, 
>>>>>>> false);
>>>>>>> +        remap_queue(pdd->dev->dqm,
>>>>>>> +            KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0,
>>>>>>> USE_DEFAULT_GRACE_PERIOD);
>>>>>>> mutex_lock(&pdd->dev->pcs_data.mutex);
>>>>>>> pdd->dev->pcs_data.hosttrap_entry.base.target_simd = 0;

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH 01/24] drm/amdkfd/kfd_ioctl: add pc sampling support
  2023-11-03 13:11 ` [PATCH 01/24] drm/amdkfd/kfd_ioctl: add pc sampling support James Zhu
  2023-11-22 21:14   ` Felix Kuehling
@ 2023-11-27 19:11   ` Alex Deucher
  2023-11-27 19:14     ` James Zhu
  1 sibling, 1 reply; 80+ messages in thread
From: Alex Deucher @ 2023-11-27 19:11 UTC (permalink / raw)
  To: James Zhu; +Cc: Felix.kuehling, jamesz, joseph.greathouse, amd-gfx

On Fri, Nov 3, 2023 at 9:22 AM James Zhu <James.Zhu@amd.com> wrote:
>
> From: David Yat Sin <david.yatsin@amd.com>
>
> Add pc sampling support in kfd_ioctl.
>
> Co-developed-by: James Zhu <James.Zhu@amd.com>
> Signed-off-by: James Zhu <James.Zhu@amd.com>
> Signed-off-by: David Yat Sin <david.yatsin@amd.com>

For any new IOCTL interfaces, please provide a link to the user mode
code branch which uses it in the patch description.

Thanks,

Alex

> ---
>  include/uapi/linux/kfd_ioctl.h | 57 +++++++++++++++++++++++++++++++++-
>  1 file changed, 56 insertions(+), 1 deletion(-)
>
> diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
> index f0ed68974c54..5202e29c9560 100644
> --- a/include/uapi/linux/kfd_ioctl.h
> +++ b/include/uapi/linux/kfd_ioctl.h
> @@ -1446,6 +1446,58 @@ struct kfd_ioctl_dbg_trap_args {
>         };
>  };
>
> +/**
> + * kfd_ioctl_pc_sample_op - PC Sampling ioctl operations
> + *
> + * @KFD_IOCTL_PCS_OP_QUERY_CAPABILITIES: Query device PC Sampling capabilities
> + * @KFD_IOCTL_PCS_OP_CREATE:             Register this process with a per-device PC sampler instance
> + * @KFD_IOCTL_PCS_OP_DESTROY:            Unregister from a previously registered PC sampler instance
> + * @KFD_IOCTL_PCS_OP_START:              Process begins taking samples from a previously registered PC sampler instance
> + * @KFD_IOCTL_PCS_OP_STOP:               Process stops taking samples from a previously registered PC sampler instance
> + */
> +enum kfd_ioctl_pc_sample_op {
> +       KFD_IOCTL_PCS_OP_QUERY_CAPABILITIES,
> +       KFD_IOCTL_PCS_OP_CREATE,
> +       KFD_IOCTL_PCS_OP_DESTROY,
> +       KFD_IOCTL_PCS_OP_START,
> +       KFD_IOCTL_PCS_OP_STOP,
> +};
> +
> +/* Values have to be a power of 2*/
> +#define KFD_IOCTL_PCS_FLAG_POWER_OF_2 0x00000001
> +
> +enum kfd_ioctl_pc_sample_method {
> +       KFD_IOCTL_PCS_METHOD_HOSTTRAP = 1,
> +       KFD_IOCTL_PCS_METHOD_STOCHASTIC,
> +};
> +
> +enum kfd_ioctl_pc_sample_type {
> +       KFD_IOCTL_PCS_TYPE_TIME_US,
> +       KFD_IOCTL_PCS_TYPE_CLOCK_CYCLES,
> +       KFD_IOCTL_PCS_TYPE_INSTRUCTIONS
> +};
> +
> +struct kfd_pc_sample_info {
> +       __u64 value;         /* [IN] if PCS_TYPE_INTERVAL_US: sample interval in us
> +                             * if PCS_TYPE_CLOCK_CYCLES: sample interval in graphics core clk cycles
> +                             * if PCS_TYPE_INSTRUCTIONS: sample interval in instructions issued by
> +                             * graphics compute units
> +                             */
> +       __u64 value_min;     /* [OUT] */
> +       __u64 value_max;     /* [OUT] */
> +       __u64 flags;         /* [OUT] indicate potential restrictions e.g FLAG_POWER_OF_2 */
> +       __u32 method;        /* [IN/OUT] kfd_ioctl_pc_sample_method */
> +       __u32 type;          /* [IN/OUT] kfd_ioctl_pc_sample_type */
> +};
> +
> +struct kfd_ioctl_pc_sample_args {
> +       __u64 sample_info_ptr;   /* array of kfd_pc_sample_info */
> +       __u32 num_sample_info;
> +       __u32 op;                /* kfd_ioctl_pc_sample_op */
> +       __u32 gpu_id;
> +       __u32 trace_id;
> +};
> +
>  #define AMDKFD_IOCTL_BASE 'K'
>  #define AMDKFD_IO(nr)                  _IO(AMDKFD_IOCTL_BASE, nr)
>  #define AMDKFD_IOR(nr, type)           _IOR(AMDKFD_IOCTL_BASE, nr, type)
> @@ -1566,7 +1618,10 @@ struct kfd_ioctl_dbg_trap_args {
>  #define AMDKFD_IOC_DBG_TRAP                    \
>                 AMDKFD_IOWR(0x26, struct kfd_ioctl_dbg_trap_args)
>
> +#define AMDKFD_IOC_PC_SAMPLE           \
> +               AMDKFD_IOWR(0x27, struct kfd_ioctl_pc_sample_args)
> +
>  #define AMDKFD_COMMAND_START           0x01
> -#define AMDKFD_COMMAND_END             0x27
> +#define AMDKFD_COMMAND_END             0x28
>
>  #endif
> --
> 2.25.1
>

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH 01/24] drm/amdkfd/kfd_ioctl: add pc sampling support
  2023-11-27 19:11   ` Alex Deucher
@ 2023-11-27 19:14     ` James Zhu
  0 siblings, 0 replies; 80+ messages in thread
From: James Zhu @ 2023-11-27 19:14 UTC (permalink / raw)
  To: Alex Deucher, James Zhu; +Cc: Felix.kuehling, joseph.greathouse, amd-gfx

[-- Attachment #1: Type: text/plain, Size: 3926 bytes --]


On 2023-11-27 14:11, Alex Deucher wrote:
> On Fri, Nov 3, 2023 at 9:22 AM James Zhu<James.Zhu@amd.com>  wrote:
>> From: David Yat Sin<david.yatsin@amd.com>
>>
>> Add pc sampling support in kfd_ioctl.
>>
>> Co-developed-by: James Zhu<James.Zhu@amd.com>
>> Signed-off-by: James Zhu<James.Zhu@amd.com>
>> Signed-off-by: David Yat Sin<david.yatsin@amd.com>
> For any new IOCTL interfaces, please provide a link to the user mode
> code branch which uses it in the patch description.
[JZ] will add, Thanks!
> Thanks,
>
> Alex
>
>> ---
>>   include/uapi/linux/kfd_ioctl.h | 57 +++++++++++++++++++++++++++++++++-
>>   1 file changed, 56 insertions(+), 1 deletion(-)
>>
>> diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
>> index f0ed68974c54..5202e29c9560 100644
>> --- a/include/uapi/linux/kfd_ioctl.h
>> +++ b/include/uapi/linux/kfd_ioctl.h
>> @@ -1446,6 +1446,58 @@ struct kfd_ioctl_dbg_trap_args {
>>          };
>>   };
>>
>> +/**
>> + * kfd_ioctl_pc_sample_op - PC Sampling ioctl operations
>> + *
>> + * @KFD_IOCTL_PCS_OP_QUERY_CAPABILITIES: Query device PC Sampling capabilities
>> + * @KFD_IOCTL_PCS_OP_CREATE:             Register this process with a per-device PC sampler instance
>> + * @KFD_IOCTL_PCS_OP_DESTROY:            Unregister from a previously registered PC sampler instance
>> + * @KFD_IOCTL_PCS_OP_START:              Process begins taking samples from a previously registered PC sampler instance
>> + * @KFD_IOCTL_PCS_OP_STOP:               Process stops taking samples from a previously registered PC sampler instance
>> + */
>> +enum kfd_ioctl_pc_sample_op {
>> +       KFD_IOCTL_PCS_OP_QUERY_CAPABILITIES,
>> +       KFD_IOCTL_PCS_OP_CREATE,
>> +       KFD_IOCTL_PCS_OP_DESTROY,
>> +       KFD_IOCTL_PCS_OP_START,
>> +       KFD_IOCTL_PCS_OP_STOP,
>> +};
>> +
>> +/* Values have to be a power of 2*/
>> +#define KFD_IOCTL_PCS_FLAG_POWER_OF_2 0x00000001
>> +
>> +enum kfd_ioctl_pc_sample_method {
>> +       KFD_IOCTL_PCS_METHOD_HOSTTRAP = 1,
>> +       KFD_IOCTL_PCS_METHOD_STOCHASTIC,
>> +};
>> +
>> +enum kfd_ioctl_pc_sample_type {
>> +       KFD_IOCTL_PCS_TYPE_TIME_US,
>> +       KFD_IOCTL_PCS_TYPE_CLOCK_CYCLES,
>> +       KFD_IOCTL_PCS_TYPE_INSTRUCTIONS
>> +};
>> +
>> +struct kfd_pc_sample_info {
>> +       __u64 value;         /* [IN] if PCS_TYPE_INTERVAL_US: sample interval in us
>> +                             * if PCS_TYPE_CLOCK_CYCLES: sample interval in graphics core clk cycles
>> +                             * if PCS_TYPE_INSTRUCTIONS: sample interval in instructions issued by
>> +                             * graphics compute units
>> +                             */
>> +       __u64 value_min;     /* [OUT] */
>> +       __u64 value_max;     /* [OUT] */
>> +       __u64 flags;         /* [OUT] indicate potential restrictions e.g FLAG_POWER_OF_2 */
>> +       __u32 method;        /* [IN/OUT] kfd_ioctl_pc_sample_method */
>> +       __u32 type;          /* [IN/OUT] kfd_ioctl_pc_sample_type */
>> +};
>> +
>> +struct kfd_ioctl_pc_sample_args {
>> +       __u64 sample_info_ptr;   /* array of kfd_pc_sample_info */
>> +       __u32 num_sample_info;
>> +       __u32 op;                /* kfd_ioctl_pc_sample_op */
>> +       __u32 gpu_id;
>> +       __u32 trace_id;
>> +};
>> +
>>   #define AMDKFD_IOCTL_BASE 'K'
>>   #define AMDKFD_IO(nr)                  _IO(AMDKFD_IOCTL_BASE, nr)
>>   #define AMDKFD_IOR(nr, type)           _IOR(AMDKFD_IOCTL_BASE, nr, type)
>> @@ -1566,7 +1618,10 @@ struct kfd_ioctl_dbg_trap_args {
>>   #define AMDKFD_IOC_DBG_TRAP                    \
>>                  AMDKFD_IOWR(0x26, struct kfd_ioctl_dbg_trap_args)
>>
>> +#define AMDKFD_IOC_PC_SAMPLE           \
>> +               AMDKFD_IOWR(0x27, struct kfd_ioctl_pc_sample_args)
>> +
>>   #define AMDKFD_COMMAND_START           0x01
>> -#define AMDKFD_COMMAND_END             0x27
>> +#define AMDKFD_COMMAND_END             0x28
>>
>>   #endif
>> --
>> 2.25.1
>>

[-- Attachment #2: Type: text/html, Size: 5011 bytes --]

^ permalink raw reply	[flat|nested] 80+ messages in thread

end of thread, other threads:[~2023-11-27 19:14 UTC | newest]

Thread overview: 80+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-11-03 13:11 [PATCH 00/24] Support Host Trap Sampling for MI200 James Zhu
2023-11-03 13:11 ` [PATCH 01/24] drm/amdkfd/kfd_ioctl: add pc sampling support James Zhu
2023-11-22 21:14   ` Felix Kuehling
2023-11-23 20:33     ` James Zhu
2023-11-27 19:11   ` Alex Deucher
2023-11-27 19:14     ` James Zhu
2023-11-03 13:11 ` [PATCH 02/24] drm/amdkfd: " James Zhu
2023-11-03 13:11 ` [PATCH 03/24] drm/amdkfd: enable pc sampling query James Zhu
2023-11-10 19:04   ` Yat Sin, David
2023-11-20 15:34   ` [PATCH v2 " James Zhu
2023-11-03 13:11 ` [PATCH 04/24] drm/amdkfd: add pc sampling mutex James Zhu
2023-11-03 13:11 ` [PATCH 05/24] drm/amdkfd: enable pc sampling create James Zhu
2023-11-22 21:51   ` Felix Kuehling
2023-11-23 20:25     ` James Zhu
2023-11-03 13:11 ` [PATCH 06/24] drm/amdkfd: add trace_id return James Zhu
2023-11-22 21:56   ` Felix Kuehling
2023-11-23 20:22     ` James Zhu
2023-11-22 22:21   ` Felix Kuehling
2023-11-23 22:14     ` Zhu, James
2023-11-03 13:11 ` [PATCH 07/24] drm/amdkfd: check pcs_enrty valid James Zhu
2023-11-10 19:09   ` Yat Sin, David
2023-11-20 15:55   ` [PATCH v2 " James Zhu
2023-11-22 22:15   ` [PATCH " Felix Kuehling
2023-11-23 20:18     ` James Zhu
2023-11-23 20:32       ` Felix Kuehling
2023-11-23 22:06         ` James Zhu
2023-11-03 13:11 ` [PATCH 08/24] drm/amdkfd: enable pc sampling destroy James Zhu
2023-11-03 13:11 ` [PATCH 09/24] drm/amdkfd: add interface to trigger pc sampling trap James Zhu
2023-11-03 13:11 ` [PATCH 10/24] drm/amdkfd: trigger pc sampling trap for gfx v9 James Zhu
2023-11-10 19:08   ` Yat Sin, David
2023-11-20 16:05   ` [PATCH v2 " James Zhu
2023-11-03 13:11 ` [PATCH 11/24] drm/amdkfd/gfx9: enable host trap James Zhu
2023-11-03 13:11 ` [PATCH 12/24] drm/amdgpu: use trapID 4 for " James Zhu
2023-11-20 16:08   ` [PATCH v2 " James Zhu
2023-11-03 13:11 ` [PATCH 13/24] drm/amdgpu: add sq host trap status check James Zhu
2023-11-10 19:07   ` Yat Sin, David
2023-11-20 16:16   ` [PATCH v2 " James Zhu
2023-11-03 13:11 ` [PATCH 14/24] drm/amdkfd: trigger pc sampling trap for arcturus James Zhu
2023-11-03 13:11 ` [PATCH 15/24] drm/amdkfd: trigger pc sampling trap for aldebaran James Zhu
2023-11-10 19:08   ` Yat Sin, David
2023-11-20 16:19     ` James Zhu
2023-11-03 13:11 ` [PATCH 16/24] drm/amdkfd: use bit operation set debug trap James Zhu
2023-11-10 19:08   ` Yat Sin, David
2023-11-20 16:21     ` James Zhu
2023-11-03 13:11 ` [PATCH 17/24] drm/amdkfd: add setting trap pc sampling flag James Zhu
2023-11-10 19:07   ` Yat Sin, David
2023-11-03 13:11 ` [PATCH 18/24] drm/amdkfd: enable pc sampling start James Zhu
2023-11-22 22:27   ` Felix Kuehling
2023-11-23 20:01     ` James Zhu
2023-11-23 20:21       ` Felix Kuehling
2023-11-23 22:00         ` James Zhu
2023-11-03 13:11 ` [PATCH 19/24] drm/amdkfd: enable pc sampling stop James Zhu
2023-11-10 19:07   ` Yat Sin, David
2023-11-13 15:19     ` James Zhu
2023-11-13 17:04       ` Yat Sin, David
2023-11-13 17:15         ` James Zhu
2023-11-03 13:11 ` [PATCH 20/24] drm/amdkfd: enable pc sampling work to trigger trap James Zhu
2023-11-22 22:31   ` Felix Kuehling
2023-11-23 18:27     ` James Zhu
2023-11-23 19:08       ` Felix Kuehling
2023-11-23 19:52         ` James Zhu
2023-11-03 13:11 ` [PATCH 21/24] drm/amdkfd: add queue remapping James Zhu
2023-11-22 22:35   ` Felix Kuehling
2023-11-23 16:25     ` James Zhu
2023-11-23 19:02       ` Felix Kuehling
2023-11-23 19:49         ` James Zhu
2023-11-23 22:41           ` Greathouse, Joseph
2023-11-23 23:01             ` Felix Kuehling
2023-11-23 23:16               ` James Zhu
2023-11-03 13:11 ` [PATCH 22/24] drm/amdkfd: add pc sampling release when process release James Zhu
2023-11-10 19:08   ` Yat Sin, David
2023-11-13 15:12     ` James Zhu
2023-11-13 15:19       ` Yat Sin, David
2023-11-13 15:30         ` James Zhu
2023-11-20 16:23   ` [PATCH v2 " James Zhu
2023-11-03 13:11 ` [PATCH 23/24] drm/amdkfd: add pc sampling capability check James Zhu
2023-11-22 22:40   ` Felix Kuehling
2023-11-23 16:06     ` James Zhu
2023-11-03 13:11 ` [PATCH 24/24] drm/amdkfd: bump kfd ioctl minor version for pc sampling availability James Zhu
2023-11-16 14:51 ` [PATCH 00/24] Support Host Trap Sampling for MI200 James Zhu

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.