All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/5] drm/amdgpu: add mode2 reset for sienna_cichlid
@ 2022-07-22  7:33 Victor Zhao
  2022-07-22  7:34 ` [PATCH 2/5] drm/amdgpu: add debugfs amdgpu_reset_level Victor Zhao
                   ` (4 more replies)
  0 siblings, 5 replies; 20+ messages in thread
From: Victor Zhao @ 2022-07-22  7:33 UTC (permalink / raw)
  To: amd-gfx; +Cc: Alexander.Deucher, emily.deng, Victor Zhao, Christian.Koenig

To meet the requirement for multi container usecase which needs
a quicker reset and not causing VRAM lost, adding the Mode2
reset handler for sienna_cichlid. Adding a AMDGPU_SKIP_MODE2_RESET
flag so driver can fallback to default reset method when mode2
reset failed and retry.

- add mode2 reset handler for sienna_cichlid
- introduce AMDGPU_SKIP_MODE2_RESET flag
- let mode2 reset fallback to default reset method if failed

Signed-off-by: Victor Zhao <Victor.Zhao@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/Makefile           |   2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c    |   1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c    |   7 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c       |   1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c       |   1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c     |  13 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h     |   1 +
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c         |   1 +
 drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c         |   1 +
 drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c         |   1 +
 drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c   | 297 ++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/sienna_cichlid.h   |  32 ++
 .../pm/swsmu/inc/pmfw_if/smu_v11_0_7_ppsmc.h  |   4 +-
 drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h  |   3 +-
 .../amd/pm/swsmu/smu11/sienna_cichlid_ppt.c   |  54 ++++
 15 files changed, 414 insertions(+), 5 deletions(-)
 create mode 100644 drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
 create mode 100644 drivers/gpu/drm/amd/amdgpu/sienna_cichlid.h

diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile
index c7d0cd15b5ef..7030ac2d7d2c 100644
--- a/drivers/gpu/drm/amd/amdgpu/Makefile
+++ b/drivers/gpu/drm/amd/amdgpu/Makefile
@@ -75,7 +75,7 @@ amdgpu-y += \
 	vi.o mxgpu_vi.o nbio_v6_1.o soc15.o emu_soc.o mxgpu_ai.o nbio_v7_0.o vega10_reg_init.o \
 	vega20_reg_init.o nbio_v7_4.o nbio_v2_3.o nv.o arct_reg_init.o mxgpu_nv.o \
 	nbio_v7_2.o hdp_v4_0.o hdp_v5_0.o aldebaran_reg_init.o aldebaran.o soc21.o \
-	nbio_v4_3.o hdp_v6_0.o nbio_v7_7.o hdp_v5_2.o lsdma_v6_0.o
+	sienna_cichlid.o nbio_v4_3.o hdp_v6_0.o nbio_v7_7.o hdp_v5_2.o lsdma_v6_0.o
 
 # add DF block
 amdgpu-y += \
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 5e53a5293935..091415a4abf0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -135,6 +135,7 @@ static void amdgpu_amdkfd_reset_work(struct work_struct *work)
 	reset_context.method = AMD_RESET_METHOD_NONE;
 	reset_context.reset_req_dev = adev;
 	clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+	clear_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags);
 
 	amdgpu_device_gpu_recover(adev, NULL, &reset_context);
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index b79ee4ffb879..5498fda8617f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5146,6 +5146,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 
 	reset_context->job = job;
 	reset_context->hive = hive;
+
 	/*
 	 * Build list of devices to reset.
 	 * In case we are in XGMI hive mode, resort the device list
@@ -5265,8 +5266,11 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 			amdgpu_ras_resume(adev);
 	} else {
 		r = amdgpu_do_asic_reset(device_list_handle, reset_context);
-		if (r && r == -EAGAIN)
+		if (r && r == -EAGAIN) {
+			set_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context->flags);
+			adev->asic_reset_res = 0;
 			goto retry;
+		}
 	}
 
 skip_hw_reset:
@@ -5694,6 +5698,7 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
 	reset_context.reset_req_dev = adev;
 	set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
 	set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
+	set_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags);
 
 	adev->no_hw_access = true;
 	r = amdgpu_device_pre_asic_reset(adev, &reset_context);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 10fdd12cf853..9844d99075e9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -71,6 +71,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
 		reset_context.method = AMD_RESET_METHOD_NONE;
 		reset_context.reset_req_dev = adev;
 		clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+		clear_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags);
 
 		r = amdgpu_device_gpu_recover(ring->adev, job, &reset_context);
 		if (r)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 4f321375015c..259ec860890a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1949,6 +1949,7 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
 		reset_context.method = AMD_RESET_METHOD_NONE;
 		reset_context.reset_req_dev = adev;
 		clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+		clear_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags);
 
 		amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context);
 	}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
index 32c86a0b145c..831fb222139c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
@@ -23,6 +23,7 @@
 
 #include "amdgpu_reset.h"
 #include "aldebaran.h"
+#include "sienna_cichlid.h"
 
 int amdgpu_reset_add_handler(struct amdgpu_reset_control *reset_ctl,
 			     struct amdgpu_reset_handler *handler)
@@ -40,6 +41,9 @@ int amdgpu_reset_init(struct amdgpu_device *adev)
 	case IP_VERSION(13, 0, 2):
 		ret = aldebaran_reset_init(adev);
 		break;
+	case IP_VERSION(11, 0, 7):
+		ret = sienna_cichlid_reset_init(adev);
+		break;
 	default:
 		break;
 	}
@@ -55,6 +59,9 @@ int amdgpu_reset_fini(struct amdgpu_device *adev)
 	case IP_VERSION(13, 0, 2):
 		ret = aldebaran_reset_fini(adev);
 		break;
+	case IP_VERSION(11, 0, 7):
+		ret = sienna_cichlid_reset_fini(adev);
+		break;
 	default:
 		break;
 	}
@@ -67,6 +74,9 @@ int amdgpu_reset_prepare_hwcontext(struct amdgpu_device *adev,
 {
 	struct amdgpu_reset_handler *reset_handler = NULL;
 
+	if (test_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context->flags))
+		return -ENOSYS;
+
 	if (adev->reset_cntl && adev->reset_cntl->get_reset_handler)
 		reset_handler = adev->reset_cntl->get_reset_handler(
 			adev->reset_cntl, reset_context);
@@ -83,6 +93,9 @@ int amdgpu_reset_perform_reset(struct amdgpu_device *adev,
 	int ret;
 	struct amdgpu_reset_handler *reset_handler = NULL;
 
+	if (test_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context->flags))
+		return -ENOSYS;
+
 	if (adev->reset_cntl)
 		reset_handler = adev->reset_cntl->get_reset_handler(
 			adev->reset_cntl, reset_context);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
index 9e55a5d7a825..cc4b2eeb24cf 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
@@ -30,6 +30,7 @@ enum AMDGPU_RESET_FLAGS {
 
 	AMDGPU_NEED_FULL_RESET = 0,
 	AMDGPU_SKIP_HW_RESET = 1,
+	AMDGPU_SKIP_MODE2_RESET = 2,
 };
 
 struct amdgpu_reset_context {
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index 12906ba74462..a2f04b249132 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -290,6 +290,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
 		reset_context.method = AMD_RESET_METHOD_NONE;
 		reset_context.reset_req_dev = adev;
 		clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+		clear_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags);
 
 		amdgpu_device_gpu_recover(adev, NULL, &reset_context);
 	}
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
index e07757eea7ad..a977f0027928 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
@@ -317,6 +317,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
 		reset_context.method = AMD_RESET_METHOD_NONE;
 		reset_context.reset_req_dev = adev;
 		clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+		clear_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags);
 
 		amdgpu_device_gpu_recover(adev, NULL, &reset_context);
 	}
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
index 288c414babdf..fd14fa9b9cd7 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
@@ -529,6 +529,7 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct *work)
 		reset_context.method = AMD_RESET_METHOD_NONE;
 		reset_context.reset_req_dev = adev;
 		clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+		clear_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags);
 
 		amdgpu_device_gpu_recover(adev, NULL, &reset_context);
 	}
diff --git a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
new file mode 100644
index 000000000000..0512960bed23
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
@@ -0,0 +1,297 @@
+/*
+ * Copyright 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "sienna_cichlid.h"
+#include "amdgpu_reset.h"
+#include "amdgpu_amdkfd.h"
+#include "amdgpu_dpm.h"
+#include "amdgpu_job.h"
+#include "amdgpu_ring.h"
+#include "amdgpu_ras.h"
+#include "amdgpu_psp.h"
+#include "amdgpu_xgmi.h"
+
+static struct amdgpu_reset_handler *
+sienna_cichlid_get_reset_handler(struct amdgpu_reset_control *reset_ctl,
+			    struct amdgpu_reset_context *reset_context)
+{
+	struct amdgpu_reset_handler *handler;
+	struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;
+
+	if (reset_context->method != AMD_RESET_METHOD_NONE) {
+		list_for_each_entry(handler, &reset_ctl->reset_handlers,
+				     handler_list) {
+			if (handler->reset_method == reset_context->method)
+				return handler;
+		}
+	} else {
+		list_for_each_entry(handler, &reset_ctl->reset_handlers,
+				     handler_list) {
+			if (handler->reset_method == AMD_RESET_METHOD_MODE2 &&
+			    adev->pm.fw_version >= 0x3a5500 &&
+			    !amdgpu_sriov_vf(adev)) {
+				reset_context->method = AMD_RESET_METHOD_MODE2;
+				return handler;
+			}
+		}
+	}
+
+	return NULL;
+}
+
+static int sienna_cichlid_mode2_suspend_ip(struct amdgpu_device *adev)
+{
+	int r, i;
+
+	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
+	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
+
+	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
+		if (!(adev->ip_blocks[i].version->type ==
+			      AMD_IP_BLOCK_TYPE_GFX ||
+		      adev->ip_blocks[i].version->type ==
+			      AMD_IP_BLOCK_TYPE_SDMA))
+			continue;
+
+		r = adev->ip_blocks[i].version->funcs->suspend(adev);
+
+		if (r) {
+			dev_err(adev->dev,
+				"suspend of IP block <%s> failed %d\n",
+				adev->ip_blocks[i].version->funcs->name, r);
+			return r;
+		}
+		adev->ip_blocks[i].status.hw = false;
+	}
+
+	return r;
+}
+
+static int
+sienna_cichlid_mode2_prepare_hwcontext(struct amdgpu_reset_control *reset_ctl,
+				  struct amdgpu_reset_context *reset_context)
+{
+	int r = 0;
+	struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;
+
+	if (!amdgpu_sriov_vf(adev))
+		r = sienna_cichlid_mode2_suspend_ip(adev);
+
+	return r;
+}
+
+static void sienna_cichlid_async_reset(struct work_struct *work)
+{
+	struct amdgpu_reset_handler *handler;
+	struct amdgpu_reset_control *reset_ctl =
+		container_of(work, struct amdgpu_reset_control, reset_work);
+	struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;
+
+	list_for_each_entry(handler, &reset_ctl->reset_handlers,
+			     handler_list) {
+		if (handler->reset_method == reset_ctl->active_reset) {
+			dev_dbg(adev->dev, "Resetting device\n");
+			handler->do_reset(adev);
+			break;
+		}
+	}
+}
+
+static int sienna_cichlid_mode2_reset(struct amdgpu_device *adev)
+{
+	/* disable BM */
+	pci_clear_master(adev->pdev);
+	adev->asic_reset_res = amdgpu_dpm_mode2_reset(adev);
+	return adev->asic_reset_res;
+}
+
+static int
+sienna_cichlid_mode2_perform_reset(struct amdgpu_reset_control *reset_ctl,
+			      struct amdgpu_reset_context *reset_context)
+{
+	struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;
+	int r;
+
+	r = sienna_cichlid_mode2_reset(adev);
+	if (r) {
+		dev_err(adev->dev,
+			"ASIC reset failed with error, %d ", r);
+	}
+	return r;
+}
+
+static int sienna_cichlid_mode2_restore_ip(struct amdgpu_device *adev)
+{
+	int i, r;
+	struct psp_context *psp = &adev->psp;
+
+	r = psp_rlc_autoload_start(psp);
+	if (r) {
+		dev_err(adev->dev, "Failed to start rlc autoload\n");
+		return r;
+	}
+
+	/* Reinit GFXHUB */
+	adev->gfxhub.funcs->init(adev);
+	r = adev->gfxhub.funcs->gart_enable(adev);
+	if (r) {
+		dev_err(adev->dev, "GFXHUB gart reenable failed after reset\n");
+		return r;
+	}
+
+	for (i = 0; i < adev->num_ip_blocks; i++) {
+		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)
+			r = adev->ip_blocks[i].version->funcs->resume(adev);
+		if (r) {
+			dev_err(adev->dev,
+				"resume of IP block <%s> failed %d\n",
+				adev->ip_blocks[i].version->funcs->name, r);
+			return r;
+		}
+
+		adev->ip_blocks[i].status.hw = true;
+	}
+
+	for (i = 0; i < adev->num_ip_blocks; i++) {
+		if (!(adev->ip_blocks[i].version->type ==
+			      AMD_IP_BLOCK_TYPE_GFX ||
+		      adev->ip_blocks[i].version->type ==
+			      AMD_IP_BLOCK_TYPE_SDMA))
+			continue;
+		r = adev->ip_blocks[i].version->funcs->resume(adev);
+		if (r) {
+			dev_err(adev->dev,
+				"resume of IP block <%s> failed %d\n",
+				adev->ip_blocks[i].version->funcs->name, r);
+			return r;
+		}
+
+		adev->ip_blocks[i].status.hw = true;
+	}
+
+	for (i = 0; i < adev->num_ip_blocks; i++) {
+		if (!(adev->ip_blocks[i].version->type ==
+			      AMD_IP_BLOCK_TYPE_GFX ||
+		      adev->ip_blocks[i].version->type ==
+			      AMD_IP_BLOCK_TYPE_SDMA))
+			continue;
+
+		if (adev->ip_blocks[i].version->funcs->late_init) {
+			r = adev->ip_blocks[i].version->funcs->late_init(
+				(void *)adev);
+			if (r) {
+				dev_err(adev->dev,
+					"late_init of IP block <%s> failed %d after reset\n",
+					adev->ip_blocks[i].version->funcs->name,
+					r);
+				return r;
+			}
+		}
+		adev->ip_blocks[i].status.late_initialized = true;
+	}
+
+	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
+	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
+
+	return r;
+}
+
+static int
+sienna_cichlid_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,
+				  struct amdgpu_reset_context *reset_context)
+{
+	int r;
+	struct amdgpu_device *tmp_adev = (struct amdgpu_device *)reset_ctl->handle;
+
+	dev_info(tmp_adev->dev,
+			"GPU reset succeeded, trying to resume\n");
+	r = sienna_cichlid_mode2_restore_ip(tmp_adev);
+	if (r)
+		goto end;
+
+	/*
+	* Add this ASIC as tracked as reset was already
+	* complete successfully.
+	*/
+	amdgpu_register_gpu_instance(tmp_adev);
+
+	/* Resume RAS */
+	amdgpu_ras_resume(tmp_adev);
+
+	amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
+
+	r = amdgpu_ib_ring_tests(tmp_adev);
+	if (r) {
+		dev_err(tmp_adev->dev,
+			"ib ring test failed (%d).\n", r);
+		r = -EAGAIN;
+		tmp_adev->asic_reset_res = r;
+		goto end;
+	}
+
+end:
+	if (r)
+		return -EAGAIN;
+	else
+		return r;
+}
+
+static struct amdgpu_reset_handler sienna_cichlid_mode2_handler = {
+	.reset_method		= AMD_RESET_METHOD_MODE2,
+	.prepare_env		= NULL,
+	.prepare_hwcontext	= sienna_cichlid_mode2_prepare_hwcontext,
+	.perform_reset		= sienna_cichlid_mode2_perform_reset,
+	.restore_hwcontext	= sienna_cichlid_mode2_restore_hwcontext,
+	.restore_env		= NULL,
+	.do_reset		= sienna_cichlid_mode2_reset,
+};
+
+int sienna_cichlid_reset_init(struct amdgpu_device *adev)
+{
+	struct amdgpu_reset_control *reset_ctl;
+
+	reset_ctl = kzalloc(sizeof(*reset_ctl), GFP_KERNEL);
+	if (!reset_ctl)
+		return -ENOMEM;
+
+	reset_ctl->handle = adev;
+	reset_ctl->async_reset = sienna_cichlid_async_reset;
+	reset_ctl->active_reset = AMD_RESET_METHOD_NONE;
+	reset_ctl->get_reset_handler = sienna_cichlid_get_reset_handler;
+
+	INIT_LIST_HEAD(&reset_ctl->reset_handlers);
+	INIT_WORK(&reset_ctl->reset_work, reset_ctl->async_reset);
+	/* Only mode2 is handled through reset control now */
+	amdgpu_reset_add_handler(reset_ctl, &sienna_cichlid_mode2_handler);
+
+	adev->reset_cntl = reset_ctl;
+
+	return 0;
+}
+
+int sienna_cichlid_reset_fini(struct amdgpu_device *adev)
+{
+	kfree(adev->reset_cntl);
+	adev->reset_cntl = NULL;
+	return 0;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.h b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.h
new file mode 100644
index 000000000000..5213b162dacd
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __SIENNA_CICHLID_H__
+#define __SIENNA_CICHLID_H__
+
+#include "amdgpu.h"
+
+int sienna_cichlid_reset_init(struct amdgpu_device *adev);
+int sienna_cichlid_reset_fini(struct amdgpu_device *adev);
+
+#endif
diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v11_0_7_ppsmc.h b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v11_0_7_ppsmc.h
index d2e10a724560..82cf9e563065 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v11_0_7_ppsmc.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v11_0_7_ppsmc.h
@@ -137,7 +137,7 @@
 #define PPSMC_MSG_DisallowGpo                    0x56
 
 #define PPSMC_MSG_Enable2ndUSB20Port             0x57
-
-#define PPSMC_Message_Count                      0x58
+#define PPSMC_MSG_DriverMode2Reset               0x5D
+#define PPSMC_Message_Count                      0x5E
 
 #endif
diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
index 19084a4fcb2b..28f6a1eb6945 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
@@ -235,7 +235,8 @@
 	__SMU_DUMMY_MAP(UnforceGfxVid),           \
 	__SMU_DUMMY_MAP(HeavySBR),			\
 	__SMU_DUMMY_MAP(SetBadHBMPagesRetiredFlagsPerChannel), \
-	__SMU_DUMMY_MAP(EnableGfxImu),
+	__SMU_DUMMY_MAP(EnableGfxImu), \
+	__SMU_DUMMY_MAP(DriverMode2Reset),
 
 #undef __SMU_DUMMY_MAP
 #define __SMU_DUMMY_MAP(type)	SMU_MSG_##type
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
index fa520d79ef67..a73d241bb64f 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
@@ -154,6 +154,7 @@ static struct cmn2asic_msg_mapping sienna_cichlid_message_map[SMU_MSG_MAX_COUNT]
 	MSG_MAP(SetGpoFeaturePMask,		PPSMC_MSG_SetGpoFeaturePMask,          0),
 	MSG_MAP(DisallowGpo,			PPSMC_MSG_DisallowGpo,                 0),
 	MSG_MAP(Enable2ndUSB20Port,		PPSMC_MSG_Enable2ndUSB20Port,          0),
+	MSG_MAP(DriverMode2Reset,		PPSMC_MSG_DriverMode2Reset,	       0),
 };
 
 static struct cmn2asic_mapping sienna_cichlid_clk_map[SMU_CLK_COUNT] = {
@@ -4254,6 +4255,57 @@ static int sienna_cichlid_stb_get_data_direct(struct smu_context *smu,
 	return 0;
 }
 
+static bool sienna_cichlid_is_mode2_reset_supported(struct smu_context *smu)
+{
+	return true;
+}
+
+static int sienna_cichlid_mode2_reset(struct smu_context *smu)
+{
+	u32 smu_version;
+	int ret = 0, index;
+	struct amdgpu_device *adev = smu->adev;
+	int timeout = 100;
+
+	smu_cmn_get_smc_version(smu, NULL, &smu_version);
+
+	index = smu_cmn_to_asic_specific_index(smu, CMN2ASIC_MAPPING_MSG,
+						SMU_MSG_DriverMode2Reset);
+
+	mutex_lock(&smu->message_lock);
+
+	ret = smu_cmn_send_msg_without_waiting(smu, (uint16_t)index,
+					       SMU_RESET_MODE_2);
+
+	ret = smu_cmn_wait_for_response(smu);
+	while (ret != 0 && timeout) {
+		ret = smu_cmn_wait_for_response(smu);
+		/* Wait a bit more time for getting ACK */
+		if (ret != 0) {
+			--timeout;
+			usleep_range(500, 1000);
+			continue;
+		} else {
+			break;
+		}
+	}
+
+	if (!timeout) {
+		dev_err(adev->dev,
+			"failed to send mode2 message \tparam: 0x%08x response %#x\n",
+			SMU_RESET_MODE_2, ret);
+		goto out;
+	}
+
+	dev_info(smu->adev->dev, "restore config space...\n");
+	/* Restore the config space saved during init */
+	amdgpu_device_load_pci_state(adev->pdev);
+out:
+	mutex_unlock(&smu->message_lock);
+
+	return ret;
+}
+
 static const struct pptable_funcs sienna_cichlid_ppt_funcs = {
 	.get_allowed_feature_mask = sienna_cichlid_get_allowed_feature_mask,
 	.set_default_dpm_table = sienna_cichlid_set_default_dpm_table,
@@ -4348,6 +4400,8 @@ static const struct pptable_funcs sienna_cichlid_ppt_funcs = {
 	.get_default_config_table_settings = sienna_cichlid_get_default_config_table_settings,
 	.set_config_table = sienna_cichlid_set_config_table,
 	.get_unique_id = sienna_cichlid_get_unique_id,
+	.mode2_reset_is_support = sienna_cichlid_is_mode2_reset_supported,
+	.mode2_reset = sienna_cichlid_mode2_reset,
 };
 
 void sienna_cichlid_set_ppt_funcs(struct smu_context *smu)
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 2/5] drm/amdgpu: add debugfs amdgpu_reset_level
  2022-07-22  7:33 [PATCH 1/5] drm/amdgpu: add mode2 reset for sienna_cichlid Victor Zhao
@ 2022-07-22  7:34 ` Victor Zhao
  2022-07-22  8:19   ` Christian König
  2022-07-22  7:34 ` [PATCH 3/5] drm/amdgpu: save and restore gc hub regs Victor Zhao
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 20+ messages in thread
From: Victor Zhao @ 2022-07-22  7:34 UTC (permalink / raw)
  To: amd-gfx; +Cc: Alexander.Deucher, emily.deng, Victor Zhao, Christian.Koenig

Introduce amdgpu_reset_level debugfs in order to help debug and
test specific type of reset. Also helps blocking unwanted type of
resets.

By default, mode2 reset will not be enabled

Signed-off-by: Victor Zhao <Victor.Zhao@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  4 ++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 20 ++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c     |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c   |  6 ++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c    |  3 +++
 5 files changed, 34 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 6cd1e0a6dffc..c661231a6a07 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -238,6 +238,7 @@ extern int amdgpu_si_support;
 extern int amdgpu_cik_support;
 #endif
 extern int amdgpu_num_kcq;
+extern uint amdgpu_reset_level_mask;
 
 #define AMDGPU_VCNFW_LOG_SIZE (32 * 1024)
 extern int amdgpu_vcnfw_log;
@@ -274,6 +275,9 @@ extern int amdgpu_vcnfw_log;
 #define AMDGPU_RESET_VCE			(1 << 13)
 #define AMDGPU_RESET_VCE1			(1 << 14)
 
+#define AMDGPU_RESET_LEVEL_SOFT_RECOVERY (1 << 0)
+#define AMDGPU_RESET_LEVEL_MODE2 (1 << 1)
+
 /* max cursor sizes (in pixels) */
 #define CIK_CURSOR_WIDTH 128
 #define CIK_CURSOR_HEIGHT 128
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index e2eec985adb3..235c48e4ba4d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1661,12 +1661,29 @@ static int amdgpu_debugfs_sclk_set(void *data, u64 val)
 	return ret;
 }
 
+static int amdgpu_debugfs_reset_level_get(void *data, u64 *val)
+{
+	struct amdgpu_device *adev = (struct amdgpu_device *)data;
+	*val = amdgpu_reset_level_mask;
+	return 0;
+}
+
+static int amdgpu_debugfs_reset_level_set(void *data, u64 val)
+{
+	struct amdgpu_device *adev = (struct amdgpu_device *)data;
+	amdgpu_reset_level_mask = val;
+	return 0;
+}
+
 DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
 			amdgpu_debugfs_ib_preempt, "%llu\n");
 
 DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
 			amdgpu_debugfs_sclk_set, "%llu\n");
 
+DEFINE_DEBUGFS_ATTRIBUTE(fops_reset_level, amdgpu_debugfs_reset_level_get,
+			amdgpu_debugfs_reset_level_set, "%llu\n");
+
 static ssize_t amdgpu_reset_dump_register_list_read(struct file *f,
 				char __user *buf, size_t size, loff_t *pos)
 {
@@ -1785,6 +1802,9 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
 		return PTR_ERR(ent);
 	}
 
+	debugfs_create_file("amdgpu_reset_level", 0200, root, adev,
+				  &fops_reset_level);
+
 	/* Register debugfs entries for amdgpu_ttm */
 	amdgpu_ttm_debugfs_init(adev);
 	amdgpu_debugfs_pm_init(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index e8c6c3fe9374..fb8f3cb853a7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -198,6 +198,7 @@ struct amdgpu_watchdog_timer amdgpu_watchdog_timer = {
 	.timeout_fatal_disable = false,
 	.period = 0x0, /* default to 0x0 (timeout disable) */
 };
+uint amdgpu_reset_level_mask = 0x1;
 
 /**
  * DOC: vramlimit (int)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
index 831fb222139c..f16ab1a54b70 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
@@ -74,6 +74,9 @@ int amdgpu_reset_prepare_hwcontext(struct amdgpu_device *adev,
 {
 	struct amdgpu_reset_handler *reset_handler = NULL;
 
+	if (!(amdgpu_reset_level_mask & AMDGPU_RESET_LEVEL_MODE2))
+		return -ENOSYS;
+
 	if (test_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context->flags))
 		return -ENOSYS;
 
@@ -93,6 +96,9 @@ int amdgpu_reset_perform_reset(struct amdgpu_device *adev,
 	int ret;
 	struct amdgpu_reset_handler *reset_handler = NULL;
 
+	if (!(amdgpu_reset_level_mask & AMDGPU_RESET_LEVEL_MODE2))
+		return -ENOSYS;
+
 	if (test_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context->flags))
 		return -ENOSYS;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index d3558c34d406..1ffdc050a077 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -405,6 +405,9 @@ bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int vmid,
 {
 	ktime_t deadline = ktime_add_us(ktime_get(), 10000);
 
+	if (!(amdgpu_reset_level_mask & AMDGPU_RESET_LEVEL_SOFT_RECOVERY))
+		return false;
+
 	if (amdgpu_sriov_vf(ring->adev) || !ring->funcs->soft_recovery || !fence)
 		return false;
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 3/5] drm/amdgpu: save and restore gc hub regs
  2022-07-22  7:33 [PATCH 1/5] drm/amdgpu: add mode2 reset for sienna_cichlid Victor Zhao
  2022-07-22  7:34 ` [PATCH 2/5] drm/amdgpu: add debugfs amdgpu_reset_level Victor Zhao
@ 2022-07-22  7:34 ` Victor Zhao
  2022-07-25 21:18   ` Andrey Grodzovsky
  2022-07-22  7:34 ` [PATCH 4/5] drm/amdgpu: revert context to stop engine before mode2 reset Victor Zhao
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 20+ messages in thread
From: Victor Zhao @ 2022-07-22  7:34 UTC (permalink / raw)
  To: amd-gfx; +Cc: Alexander.Deucher, emily.deng, Victor Zhao, Christian.Koenig

Save and restore gfxhub regs as they will be reset during mode 2

Signed-off-by: Victor Zhao <Victor.Zhao@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h    |  2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h       | 26 +++++++
 drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c      | 72 +++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c   |  7 +-
 .../include/asic_reg/gc/gc_10_3_0_offset.h    |  4 ++
 5 files changed, 110 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h
index beabab515836..f8036f2b100e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h
@@ -35,6 +35,8 @@ struct amdgpu_gfxhub_funcs {
 	void (*init)(struct amdgpu_device *adev);
 	int (*get_xgmi_info)(struct amdgpu_device *adev);
 	void (*utcl2_harvest)(struct amdgpu_device *adev);
+	void (*mode2_save_regs)(struct amdgpu_device *adev);
+	void (*mode2_restore_regs)(struct amdgpu_device *adev);
 };
 
 struct amdgpu_gfxhub {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
index 008eaca27151..0305b660cd17 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
@@ -264,6 +264,32 @@ struct amdgpu_gmc {
 	u64 mall_size;
 	/* number of UMC instances */
 	int num_umc;
+	/* mode2 save restore */
+	u64 VM_L2_CNTL;
+	u64 VM_L2_CNTL2;
+	u64 VM_DUMMY_PAGE_FAULT_CNTL;
+	u64 VM_DUMMY_PAGE_FAULT_ADDR_LO32;
+	u64 VM_DUMMY_PAGE_FAULT_ADDR_HI32;
+	u64 VM_L2_PROTECTION_FAULT_CNTL;
+	u64 VM_L2_PROTECTION_FAULT_CNTL2;
+	u64 VM_L2_PROTECTION_FAULT_MM_CNTL3;
+	u64 VM_L2_PROTECTION_FAULT_MM_CNTL4;
+	u64 VM_L2_PROTECTION_FAULT_ADDR_LO32;
+	u64 VM_L2_PROTECTION_FAULT_ADDR_HI32;
+	u64 VM_DEBUG;
+	u64 VM_L2_MM_GROUP_RT_CLASSES;
+	u64 VM_L2_BANK_SELECT_RESERVED_CID;
+	u64 VM_L2_BANK_SELECT_RESERVED_CID2;
+	u64 VM_L2_CACHE_PARITY_CNTL;
+	u64 VM_L2_IH_LOG_CNTL;
+	u64 VM_CONTEXT_CNTL[16];
+	u64 VM_CONTEXT_PAGE_TABLE_BASE_ADDR_LO32[16];
+	u64 VM_CONTEXT_PAGE_TABLE_BASE_ADDR_HI32[16];
+	u64 VM_CONTEXT_PAGE_TABLE_START_ADDR_LO32[16];
+	u64 VM_CONTEXT_PAGE_TABLE_START_ADDR_HI32[16];
+	u64 VM_CONTEXT_PAGE_TABLE_END_ADDR_LO32[16];
+	u64 VM_CONTEXT_PAGE_TABLE_END_ADDR_HI32[16];
+	u64 MC_VM_MX_L1_TLB_CNTL;
 };
 
 #define amdgpu_gmc_flush_gpu_tlb(adev, vmid, vmhub, type) ((adev)->gmc.gmc_funcs->flush_gpu_tlb((adev), (vmid), (vmhub), (type)))
diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c b/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c
index d8c531581116..51cf8acd2d79 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c
@@ -576,6 +576,76 @@ static void gfxhub_v2_1_utcl2_harvest(struct amdgpu_device *adev)
 	}
 }
 
+static void gfxhub_v2_1_save_regs(struct amdgpu_device *adev)
+{
+	int i;
+	adev->gmc.VM_L2_CNTL = RREG32_SOC15(GC, 0, mmGCVM_L2_CNTL);
+	adev->gmc.VM_L2_CNTL2 = RREG32_SOC15(GC, 0, mmGCVM_L2_CNTL2);
+	adev->gmc.VM_DUMMY_PAGE_FAULT_CNTL = RREG32_SOC15(GC, 0, mmGCVM_DUMMY_PAGE_FAULT_CNTL);
+	adev->gmc.VM_DUMMY_PAGE_FAULT_ADDR_LO32 = RREG32_SOC15(GC, 0, mmGCVM_DUMMY_PAGE_FAULT_ADDR_LO32);
+	adev->gmc.VM_DUMMY_PAGE_FAULT_ADDR_HI32 = RREG32_SOC15(GC, 0, mmGCVM_DUMMY_PAGE_FAULT_ADDR_HI32);
+	adev->gmc.VM_L2_PROTECTION_FAULT_CNTL = RREG32_SOC15(GC, 0, mmGCVM_L2_PROTECTION_FAULT_CNTL);
+	adev->gmc.VM_L2_PROTECTION_FAULT_CNTL2 = RREG32_SOC15(GC, 0, mmGCVM_L2_PROTECTION_FAULT_CNTL2);
+	adev->gmc.VM_L2_PROTECTION_FAULT_MM_CNTL3 = RREG32_SOC15(GC, 0, mmGCVM_L2_PROTECTION_FAULT_MM_CNTL3);
+	adev->gmc.VM_L2_PROTECTION_FAULT_MM_CNTL4 = RREG32_SOC15(GC, 0, mmGCVM_L2_PROTECTION_FAULT_MM_CNTL4);
+	adev->gmc.VM_L2_PROTECTION_FAULT_ADDR_LO32 = RREG32_SOC15(GC, 0, mmGCVM_L2_PROTECTION_FAULT_ADDR_LO32);
+	adev->gmc.VM_L2_PROTECTION_FAULT_ADDR_HI32 = RREG32_SOC15(GC, 0, mmGCVM_L2_PROTECTION_FAULT_ADDR_HI32);
+	adev->gmc.VM_DEBUG = RREG32_SOC15(GC, 0, mmGCVM_DEBUG);
+	adev->gmc.VM_L2_MM_GROUP_RT_CLASSES = RREG32_SOC15(GC, 0, mmGCVM_L2_MM_GROUP_RT_CLASSES);
+	adev->gmc.VM_L2_BANK_SELECT_RESERVED_CID = RREG32_SOC15(GC, 0, mmGCVM_L2_BANK_SELECT_RESERVED_CID);
+	adev->gmc.VM_L2_BANK_SELECT_RESERVED_CID2 = RREG32_SOC15(GC, 0, mmGCVM_L2_BANK_SELECT_RESERVED_CID2);
+	adev->gmc.VM_L2_CACHE_PARITY_CNTL = RREG32_SOC15(GC, 0, mmGCVM_L2_CACHE_PARITY_CNTL);
+	adev->gmc.VM_L2_IH_LOG_CNTL = RREG32_SOC15(GC, 0, mmGCVM_L2_IH_LOG_CNTL);
+
+	for (i = 0; i <= 15; i++) {
+		adev->gmc.VM_CONTEXT_CNTL[i] = RREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT0_CNTL, i);
+		adev->gmc.VM_CONTEXT_PAGE_TABLE_BASE_ADDR_LO32[i] = RREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32, i * 2);
+		adev->gmc.VM_CONTEXT_PAGE_TABLE_BASE_ADDR_HI32[i] = RREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32, i * 2);
+		adev->gmc.VM_CONTEXT_PAGE_TABLE_START_ADDR_LO32[i] = RREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32, i * 2);
+		adev->gmc.VM_CONTEXT_PAGE_TABLE_START_ADDR_HI32[i] = RREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32, i * 2);
+		adev->gmc.VM_CONTEXT_PAGE_TABLE_END_ADDR_LO32[i] = RREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32, i * 2);
+		adev->gmc.VM_CONTEXT_PAGE_TABLE_END_ADDR_HI32[i] = RREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32, i * 2);
+	}
+
+	adev->gmc.MC_VM_MX_L1_TLB_CNTL = RREG32_SOC15(GC, 0, mmGCMC_VM_MX_L1_TLB_CNTL);
+}
+
+static void gfxhub_v2_1_restore_regs(struct amdgpu_device *adev)
+{
+	int i;
+	WREG32_SOC15(GC, 0, mmGCVM_L2_CNTL, adev->gmc.VM_L2_CNTL);
+	WREG32_SOC15(GC, 0, mmGCVM_L2_CNTL2, adev->gmc.VM_L2_CNTL2);
+	WREG32_SOC15(GC, 0, mmGCVM_DUMMY_PAGE_FAULT_CNTL, adev->gmc.VM_DUMMY_PAGE_FAULT_CNTL);
+	WREG32_SOC15(GC, 0, mmGCVM_DUMMY_PAGE_FAULT_ADDR_LO32, adev->gmc.VM_DUMMY_PAGE_FAULT_ADDR_LO32);
+	WREG32_SOC15(GC, 0, mmGCVM_DUMMY_PAGE_FAULT_ADDR_HI32, adev->gmc.VM_DUMMY_PAGE_FAULT_ADDR_HI32);
+	WREG32_SOC15(GC, 0, mmGCVM_L2_PROTECTION_FAULT_CNTL, adev->gmc.VM_L2_PROTECTION_FAULT_CNTL);
+	WREG32_SOC15(GC, 0, mmGCVM_L2_PROTECTION_FAULT_CNTL2, adev->gmc.VM_L2_PROTECTION_FAULT_CNTL2);
+	WREG32_SOC15(GC, 0, mmGCVM_L2_PROTECTION_FAULT_MM_CNTL3, adev->gmc.VM_L2_PROTECTION_FAULT_MM_CNTL3);
+	WREG32_SOC15(GC, 0, mmGCVM_L2_PROTECTION_FAULT_MM_CNTL4, adev->gmc.VM_L2_PROTECTION_FAULT_MM_CNTL4);
+	WREG32_SOC15(GC, 0, mmGCVM_L2_PROTECTION_FAULT_ADDR_LO32, adev->gmc.VM_L2_PROTECTION_FAULT_ADDR_LO32);
+	WREG32_SOC15(GC, 0, mmGCVM_L2_PROTECTION_FAULT_ADDR_HI32, adev->gmc.VM_L2_PROTECTION_FAULT_ADDR_HI32);
+	WREG32_SOC15(GC, 0, mmGCVM_DEBUG, adev->gmc.VM_DEBUG);
+	WREG32_SOC15(GC, 0, mmGCVM_L2_MM_GROUP_RT_CLASSES, adev->gmc.VM_L2_MM_GROUP_RT_CLASSES);
+	WREG32_SOC15(GC, 0, mmGCVM_L2_BANK_SELECT_RESERVED_CID, adev->gmc.VM_L2_BANK_SELECT_RESERVED_CID);
+	WREG32_SOC15(GC, 0, mmGCVM_L2_BANK_SELECT_RESERVED_CID2, adev->gmc.VM_L2_BANK_SELECT_RESERVED_CID2);
+	WREG32_SOC15(GC, 0, mmGCVM_L2_CACHE_PARITY_CNTL, adev->gmc.VM_L2_CACHE_PARITY_CNTL);
+	WREG32_SOC15(GC, 0, mmGCVM_L2_IH_LOG_CNTL, adev->gmc.VM_L2_IH_LOG_CNTL);
+
+	for (i = 0; i <= 15; i++) {
+		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT0_CNTL, i, adev->gmc.VM_CONTEXT_CNTL[i]);
+		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32, i * 2, adev->gmc.VM_CONTEXT_PAGE_TABLE_BASE_ADDR_LO32[i]);
+		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32, i * 2, adev->gmc.VM_CONTEXT_PAGE_TABLE_BASE_ADDR_HI32[i]);
+		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32, i * 2, adev->gmc.VM_CONTEXT_PAGE_TABLE_START_ADDR_LO32[i]);
+		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32, i * 2, adev->gmc.VM_CONTEXT_PAGE_TABLE_START_ADDR_HI32[i]);
+		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32, i * 2, adev->gmc.VM_CONTEXT_PAGE_TABLE_END_ADDR_LO32[i]);
+		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32, i * 2, adev->gmc.VM_CONTEXT_PAGE_TABLE_END_ADDR_HI32[i]);
+	}
+
+	WREG32_SOC15(GC, 0, mmGCMC_VM_FB_LOCATION_BASE, adev->gmc.vram_start >> 24);
+	WREG32_SOC15(GC, 0, mmGCMC_VM_FB_LOCATION_TOP, adev->gmc.vram_end >> 24);
+	WREG32_SOC15(GC, 0, mmGCMC_VM_MX_L1_TLB_CNTL, adev->gmc.MC_VM_MX_L1_TLB_CNTL);
+}
+
 const struct amdgpu_gfxhub_funcs gfxhub_v2_1_funcs = {
 	.get_fb_location = gfxhub_v2_1_get_fb_location,
 	.get_mc_fb_offset = gfxhub_v2_1_get_mc_fb_offset,
@@ -586,4 +656,6 @@ const struct amdgpu_gfxhub_funcs gfxhub_v2_1_funcs = {
 	.init = gfxhub_v2_1_init,
 	.get_xgmi_info = gfxhub_v2_1_get_xgmi_info,
 	.utcl2_harvest = gfxhub_v2_1_utcl2_harvest,
+	.mode2_save_regs = gfxhub_v2_1_save_regs,
+	.mode2_restore_regs = gfxhub_v2_1_restore_regs,
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
index 0512960bed23..51a5b68f77d3 100644
--- a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
+++ b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
@@ -94,8 +94,11 @@ sienna_cichlid_mode2_prepare_hwcontext(struct amdgpu_reset_control *reset_ctl,
 	int r = 0;
 	struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;
 
-	if (!amdgpu_sriov_vf(adev))
+	if (!amdgpu_sriov_vf(adev)) {
+		if (adev->gfxhub.funcs->mode2_save_regs)
+			adev->gfxhub.funcs->mode2_save_regs(adev);
 		r = sienna_cichlid_mode2_suspend_ip(adev);
+	}
 
 	return r;
 }
@@ -152,6 +155,8 @@ static int sienna_cichlid_mode2_restore_ip(struct amdgpu_device *adev)
 	}
 
 	/* Reinit GFXHUB */
+	if (adev->gfxhub.funcs->mode2_restore_regs)
+		adev->gfxhub.funcs->mode2_restore_regs(adev);
 	adev->gfxhub.funcs->init(adev);
 	r = adev->gfxhub.funcs->gart_enable(adev);
 	if (r) {
diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_offset.h b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_offset.h
index f21554a1c86c..594bffce93a9 100644
--- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_offset.h
+++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_offset.h
@@ -3129,6 +3129,8 @@
 #define mmGCVM_L2_PROTECTION_FAULT_DEFAULT_ADDR_LO32_BASE_IDX                                          0
 #define mmGCVM_L2_PROTECTION_FAULT_DEFAULT_ADDR_HI32                                                   0x15cc
 #define mmGCVM_L2_PROTECTION_FAULT_DEFAULT_ADDR_HI32_BASE_IDX                                          0
+#define mmGCVM_DEBUG                                                                                   0x15cd
+#define mmGCVM_DEBUG_BASE_IDX                                                                          0
 #define mmGCVM_L2_CONTEXT1_IDENTITY_APERTURE_LOW_ADDR_LO32                                             0x15ce
 #define mmGCVM_L2_CONTEXT1_IDENTITY_APERTURE_LOW_ADDR_LO32_BASE_IDX                                    0
 #define mmGCVM_L2_CONTEXT1_IDENTITY_APERTURE_LOW_ADDR_HI32                                             0x15cf
@@ -3151,6 +3153,8 @@
 #define mmGCVM_L2_BANK_SELECT_RESERVED_CID2_BASE_IDX                                                   0
 #define mmGCVM_L2_CACHE_PARITY_CNTL                                                                    0x15d8
 #define mmGCVM_L2_CACHE_PARITY_CNTL_BASE_IDX                                                           0
+#define mmGCVM_L2_IH_LOG_CNTL                                                                          0x15d9
+#define mmGCVM_L2_IH_LOG_CNTL_BASE_IDX                                                                 0
 #define mmGCVM_L2_CNTL5                                                                                0x15dc
 #define mmGCVM_L2_CNTL5_BASE_IDX                                                                       0
 #define mmGCVM_L2_GCR_CNTL                                                                             0x15dd
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 4/5] drm/amdgpu: revert context to stop engine before mode2 reset
  2022-07-22  7:33 [PATCH 1/5] drm/amdgpu: add mode2 reset for sienna_cichlid Victor Zhao
  2022-07-22  7:34 ` [PATCH 2/5] drm/amdgpu: add debugfs amdgpu_reset_level Victor Zhao
  2022-07-22  7:34 ` [PATCH 3/5] drm/amdgpu: save and restore gc hub regs Victor Zhao
@ 2022-07-22  7:34 ` Victor Zhao
  2022-07-25 21:19   ` Andrey Grodzovsky
  2022-07-22  7:34 ` [PATCH 5/5] drm/amdgpu: reduce reset time Victor Zhao
  2022-07-25 21:05 ` [PATCH 1/5] drm/amdgpu: add mode2 reset for sienna_cichlid Andrey Grodzovsky
  4 siblings, 1 reply; 20+ messages in thread
From: Victor Zhao @ 2022-07-22  7:34 UTC (permalink / raw)
  To: amd-gfx; +Cc: Alexander.Deucher, emily.deng, Victor Zhao, Christian.Koenig

For some hang caused by slow tests, engine cannot be stopped which
may cause resume failure after reset. In this case, force halt
engine by reverting context addresses

Signed-off-by: Victor Zhao <Victor.Zhao@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h  |  1 +
 drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c    | 36 +++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c |  2 ++
 4 files changed, 40 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 5498fda8617f..833dc5e224d3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5037,6 +5037,7 @@ static void amdgpu_device_recheck_guilty_jobs(
 
 			/* set guilty */
 			drm_sched_increase_karma(s_job);
+			amdgpu_reset_prepare_hwcontext(adev, reset_context);
 retry:
 			/* do hw reset */
 			if (amdgpu_sriov_vf(adev)) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h
index f8036f2b100e..c7b44aeb671b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h
@@ -37,6 +37,7 @@ struct amdgpu_gfxhub_funcs {
 	void (*utcl2_harvest)(struct amdgpu_device *adev);
 	void (*mode2_save_regs)(struct amdgpu_device *adev);
 	void (*mode2_restore_regs)(struct amdgpu_device *adev);
+	void (*halt)(struct amdgpu_device *adev);
 };
 
 struct amdgpu_gfxhub {
diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c b/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c
index 51cf8acd2d79..8cf53e039c11 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c
@@ -646,6 +646,41 @@ static void gfxhub_v2_1_restore_regs(struct amdgpu_device *adev)
 	WREG32_SOC15(GC, 0, mmGCMC_VM_MX_L1_TLB_CNTL, adev->gmc.MC_VM_MX_L1_TLB_CNTL);
 }
 
+static void gfxhub_v2_1_halt(struct amdgpu_device *adev)
+{
+	struct amdgpu_vmhub *hub = &adev->vmhub[AMDGPU_GFXHUB_0];
+	int i;
+	uint32_t tmp;
+	int time = 1000;
+
+	gfxhub_v2_1_set_fault_enable_default(adev, false);
+
+	for (i = 0; i <= 14; i++) {
+		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32,
+				    i * hub->ctx_addr_distance, ~0);
+		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_PAGE_TABLE_START_ADDR_HI32,
+				    i * hub->ctx_addr_distance, ~0);
+		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_PAGE_TABLE_END_ADDR_LO32,
+				    i * hub->ctx_addr_distance,
+				    0);
+		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_PAGE_TABLE_END_ADDR_HI32,
+				    i * hub->ctx_addr_distance,
+				    0);
+	}
+	tmp = RREG32_SOC15(GC, 0, mmGRBM_STATUS2);
+	while ((tmp & (GRBM_STATUS2__EA_BUSY_MASK |
+		      GRBM_STATUS2__EA_LINK_BUSY_MASK)) != 0 &&
+	       time) {
+		udelay(100);
+		time--;
+		tmp = RREG32_SOC15(GC, 0, mmGRBM_STATUS2);
+	}
+
+	if (!time) {
+		DRM_WARN("failed to wait for GRBM(EA) idle\n");
+	}
+}
+
 const struct amdgpu_gfxhub_funcs gfxhub_v2_1_funcs = {
 	.get_fb_location = gfxhub_v2_1_get_fb_location,
 	.get_mc_fb_offset = gfxhub_v2_1_get_mc_fb_offset,
@@ -658,4 +693,5 @@ const struct amdgpu_gfxhub_funcs gfxhub_v2_1_funcs = {
 	.utcl2_harvest = gfxhub_v2_1_utcl2_harvest,
 	.mode2_save_regs = gfxhub_v2_1_save_regs,
 	.mode2_restore_regs = gfxhub_v2_1_restore_regs,
+	.halt = gfxhub_v2_1_halt,
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
index 51a5b68f77d3..fead7251292f 100644
--- a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
+++ b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
@@ -97,6 +97,8 @@ sienna_cichlid_mode2_prepare_hwcontext(struct amdgpu_reset_control *reset_ctl,
 	if (!amdgpu_sriov_vf(adev)) {
 		if (adev->gfxhub.funcs->mode2_save_regs)
 			adev->gfxhub.funcs->mode2_save_regs(adev);
+		if (adev->gfxhub.funcs->halt)
+			adev->gfxhub.funcs->halt(adev);
 		r = sienna_cichlid_mode2_suspend_ip(adev);
 	}
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 5/5] drm/amdgpu: reduce reset time
  2022-07-22  7:33 [PATCH 1/5] drm/amdgpu: add mode2 reset for sienna_cichlid Victor Zhao
                   ` (2 preceding siblings ...)
  2022-07-22  7:34 ` [PATCH 4/5] drm/amdgpu: revert context to stop engine before mode2 reset Victor Zhao
@ 2022-07-22  7:34 ` Victor Zhao
  2022-07-25 21:17   ` Andrey Grodzovsky
  2022-07-25 21:05 ` [PATCH 1/5] drm/amdgpu: add mode2 reset for sienna_cichlid Andrey Grodzovsky
  4 siblings, 1 reply; 20+ messages in thread
From: Victor Zhao @ 2022-07-22  7:34 UTC (permalink / raw)
  To: amd-gfx; +Cc: Alexander.Deucher, emily.deng, Victor Zhao, Christian.Koenig

In multi container use case, reset time is important, so skip ring
tests and cp halt wait during ip suspending for reset as they are
going to fail and cost more time on reset

Signed-off-by: Victor Zhao <Victor.Zhao@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c |  2 +-
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c  | 26 +++++++++++++++----------
 2 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 222d3d7ea076..f872495ccc3a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -477,7 +477,7 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev)
 		kiq->pmf->kiq_unmap_queues(kiq_ring, &adev->gfx.compute_ring[i],
 					   RESET_QUEUES, 0, 0);
 
-	if (adev->gfx.kiq.ring.sched.ready)
+	if (adev->gfx.kiq.ring.sched.ready && !amdgpu_in_reset(adev))
 		r = amdgpu_ring_test_helper(kiq_ring);
 	spin_unlock(&adev->gfx.kiq.ring_lock);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index fafbad3cf08d..9ae29023e38f 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -5971,16 +5971,19 @@ static int gfx_v10_0_cp_gfx_enable(struct amdgpu_device *adev, bool enable)
 		WREG32_SOC15(GC, 0, mmCP_ME_CNTL, tmp);
 	}
 
-	for (i = 0; i < adev->usec_timeout; i++) {
-		if (RREG32_SOC15(GC, 0, mmCP_STAT) == 0)
-			break;
-		udelay(1);
-	}
-
-	if (i >= adev->usec_timeout)
-		DRM_ERROR("failed to %s cp gfx\n", enable ? "unhalt" : "halt");
+	if (!amdgpu_in_reset(adev)) {
+		for (i = 0; i < adev->usec_timeout; i++) {
+			if (RREG32_SOC15(GC, 0, mmCP_STAT) == 0)
+				break;
+			udelay(1);
+		}
 
+		if (i >= adev->usec_timeout)
+			DRM_ERROR("failed to %s cp gfx\n",
+				  enable ? "unhalt" : "halt");
+	}
 	return 0;
+
 }
 
 static int gfx_v10_0_cp_gfx_load_pfp_microcode(struct amdgpu_device *adev)
@@ -7569,8 +7572,10 @@ static int gfx_v10_0_kiq_disable_kgq(struct amdgpu_device *adev)
 	for (i = 0; i < adev->gfx.num_gfx_rings; i++)
 		kiq->pmf->kiq_unmap_queues(kiq_ring, &adev->gfx.gfx_ring[i],
 					   PREEMPT_QUEUES, 0, 0);
-
-	return amdgpu_ring_test_helper(kiq_ring);
+	if (!amdgpu_in_reset(adev))
+		return amdgpu_ring_test_helper(kiq_ring);
+	else
+		return 0;
 }
 #endif
 
@@ -7610,6 +7615,7 @@ static int gfx_v10_0_hw_fini(void *handle)
 
 		return 0;
 	}
+
 	gfx_v10_0_cp_enable(adev, false);
 	gfx_v10_0_enable_gui_idle_interrupt(adev, false);
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/5] drm/amdgpu: add debugfs amdgpu_reset_level
  2022-07-22  7:34 ` [PATCH 2/5] drm/amdgpu: add debugfs amdgpu_reset_level Victor Zhao
@ 2022-07-22  8:19   ` Christian König
  2022-07-25 10:45     ` Zhao, Victor
  0 siblings, 1 reply; 20+ messages in thread
From: Christian König @ 2022-07-22  8:19 UTC (permalink / raw)
  To: Victor Zhao, amd-gfx; +Cc: Alexander.Deucher, emily.deng

Well NAK to the debugfs approach, stuff like that is usually a module 
parameter.

Apart from that this series needs to be reviewed by Andrey.

Regards,
Christian.

Am 22.07.22 um 09:34 schrieb Victor Zhao:
> Introduce amdgpu_reset_level debugfs in order to help debug and
> test specific type of reset. Also helps blocking unwanted type of
> resets.
>
> By default, mode2 reset will not be enabled
>
> Signed-off-by: Victor Zhao <Victor.Zhao@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  4 ++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 20 ++++++++++++++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c     |  1 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c   |  6 ++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c    |  3 +++
>   5 files changed, 34 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 6cd1e0a6dffc..c661231a6a07 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -238,6 +238,7 @@ extern int amdgpu_si_support;
>   extern int amdgpu_cik_support;
>   #endif
>   extern int amdgpu_num_kcq;
> +extern uint amdgpu_reset_level_mask;
>   
>   #define AMDGPU_VCNFW_LOG_SIZE (32 * 1024)
>   extern int amdgpu_vcnfw_log;
> @@ -274,6 +275,9 @@ extern int amdgpu_vcnfw_log;
>   #define AMDGPU_RESET_VCE			(1 << 13)
>   #define AMDGPU_RESET_VCE1			(1 << 14)
>   
> +#define AMDGPU_RESET_LEVEL_SOFT_RECOVERY (1 << 0)
> +#define AMDGPU_RESET_LEVEL_MODE2 (1 << 1)
> +
>   /* max cursor sizes (in pixels) */
>   #define CIK_CURSOR_WIDTH 128
>   #define CIK_CURSOR_HEIGHT 128
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> index e2eec985adb3..235c48e4ba4d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> @@ -1661,12 +1661,29 @@ static int amdgpu_debugfs_sclk_set(void *data, u64 val)
>   	return ret;
>   }
>   
> +static int amdgpu_debugfs_reset_level_get(void *data, u64 *val)
> +{
> +	struct amdgpu_device *adev = (struct amdgpu_device *)data;
> +	*val = amdgpu_reset_level_mask;
> +	return 0;
> +}
> +
> +static int amdgpu_debugfs_reset_level_set(void *data, u64 val)
> +{
> +	struct amdgpu_device *adev = (struct amdgpu_device *)data;
> +	amdgpu_reset_level_mask = val;
> +	return 0;
> +}
> +
>   DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
>   			amdgpu_debugfs_ib_preempt, "%llu\n");
>   
>   DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
>   			amdgpu_debugfs_sclk_set, "%llu\n");
>   
> +DEFINE_DEBUGFS_ATTRIBUTE(fops_reset_level, amdgpu_debugfs_reset_level_get,
> +			amdgpu_debugfs_reset_level_set, "%llu\n");
> +
>   static ssize_t amdgpu_reset_dump_register_list_read(struct file *f,
>   				char __user *buf, size_t size, loff_t *pos)
>   {
> @@ -1785,6 +1802,9 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
>   		return PTR_ERR(ent);
>   	}
>   
> +	debugfs_create_file("amdgpu_reset_level", 0200, root, adev,
> +				  &fops_reset_level);
> +
>   	/* Register debugfs entries for amdgpu_ttm */
>   	amdgpu_ttm_debugfs_init(adev);
>   	amdgpu_debugfs_pm_init(adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index e8c6c3fe9374..fb8f3cb853a7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -198,6 +198,7 @@ struct amdgpu_watchdog_timer amdgpu_watchdog_timer = {
>   	.timeout_fatal_disable = false,
>   	.period = 0x0, /* default to 0x0 (timeout disable) */
>   };
> +uint amdgpu_reset_level_mask = 0x1;
>   
>   /**
>    * DOC: vramlimit (int)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> index 831fb222139c..f16ab1a54b70 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> @@ -74,6 +74,9 @@ int amdgpu_reset_prepare_hwcontext(struct amdgpu_device *adev,
>   {
>   	struct amdgpu_reset_handler *reset_handler = NULL;
>   
> +	if (!(amdgpu_reset_level_mask & AMDGPU_RESET_LEVEL_MODE2))
> +		return -ENOSYS;
> +
>   	if (test_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context->flags))
>   		return -ENOSYS;
>   
> @@ -93,6 +96,9 @@ int amdgpu_reset_perform_reset(struct amdgpu_device *adev,
>   	int ret;
>   	struct amdgpu_reset_handler *reset_handler = NULL;
>   
> +	if (!(amdgpu_reset_level_mask & AMDGPU_RESET_LEVEL_MODE2))
> +		return -ENOSYS;
> +
>   	if (test_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context->flags))
>   		return -ENOSYS;
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> index d3558c34d406..1ffdc050a077 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> @@ -405,6 +405,9 @@ bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int vmid,
>   {
>   	ktime_t deadline = ktime_add_us(ktime_get(), 10000);
>   
> +	if (!(amdgpu_reset_level_mask & AMDGPU_RESET_LEVEL_SOFT_RECOVERY))
> +		return false;
> +
>   	if (amdgpu_sriov_vf(ring->adev) || !ring->funcs->soft_recovery || !fence)
>   		return false;
>   


^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [PATCH 2/5] drm/amdgpu: add debugfs amdgpu_reset_level
  2022-07-22  8:19   ` Christian König
@ 2022-07-25 10:45     ` Zhao, Victor
  2022-07-25 17:37       ` Christian König
  0 siblings, 1 reply; 20+ messages in thread
From: Zhao, Victor @ 2022-07-25 10:45 UTC (permalink / raw)
  To: Koenig, Christian, amd-gfx, Grodzovsky,  Andrey
  Cc: Deucher, Alexander, Deng, Emily

[AMD Official Use Only - General]

Hi @Grodzovsky, Andrey,

Please help review the series, thanks a lot.

Hi @Koenig, Christian,

I thought a module parameter will be exposed to a common user, this control was added to help debug and test so I put it as a debugfs. I can make it module parameter if more appropriate.


Thanks,
Victor



-----Original Message-----
From: Koenig, Christian <Christian.Koenig@amd.com> 
Sent: Friday, July 22, 2022 4:20 PM
To: Zhao, Victor <Victor.Zhao@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Deng, Emily <Emily.Deng@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>
Subject: Re: [PATCH 2/5] drm/amdgpu: add debugfs amdgpu_reset_level

Well NAK to the debugfs approach, stuff like that is usually a module parameter.

Apart from that this series needs to be reviewed by Andrey.

Regards,
Christian.

Am 22.07.22 um 09:34 schrieb Victor Zhao:
> Introduce amdgpu_reset_level debugfs in order to help debug and test 
> specific type of reset. Also helps blocking unwanted type of resets.
>
> By default, mode2 reset will not be enabled
>
> Signed-off-by: Victor Zhao <Victor.Zhao@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  4 ++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 20 ++++++++++++++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c     |  1 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c   |  6 ++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c    |  3 +++
>   5 files changed, 34 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 6cd1e0a6dffc..c661231a6a07 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -238,6 +238,7 @@ extern int amdgpu_si_support;
>   extern int amdgpu_cik_support;
>   #endif
>   extern int amdgpu_num_kcq;
> +extern uint amdgpu_reset_level_mask;
>   
>   #define AMDGPU_VCNFW_LOG_SIZE (32 * 1024)
>   extern int amdgpu_vcnfw_log;
> @@ -274,6 +275,9 @@ extern int amdgpu_vcnfw_log;
>   #define AMDGPU_RESET_VCE			(1 << 13)
>   #define AMDGPU_RESET_VCE1			(1 << 14)
>   
> +#define AMDGPU_RESET_LEVEL_SOFT_RECOVERY (1 << 0) #define 
> +AMDGPU_RESET_LEVEL_MODE2 (1 << 1)
> +
>   /* max cursor sizes (in pixels) */
>   #define CIK_CURSOR_WIDTH 128
>   #define CIK_CURSOR_HEIGHT 128
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> index e2eec985adb3..235c48e4ba4d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> @@ -1661,12 +1661,29 @@ static int amdgpu_debugfs_sclk_set(void *data, u64 val)
>   	return ret;
>   }
>   
> +static int amdgpu_debugfs_reset_level_get(void *data, u64 *val) {
> +	struct amdgpu_device *adev = (struct amdgpu_device *)data;
> +	*val = amdgpu_reset_level_mask;
> +	return 0;
> +}
> +
> +static int amdgpu_debugfs_reset_level_set(void *data, u64 val) {
> +	struct amdgpu_device *adev = (struct amdgpu_device *)data;
> +	amdgpu_reset_level_mask = val;
> +	return 0;
> +}
> +
>   DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
>   			amdgpu_debugfs_ib_preempt, "%llu\n");
>   
>   DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
>   			amdgpu_debugfs_sclk_set, "%llu\n");
>   
> +DEFINE_DEBUGFS_ATTRIBUTE(fops_reset_level, amdgpu_debugfs_reset_level_get,
> +			amdgpu_debugfs_reset_level_set, "%llu\n");
> +
>   static ssize_t amdgpu_reset_dump_register_list_read(struct file *f,
>   				char __user *buf, size_t size, loff_t *pos)
>   {
> @@ -1785,6 +1802,9 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
>   		return PTR_ERR(ent);
>   	}
>   
> +	debugfs_create_file("amdgpu_reset_level", 0200, root, adev,
> +				  &fops_reset_level);
> +
>   	/* Register debugfs entries for amdgpu_ttm */
>   	amdgpu_ttm_debugfs_init(adev);
>   	amdgpu_debugfs_pm_init(adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index e8c6c3fe9374..fb8f3cb853a7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -198,6 +198,7 @@ struct amdgpu_watchdog_timer amdgpu_watchdog_timer = {
>   	.timeout_fatal_disable = false,
>   	.period = 0x0, /* default to 0x0 (timeout disable) */
>   };
> +uint amdgpu_reset_level_mask = 0x1;
>   
>   /**
>    * DOC: vramlimit (int)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> index 831fb222139c..f16ab1a54b70 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> @@ -74,6 +74,9 @@ int amdgpu_reset_prepare_hwcontext(struct amdgpu_device *adev,
>   {
>   	struct amdgpu_reset_handler *reset_handler = NULL;
>   
> +	if (!(amdgpu_reset_level_mask & AMDGPU_RESET_LEVEL_MODE2))
> +		return -ENOSYS;
> +
>   	if (test_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context->flags))
>   		return -ENOSYS;
>   
> @@ -93,6 +96,9 @@ int amdgpu_reset_perform_reset(struct amdgpu_device *adev,
>   	int ret;
>   	struct amdgpu_reset_handler *reset_handler = NULL;
>   
> +	if (!(amdgpu_reset_level_mask & AMDGPU_RESET_LEVEL_MODE2))
> +		return -ENOSYS;
> +
>   	if (test_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context->flags))
>   		return -ENOSYS;
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> index d3558c34d406..1ffdc050a077 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> @@ -405,6 +405,9 @@ bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int vmid,
>   {
>   	ktime_t deadline = ktime_add_us(ktime_get(), 10000);
>   
> +	if (!(amdgpu_reset_level_mask & AMDGPU_RESET_LEVEL_SOFT_RECOVERY))
> +		return false;
> +
>   	if (amdgpu_sriov_vf(ring->adev) || !ring->funcs->soft_recovery || !fence)
>   		return false;
>   

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/5] drm/amdgpu: add debugfs amdgpu_reset_level
  2022-07-25 10:45     ` Zhao, Victor
@ 2022-07-25 17:37       ` Christian König
  2022-07-25 21:07         ` Andrey Grodzovsky
  0 siblings, 1 reply; 20+ messages in thread
From: Christian König @ 2022-07-25 17:37 UTC (permalink / raw)
  To: Zhao, Victor, amd-gfx, Grodzovsky, Andrey; +Cc: Deucher, Alexander, Deng, Emily

Hi Victor,

Am 25.07.22 um 12:45 schrieb Zhao, Victor:
> [AMD Official Use Only - General]
>
> Hi @Grodzovsky, Andrey,
>
> Please help review the series, thanks a lot.
>
> Hi @Koenig, Christian,
>
> I thought a module parameter will be exposed to a common user, this control was added to help debug and test so I put it as a debugfs. I can make it module parameter if more appropriate.

That's a really good argument. I leave the decision if we should use a 
module parameter or debugfs file to Andrey.

If you go with debugfs then using the debugfs_create_u32() or 
debugfs_create_u64() function would be more appropriate I think. And 
then don't make that global, but rather a per device flag.

Regards,
Christian.

>
>
> Thanks,
> Victor
>
>
>
> -----Original Message-----
> From: Koenig, Christian <Christian.Koenig@amd.com>
> Sent: Friday, July 22, 2022 4:20 PM
> To: Zhao, Victor <Victor.Zhao@amd.com>; amd-gfx@lists.freedesktop.org
> Cc: Deng, Emily <Emily.Deng@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>
> Subject: Re: [PATCH 2/5] drm/amdgpu: add debugfs amdgpu_reset_level
>
> Well NAK to the debugfs approach, stuff like that is usually a module parameter.
>
> Apart from that this series needs to be reviewed by Andrey.
>
> Regards,
> Christian.
>
> Am 22.07.22 um 09:34 schrieb Victor Zhao:
>> Introduce amdgpu_reset_level debugfs in order to help debug and test
>> specific type of reset. Also helps blocking unwanted type of resets.
>>
>> By default, mode2 reset will not be enabled
>>
>> Signed-off-by: Victor Zhao <Victor.Zhao@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  4 ++++
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 20 ++++++++++++++++++++
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c     |  1 +
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c   |  6 ++++++
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c    |  3 +++
>>    5 files changed, 34 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index 6cd1e0a6dffc..c661231a6a07 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -238,6 +238,7 @@ extern int amdgpu_si_support;
>>    extern int amdgpu_cik_support;
>>    #endif
>>    extern int amdgpu_num_kcq;
>> +extern uint amdgpu_reset_level_mask;
>>    
>>    #define AMDGPU_VCNFW_LOG_SIZE (32 * 1024)
>>    extern int amdgpu_vcnfw_log;
>> @@ -274,6 +275,9 @@ extern int amdgpu_vcnfw_log;
>>    #define AMDGPU_RESET_VCE			(1 << 13)
>>    #define AMDGPU_RESET_VCE1			(1 << 14)
>>    
>> +#define AMDGPU_RESET_LEVEL_SOFT_RECOVERY (1 << 0) #define
>> +AMDGPU_RESET_LEVEL_MODE2 (1 << 1)
>> +
>>    /* max cursor sizes (in pixels) */
>>    #define CIK_CURSOR_WIDTH 128
>>    #define CIK_CURSOR_HEIGHT 128
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>> index e2eec985adb3..235c48e4ba4d 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>> @@ -1661,12 +1661,29 @@ static int amdgpu_debugfs_sclk_set(void *data, u64 val)
>>    	return ret;
>>    }
>>    
>> +static int amdgpu_debugfs_reset_level_get(void *data, u64 *val) {
>> +	struct amdgpu_device *adev = (struct amdgpu_device *)data;
>> +	*val = amdgpu_reset_level_mask;
>> +	return 0;
>> +}
>> +
>> +static int amdgpu_debugfs_reset_level_set(void *data, u64 val) {
>> +	struct amdgpu_device *adev = (struct amdgpu_device *)data;
>> +	amdgpu_reset_level_mask = val;
>> +	return 0;
>> +}
>> +
>>    DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
>>    			amdgpu_debugfs_ib_preempt, "%llu\n");
>>    
>>    DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
>>    			amdgpu_debugfs_sclk_set, "%llu\n");
>>    
>> +DEFINE_DEBUGFS_ATTRIBUTE(fops_reset_level, amdgpu_debugfs_reset_level_get,
>> +			amdgpu_debugfs_reset_level_set, "%llu\n");
>> +
>>    static ssize_t amdgpu_reset_dump_register_list_read(struct file *f,
>>    				char __user *buf, size_t size, loff_t *pos)
>>    {
>> @@ -1785,6 +1802,9 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
>>    		return PTR_ERR(ent);
>>    	}
>>    
>> +	debugfs_create_file("amdgpu_reset_level", 0200, root, adev,
>> +				  &fops_reset_level);
>> +
>>    	/* Register debugfs entries for amdgpu_ttm */
>>    	amdgpu_ttm_debugfs_init(adev);
>>    	amdgpu_debugfs_pm_init(adev);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> index e8c6c3fe9374..fb8f3cb853a7 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> @@ -198,6 +198,7 @@ struct amdgpu_watchdog_timer amdgpu_watchdog_timer = {
>>    	.timeout_fatal_disable = false,
>>    	.period = 0x0, /* default to 0x0 (timeout disable) */
>>    };
>> +uint amdgpu_reset_level_mask = 0x1;
>>    
>>    /**
>>     * DOC: vramlimit (int)
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>> index 831fb222139c..f16ab1a54b70 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>> @@ -74,6 +74,9 @@ int amdgpu_reset_prepare_hwcontext(struct amdgpu_device *adev,
>>    {
>>    	struct amdgpu_reset_handler *reset_handler = NULL;
>>    
>> +	if (!(amdgpu_reset_level_mask & AMDGPU_RESET_LEVEL_MODE2))
>> +		return -ENOSYS;
>> +
>>    	if (test_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context->flags))
>>    		return -ENOSYS;
>>    
>> @@ -93,6 +96,9 @@ int amdgpu_reset_perform_reset(struct amdgpu_device *adev,
>>    	int ret;
>>    	struct amdgpu_reset_handler *reset_handler = NULL;
>>    
>> +	if (!(amdgpu_reset_level_mask & AMDGPU_RESET_LEVEL_MODE2))
>> +		return -ENOSYS;
>> +
>>    	if (test_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context->flags))
>>    		return -ENOSYS;
>>    
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> index d3558c34d406..1ffdc050a077 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> @@ -405,6 +405,9 @@ bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int vmid,
>>    {
>>    	ktime_t deadline = ktime_add_us(ktime_get(), 10000);
>>    
>> +	if (!(amdgpu_reset_level_mask & AMDGPU_RESET_LEVEL_SOFT_RECOVERY))
>> +		return false;
>> +
>>    	if (amdgpu_sriov_vf(ring->adev) || !ring->funcs->soft_recovery || !fence)
>>    		return false;
>>    


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/5] drm/amdgpu: add mode2 reset for sienna_cichlid
  2022-07-22  7:33 [PATCH 1/5] drm/amdgpu: add mode2 reset for sienna_cichlid Victor Zhao
                   ` (3 preceding siblings ...)
  2022-07-22  7:34 ` [PATCH 5/5] drm/amdgpu: reduce reset time Victor Zhao
@ 2022-07-25 21:05 ` Andrey Grodzovsky
  2022-07-26 10:04   ` Zhao, Victor
  4 siblings, 1 reply; 20+ messages in thread
From: Andrey Grodzovsky @ 2022-07-25 21:05 UTC (permalink / raw)
  To: Victor Zhao, amd-gfx; +Cc: Alexander.Deucher, emily.deng, Christian.Koenig


On 2022-07-22 03:33, Victor Zhao wrote:
> To meet the requirement for multi container usecase which needs
> a quicker reset and not causing VRAM lost, adding the Mode2
> reset handler for sienna_cichlid. Adding a AMDGPU_SKIP_MODE2_RESET
> flag so driver can fallback to default reset method when mode2
> reset failed and retry.
>
> - add mode2 reset handler for sienna_cichlid


Seems to me ASIC specific changes should be in a seperate patch


> - introduce AMDGPU_SKIP_MODE2_RESET flag
> - let mode2 reset fallback to default reset method if failed
>
> Signed-off-by: Victor Zhao <Victor.Zhao@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/Makefile           |   2 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c    |   1 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c    |   7 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c       |   1 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c       |   1 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c     |  13 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h     |   1 +
>   drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c         |   1 +
>   drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c         |   1 +
>   drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c         |   1 +
>   drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c   | 297 ++++++++++++++++++
>   drivers/gpu/drm/amd/amdgpu/sienna_cichlid.h   |  32 ++
>   .../pm/swsmu/inc/pmfw_if/smu_v11_0_7_ppsmc.h  |   4 +-
>   drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h  |   3 +-
>   .../amd/pm/swsmu/smu11/sienna_cichlid_ppt.c   |  54 ++++
>   15 files changed, 414 insertions(+), 5 deletions(-)
>   create mode 100644 drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
>   create mode 100644 drivers/gpu/drm/amd/amdgpu/sienna_cichlid.h
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile
> index c7d0cd15b5ef..7030ac2d7d2c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/Makefile
> +++ b/drivers/gpu/drm/amd/amdgpu/Makefile
> @@ -75,7 +75,7 @@ amdgpu-y += \
>   	vi.o mxgpu_vi.o nbio_v6_1.o soc15.o emu_soc.o mxgpu_ai.o nbio_v7_0.o vega10_reg_init.o \
>   	vega20_reg_init.o nbio_v7_4.o nbio_v2_3.o nv.o arct_reg_init.o mxgpu_nv.o \
>   	nbio_v7_2.o hdp_v4_0.o hdp_v5_0.o aldebaran_reg_init.o aldebaran.o soc21.o \
> -	nbio_v4_3.o hdp_v6_0.o nbio_v7_7.o hdp_v5_2.o lsdma_v6_0.o
> +	sienna_cichlid.o nbio_v4_3.o hdp_v6_0.o nbio_v7_7.o hdp_v5_2.o lsdma_v6_0.o
>   
>   # add DF block
>   amdgpu-y += \
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> index 5e53a5293935..091415a4abf0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> @@ -135,6 +135,7 @@ static void amdgpu_amdkfd_reset_work(struct work_struct *work)
>   	reset_context.method = AMD_RESET_METHOD_NONE;
>   	reset_context.reset_req_dev = adev;
>   	clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
> +	clear_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags);
>   
>   	amdgpu_device_gpu_recover(adev, NULL, &reset_context);
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index b79ee4ffb879..5498fda8617f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -5146,6 +5146,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   
>   	reset_context->job = job;
>   	reset_context->hive = hive;
> +
>   	/*
>   	 * Build list of devices to reset.
>   	 * In case we are in XGMI hive mode, resort the device list
> @@ -5265,8 +5266,11 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   			amdgpu_ras_resume(adev);
>   	} else {
>   		r = amdgpu_do_asic_reset(device_list_handle, reset_context);
> -		if (r && r == -EAGAIN)
> +		if (r && r == -EAGAIN) {
> +			set_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context->flags);
> +			adev->asic_reset_res = 0;


See my question bellow related to this set


>   			goto retry;
> +		}
>   	}
>   
>   skip_hw_reset:
> @@ -5694,6 +5698,7 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
>   	reset_context.reset_req_dev = adev;
>   	set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
>   	set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
> +	set_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags);
>   
>   	adev->no_hw_access = true;
>   	r = amdgpu_device_pre_asic_reset(adev, &reset_context);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index 10fdd12cf853..9844d99075e9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -71,6 +71,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
>   		reset_context.method = AMD_RESET_METHOD_NONE;
>   		reset_context.reset_req_dev = adev;
>   		clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
> +		clear_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags);
>   
>   		r = amdgpu_device_gpu_recover(ring->adev, job, &reset_context);
>   		if (r)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 4f321375015c..259ec860890a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -1949,6 +1949,7 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
>   		reset_context.method = AMD_RESET_METHOD_NONE;
>   		reset_context.reset_req_dev = adev;
>   		clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
> +		clear_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags);
>   
>   		amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context);
>   	}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> index 32c86a0b145c..831fb222139c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> @@ -23,6 +23,7 @@
>   
>   #include "amdgpu_reset.h"
>   #include "aldebaran.h"
> +#include "sienna_cichlid.h"
>   
>   int amdgpu_reset_add_handler(struct amdgpu_reset_control *reset_ctl,
>   			     struct amdgpu_reset_handler *handler)
> @@ -40,6 +41,9 @@ int amdgpu_reset_init(struct amdgpu_device *adev)
>   	case IP_VERSION(13, 0, 2):
>   		ret = aldebaran_reset_init(adev);
>   		break;
> +	case IP_VERSION(11, 0, 7):
> +		ret = sienna_cichlid_reset_init(adev);
> +		break;
>   	default:
>   		break;
>   	}
> @@ -55,6 +59,9 @@ int amdgpu_reset_fini(struct amdgpu_device *adev)
>   	case IP_VERSION(13, 0, 2):
>   		ret = aldebaran_reset_fini(adev);
>   		break;
> +	case IP_VERSION(11, 0, 7):
> +		ret = sienna_cichlid_reset_fini(adev);
> +		break;
>   	default:
>   		break;
>   	}
> @@ -67,6 +74,9 @@ int amdgpu_reset_prepare_hwcontext(struct amdgpu_device *adev,
>   {
>   	struct amdgpu_reset_handler *reset_handler = NULL;
>   
> +	if (test_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context->flags))
> +		return -ENOSYS;
> +
>   	if (adev->reset_cntl && adev->reset_cntl->get_reset_handler)
>   		reset_handler = adev->reset_cntl->get_reset_handler(
>   			adev->reset_cntl, reset_context);
> @@ -83,6 +93,9 @@ int amdgpu_reset_perform_reset(struct amdgpu_device *adev,
>   	int ret;
>   	struct amdgpu_reset_handler *reset_handler = NULL;
>   
> +	if (test_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context->flags))
> +		return -ENOSYS;
> +
>   	if (adev->reset_cntl)
>   		reset_handler = adev->reset_cntl->get_reset_handler(
>   			adev->reset_cntl, reset_context);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> index 9e55a5d7a825..cc4b2eeb24cf 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> @@ -30,6 +30,7 @@ enum AMDGPU_RESET_FLAGS {
>   
>   	AMDGPU_NEED_FULL_RESET = 0,
>   	AMDGPU_SKIP_HW_RESET = 1,
> +	AMDGPU_SKIP_MODE2_RESET = 2,
>   };
>   
>   struct amdgpu_reset_context {
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> index 12906ba74462..a2f04b249132 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> @@ -290,6 +290,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
>   		reset_context.method = AMD_RESET_METHOD_NONE;
>   		reset_context.reset_req_dev = adev;
>   		clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
> +		clear_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags);
>   
>   		amdgpu_device_gpu_recover(adev, NULL, &reset_context);
>   	}
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> index e07757eea7ad..a977f0027928 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> @@ -317,6 +317,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
>   		reset_context.method = AMD_RESET_METHOD_NONE;
>   		reset_context.reset_req_dev = adev;
>   		clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
> +		clear_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags);
>   
>   		amdgpu_device_gpu_recover(adev, NULL, &reset_context);
>   	}
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
> index 288c414babdf..fd14fa9b9cd7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
> @@ -529,6 +529,7 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct *work)
>   		reset_context.method = AMD_RESET_METHOD_NONE;
>   		reset_context.reset_req_dev = adev;
>   		clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
> +		clear_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags);
>   
>   		amdgpu_device_gpu_recover(adev, NULL, &reset_context);
>   	}
> diff --git a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
> new file mode 100644
> index 000000000000..0512960bed23
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
> @@ -0,0 +1,297 @@
> +/*
> + * Copyright 2021 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + *
> + */
> +
> +#include "sienna_cichlid.h"
> +#include "amdgpu_reset.h"
> +#include "amdgpu_amdkfd.h"
> +#include "amdgpu_dpm.h"
> +#include "amdgpu_job.h"
> +#include "amdgpu_ring.h"
> +#include "amdgpu_ras.h"
> +#include "amdgpu_psp.h"
> +#include "amdgpu_xgmi.h"
> +
> +static struct amdgpu_reset_handler *
> +sienna_cichlid_get_reset_handler(struct amdgpu_reset_control *reset_ctl,
> +			    struct amdgpu_reset_context *reset_context)
> +{
> +	struct amdgpu_reset_handler *handler;
> +	struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;
> +
> +	if (reset_context->method != AMD_RESET_METHOD_NONE) {
> +		list_for_each_entry(handler, &reset_ctl->reset_handlers,
> +				     handler_list) {
> +			if (handler->reset_method == reset_context->method)
> +				return handler;
> +		}
> +	} else {
> +		list_for_each_entry(handler, &reset_ctl->reset_handlers,
> +				     handler_list) {
> +			if (handler->reset_method == AMD_RESET_METHOD_MODE2 &&
> +			    adev->pm.fw_version >= 0x3a5500 &&
> +			    !amdgpu_sriov_vf(adev)) {
> +				reset_context->method = AMD_RESET_METHOD_MODE2;
> +				return handler;
> +			}
> +		}
> +	}
> +
> +	return NULL;
> +}
> +
> +static int sienna_cichlid_mode2_suspend_ip(struct amdgpu_device *adev)
> +{
> +	int r, i;
> +
> +	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
> +	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
> +
> +	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
> +		if (!(adev->ip_blocks[i].version->type ==
> +			      AMD_IP_BLOCK_TYPE_GFX ||
> +		      adev->ip_blocks[i].version->type ==
> +			      AMD_IP_BLOCK_TYPE_SDMA))
> +			continue;
> +
> +		r = adev->ip_blocks[i].version->funcs->suspend(adev);
> +
> +		if (r) {
> +			dev_err(adev->dev,
> +				"suspend of IP block <%s> failed %d\n",
> +				adev->ip_blocks[i].version->funcs->name, r);
> +			return r;
> +		}
> +		adev->ip_blocks[i].status.hw = false;
> +	}
> +
> +	return r;
> +}
> +
> +static int
> +sienna_cichlid_mode2_prepare_hwcontext(struct amdgpu_reset_control *reset_ctl,
> +				  struct amdgpu_reset_context *reset_context)
> +{
> +	int r = 0;
> +	struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;
> +
> +	if (!amdgpu_sriov_vf(adev))
> +		r = sienna_cichlid_mode2_suspend_ip(adev);
> +
> +	return r;
> +}
> +
> +static void sienna_cichlid_async_reset(struct work_struct *work)
> +{
> +	struct amdgpu_reset_handler *handler;
> +	struct amdgpu_reset_control *reset_ctl =
> +		container_of(work, struct amdgpu_reset_control, reset_work);
> +	struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;
> +
> +	list_for_each_entry(handler, &reset_ctl->reset_handlers,
> +			     handler_list) {
> +		if (handler->reset_method == reset_ctl->active_reset) {
> +			dev_dbg(adev->dev, "Resetting device\n");
> +			handler->do_reset(adev);
> +			break;
> +		}
> +	}
> +}
> +
> +static int sienna_cichlid_mode2_reset(struct amdgpu_device *adev)
> +{
> +	/* disable BM */
> +	pci_clear_master(adev->pdev);
> +	adev->asic_reset_res = amdgpu_dpm_mode2_reset(adev);


Why do you need to set adev->asic_reset_res here and
not just return back result ?


> +	return adev->asic_reset_res;
> +}
> +
> +static int
> +sienna_cichlid_mode2_perform_reset(struct amdgpu_reset_control *reset_ctl,
> +			      struct amdgpu_reset_context *reset_context)
> +{
> +	struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;
> +	int r;
> +
> +	r = sienna_cichlid_mode2_reset(adev);
> +	if (r) {
> +		dev_err(adev->dev,
> +			"ASIC reset failed with error, %d ", r);
> +	}
> +	return r;
> +}
> +
> +static int sienna_cichlid_mode2_restore_ip(struct amdgpu_device *adev)
> +{
> +	int i, r;
> +	struct psp_context *psp = &adev->psp;
> +
> +	r = psp_rlc_autoload_start(psp);
> +	if (r) {
> +		dev_err(adev->dev, "Failed to start rlc autoload\n");
> +		return r;
> +	}
> +
> +	/* Reinit GFXHUB */
> +	adev->gfxhub.funcs->init(adev);
> +	r = adev->gfxhub.funcs->gart_enable(adev);
> +	if (r) {
> +		dev_err(adev->dev, "GFXHUB gart reenable failed after reset\n");
> +		return r;
> +	}
> +
> +	for (i = 0; i < adev->num_ip_blocks; i++) {
> +		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)
> +			r = adev->ip_blocks[i].version->funcs->resume(adev);
> +		if (r) {
> +			dev_err(adev->dev,
> +				"resume of IP block <%s> failed %d\n",
> +				adev->ip_blocks[i].version->funcs->name, r);
> +			return r;
> +		}
> +
> +		adev->ip_blocks[i].status.hw = true;
> +	}
> +
> +	for (i = 0; i < adev->num_ip_blocks; i++) {
> +		if (!(adev->ip_blocks[i].version->type ==
> +			      AMD_IP_BLOCK_TYPE_GFX ||
> +		      adev->ip_blocks[i].version->type ==
> +			      AMD_IP_BLOCK_TYPE_SDMA))
> +			continue;
> +		r = adev->ip_blocks[i].version->funcs->resume(adev);
> +		if (r) {
> +			dev_err(adev->dev,
> +				"resume of IP block <%s> failed %d\n",
> +				adev->ip_blocks[i].version->funcs->name, r);
> +			return r;
> +		}
> +
> +		adev->ip_blocks[i].status.hw = true;
> +	}
> +
> +	for (i = 0; i < adev->num_ip_blocks; i++) {
> +		if (!(adev->ip_blocks[i].version->type ==
> +			      AMD_IP_BLOCK_TYPE_GFX ||
> +		      adev->ip_blocks[i].version->type ==
> +			      AMD_IP_BLOCK_TYPE_SDMA))
> +			continue;
> +
> +		if (adev->ip_blocks[i].version->funcs->late_init) {
> +			r = adev->ip_blocks[i].version->funcs->late_init(
> +				(void *)adev);
> +			if (r) {
> +				dev_err(adev->dev,
> +					"late_init of IP block <%s> failed %d after reset\n",
> +					adev->ip_blocks[i].version->funcs->name,
> +					r);
> +				return r;
> +			}
> +		}
> +		adev->ip_blocks[i].status.late_initialized = true;
> +	}
> +
> +	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
> +	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
> +
> +	return r;
> +}
> +
> +static int
> +sienna_cichlid_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,
> +				  struct amdgpu_reset_context *reset_context)
> +{
> +	int r;
> +	struct amdgpu_device *tmp_adev = (struct amdgpu_device *)reset_ctl->handle;
> +
> +	dev_info(tmp_adev->dev,
> +			"GPU reset succeeded, trying to resume\n");
> +	r = sienna_cichlid_mode2_restore_ip(tmp_adev);
> +	if (r)
> +		goto end;
> +
> +	/*
> +	* Add this ASIC as tracked as reset was already
> +	* complete successfully.
> +	*/
> +	amdgpu_register_gpu_instance(tmp_adev);
> +
> +	/* Resume RAS */
> +	amdgpu_ras_resume(tmp_adev);
> +
> +	amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
> +
> +	r = amdgpu_ib_ring_tests(tmp_adev);
> +	if (r) {
> +		dev_err(tmp_adev->dev,
> +			"ib ring test failed (%d).\n", r);
> +		r = -EAGAIN;
> +		tmp_adev->asic_reset_res = r;
> +		goto end;
> +	}
> +
> +end:
> +	if (r)
> +		return -EAGAIN;
> +	else
> +		return r;
> +}
> +
> +static struct amdgpu_reset_handler sienna_cichlid_mode2_handler = {
> +	.reset_method		= AMD_RESET_METHOD_MODE2,
> +	.prepare_env		= NULL,
> +	.prepare_hwcontext	= sienna_cichlid_mode2_prepare_hwcontext,
> +	.perform_reset		= sienna_cichlid_mode2_perform_reset,
> +	.restore_hwcontext	= sienna_cichlid_mode2_restore_hwcontext,
> +	.restore_env		= NULL,
> +	.do_reset		= sienna_cichlid_mode2_reset,
> +};
> +
> +int sienna_cichlid_reset_init(struct amdgpu_device *adev)
> +{
> +	struct amdgpu_reset_control *reset_ctl;
> +
> +	reset_ctl = kzalloc(sizeof(*reset_ctl), GFP_KERNEL);
> +	if (!reset_ctl)
> +		return -ENOMEM;
> +
> +	reset_ctl->handle = adev;
> +	reset_ctl->async_reset = sienna_cichlid_async_reset;
> +	reset_ctl->active_reset = AMD_RESET_METHOD_NONE;
> +	reset_ctl->get_reset_handler = sienna_cichlid_get_reset_handler;
> +
> +	INIT_LIST_HEAD(&reset_ctl->reset_handlers);
> +	INIT_WORK(&reset_ctl->reset_work, reset_ctl->async_reset);
> +	/* Only mode2 is handled through reset control now */
> +	amdgpu_reset_add_handler(reset_ctl, &sienna_cichlid_mode2_handler);
> +
> +	adev->reset_cntl = reset_ctl;
> +
> +	return 0;
> +}
> +
> +int sienna_cichlid_reset_fini(struct amdgpu_device *adev)
> +{
> +	kfree(adev->reset_cntl);
> +	adev->reset_cntl = NULL;
> +	return 0;
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.h b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.h
> new file mode 100644
> index 000000000000..5213b162dacd
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.h
> @@ -0,0 +1,32 @@
> +/*
> + * Copyright 2021 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + *
> + */
> +
> +#ifndef __SIENNA_CICHLID_H__
> +#define __SIENNA_CICHLID_H__
> +
> +#include "amdgpu.h"
> +
> +int sienna_cichlid_reset_init(struct amdgpu_device *adev);
> +int sienna_cichlid_reset_fini(struct amdgpu_device *adev);
> +
> +#endif
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v11_0_7_ppsmc.h b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v11_0_7_ppsmc.h
> index d2e10a724560..82cf9e563065 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v11_0_7_ppsmc.h
> +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v11_0_7_ppsmc.h
> @@ -137,7 +137,7 @@
>   #define PPSMC_MSG_DisallowGpo                    0x56
>   
>   #define PPSMC_MSG_Enable2ndUSB20Port             0x57
> -
> -#define PPSMC_Message_Count                      0x58
> +#define PPSMC_MSG_DriverMode2Reset               0x5D
> +#define PPSMC_Message_Count                      0x5E
>   
>   #endif
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
> index 19084a4fcb2b..28f6a1eb6945 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
> +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
> @@ -235,7 +235,8 @@
>   	__SMU_DUMMY_MAP(UnforceGfxVid),           \
>   	__SMU_DUMMY_MAP(HeavySBR),			\
>   	__SMU_DUMMY_MAP(SetBadHBMPagesRetiredFlagsPerChannel), \
> -	__SMU_DUMMY_MAP(EnableGfxImu),
> +	__SMU_DUMMY_MAP(EnableGfxImu), \
> +	__SMU_DUMMY_MAP(DriverMode2Reset),
>   
>   #undef __SMU_DUMMY_MAP
>   #define __SMU_DUMMY_MAP(type)	SMU_MSG_##type
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
> index fa520d79ef67..a73d241bb64f 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
> @@ -154,6 +154,7 @@ static struct cmn2asic_msg_mapping sienna_cichlid_message_map[SMU_MSG_MAX_COUNT]
>   	MSG_MAP(SetGpoFeaturePMask,		PPSMC_MSG_SetGpoFeaturePMask,          0),
>   	MSG_MAP(DisallowGpo,			PPSMC_MSG_DisallowGpo,                 0),
>   	MSG_MAP(Enable2ndUSB20Port,		PPSMC_MSG_Enable2ndUSB20Port,          0),
> +	MSG_MAP(DriverMode2Reset,		PPSMC_MSG_DriverMode2Reset,	       0),
>   };
>   
>   static struct cmn2asic_mapping sienna_cichlid_clk_map[SMU_CLK_COUNT] = {
> @@ -4254,6 +4255,57 @@ static int sienna_cichlid_stb_get_data_direct(struct smu_context *smu,
>   	return 0;
>   }
>   
> +static bool sienna_cichlid_is_mode2_reset_supported(struct smu_context *smu)
> +{
> +	return true;
> +}
> +
> +static int sienna_cichlid_mode2_reset(struct smu_context *smu)
> +{
> +	u32 smu_version;
> +	int ret = 0, index;
> +	struct amdgpu_device *adev = smu->adev;
> +	int timeout = 100;
> +
> +	smu_cmn_get_smc_version(smu, NULL, &smu_version);
> +
> +	index = smu_cmn_to_asic_specific_index(smu, CMN2ASIC_MAPPING_MSG,
> +						SMU_MSG_DriverMode2Reset);
> +
> +	mutex_lock(&smu->message_lock);
> +
> +	ret = smu_cmn_send_msg_without_waiting(smu, (uint16_t)index,
> +					       SMU_RESET_MODE_2);
> +
> +	ret = smu_cmn_wait_for_response(smu);
> +	while (ret != 0 && timeout) {
> +		ret = smu_cmn_wait_for_response(smu);
> +		/* Wait a bit more time for getting ACK */
> +		if (ret != 0) {
> +			--timeout;
> +			usleep_range(500, 1000);
> +			continue;
> +		} else {
> +			break;
> +		}
> +	}
> +
> +	if (!timeout) {
> +		dev_err(adev->dev,
> +			"failed to send mode2 message \tparam: 0x%08x response %#x\n",
> +			SMU_RESET_MODE_2, ret);
> +		goto out;
> +	}
> +
> +	dev_info(smu->adev->dev, "restore config space...\n");
> +	/* Restore the config space saved during init */
> +	amdgpu_device_load_pci_state(adev->pdev);
> +out:
> +	mutex_unlock(&smu->message_lock);
> +
> +	return ret;
> +}
> +
>   static const struct pptable_funcs sienna_cichlid_ppt_funcs = {
>   	.get_allowed_feature_mask = sienna_cichlid_get_allowed_feature_mask,
>   	.set_default_dpm_table = sienna_cichlid_set_default_dpm_table,
> @@ -4348,6 +4400,8 @@ static const struct pptable_funcs sienna_cichlid_ppt_funcs = {
>   	.get_default_config_table_settings = sienna_cichlid_get_default_config_table_settings,
>   	.set_config_table = sienna_cichlid_set_config_table,
>   	.get_unique_id = sienna_cichlid_get_unique_id,
> +	.mode2_reset_is_support = sienna_cichlid_is_mode2_reset_supported,
> +	.mode2_reset = sienna_cichlid_mode2_reset,
>   };
>   
>   void sienna_cichlid_set_ppt_funcs(struct smu_context *smu)

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/5] drm/amdgpu: add debugfs amdgpu_reset_level
  2022-07-25 17:37       ` Christian König
@ 2022-07-25 21:07         ` Andrey Grodzovsky
  0 siblings, 0 replies; 20+ messages in thread
From: Andrey Grodzovsky @ 2022-07-25 21:07 UTC (permalink / raw)
  To: Christian König, Zhao, Victor, amd-gfx
  Cc: Deucher, Alexander, Deng, Emily


On 2022-07-25 13:37, Christian König wrote:
> Hi Victor,
>
> Am 25.07.22 um 12:45 schrieb Zhao, Victor:
>> [AMD Official Use Only - General]
>>
>> Hi @Grodzovsky, Andrey,
>>
>> Please help review the series, thanks a lot.
>>
>> Hi @Koenig, Christian,
>>
>> I thought a module parameter will be exposed to a common user, this 
>> control was added to help debug and test so I put it as a debugfs. I 
>> can make it module parameter if more appropriate.
>
> That's a really good argument. I leave the decision if we should use a 
> module parameter or debugfs file to Andrey.
>
> If you go with debugfs then using the debugfs_create_u32() or 
> debugfs_create_u64() function would be more appropriate I think. And 
> then don't make that global, but rather a per device flag.
>
> Regards,
> Christian.


Makes sense to me too.

Andrey


>
>>
>>
>> Thanks,
>> Victor
>>
>>
>>
>> -----Original Message-----
>> From: Koenig, Christian <Christian.Koenig@amd.com>
>> Sent: Friday, July 22, 2022 4:20 PM
>> To: Zhao, Victor <Victor.Zhao@amd.com>; amd-gfx@lists.freedesktop.org
>> Cc: Deng, Emily <Emily.Deng@amd.com>; Deucher, Alexander 
>> <Alexander.Deucher@amd.com>
>> Subject: Re: [PATCH 2/5] drm/amdgpu: add debugfs amdgpu_reset_level
>>
>> Well NAK to the debugfs approach, stuff like that is usually a module 
>> parameter.
>>
>> Apart from that this series needs to be reviewed by Andrey.
>>
>> Regards,
>> Christian.
>>
>> Am 22.07.22 um 09:34 schrieb Victor Zhao:
>>> Introduce amdgpu_reset_level debugfs in order to help debug and test
>>> specific type of reset. Also helps blocking unwanted type of resets.
>>>
>>> By default, mode2 reset will not be enabled
>>>
>>> Signed-off-by: Victor Zhao <Victor.Zhao@amd.com>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  4 ++++
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 20 
>>> ++++++++++++++++++++
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c     |  1 +
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c   |  6 ++++++
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c    |  3 +++
>>>    5 files changed, 34 insertions(+)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> index 6cd1e0a6dffc..c661231a6a07 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> @@ -238,6 +238,7 @@ extern int amdgpu_si_support;
>>>    extern int amdgpu_cik_support;
>>>    #endif
>>>    extern int amdgpu_num_kcq;
>>> +extern uint amdgpu_reset_level_mask;
>>>       #define AMDGPU_VCNFW_LOG_SIZE (32 * 1024)
>>>    extern int amdgpu_vcnfw_log;
>>> @@ -274,6 +275,9 @@ extern int amdgpu_vcnfw_log;
>>>    #define AMDGPU_RESET_VCE            (1 << 13)
>>>    #define AMDGPU_RESET_VCE1            (1 << 14)
>>>    +#define AMDGPU_RESET_LEVEL_SOFT_RECOVERY (1 << 0) #define
>>> +AMDGPU_RESET_LEVEL_MODE2 (1 << 1)
>>> +
>>>    /* max cursor sizes (in pixels) */
>>>    #define CIK_CURSOR_WIDTH 128
>>>    #define CIK_CURSOR_HEIGHT 128
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>> index e2eec985adb3..235c48e4ba4d 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>> @@ -1661,12 +1661,29 @@ static int amdgpu_debugfs_sclk_set(void 
>>> *data, u64 val)
>>>        return ret;
>>>    }
>>>    +static int amdgpu_debugfs_reset_level_get(void *data, u64 *val) {
>>> +    struct amdgpu_device *adev = (struct amdgpu_device *)data;
>>> +    *val = amdgpu_reset_level_mask;
>>> +    return 0;
>>> +}
>>> +
>>> +static int amdgpu_debugfs_reset_level_set(void *data, u64 val) {
>>> +    struct amdgpu_device *adev = (struct amdgpu_device *)data;
>>> +    amdgpu_reset_level_mask = val;
>>> +    return 0;
>>> +}
>>> +
>>>    DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
>>>                amdgpu_debugfs_ib_preempt, "%llu\n");
>>>       DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
>>>                amdgpu_debugfs_sclk_set, "%llu\n");
>>>    +DEFINE_DEBUGFS_ATTRIBUTE(fops_reset_level, 
>>> amdgpu_debugfs_reset_level_get,
>>> +            amdgpu_debugfs_reset_level_set, "%llu\n");
>>> +
>>>    static ssize_t amdgpu_reset_dump_register_list_read(struct file *f,
>>>                    char __user *buf, size_t size, loff_t *pos)
>>>    {
>>> @@ -1785,6 +1802,9 @@ int amdgpu_debugfs_init(struct amdgpu_device 
>>> *adev)
>>>            return PTR_ERR(ent);
>>>        }
>>>    +    debugfs_create_file("amdgpu_reset_level", 0200, root, adev,
>>> +                  &fops_reset_level);
>>> +
>>>        /* Register debugfs entries for amdgpu_ttm */
>>>        amdgpu_ttm_debugfs_init(adev);
>>>        amdgpu_debugfs_pm_init(adev);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> index e8c6c3fe9374..fb8f3cb853a7 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> @@ -198,6 +198,7 @@ struct amdgpu_watchdog_timer 
>>> amdgpu_watchdog_timer = {
>>>        .timeout_fatal_disable = false,
>>>        .period = 0x0, /* default to 0x0 (timeout disable) */
>>>    };
>>> +uint amdgpu_reset_level_mask = 0x1;
>>>       /**
>>>     * DOC: vramlimit (int)
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>> index 831fb222139c..f16ab1a54b70 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>> @@ -74,6 +74,9 @@ int amdgpu_reset_prepare_hwcontext(struct 
>>> amdgpu_device *adev,
>>>    {
>>>        struct amdgpu_reset_handler *reset_handler = NULL;
>>>    +    if (!(amdgpu_reset_level_mask & AMDGPU_RESET_LEVEL_MODE2))
>>> +        return -ENOSYS;
>>> +
>>>        if (test_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context->flags))
>>>            return -ENOSYS;
>>>    @@ -93,6 +96,9 @@ int amdgpu_reset_perform_reset(struct 
>>> amdgpu_device *adev,
>>>        int ret;
>>>        struct amdgpu_reset_handler *reset_handler = NULL;
>>>    +    if (!(amdgpu_reset_level_mask & AMDGPU_RESET_LEVEL_MODE2))
>>> +        return -ENOSYS;
>>> +
>>>        if (test_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context->flags))
>>>            return -ENOSYS;
>>>    diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>> index d3558c34d406..1ffdc050a077 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>> @@ -405,6 +405,9 @@ bool amdgpu_ring_soft_recovery(struct 
>>> amdgpu_ring *ring, unsigned int vmid,
>>>    {
>>>        ktime_t deadline = ktime_add_us(ktime_get(), 10000);
>>>    +    if (!(amdgpu_reset_level_mask & 
>>> AMDGPU_RESET_LEVEL_SOFT_RECOVERY))
>>> +        return false;
>>> +
>>>        if (amdgpu_sriov_vf(ring->adev) || 
>>> !ring->funcs->soft_recovery || !fence)
>>>            return false;
>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 5/5] drm/amdgpu: reduce reset time
  2022-07-22  7:34 ` [PATCH 5/5] drm/amdgpu: reduce reset time Victor Zhao
@ 2022-07-25 21:17   ` Andrey Grodzovsky
  2022-07-26  9:40     ` Zhao, Victor
  0 siblings, 1 reply; 20+ messages in thread
From: Andrey Grodzovsky @ 2022-07-25 21:17 UTC (permalink / raw)
  To: Victor Zhao, amd-gfx; +Cc: Alexander.Deucher, emily.deng, Christian.Koenig


On 2022-07-22 03:34, Victor Zhao wrote:
> In multi container use case, reset time is important, so skip ring
> tests and cp halt wait during ip suspending for reset as they are
> going to fail and cost more time on reset


Why are they failing in this case ? Skipping ring tests is not the best 
idea as you
loose important indicator of system's sanity. Is there any way to make 
them work ?


>
> Signed-off-by: Victor Zhao <Victor.Zhao@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c |  2 +-
>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c  | 26 +++++++++++++++----------
>   2 files changed, 17 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index 222d3d7ea076..f872495ccc3a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -477,7 +477,7 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev)
>   		kiq->pmf->kiq_unmap_queues(kiq_ring, &adev->gfx.compute_ring[i],
>   					   RESET_QUEUES, 0, 0);
>   
> -	if (adev->gfx.kiq.ring.sched.ready)
> +	if (adev->gfx.kiq.ring.sched.ready && !amdgpu_in_reset(adev))
>   		r = amdgpu_ring_test_helper(kiq_ring);
>   	spin_unlock(&adev->gfx.kiq.ring_lock);
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index fafbad3cf08d..9ae29023e38f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -5971,16 +5971,19 @@ static int gfx_v10_0_cp_gfx_enable(struct amdgpu_device *adev, bool enable)
>   		WREG32_SOC15(GC, 0, mmCP_ME_CNTL, tmp);
>   	}
>   
> -	for (i = 0; i < adev->usec_timeout; i++) {
> -		if (RREG32_SOC15(GC, 0, mmCP_STAT) == 0)
> -			break;
> -		udelay(1);
> -	}
> -
> -	if (i >= adev->usec_timeout)
> -		DRM_ERROR("failed to %s cp gfx\n", enable ? "unhalt" : "halt");
> +	if (!amdgpu_in_reset(adev)) {
> +		for (i = 0; i < adev->usec_timeout; i++) {
> +			if (RREG32_SOC15(GC, 0, mmCP_STAT) == 0)
> +				break;
> +			udelay(1);
> +		}
>   
> +		if (i >= adev->usec_timeout)
> +			DRM_ERROR("failed to %s cp gfx\n",
> +				  enable ? "unhalt" : "halt");
> +	}
>   	return 0;
> +
>   }


This change has impact beyond container case no ? We had no issue
with this code during regular reset cases so why we would give up on 
this code
which confirms CP is idle ? What is the side effect of skipping this 
during all GPU resets ?

Andrey


>   
>   static int gfx_v10_0_cp_gfx_load_pfp_microcode(struct amdgpu_device *adev)
> @@ -7569,8 +7572,10 @@ static int gfx_v10_0_kiq_disable_kgq(struct amdgpu_device *adev)
>   	for (i = 0; i < adev->gfx.num_gfx_rings; i++)
>   		kiq->pmf->kiq_unmap_queues(kiq_ring, &adev->gfx.gfx_ring[i],
>   					   PREEMPT_QUEUES, 0, 0);
> -
> -	return amdgpu_ring_test_helper(kiq_ring);
> +	if (!amdgpu_in_reset(adev))
> +		return amdgpu_ring_test_helper(kiq_ring);
> +	else
> +		return 0;
>   }
>   #endif
>   
> @@ -7610,6 +7615,7 @@ static int gfx_v10_0_hw_fini(void *handle)
>   
>   		return 0;
>   	}
> +
>   	gfx_v10_0_cp_enable(adev, false);
>   	gfx_v10_0_enable_gui_idle_interrupt(adev, false);
>   

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 3/5] drm/amdgpu: save and restore gc hub regs
  2022-07-22  7:34 ` [PATCH 3/5] drm/amdgpu: save and restore gc hub regs Victor Zhao
@ 2022-07-25 21:18   ` Andrey Grodzovsky
  0 siblings, 0 replies; 20+ messages in thread
From: Andrey Grodzovsky @ 2022-07-25 21:18 UTC (permalink / raw)
  To: Victor Zhao, amd-gfx; +Cc: Alexander.Deucher, emily.deng, Christian.Koenig

Acked-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>

Andrey

On 2022-07-22 03:34, Victor Zhao wrote:
> Save and restore gfxhub regs as they will be reset during mode 2
>
> Signed-off-by: Victor Zhao <Victor.Zhao@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h    |  2 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h       | 26 +++++++
>   drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c      | 72 +++++++++++++++++++
>   drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c   |  7 +-
>   .../include/asic_reg/gc/gc_10_3_0_offset.h    |  4 ++
>   5 files changed, 110 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h
> index beabab515836..f8036f2b100e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h
> @@ -35,6 +35,8 @@ struct amdgpu_gfxhub_funcs {
>   	void (*init)(struct amdgpu_device *adev);
>   	int (*get_xgmi_info)(struct amdgpu_device *adev);
>   	void (*utcl2_harvest)(struct amdgpu_device *adev);
> +	void (*mode2_save_regs)(struct amdgpu_device *adev);
> +	void (*mode2_restore_regs)(struct amdgpu_device *adev);
>   };
>   
>   struct amdgpu_gfxhub {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> index 008eaca27151..0305b660cd17 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> @@ -264,6 +264,32 @@ struct amdgpu_gmc {
>   	u64 mall_size;
>   	/* number of UMC instances */
>   	int num_umc;
> +	/* mode2 save restore */
> +	u64 VM_L2_CNTL;
> +	u64 VM_L2_CNTL2;
> +	u64 VM_DUMMY_PAGE_FAULT_CNTL;
> +	u64 VM_DUMMY_PAGE_FAULT_ADDR_LO32;
> +	u64 VM_DUMMY_PAGE_FAULT_ADDR_HI32;
> +	u64 VM_L2_PROTECTION_FAULT_CNTL;
> +	u64 VM_L2_PROTECTION_FAULT_CNTL2;
> +	u64 VM_L2_PROTECTION_FAULT_MM_CNTL3;
> +	u64 VM_L2_PROTECTION_FAULT_MM_CNTL4;
> +	u64 VM_L2_PROTECTION_FAULT_ADDR_LO32;
> +	u64 VM_L2_PROTECTION_FAULT_ADDR_HI32;
> +	u64 VM_DEBUG;
> +	u64 VM_L2_MM_GROUP_RT_CLASSES;
> +	u64 VM_L2_BANK_SELECT_RESERVED_CID;
> +	u64 VM_L2_BANK_SELECT_RESERVED_CID2;
> +	u64 VM_L2_CACHE_PARITY_CNTL;
> +	u64 VM_L2_IH_LOG_CNTL;
> +	u64 VM_CONTEXT_CNTL[16];
> +	u64 VM_CONTEXT_PAGE_TABLE_BASE_ADDR_LO32[16];
> +	u64 VM_CONTEXT_PAGE_TABLE_BASE_ADDR_HI32[16];
> +	u64 VM_CONTEXT_PAGE_TABLE_START_ADDR_LO32[16];
> +	u64 VM_CONTEXT_PAGE_TABLE_START_ADDR_HI32[16];
> +	u64 VM_CONTEXT_PAGE_TABLE_END_ADDR_LO32[16];
> +	u64 VM_CONTEXT_PAGE_TABLE_END_ADDR_HI32[16];
> +	u64 MC_VM_MX_L1_TLB_CNTL;
>   };
>   
>   #define amdgpu_gmc_flush_gpu_tlb(adev, vmid, vmhub, type) ((adev)->gmc.gmc_funcs->flush_gpu_tlb((adev), (vmid), (vmhub), (type)))
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c b/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c
> index d8c531581116..51cf8acd2d79 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c
> @@ -576,6 +576,76 @@ static void gfxhub_v2_1_utcl2_harvest(struct amdgpu_device *adev)
>   	}
>   }
>   
> +static void gfxhub_v2_1_save_regs(struct amdgpu_device *adev)
> +{
> +	int i;
> +	adev->gmc.VM_L2_CNTL = RREG32_SOC15(GC, 0, mmGCVM_L2_CNTL);
> +	adev->gmc.VM_L2_CNTL2 = RREG32_SOC15(GC, 0, mmGCVM_L2_CNTL2);
> +	adev->gmc.VM_DUMMY_PAGE_FAULT_CNTL = RREG32_SOC15(GC, 0, mmGCVM_DUMMY_PAGE_FAULT_CNTL);
> +	adev->gmc.VM_DUMMY_PAGE_FAULT_ADDR_LO32 = RREG32_SOC15(GC, 0, mmGCVM_DUMMY_PAGE_FAULT_ADDR_LO32);
> +	adev->gmc.VM_DUMMY_PAGE_FAULT_ADDR_HI32 = RREG32_SOC15(GC, 0, mmGCVM_DUMMY_PAGE_FAULT_ADDR_HI32);
> +	adev->gmc.VM_L2_PROTECTION_FAULT_CNTL = RREG32_SOC15(GC, 0, mmGCVM_L2_PROTECTION_FAULT_CNTL);
> +	adev->gmc.VM_L2_PROTECTION_FAULT_CNTL2 = RREG32_SOC15(GC, 0, mmGCVM_L2_PROTECTION_FAULT_CNTL2);
> +	adev->gmc.VM_L2_PROTECTION_FAULT_MM_CNTL3 = RREG32_SOC15(GC, 0, mmGCVM_L2_PROTECTION_FAULT_MM_CNTL3);
> +	adev->gmc.VM_L2_PROTECTION_FAULT_MM_CNTL4 = RREG32_SOC15(GC, 0, mmGCVM_L2_PROTECTION_FAULT_MM_CNTL4);
> +	adev->gmc.VM_L2_PROTECTION_FAULT_ADDR_LO32 = RREG32_SOC15(GC, 0, mmGCVM_L2_PROTECTION_FAULT_ADDR_LO32);
> +	adev->gmc.VM_L2_PROTECTION_FAULT_ADDR_HI32 = RREG32_SOC15(GC, 0, mmGCVM_L2_PROTECTION_FAULT_ADDR_HI32);
> +	adev->gmc.VM_DEBUG = RREG32_SOC15(GC, 0, mmGCVM_DEBUG);
> +	adev->gmc.VM_L2_MM_GROUP_RT_CLASSES = RREG32_SOC15(GC, 0, mmGCVM_L2_MM_GROUP_RT_CLASSES);
> +	adev->gmc.VM_L2_BANK_SELECT_RESERVED_CID = RREG32_SOC15(GC, 0, mmGCVM_L2_BANK_SELECT_RESERVED_CID);
> +	adev->gmc.VM_L2_BANK_SELECT_RESERVED_CID2 = RREG32_SOC15(GC, 0, mmGCVM_L2_BANK_SELECT_RESERVED_CID2);
> +	adev->gmc.VM_L2_CACHE_PARITY_CNTL = RREG32_SOC15(GC, 0, mmGCVM_L2_CACHE_PARITY_CNTL);
> +	adev->gmc.VM_L2_IH_LOG_CNTL = RREG32_SOC15(GC, 0, mmGCVM_L2_IH_LOG_CNTL);
> +
> +	for (i = 0; i <= 15; i++) {
> +		adev->gmc.VM_CONTEXT_CNTL[i] = RREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT0_CNTL, i);
> +		adev->gmc.VM_CONTEXT_PAGE_TABLE_BASE_ADDR_LO32[i] = RREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32, i * 2);
> +		adev->gmc.VM_CONTEXT_PAGE_TABLE_BASE_ADDR_HI32[i] = RREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32, i * 2);
> +		adev->gmc.VM_CONTEXT_PAGE_TABLE_START_ADDR_LO32[i] = RREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32, i * 2);
> +		adev->gmc.VM_CONTEXT_PAGE_TABLE_START_ADDR_HI32[i] = RREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32, i * 2);
> +		adev->gmc.VM_CONTEXT_PAGE_TABLE_END_ADDR_LO32[i] = RREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32, i * 2);
> +		adev->gmc.VM_CONTEXT_PAGE_TABLE_END_ADDR_HI32[i] = RREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32, i * 2);
> +	}
> +
> +	adev->gmc.MC_VM_MX_L1_TLB_CNTL = RREG32_SOC15(GC, 0, mmGCMC_VM_MX_L1_TLB_CNTL);
> +}
> +
> +static void gfxhub_v2_1_restore_regs(struct amdgpu_device *adev)
> +{
> +	int i;
> +	WREG32_SOC15(GC, 0, mmGCVM_L2_CNTL, adev->gmc.VM_L2_CNTL);
> +	WREG32_SOC15(GC, 0, mmGCVM_L2_CNTL2, adev->gmc.VM_L2_CNTL2);
> +	WREG32_SOC15(GC, 0, mmGCVM_DUMMY_PAGE_FAULT_CNTL, adev->gmc.VM_DUMMY_PAGE_FAULT_CNTL);
> +	WREG32_SOC15(GC, 0, mmGCVM_DUMMY_PAGE_FAULT_ADDR_LO32, adev->gmc.VM_DUMMY_PAGE_FAULT_ADDR_LO32);
> +	WREG32_SOC15(GC, 0, mmGCVM_DUMMY_PAGE_FAULT_ADDR_HI32, adev->gmc.VM_DUMMY_PAGE_FAULT_ADDR_HI32);
> +	WREG32_SOC15(GC, 0, mmGCVM_L2_PROTECTION_FAULT_CNTL, adev->gmc.VM_L2_PROTECTION_FAULT_CNTL);
> +	WREG32_SOC15(GC, 0, mmGCVM_L2_PROTECTION_FAULT_CNTL2, adev->gmc.VM_L2_PROTECTION_FAULT_CNTL2);
> +	WREG32_SOC15(GC, 0, mmGCVM_L2_PROTECTION_FAULT_MM_CNTL3, adev->gmc.VM_L2_PROTECTION_FAULT_MM_CNTL3);
> +	WREG32_SOC15(GC, 0, mmGCVM_L2_PROTECTION_FAULT_MM_CNTL4, adev->gmc.VM_L2_PROTECTION_FAULT_MM_CNTL4);
> +	WREG32_SOC15(GC, 0, mmGCVM_L2_PROTECTION_FAULT_ADDR_LO32, adev->gmc.VM_L2_PROTECTION_FAULT_ADDR_LO32);
> +	WREG32_SOC15(GC, 0, mmGCVM_L2_PROTECTION_FAULT_ADDR_HI32, adev->gmc.VM_L2_PROTECTION_FAULT_ADDR_HI32);
> +	WREG32_SOC15(GC, 0, mmGCVM_DEBUG, adev->gmc.VM_DEBUG);
> +	WREG32_SOC15(GC, 0, mmGCVM_L2_MM_GROUP_RT_CLASSES, adev->gmc.VM_L2_MM_GROUP_RT_CLASSES);
> +	WREG32_SOC15(GC, 0, mmGCVM_L2_BANK_SELECT_RESERVED_CID, adev->gmc.VM_L2_BANK_SELECT_RESERVED_CID);
> +	WREG32_SOC15(GC, 0, mmGCVM_L2_BANK_SELECT_RESERVED_CID2, adev->gmc.VM_L2_BANK_SELECT_RESERVED_CID2);
> +	WREG32_SOC15(GC, 0, mmGCVM_L2_CACHE_PARITY_CNTL, adev->gmc.VM_L2_CACHE_PARITY_CNTL);
> +	WREG32_SOC15(GC, 0, mmGCVM_L2_IH_LOG_CNTL, adev->gmc.VM_L2_IH_LOG_CNTL);
> +
> +	for (i = 0; i <= 15; i++) {
> +		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT0_CNTL, i, adev->gmc.VM_CONTEXT_CNTL[i]);
> +		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32, i * 2, adev->gmc.VM_CONTEXT_PAGE_TABLE_BASE_ADDR_LO32[i]);
> +		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32, i * 2, adev->gmc.VM_CONTEXT_PAGE_TABLE_BASE_ADDR_HI32[i]);
> +		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32, i * 2, adev->gmc.VM_CONTEXT_PAGE_TABLE_START_ADDR_LO32[i]);
> +		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32, i * 2, adev->gmc.VM_CONTEXT_PAGE_TABLE_START_ADDR_HI32[i]);
> +		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32, i * 2, adev->gmc.VM_CONTEXT_PAGE_TABLE_END_ADDR_LO32[i]);
> +		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32, i * 2, adev->gmc.VM_CONTEXT_PAGE_TABLE_END_ADDR_HI32[i]);
> +	}
> +
> +	WREG32_SOC15(GC, 0, mmGCMC_VM_FB_LOCATION_BASE, adev->gmc.vram_start >> 24);
> +	WREG32_SOC15(GC, 0, mmGCMC_VM_FB_LOCATION_TOP, adev->gmc.vram_end >> 24);
> +	WREG32_SOC15(GC, 0, mmGCMC_VM_MX_L1_TLB_CNTL, adev->gmc.MC_VM_MX_L1_TLB_CNTL);
> +}
> +
>   const struct amdgpu_gfxhub_funcs gfxhub_v2_1_funcs = {
>   	.get_fb_location = gfxhub_v2_1_get_fb_location,
>   	.get_mc_fb_offset = gfxhub_v2_1_get_mc_fb_offset,
> @@ -586,4 +656,6 @@ const struct amdgpu_gfxhub_funcs gfxhub_v2_1_funcs = {
>   	.init = gfxhub_v2_1_init,
>   	.get_xgmi_info = gfxhub_v2_1_get_xgmi_info,
>   	.utcl2_harvest = gfxhub_v2_1_utcl2_harvest,
> +	.mode2_save_regs = gfxhub_v2_1_save_regs,
> +	.mode2_restore_regs = gfxhub_v2_1_restore_regs,
>   };
> diff --git a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
> index 0512960bed23..51a5b68f77d3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
> @@ -94,8 +94,11 @@ sienna_cichlid_mode2_prepare_hwcontext(struct amdgpu_reset_control *reset_ctl,
>   	int r = 0;
>   	struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;
>   
> -	if (!amdgpu_sriov_vf(adev))
> +	if (!amdgpu_sriov_vf(adev)) {
> +		if (adev->gfxhub.funcs->mode2_save_regs)
> +			adev->gfxhub.funcs->mode2_save_regs(adev);
>   		r = sienna_cichlid_mode2_suspend_ip(adev);
> +	}
>   
>   	return r;
>   }
> @@ -152,6 +155,8 @@ static int sienna_cichlid_mode2_restore_ip(struct amdgpu_device *adev)
>   	}
>   
>   	/* Reinit GFXHUB */
> +	if (adev->gfxhub.funcs->mode2_restore_regs)
> +		adev->gfxhub.funcs->mode2_restore_regs(adev);
>   	adev->gfxhub.funcs->init(adev);
>   	r = adev->gfxhub.funcs->gart_enable(adev);
>   	if (r) {
> diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_offset.h b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_offset.h
> index f21554a1c86c..594bffce93a9 100644
> --- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_offset.h
> +++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_offset.h
> @@ -3129,6 +3129,8 @@
>   #define mmGCVM_L2_PROTECTION_FAULT_DEFAULT_ADDR_LO32_BASE_IDX                                          0
>   #define mmGCVM_L2_PROTECTION_FAULT_DEFAULT_ADDR_HI32                                                   0x15cc
>   #define mmGCVM_L2_PROTECTION_FAULT_DEFAULT_ADDR_HI32_BASE_IDX                                          0
> +#define mmGCVM_DEBUG                                                                                   0x15cd
> +#define mmGCVM_DEBUG_BASE_IDX                                                                          0
>   #define mmGCVM_L2_CONTEXT1_IDENTITY_APERTURE_LOW_ADDR_LO32                                             0x15ce
>   #define mmGCVM_L2_CONTEXT1_IDENTITY_APERTURE_LOW_ADDR_LO32_BASE_IDX                                    0
>   #define mmGCVM_L2_CONTEXT1_IDENTITY_APERTURE_LOW_ADDR_HI32                                             0x15cf
> @@ -3151,6 +3153,8 @@
>   #define mmGCVM_L2_BANK_SELECT_RESERVED_CID2_BASE_IDX                                                   0
>   #define mmGCVM_L2_CACHE_PARITY_CNTL                                                                    0x15d8
>   #define mmGCVM_L2_CACHE_PARITY_CNTL_BASE_IDX                                                           0
> +#define mmGCVM_L2_IH_LOG_CNTL                                                                          0x15d9
> +#define mmGCVM_L2_IH_LOG_CNTL_BASE_IDX                                                                 0
>   #define mmGCVM_L2_CNTL5                                                                                0x15dc
>   #define mmGCVM_L2_CNTL5_BASE_IDX                                                                       0
>   #define mmGCVM_L2_GCR_CNTL                                                                             0x15dd

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 4/5] drm/amdgpu: revert context to stop engine before mode2 reset
  2022-07-22  7:34 ` [PATCH 4/5] drm/amdgpu: revert context to stop engine before mode2 reset Victor Zhao
@ 2022-07-25 21:19   ` Andrey Grodzovsky
  2022-07-26 10:01     ` Zhao, Victor
  0 siblings, 1 reply; 20+ messages in thread
From: Andrey Grodzovsky @ 2022-07-25 21:19 UTC (permalink / raw)
  To: Victor Zhao, amd-gfx; +Cc: Alexander.Deucher, emily.deng, Christian.Koenig

On 2022-07-22 03:34, Victor Zhao wrote:

> For some hang caused by slow tests, engine cannot be stopped which
> may cause resume failure after reset. In this case, force halt
> engine by reverting context addresses


Can you maybe explain a bit more what exactly you mean by slow test and
why engine cannot be stopped in this case ?

Andrey


>
> Signed-off-by: Victor Zhao <Victor.Zhao@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  1 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h  |  1 +
>   drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c    | 36 +++++++++++++++++++++
>   drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c |  2 ++
>   4 files changed, 40 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 5498fda8617f..833dc5e224d3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -5037,6 +5037,7 @@ static void amdgpu_device_recheck_guilty_jobs(
>   
>   			/* set guilty */
>   			drm_sched_increase_karma(s_job);
> +			amdgpu_reset_prepare_hwcontext(adev, reset_context);
>   retry:
>   			/* do hw reset */
>   			if (amdgpu_sriov_vf(adev)) {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h
> index f8036f2b100e..c7b44aeb671b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h
> @@ -37,6 +37,7 @@ struct amdgpu_gfxhub_funcs {
>   	void (*utcl2_harvest)(struct amdgpu_device *adev);
>   	void (*mode2_save_regs)(struct amdgpu_device *adev);
>   	void (*mode2_restore_regs)(struct amdgpu_device *adev);
> +	void (*halt)(struct amdgpu_device *adev);
>   };
>   
>   struct amdgpu_gfxhub {
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c b/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c
> index 51cf8acd2d79..8cf53e039c11 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c
> @@ -646,6 +646,41 @@ static void gfxhub_v2_1_restore_regs(struct amdgpu_device *adev)
>   	WREG32_SOC15(GC, 0, mmGCMC_VM_MX_L1_TLB_CNTL, adev->gmc.MC_VM_MX_L1_TLB_CNTL);
>   }
>   
> +static void gfxhub_v2_1_halt(struct amdgpu_device *adev)
> +{
> +	struct amdgpu_vmhub *hub = &adev->vmhub[AMDGPU_GFXHUB_0];
> +	int i;
> +	uint32_t tmp;
> +	int time = 1000;
> +
> +	gfxhub_v2_1_set_fault_enable_default(adev, false);
> +
> +	for (i = 0; i <= 14; i++) {
> +		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32,
> +				    i * hub->ctx_addr_distance, ~0);
> +		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_PAGE_TABLE_START_ADDR_HI32,
> +				    i * hub->ctx_addr_distance, ~0);
> +		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_PAGE_TABLE_END_ADDR_LO32,
> +				    i * hub->ctx_addr_distance,
> +				    0);
> +		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_PAGE_TABLE_END_ADDR_HI32,
> +				    i * hub->ctx_addr_distance,
> +				    0);
> +	}
> +	tmp = RREG32_SOC15(GC, 0, mmGRBM_STATUS2);
> +	while ((tmp & (GRBM_STATUS2__EA_BUSY_MASK |
> +		      GRBM_STATUS2__EA_LINK_BUSY_MASK)) != 0 &&
> +	       time) {
> +		udelay(100);
> +		time--;
> +		tmp = RREG32_SOC15(GC, 0, mmGRBM_STATUS2);
> +	}
> +
> +	if (!time) {
> +		DRM_WARN("failed to wait for GRBM(EA) idle\n");
> +	}
> +}
> +
>   const struct amdgpu_gfxhub_funcs gfxhub_v2_1_funcs = {
>   	.get_fb_location = gfxhub_v2_1_get_fb_location,
>   	.get_mc_fb_offset = gfxhub_v2_1_get_mc_fb_offset,
> @@ -658,4 +693,5 @@ const struct amdgpu_gfxhub_funcs gfxhub_v2_1_funcs = {
>   	.utcl2_harvest = gfxhub_v2_1_utcl2_harvest,
>   	.mode2_save_regs = gfxhub_v2_1_save_regs,
>   	.mode2_restore_regs = gfxhub_v2_1_restore_regs,
> +	.halt = gfxhub_v2_1_halt,
>   };
> diff --git a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
> index 51a5b68f77d3..fead7251292f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
> @@ -97,6 +97,8 @@ sienna_cichlid_mode2_prepare_hwcontext(struct amdgpu_reset_control *reset_ctl,
>   	if (!amdgpu_sriov_vf(adev)) {
>   		if (adev->gfxhub.funcs->mode2_save_regs)
>   			adev->gfxhub.funcs->mode2_save_regs(adev);
> +		if (adev->gfxhub.funcs->halt)
> +			adev->gfxhub.funcs->halt(adev);
>   		r = sienna_cichlid_mode2_suspend_ip(adev);
>   	}
>   

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [PATCH 5/5] drm/amdgpu: reduce reset time
  2022-07-25 21:17   ` Andrey Grodzovsky
@ 2022-07-26  9:40     ` Zhao, Victor
  2022-07-26 16:57       ` Andrey Grodzovsky
  0 siblings, 1 reply; 20+ messages in thread
From: Zhao, Victor @ 2022-07-26  9:40 UTC (permalink / raw)
  To: Grodzovsky, Andrey, amd-gfx
  Cc: Deucher, Alexander, Deng, Emily, Koenig, Christian

[AMD Official Use Only - General]

Hi Andrey,

Reply inline.


Thanks,
Victor



-----Original Message-----
From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com> 
Sent: Tuesday, July 26, 2022 5:18 AM
To: Zhao, Victor <Victor.Zhao@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Deng, Emily <Emily.Deng@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
Subject: Re: [PATCH 5/5] drm/amdgpu: reduce reset time


On 2022-07-22 03:34, Victor Zhao wrote:
> In multi container use case, reset time is important, so skip ring 
> tests and cp halt wait during ip suspending for reset as they are 
> going to fail and cost more time on reset


Why are they failing in this case ? Skipping ring tests is not the best idea as you loose important indicator of system's sanity. Is there any way to make them work ?

[Victor]: I've seen gfx ring test fail every time after a gfx engine hang. I thought it should be expected as gfx is in a bad state. Do you know the reason we have ring tests before reset? As we are going to reset the asic anyway.
Another approach could be to make the skip mode2 only or reduce the wait time here.


>
> Signed-off-by: Victor Zhao <Victor.Zhao@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c |  2 +-
>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c  | 26 +++++++++++++++----------
>   2 files changed, 17 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index 222d3d7ea076..f872495ccc3a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -477,7 +477,7 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev)
>   		kiq->pmf->kiq_unmap_queues(kiq_ring, &adev->gfx.compute_ring[i],
>   					   RESET_QUEUES, 0, 0);
>   
> -	if (adev->gfx.kiq.ring.sched.ready)
> +	if (adev->gfx.kiq.ring.sched.ready && !amdgpu_in_reset(adev))
>   		r = amdgpu_ring_test_helper(kiq_ring);
>   	spin_unlock(&adev->gfx.kiq.ring_lock);
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index fafbad3cf08d..9ae29023e38f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -5971,16 +5971,19 @@ static int gfx_v10_0_cp_gfx_enable(struct amdgpu_device *adev, bool enable)
>   		WREG32_SOC15(GC, 0, mmCP_ME_CNTL, tmp);
>   	}
>   
> -	for (i = 0; i < adev->usec_timeout; i++) {
> -		if (RREG32_SOC15(GC, 0, mmCP_STAT) == 0)
> -			break;
> -		udelay(1);
> -	}
> -
> -	if (i >= adev->usec_timeout)
> -		DRM_ERROR("failed to %s cp gfx\n", enable ? "unhalt" : "halt");
> +	if (!amdgpu_in_reset(adev)) {
> +		for (i = 0; i < adev->usec_timeout; i++) {
> +			if (RREG32_SOC15(GC, 0, mmCP_STAT) == 0)
> +				break;
> +			udelay(1);
> +		}
>   
> +		if (i >= adev->usec_timeout)
> +			DRM_ERROR("failed to %s cp gfx\n",
> +				  enable ? "unhalt" : "halt");
> +	}
>   	return 0;
> +
>   }


This change has impact beyond container case no ? We had no issue with this code during regular reset cases so why we would give up on this code which confirms CP is idle ? What is the side effect of skipping this during all GPU resets ?

Andrey

[Victor]: I see "failed to halt cp gfx" with regular reset cases as well when doing a gfx hang test using quark. I haven't seen a side effect with Mode1 reset yet but maybe shorten the wait time could be better?

>   
>   static int gfx_v10_0_cp_gfx_load_pfp_microcode(struct amdgpu_device 
> *adev) @@ -7569,8 +7572,10 @@ static int gfx_v10_0_kiq_disable_kgq(struct amdgpu_device *adev)
>   	for (i = 0; i < adev->gfx.num_gfx_rings; i++)
>   		kiq->pmf->kiq_unmap_queues(kiq_ring, &adev->gfx.gfx_ring[i],
>   					   PREEMPT_QUEUES, 0, 0);
> -
> -	return amdgpu_ring_test_helper(kiq_ring);
> +	if (!amdgpu_in_reset(adev))
> +		return amdgpu_ring_test_helper(kiq_ring);
> +	else
> +		return 0;
>   }
>   #endif
>   
> @@ -7610,6 +7615,7 @@ static int gfx_v10_0_hw_fini(void *handle)
>   
>   		return 0;
>   	}
> +
>   	gfx_v10_0_cp_enable(adev, false);
>   	gfx_v10_0_enable_gui_idle_interrupt(adev, false);
>   

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [PATCH 4/5] drm/amdgpu: revert context to stop engine before mode2 reset
  2022-07-25 21:19   ` Andrey Grodzovsky
@ 2022-07-26 10:01     ` Zhao, Victor
  2022-07-26 16:59       ` Andrey Grodzovsky
  0 siblings, 1 reply; 20+ messages in thread
From: Zhao, Victor @ 2022-07-26 10:01 UTC (permalink / raw)
  To: Grodzovsky, Andrey, amd-gfx
  Cc: Deucher, Alexander, Deng, Emily, Koenig, Christian

[AMD Official Use Only - General]

Hi Andrey,

For slow tests I mean the slow hang tests by quark tool.
An example here:
hang_vm_gfx_dispatch_slow.lua - This script runs on a graphics engine using compute engine and has a hacked CS program which is massive and duplicates standard CS program move code hundreds of thousands of times. The effect is a very slowly executing CS program.

It's not a bad job but just need a very long time to finish. I suppose we don’t have a way to stop shader here. And the running apps will be affected when reset is done.


Thanks,
Victor



-----Original Message-----
From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com> 
Sent: Tuesday, July 26, 2022 5:20 AM
To: Zhao, Victor <Victor.Zhao@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Deng, Emily <Emily.Deng@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
Subject: Re: [PATCH 4/5] drm/amdgpu: revert context to stop engine before mode2 reset

On 2022-07-22 03:34, Victor Zhao wrote:

> For some hang caused by slow tests, engine cannot be stopped which may 
> cause resume failure after reset. In this case, force halt engine by 
> reverting context addresses


Can you maybe explain a bit more what exactly you mean by slow test and why engine cannot be stopped in this case ?

Andrey


>
> Signed-off-by: Victor Zhao <Victor.Zhao@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  1 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h  |  1 +
>   drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c    | 36 +++++++++++++++++++++
>   drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c |  2 ++
>   4 files changed, 40 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 5498fda8617f..833dc5e224d3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -5037,6 +5037,7 @@ static void amdgpu_device_recheck_guilty_jobs(
>   
>   			/* set guilty */
>   			drm_sched_increase_karma(s_job);
> +			amdgpu_reset_prepare_hwcontext(adev, reset_context);
>   retry:
>   			/* do hw reset */
>   			if (amdgpu_sriov_vf(adev)) {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h
> index f8036f2b100e..c7b44aeb671b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h
> @@ -37,6 +37,7 @@ struct amdgpu_gfxhub_funcs {
>   	void (*utcl2_harvest)(struct amdgpu_device *adev);
>   	void (*mode2_save_regs)(struct amdgpu_device *adev);
>   	void (*mode2_restore_regs)(struct amdgpu_device *adev);
> +	void (*halt)(struct amdgpu_device *adev);
>   };
>   
>   struct amdgpu_gfxhub {
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c 
> b/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c
> index 51cf8acd2d79..8cf53e039c11 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c
> @@ -646,6 +646,41 @@ static void gfxhub_v2_1_restore_regs(struct amdgpu_device *adev)
>   	WREG32_SOC15(GC, 0, mmGCMC_VM_MX_L1_TLB_CNTL, adev->gmc.MC_VM_MX_L1_TLB_CNTL);
>   }
>   
> +static void gfxhub_v2_1_halt(struct amdgpu_device *adev) {
> +	struct amdgpu_vmhub *hub = &adev->vmhub[AMDGPU_GFXHUB_0];
> +	int i;
> +	uint32_t tmp;
> +	int time = 1000;
> +
> +	gfxhub_v2_1_set_fault_enable_default(adev, false);
> +
> +	for (i = 0; i <= 14; i++) {
> +		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32,
> +				    i * hub->ctx_addr_distance, ~0);
> +		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_PAGE_TABLE_START_ADDR_HI32,
> +				    i * hub->ctx_addr_distance, ~0);
> +		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_PAGE_TABLE_END_ADDR_LO32,
> +				    i * hub->ctx_addr_distance,
> +				    0);
> +		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_PAGE_TABLE_END_ADDR_HI32,
> +				    i * hub->ctx_addr_distance,
> +				    0);
> +	}
> +	tmp = RREG32_SOC15(GC, 0, mmGRBM_STATUS2);
> +	while ((tmp & (GRBM_STATUS2__EA_BUSY_MASK |
> +		      GRBM_STATUS2__EA_LINK_BUSY_MASK)) != 0 &&
> +	       time) {
> +		udelay(100);
> +		time--;
> +		tmp = RREG32_SOC15(GC, 0, mmGRBM_STATUS2);
> +	}
> +
> +	if (!time) {
> +		DRM_WARN("failed to wait for GRBM(EA) idle\n");
> +	}
> +}
> +
>   const struct amdgpu_gfxhub_funcs gfxhub_v2_1_funcs = {
>   	.get_fb_location = gfxhub_v2_1_get_fb_location,
>   	.get_mc_fb_offset = gfxhub_v2_1_get_mc_fb_offset, @@ -658,4 +693,5 
> @@ const struct amdgpu_gfxhub_funcs gfxhub_v2_1_funcs = {
>   	.utcl2_harvest = gfxhub_v2_1_utcl2_harvest,
>   	.mode2_save_regs = gfxhub_v2_1_save_regs,
>   	.mode2_restore_regs = gfxhub_v2_1_restore_regs,
> +	.halt = gfxhub_v2_1_halt,
>   };
> diff --git a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c 
> b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
> index 51a5b68f77d3..fead7251292f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
> @@ -97,6 +97,8 @@ sienna_cichlid_mode2_prepare_hwcontext(struct amdgpu_reset_control *reset_ctl,
>   	if (!amdgpu_sriov_vf(adev)) {
>   		if (adev->gfxhub.funcs->mode2_save_regs)
>   			adev->gfxhub.funcs->mode2_save_regs(adev);
> +		if (adev->gfxhub.funcs->halt)
> +			adev->gfxhub.funcs->halt(adev);
>   		r = sienna_cichlid_mode2_suspend_ip(adev);
>   	}
>   

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [PATCH 1/5] drm/amdgpu: add mode2 reset for sienna_cichlid
  2022-07-25 21:05 ` [PATCH 1/5] drm/amdgpu: add mode2 reset for sienna_cichlid Andrey Grodzovsky
@ 2022-07-26 10:04   ` Zhao, Victor
  0 siblings, 0 replies; 20+ messages in thread
From: Zhao, Victor @ 2022-07-26 10:04 UTC (permalink / raw)
  To: Grodzovsky, Andrey, amd-gfx
  Cc: Deucher, Alexander, Deng, Emily, Koenig, Christian

[AMD Official Use Only - General]

Will do.


Thanks,
Victor



-----Original Message-----
From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com> 
Sent: Tuesday, July 26, 2022 5:06 AM
To: Zhao, Victor <Victor.Zhao@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Deng, Emily <Emily.Deng@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
Subject: Re: [PATCH 1/5] drm/amdgpu: add mode2 reset for sienna_cichlid


On 2022-07-22 03:33, Victor Zhao wrote:
> To meet the requirement for multi container usecase which needs a 
> quicker reset and not causing VRAM lost, adding the Mode2 reset 
> handler for sienna_cichlid. Adding a AMDGPU_SKIP_MODE2_RESET flag so 
> driver can fallback to default reset method when mode2 reset failed 
> and retry.
>
> - add mode2 reset handler for sienna_cichlid


Seems to me ASIC specific changes should be in a seperate patch


> - introduce AMDGPU_SKIP_MODE2_RESET flag
> - let mode2 reset fallback to default reset method if failed
>
> Signed-off-by: Victor Zhao <Victor.Zhao@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/Makefile           |   2 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c    |   1 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c    |   7 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c       |   1 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c       |   1 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c     |  13 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h     |   1 +
>   drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c         |   1 +
>   drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c         |   1 +
>   drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c         |   1 +
>   drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c   | 297 ++++++++++++++++++
>   drivers/gpu/drm/amd/amdgpu/sienna_cichlid.h   |  32 ++
>   .../pm/swsmu/inc/pmfw_if/smu_v11_0_7_ppsmc.h  |   4 +-
>   drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h  |   3 +-
>   .../amd/pm/swsmu/smu11/sienna_cichlid_ppt.c   |  54 ++++
>   15 files changed, 414 insertions(+), 5 deletions(-)
>   create mode 100644 drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
>   create mode 100644 drivers/gpu/drm/amd/amdgpu/sienna_cichlid.h
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile 
> b/drivers/gpu/drm/amd/amdgpu/Makefile
> index c7d0cd15b5ef..7030ac2d7d2c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/Makefile
> +++ b/drivers/gpu/drm/amd/amdgpu/Makefile
> @@ -75,7 +75,7 @@ amdgpu-y += \
>   	vi.o mxgpu_vi.o nbio_v6_1.o soc15.o emu_soc.o mxgpu_ai.o nbio_v7_0.o vega10_reg_init.o \
>   	vega20_reg_init.o nbio_v7_4.o nbio_v2_3.o nv.o arct_reg_init.o mxgpu_nv.o \
>   	nbio_v7_2.o hdp_v4_0.o hdp_v5_0.o aldebaran_reg_init.o aldebaran.o soc21.o \
> -	nbio_v4_3.o hdp_v6_0.o nbio_v7_7.o hdp_v5_2.o lsdma_v6_0.o
> +	sienna_cichlid.o nbio_v4_3.o hdp_v6_0.o nbio_v7_7.o hdp_v5_2.o 
> +lsdma_v6_0.o
>   
>   # add DF block
>   amdgpu-y += \
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> index 5e53a5293935..091415a4abf0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> @@ -135,6 +135,7 @@ static void amdgpu_amdkfd_reset_work(struct work_struct *work)
>   	reset_context.method = AMD_RESET_METHOD_NONE;
>   	reset_context.reset_req_dev = adev;
>   	clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
> +	clear_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags);
>   
>   	amdgpu_device_gpu_recover(adev, NULL, &reset_context);
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index b79ee4ffb879..5498fda8617f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -5146,6 +5146,7 @@ int amdgpu_device_gpu_recover(struct 
> amdgpu_device *adev,
>   
>   	reset_context->job = job;
>   	reset_context->hive = hive;
> +
>   	/*
>   	 * Build list of devices to reset.
>   	 * In case we are in XGMI hive mode, resort the device list @@ 
> -5265,8 +5266,11 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   			amdgpu_ras_resume(adev);
>   	} else {
>   		r = amdgpu_do_asic_reset(device_list_handle, reset_context);
> -		if (r && r == -EAGAIN)
> +		if (r && r == -EAGAIN) {
> +			set_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context->flags);
> +			adev->asic_reset_res = 0;


See my question bellow related to this set


>   			goto retry;
> +		}
>   	}
>   
>   skip_hw_reset:
> @@ -5694,6 +5698,7 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
>   	reset_context.reset_req_dev = adev;
>   	set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
>   	set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
> +	set_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags);
>   
>   	adev->no_hw_access = true;
>   	r = amdgpu_device_pre_asic_reset(adev, &reset_context); diff --git 
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index 10fdd12cf853..9844d99075e9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -71,6 +71,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
>   		reset_context.method = AMD_RESET_METHOD_NONE;
>   		reset_context.reset_req_dev = adev;
>   		clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
> +		clear_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags);
>   
>   		r = amdgpu_device_gpu_recover(ring->adev, job, &reset_context);
>   		if (r)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 4f321375015c..259ec860890a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -1949,6 +1949,7 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
>   		reset_context.method = AMD_RESET_METHOD_NONE;
>   		reset_context.reset_req_dev = adev;
>   		clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
> +		clear_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags);
>   
>   		amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context);
>   	}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> index 32c86a0b145c..831fb222139c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> @@ -23,6 +23,7 @@
>   
>   #include "amdgpu_reset.h"
>   #include "aldebaran.h"
> +#include "sienna_cichlid.h"
>   
>   int amdgpu_reset_add_handler(struct amdgpu_reset_control *reset_ctl,
>   			     struct amdgpu_reset_handler *handler) @@ -40,6 +41,9 @@ int 
> amdgpu_reset_init(struct amdgpu_device *adev)
>   	case IP_VERSION(13, 0, 2):
>   		ret = aldebaran_reset_init(adev);
>   		break;
> +	case IP_VERSION(11, 0, 7):
> +		ret = sienna_cichlid_reset_init(adev);
> +		break;
>   	default:
>   		break;
>   	}
> @@ -55,6 +59,9 @@ int amdgpu_reset_fini(struct amdgpu_device *adev)
>   	case IP_VERSION(13, 0, 2):
>   		ret = aldebaran_reset_fini(adev);
>   		break;
> +	case IP_VERSION(11, 0, 7):
> +		ret = sienna_cichlid_reset_fini(adev);
> +		break;
>   	default:
>   		break;
>   	}
> @@ -67,6 +74,9 @@ int amdgpu_reset_prepare_hwcontext(struct amdgpu_device *adev,
>   {
>   	struct amdgpu_reset_handler *reset_handler = NULL;
>   
> +	if (test_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context->flags))
> +		return -ENOSYS;
> +
>   	if (adev->reset_cntl && adev->reset_cntl->get_reset_handler)
>   		reset_handler = adev->reset_cntl->get_reset_handler(
>   			adev->reset_cntl, reset_context); @@ -83,6 +93,9 @@ int 
> amdgpu_reset_perform_reset(struct amdgpu_device *adev,
>   	int ret;
>   	struct amdgpu_reset_handler *reset_handler = NULL;
>   
> +	if (test_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context->flags))
> +		return -ENOSYS;
> +
>   	if (adev->reset_cntl)
>   		reset_handler = adev->reset_cntl->get_reset_handler(
>   			adev->reset_cntl, reset_context); diff --git 
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> index 9e55a5d7a825..cc4b2eeb24cf 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> @@ -30,6 +30,7 @@ enum AMDGPU_RESET_FLAGS {
>   
>   	AMDGPU_NEED_FULL_RESET = 0,
>   	AMDGPU_SKIP_HW_RESET = 1,
> +	AMDGPU_SKIP_MODE2_RESET = 2,
>   };
>   
>   struct amdgpu_reset_context {
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> index 12906ba74462..a2f04b249132 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> @@ -290,6 +290,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
>   		reset_context.method = AMD_RESET_METHOD_NONE;
>   		reset_context.reset_req_dev = adev;
>   		clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
> +		clear_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags);
>   
>   		amdgpu_device_gpu_recover(adev, NULL, &reset_context);
>   	}
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> index e07757eea7ad..a977f0027928 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> @@ -317,6 +317,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
>   		reset_context.method = AMD_RESET_METHOD_NONE;
>   		reset_context.reset_req_dev = adev;
>   		clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
> +		clear_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags);
>   
>   		amdgpu_device_gpu_recover(adev, NULL, &reset_context);
>   	}
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c 
> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
> index 288c414babdf..fd14fa9b9cd7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
> @@ -529,6 +529,7 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct *work)
>   		reset_context.method = AMD_RESET_METHOD_NONE;
>   		reset_context.reset_req_dev = adev;
>   		clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
> +		clear_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags);
>   
>   		amdgpu_device_gpu_recover(adev, NULL, &reset_context);
>   	}
> diff --git a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c 
> b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
> new file mode 100644
> index 000000000000..0512960bed23
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
> @@ -0,0 +1,297 @@
> +/*
> + * Copyright 2021 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person 
> +obtaining a
> + * copy of this software and associated documentation files (the 
> +"Software"),
> + * to deal in the Software without restriction, including without 
> +limitation
> + * the rights to use, copy, modify, merge, publish, distribute, 
> +sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom 
> +the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be 
> +included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
> +EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
> +MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT 
> +SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, 
> +DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
> +OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE 
> +OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + *
> + */
> +
> +#include "sienna_cichlid.h"
> +#include "amdgpu_reset.h"
> +#include "amdgpu_amdkfd.h"
> +#include "amdgpu_dpm.h"
> +#include "amdgpu_job.h"
> +#include "amdgpu_ring.h"
> +#include "amdgpu_ras.h"
> +#include "amdgpu_psp.h"
> +#include "amdgpu_xgmi.h"
> +
> +static struct amdgpu_reset_handler *
> +sienna_cichlid_get_reset_handler(struct amdgpu_reset_control *reset_ctl,
> +			    struct amdgpu_reset_context *reset_context) {
> +	struct amdgpu_reset_handler *handler;
> +	struct amdgpu_device *adev = (struct amdgpu_device 
> +*)reset_ctl->handle;
> +
> +	if (reset_context->method != AMD_RESET_METHOD_NONE) {
> +		list_for_each_entry(handler, &reset_ctl->reset_handlers,
> +				     handler_list) {
> +			if (handler->reset_method == reset_context->method)
> +				return handler;
> +		}
> +	} else {
> +		list_for_each_entry(handler, &reset_ctl->reset_handlers,
> +				     handler_list) {
> +			if (handler->reset_method == AMD_RESET_METHOD_MODE2 &&
> +			    adev->pm.fw_version >= 0x3a5500 &&
> +			    !amdgpu_sriov_vf(adev)) {
> +				reset_context->method = AMD_RESET_METHOD_MODE2;
> +				return handler;
> +			}
> +		}
> +	}
> +
> +	return NULL;
> +}
> +
> +static int sienna_cichlid_mode2_suspend_ip(struct amdgpu_device 
> +*adev) {
> +	int r, i;
> +
> +	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
> +	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
> +
> +	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
> +		if (!(adev->ip_blocks[i].version->type ==
> +			      AMD_IP_BLOCK_TYPE_GFX ||
> +		      adev->ip_blocks[i].version->type ==
> +			      AMD_IP_BLOCK_TYPE_SDMA))
> +			continue;
> +
> +		r = adev->ip_blocks[i].version->funcs->suspend(adev);
> +
> +		if (r) {
> +			dev_err(adev->dev,
> +				"suspend of IP block <%s> failed %d\n",
> +				adev->ip_blocks[i].version->funcs->name, r);
> +			return r;
> +		}
> +		adev->ip_blocks[i].status.hw = false;
> +	}
> +
> +	return r;
> +}
> +
> +static int
> +sienna_cichlid_mode2_prepare_hwcontext(struct amdgpu_reset_control *reset_ctl,
> +				  struct amdgpu_reset_context *reset_context) {
> +	int r = 0;
> +	struct amdgpu_device *adev = (struct amdgpu_device 
> +*)reset_ctl->handle;
> +
> +	if (!amdgpu_sriov_vf(adev))
> +		r = sienna_cichlid_mode2_suspend_ip(adev);
> +
> +	return r;
> +}
> +
> +static void sienna_cichlid_async_reset(struct work_struct *work) {
> +	struct amdgpu_reset_handler *handler;
> +	struct amdgpu_reset_control *reset_ctl =
> +		container_of(work, struct amdgpu_reset_control, reset_work);
> +	struct amdgpu_device *adev = (struct amdgpu_device 
> +*)reset_ctl->handle;
> +
> +	list_for_each_entry(handler, &reset_ctl->reset_handlers,
> +			     handler_list) {
> +		if (handler->reset_method == reset_ctl->active_reset) {
> +			dev_dbg(adev->dev, "Resetting device\n");
> +			handler->do_reset(adev);
> +			break;
> +		}
> +	}
> +}
> +
> +static int sienna_cichlid_mode2_reset(struct amdgpu_device *adev) {
> +	/* disable BM */
> +	pci_clear_master(adev->pdev);
> +	adev->asic_reset_res = amdgpu_dpm_mode2_reset(adev);


Why do you need to set adev->asic_reset_res here and not just return back result ?


> +	return adev->asic_reset_res;
> +}
> +
> +static int
> +sienna_cichlid_mode2_perform_reset(struct amdgpu_reset_control *reset_ctl,
> +			      struct amdgpu_reset_context *reset_context) {
> +	struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;
> +	int r;
> +
> +	r = sienna_cichlid_mode2_reset(adev);
> +	if (r) {
> +		dev_err(adev->dev,
> +			"ASIC reset failed with error, %d ", r);
> +	}
> +	return r;
> +}
> +
> +static int sienna_cichlid_mode2_restore_ip(struct amdgpu_device 
> +*adev) {
> +	int i, r;
> +	struct psp_context *psp = &adev->psp;
> +
> +	r = psp_rlc_autoload_start(psp);
> +	if (r) {
> +		dev_err(adev->dev, "Failed to start rlc autoload\n");
> +		return r;
> +	}
> +
> +	/* Reinit GFXHUB */
> +	adev->gfxhub.funcs->init(adev);
> +	r = adev->gfxhub.funcs->gart_enable(adev);
> +	if (r) {
> +		dev_err(adev->dev, "GFXHUB gart reenable failed after reset\n");
> +		return r;
> +	}
> +
> +	for (i = 0; i < adev->num_ip_blocks; i++) {
> +		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)
> +			r = adev->ip_blocks[i].version->funcs->resume(adev);
> +		if (r) {
> +			dev_err(adev->dev,
> +				"resume of IP block <%s> failed %d\n",
> +				adev->ip_blocks[i].version->funcs->name, r);
> +			return r;
> +		}
> +
> +		adev->ip_blocks[i].status.hw = true;
> +	}
> +
> +	for (i = 0; i < adev->num_ip_blocks; i++) {
> +		if (!(adev->ip_blocks[i].version->type ==
> +			      AMD_IP_BLOCK_TYPE_GFX ||
> +		      adev->ip_blocks[i].version->type ==
> +			      AMD_IP_BLOCK_TYPE_SDMA))
> +			continue;
> +		r = adev->ip_blocks[i].version->funcs->resume(adev);
> +		if (r) {
> +			dev_err(adev->dev,
> +				"resume of IP block <%s> failed %d\n",
> +				adev->ip_blocks[i].version->funcs->name, r);
> +			return r;
> +		}
> +
> +		adev->ip_blocks[i].status.hw = true;
> +	}
> +
> +	for (i = 0; i < adev->num_ip_blocks; i++) {
> +		if (!(adev->ip_blocks[i].version->type ==
> +			      AMD_IP_BLOCK_TYPE_GFX ||
> +		      adev->ip_blocks[i].version->type ==
> +			      AMD_IP_BLOCK_TYPE_SDMA))
> +			continue;
> +
> +		if (adev->ip_blocks[i].version->funcs->late_init) {
> +			r = adev->ip_blocks[i].version->funcs->late_init(
> +				(void *)adev);
> +			if (r) {
> +				dev_err(adev->dev,
> +					"late_init of IP block <%s> failed %d after reset\n",
> +					adev->ip_blocks[i].version->funcs->name,
> +					r);
> +				return r;
> +			}
> +		}
> +		adev->ip_blocks[i].status.late_initialized = true;
> +	}
> +
> +	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
> +	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
> +
> +	return r;
> +}
> +
> +static int
> +sienna_cichlid_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,
> +				  struct amdgpu_reset_context *reset_context) {
> +	int r;
> +	struct amdgpu_device *tmp_adev = (struct amdgpu_device 
> +*)reset_ctl->handle;
> +
> +	dev_info(tmp_adev->dev,
> +			"GPU reset succeeded, trying to resume\n");
> +	r = sienna_cichlid_mode2_restore_ip(tmp_adev);
> +	if (r)
> +		goto end;
> +
> +	/*
> +	* Add this ASIC as tracked as reset was already
> +	* complete successfully.
> +	*/
> +	amdgpu_register_gpu_instance(tmp_adev);
> +
> +	/* Resume RAS */
> +	amdgpu_ras_resume(tmp_adev);
> +
> +	amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
> +
> +	r = amdgpu_ib_ring_tests(tmp_adev);
> +	if (r) {
> +		dev_err(tmp_adev->dev,
> +			"ib ring test failed (%d).\n", r);
> +		r = -EAGAIN;
> +		tmp_adev->asic_reset_res = r;
> +		goto end;
> +	}
> +
> +end:
> +	if (r)
> +		return -EAGAIN;
> +	else
> +		return r;
> +}
> +
> +static struct amdgpu_reset_handler sienna_cichlid_mode2_handler = {
> +	.reset_method		= AMD_RESET_METHOD_MODE2,
> +	.prepare_env		= NULL,
> +	.prepare_hwcontext	= sienna_cichlid_mode2_prepare_hwcontext,
> +	.perform_reset		= sienna_cichlid_mode2_perform_reset,
> +	.restore_hwcontext	= sienna_cichlid_mode2_restore_hwcontext,
> +	.restore_env		= NULL,
> +	.do_reset		= sienna_cichlid_mode2_reset,
> +};
> +
> +int sienna_cichlid_reset_init(struct amdgpu_device *adev) {
> +	struct amdgpu_reset_control *reset_ctl;
> +
> +	reset_ctl = kzalloc(sizeof(*reset_ctl), GFP_KERNEL);
> +	if (!reset_ctl)
> +		return -ENOMEM;
> +
> +	reset_ctl->handle = adev;
> +	reset_ctl->async_reset = sienna_cichlid_async_reset;
> +	reset_ctl->active_reset = AMD_RESET_METHOD_NONE;
> +	reset_ctl->get_reset_handler = sienna_cichlid_get_reset_handler;
> +
> +	INIT_LIST_HEAD(&reset_ctl->reset_handlers);
> +	INIT_WORK(&reset_ctl->reset_work, reset_ctl->async_reset);
> +	/* Only mode2 is handled through reset control now */
> +	amdgpu_reset_add_handler(reset_ctl, &sienna_cichlid_mode2_handler);
> +
> +	adev->reset_cntl = reset_ctl;
> +
> +	return 0;
> +}
> +
> +int sienna_cichlid_reset_fini(struct amdgpu_device *adev) {
> +	kfree(adev->reset_cntl);
> +	adev->reset_cntl = NULL;
> +	return 0;
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.h 
> b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.h
> new file mode 100644
> index 000000000000..5213b162dacd
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.h
> @@ -0,0 +1,32 @@
> +/*
> + * Copyright 2021 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person 
> +obtaining a
> + * copy of this software and associated documentation files (the 
> +"Software"),
> + * to deal in the Software without restriction, including without 
> +limitation
> + * the rights to use, copy, modify, merge, publish, distribute, 
> +sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom 
> +the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be 
> +included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
> +EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
> +MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT 
> +SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, 
> +DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
> +OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE 
> +OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + *
> + */
> +
> +#ifndef __SIENNA_CICHLID_H__
> +#define __SIENNA_CICHLID_H__
> +
> +#include "amdgpu.h"
> +
> +int sienna_cichlid_reset_init(struct amdgpu_device *adev); int 
> +sienna_cichlid_reset_fini(struct amdgpu_device *adev);
> +
> +#endif
> diff --git 
> a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v11_0_7_ppsmc.h 
> b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v11_0_7_ppsmc.h
> index d2e10a724560..82cf9e563065 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v11_0_7_ppsmc.h
> +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v11_0_7_ppsmc.h
> @@ -137,7 +137,7 @@
>   #define PPSMC_MSG_DisallowGpo                    0x56
>   
>   #define PPSMC_MSG_Enable2ndUSB20Port             0x57
> -
> -#define PPSMC_Message_Count                      0x58
> +#define PPSMC_MSG_DriverMode2Reset               0x5D
> +#define PPSMC_Message_Count                      0x5E
>   
>   #endif
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h 
> b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
> index 19084a4fcb2b..28f6a1eb6945 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
> +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
> @@ -235,7 +235,8 @@
>   	__SMU_DUMMY_MAP(UnforceGfxVid),           \
>   	__SMU_DUMMY_MAP(HeavySBR),			\
>   	__SMU_DUMMY_MAP(SetBadHBMPagesRetiredFlagsPerChannel), \
> -	__SMU_DUMMY_MAP(EnableGfxImu),
> +	__SMU_DUMMY_MAP(EnableGfxImu), \
> +	__SMU_DUMMY_MAP(DriverMode2Reset),
>   
>   #undef __SMU_DUMMY_MAP
>   #define __SMU_DUMMY_MAP(type)	SMU_MSG_##type
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c 
> b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
> index fa520d79ef67..a73d241bb64f 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
> @@ -154,6 +154,7 @@ static struct cmn2asic_msg_mapping sienna_cichlid_message_map[SMU_MSG_MAX_COUNT]
>   	MSG_MAP(SetGpoFeaturePMask,		PPSMC_MSG_SetGpoFeaturePMask,          0),
>   	MSG_MAP(DisallowGpo,			PPSMC_MSG_DisallowGpo,                 0),
>   	MSG_MAP(Enable2ndUSB20Port,		PPSMC_MSG_Enable2ndUSB20Port,          0),
> +	MSG_MAP(DriverMode2Reset,		PPSMC_MSG_DriverMode2Reset,	       0),
>   };
>   
>   static struct cmn2asic_mapping sienna_cichlid_clk_map[SMU_CLK_COUNT] 
> = { @@ -4254,6 +4255,57 @@ static int sienna_cichlid_stb_get_data_direct(struct smu_context *smu,
>   	return 0;
>   }
>   
> +static bool sienna_cichlid_is_mode2_reset_supported(struct 
> +smu_context *smu) {
> +	return true;
> +}
> +
> +static int sienna_cichlid_mode2_reset(struct smu_context *smu) {
> +	u32 smu_version;
> +	int ret = 0, index;
> +	struct amdgpu_device *adev = smu->adev;
> +	int timeout = 100;
> +
> +	smu_cmn_get_smc_version(smu, NULL, &smu_version);
> +
> +	index = smu_cmn_to_asic_specific_index(smu, CMN2ASIC_MAPPING_MSG,
> +						SMU_MSG_DriverMode2Reset);
> +
> +	mutex_lock(&smu->message_lock);
> +
> +	ret = smu_cmn_send_msg_without_waiting(smu, (uint16_t)index,
> +					       SMU_RESET_MODE_2);
> +
> +	ret = smu_cmn_wait_for_response(smu);
> +	while (ret != 0 && timeout) {
> +		ret = smu_cmn_wait_for_response(smu);
> +		/* Wait a bit more time for getting ACK */
> +		if (ret != 0) {
> +			--timeout;
> +			usleep_range(500, 1000);
> +			continue;
> +		} else {
> +			break;
> +		}
> +	}
> +
> +	if (!timeout) {
> +		dev_err(adev->dev,
> +			"failed to send mode2 message \tparam: 0x%08x response %#x\n",
> +			SMU_RESET_MODE_2, ret);
> +		goto out;
> +	}
> +
> +	dev_info(smu->adev->dev, "restore config space...\n");
> +	/* Restore the config space saved during init */
> +	amdgpu_device_load_pci_state(adev->pdev);
> +out:
> +	mutex_unlock(&smu->message_lock);
> +
> +	return ret;
> +}
> +
>   static const struct pptable_funcs sienna_cichlid_ppt_funcs = {
>   	.get_allowed_feature_mask = sienna_cichlid_get_allowed_feature_mask,
>   	.set_default_dpm_table = sienna_cichlid_set_default_dpm_table,
> @@ -4348,6 +4400,8 @@ static const struct pptable_funcs sienna_cichlid_ppt_funcs = {
>   	.get_default_config_table_settings = sienna_cichlid_get_default_config_table_settings,
>   	.set_config_table = sienna_cichlid_set_config_table,
>   	.get_unique_id = sienna_cichlid_get_unique_id,
> +	.mode2_reset_is_support = sienna_cichlid_is_mode2_reset_supported,
> +	.mode2_reset = sienna_cichlid_mode2_reset,
>   };
>   
>   void sienna_cichlid_set_ppt_funcs(struct smu_context *smu)

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 5/5] drm/amdgpu: reduce reset time
  2022-07-26  9:40     ` Zhao, Victor
@ 2022-07-26 16:57       ` Andrey Grodzovsky
  2022-07-27 10:35         ` Zhao, Victor
  0 siblings, 1 reply; 20+ messages in thread
From: Andrey Grodzovsky @ 2022-07-26 16:57 UTC (permalink / raw)
  To: Zhao, Victor, amd-gfx; +Cc: Deucher, Alexander, Deng, Emily, Koenig, Christian


On 2022-07-26 05:40, Zhao, Victor wrote:
> [AMD Official Use Only - General]
>
> Hi Andrey,
>
> Reply inline.
>
>
> Thanks,
> Victor
>
>
>
> -----Original Message-----
> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
> Sent: Tuesday, July 26, 2022 5:18 AM
> To: Zhao, Victor <Victor.Zhao@amd.com>; amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Deng, Emily <Emily.Deng@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
> Subject: Re: [PATCH 5/5] drm/amdgpu: reduce reset time
>
>
> On 2022-07-22 03:34, Victor Zhao wrote:
>> In multi container use case, reset time is important, so skip ring
>> tests and cp halt wait during ip suspending for reset as they are
>> going to fail and cost more time on reset
>
> Why are they failing in this case ? Skipping ring tests is not the best idea as you loose important indicator of system's sanity. Is there any way to make them work ?
>
> [Victor]: I've seen gfx ring test fail every time after a gfx engine hang. I thought it should be expected as gfx is in a bad state. Do you know the reason we have ring tests before reset? As we are going to reset the asic anyway.
> Another approach could be to make the skip mode2 only or reduce the wait time here.


I dug down in history and according to commit 'drm/amdgpu:unmap KCQ in 
gfx hw_fini(v2)' you need to write to scratch register for completion of 
queue unmap operation so you defently don't want to just skip it. I 
agree in case
that the ring is hung this has no point but remember that GPU reset can 
happen not only to a hunged ring but for other reasons (RAS, manual 
reset e.t.c.) in which case you probably want to shut down gracefully here ?
I see we have adev->ip_blocks[i].status.hang flag which you maybe can 
use here instead ?


>
>
>> Signed-off-by: Victor Zhao <Victor.Zhao@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c |  2 +-
>>    drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c  | 26 +++++++++++++++----------
>>    2 files changed, 17 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> index 222d3d7ea076..f872495ccc3a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> @@ -477,7 +477,7 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev)
>>    		kiq->pmf->kiq_unmap_queues(kiq_ring, &adev->gfx.compute_ring[i],
>>    					   RESET_QUEUES, 0, 0);
>>    
>> -	if (adev->gfx.kiq.ring.sched.ready)
>> +	if (adev->gfx.kiq.ring.sched.ready && !amdgpu_in_reset(adev))
>>    		r = amdgpu_ring_test_helper(kiq_ring);
>>    	spin_unlock(&adev->gfx.kiq.ring_lock);
>>    
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> index fafbad3cf08d..9ae29023e38f 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> @@ -5971,16 +5971,19 @@ static int gfx_v10_0_cp_gfx_enable(struct amdgpu_device *adev, bool enable)
>>    		WREG32_SOC15(GC, 0, mmCP_ME_CNTL, tmp);
>>    	}
>>    
>> -	for (i = 0; i < adev->usec_timeout; i++) {
>> -		if (RREG32_SOC15(GC, 0, mmCP_STAT) == 0)
>> -			break;
>> -		udelay(1);
>> -	}
>> -
>> -	if (i >= adev->usec_timeout)
>> -		DRM_ERROR("failed to %s cp gfx\n", enable ? "unhalt" : "halt");
>> +	if (!amdgpu_in_reset(adev)) {
>> +		for (i = 0; i < adev->usec_timeout; i++) {
>> +			if (RREG32_SOC15(GC, 0, mmCP_STAT) == 0)
>> +				break;
>> +			udelay(1);
>> +		}
>>    
>> +		if (i >= adev->usec_timeout)
>> +			DRM_ERROR("failed to %s cp gfx\n",
>> +				  enable ? "unhalt" : "halt");
>> +	}
>>    	return 0;
>> +
>>    }
>
> This change has impact beyond container case no ? We had no issue with this code during regular reset cases so why we would give up on this code which confirms CP is idle ? What is the side effect of skipping this during all GPU resets ?
>
> Andrey
>
> [Victor]: I see "failed to halt cp gfx" with regular reset cases as well when doing a gfx hang test using quark. I haven't seen a side effect with Mode1 reset yet but maybe shorten the wait time could be better?


Same as above i guess, it would indeed time out for a hung ring but GPU 
reset happens not only because of hung rings but for other reasons.

Andrey


>
>>    
>>    static int gfx_v10_0_cp_gfx_load_pfp_microcode(struct amdgpu_device
>> *adev) @@ -7569,8 +7572,10 @@ static int gfx_v10_0_kiq_disable_kgq(struct amdgpu_device *adev)
>>    	for (i = 0; i < adev->gfx.num_gfx_rings; i++)
>>    		kiq->pmf->kiq_unmap_queues(kiq_ring, &adev->gfx.gfx_ring[i],
>>    					   PREEMPT_QUEUES, 0, 0);
>> -
>> -	return amdgpu_ring_test_helper(kiq_ring);
>> +	if (!amdgpu_in_reset(adev))
>> +		return amdgpu_ring_test_helper(kiq_ring);
>> +	else
>> +		return 0;
>>    }
>>    #endif
>>    
>> @@ -7610,6 +7615,7 @@ static int gfx_v10_0_hw_fini(void *handle)
>>    
>>    		return 0;
>>    	}
>> +
>>    	gfx_v10_0_cp_enable(adev, false);
>>    	gfx_v10_0_enable_gui_idle_interrupt(adev, false);
>>    

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 4/5] drm/amdgpu: revert context to stop engine before mode2 reset
  2022-07-26 10:01     ` Zhao, Victor
@ 2022-07-26 16:59       ` Andrey Grodzovsky
  0 siblings, 0 replies; 20+ messages in thread
From: Andrey Grodzovsky @ 2022-07-26 16:59 UTC (permalink / raw)
  To: Zhao, Victor, amd-gfx; +Cc: Deucher, Alexander, Deng, Emily, Koenig, Christian

Got it

Acked-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>

Andrey

On 2022-07-26 06:01, Zhao, Victor wrote:
> [AMD Official Use Only - General]
>
> Hi Andrey,
>
> For slow tests I mean the slow hang tests by quark tool.
> An example here:
> hang_vm_gfx_dispatch_slow.lua - This script runs on a graphics engine using compute engine and has a hacked CS program which is massive and duplicates standard CS program move code hundreds of thousands of times. The effect is a very slowly executing CS program.
>
> It's not a bad job but just need a very long time to finish. I suppose we don’t have a way to stop shader here. And the running apps will be affected when reset is done.
>
>
> Thanks,
> Victor
>
>
>
> -----Original Message-----
> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
> Sent: Tuesday, July 26, 2022 5:20 AM
> To: Zhao, Victor <Victor.Zhao@amd.com>; amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Deng, Emily <Emily.Deng@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
> Subject: Re: [PATCH 4/5] drm/amdgpu: revert context to stop engine before mode2 reset
>
> On 2022-07-22 03:34, Victor Zhao wrote:
>
>> For some hang caused by slow tests, engine cannot be stopped which may
>> cause resume failure after reset. In this case, force halt engine by
>> reverting context addresses
>
> Can you maybe explain a bit more what exactly you mean by slow test and why engine cannot be stopped in this case ?
>
> Andrey
>
>
>> Signed-off-by: Victor Zhao <Victor.Zhao@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  1 +
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h  |  1 +
>>    drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c    | 36 +++++++++++++++++++++
>>    drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c |  2 ++
>>    4 files changed, 40 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 5498fda8617f..833dc5e224d3 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -5037,6 +5037,7 @@ static void amdgpu_device_recheck_guilty_jobs(
>>    
>>    			/* set guilty */
>>    			drm_sched_increase_karma(s_job);
>> +			amdgpu_reset_prepare_hwcontext(adev, reset_context);
>>    retry:
>>    			/* do hw reset */
>>    			if (amdgpu_sriov_vf(adev)) {
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h
>> index f8036f2b100e..c7b44aeb671b 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h
>> @@ -37,6 +37,7 @@ struct amdgpu_gfxhub_funcs {
>>    	void (*utcl2_harvest)(struct amdgpu_device *adev);
>>    	void (*mode2_save_regs)(struct amdgpu_device *adev);
>>    	void (*mode2_restore_regs)(struct amdgpu_device *adev);
>> +	void (*halt)(struct amdgpu_device *adev);
>>    };
>>    
>>    struct amdgpu_gfxhub {
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c
>> b/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c
>> index 51cf8acd2d79..8cf53e039c11 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c
>> @@ -646,6 +646,41 @@ static void gfxhub_v2_1_restore_regs(struct amdgpu_device *adev)
>>    	WREG32_SOC15(GC, 0, mmGCMC_VM_MX_L1_TLB_CNTL, adev->gmc.MC_VM_MX_L1_TLB_CNTL);
>>    }
>>    
>> +static void gfxhub_v2_1_halt(struct amdgpu_device *adev) {
>> +	struct amdgpu_vmhub *hub = &adev->vmhub[AMDGPU_GFXHUB_0];
>> +	int i;
>> +	uint32_t tmp;
>> +	int time = 1000;
>> +
>> +	gfxhub_v2_1_set_fault_enable_default(adev, false);
>> +
>> +	for (i = 0; i <= 14; i++) {
>> +		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32,
>> +				    i * hub->ctx_addr_distance, ~0);
>> +		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_PAGE_TABLE_START_ADDR_HI32,
>> +				    i * hub->ctx_addr_distance, ~0);
>> +		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_PAGE_TABLE_END_ADDR_LO32,
>> +				    i * hub->ctx_addr_distance,
>> +				    0);
>> +		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_PAGE_TABLE_END_ADDR_HI32,
>> +				    i * hub->ctx_addr_distance,
>> +				    0);
>> +	}
>> +	tmp = RREG32_SOC15(GC, 0, mmGRBM_STATUS2);
>> +	while ((tmp & (GRBM_STATUS2__EA_BUSY_MASK |
>> +		      GRBM_STATUS2__EA_LINK_BUSY_MASK)) != 0 &&
>> +	       time) {
>> +		udelay(100);
>> +		time--;
>> +		tmp = RREG32_SOC15(GC, 0, mmGRBM_STATUS2);
>> +	}
>> +
>> +	if (!time) {
>> +		DRM_WARN("failed to wait for GRBM(EA) idle\n");
>> +	}
>> +}
>> +
>>    const struct amdgpu_gfxhub_funcs gfxhub_v2_1_funcs = {
>>    	.get_fb_location = gfxhub_v2_1_get_fb_location,
>>    	.get_mc_fb_offset = gfxhub_v2_1_get_mc_fb_offset, @@ -658,4 +693,5
>> @@ const struct amdgpu_gfxhub_funcs gfxhub_v2_1_funcs = {
>>    	.utcl2_harvest = gfxhub_v2_1_utcl2_harvest,
>>    	.mode2_save_regs = gfxhub_v2_1_save_regs,
>>    	.mode2_restore_regs = gfxhub_v2_1_restore_regs,
>> +	.halt = gfxhub_v2_1_halt,
>>    };
>> diff --git a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
>> b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
>> index 51a5b68f77d3..fead7251292f 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
>> @@ -97,6 +97,8 @@ sienna_cichlid_mode2_prepare_hwcontext(struct amdgpu_reset_control *reset_ctl,
>>    	if (!amdgpu_sriov_vf(adev)) {
>>    		if (adev->gfxhub.funcs->mode2_save_regs)
>>    			adev->gfxhub.funcs->mode2_save_regs(adev);
>> +		if (adev->gfxhub.funcs->halt)
>> +			adev->gfxhub.funcs->halt(adev);
>>    		r = sienna_cichlid_mode2_suspend_ip(adev);
>>    	}
>>    

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [PATCH 5/5] drm/amdgpu: reduce reset time
  2022-07-26 16:57       ` Andrey Grodzovsky
@ 2022-07-27 10:35         ` Zhao, Victor
  2022-07-27 14:34           ` Andrey Grodzovsky
  0 siblings, 1 reply; 20+ messages in thread
From: Zhao, Victor @ 2022-07-27 10:35 UTC (permalink / raw)
  To: Grodzovsky, Andrey, amd-gfx
  Cc: Deucher, Alexander, Deng, Emily, Koenig, Christian

[AMD Official Use Only - General]

Hi Andrey,

Problem with status.hang is that it is set at amdgpu_device_ip_check_soft_reset, which is not implemented in nv or gfx10. They have to be nicely implemented first.
Another option I thought is to mark status.hang or add a flag to amdgpu_gfx when job timeout reported on gfx/comp ring. And this will require some logic to map the relationship for ring and ip blocks. This way does not look good as well.


Thanks,
Victor



-----Original Message-----
From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com> 
Sent: Wednesday, July 27, 2022 12:57 AM
To: Zhao, Victor <Victor.Zhao@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Deng, Emily <Emily.Deng@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
Subject: Re: [PATCH 5/5] drm/amdgpu: reduce reset time


On 2022-07-26 05:40, Zhao, Victor wrote:
> [AMD Official Use Only - General]
>
> Hi Andrey,
>
> Reply inline.
>
>
> Thanks,
> Victor
>
>
>
> -----Original Message-----
> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
> Sent: Tuesday, July 26, 2022 5:18 AM
> To: Zhao, Victor <Victor.Zhao@amd.com>; amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Deng, Emily 
> <Emily.Deng@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
> Subject: Re: [PATCH 5/5] drm/amdgpu: reduce reset time
>
>
> On 2022-07-22 03:34, Victor Zhao wrote:
>> In multi container use case, reset time is important, so skip ring 
>> tests and cp halt wait during ip suspending for reset as they are 
>> going to fail and cost more time on reset
>
> Why are they failing in this case ? Skipping ring tests is not the best idea as you loose important indicator of system's sanity. Is there any way to make them work ?
>
> [Victor]: I've seen gfx ring test fail every time after a gfx engine hang. I thought it should be expected as gfx is in a bad state. Do you know the reason we have ring tests before reset? As we are going to reset the asic anyway.
> Another approach could be to make the skip mode2 only or reduce the wait time here.


I dug down in history and according to commit 'drm/amdgpu:unmap KCQ in gfx hw_fini(v2)' you need to write to scratch register for completion of queue unmap operation so you defently don't want to just skip it. I agree in case that the ring is hung this has no point but remember that GPU reset can happen not only to a hunged ring but for other reasons (RAS, manual reset e.t.c.) in which case you probably want to shut down gracefully here ?
I see we have adev->ip_blocks[i].status.hang flag which you maybe can use here instead ?


>
>
>> Signed-off-by: Victor Zhao <Victor.Zhao@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c |  2 +-
>>    drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c  | 26 +++++++++++++++----------
>>    2 files changed, 17 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> index 222d3d7ea076..f872495ccc3a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> @@ -477,7 +477,7 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev)
>>    		kiq->pmf->kiq_unmap_queues(kiq_ring, &adev->gfx.compute_ring[i],
>>    					   RESET_QUEUES, 0, 0);
>>    
>> -	if (adev->gfx.kiq.ring.sched.ready)
>> +	if (adev->gfx.kiq.ring.sched.ready && !amdgpu_in_reset(adev))
>>    		r = amdgpu_ring_test_helper(kiq_ring);
>>    	spin_unlock(&adev->gfx.kiq.ring_lock);
>>    
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> index fafbad3cf08d..9ae29023e38f 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> @@ -5971,16 +5971,19 @@ static int gfx_v10_0_cp_gfx_enable(struct amdgpu_device *adev, bool enable)
>>    		WREG32_SOC15(GC, 0, mmCP_ME_CNTL, tmp);
>>    	}
>>    
>> -	for (i = 0; i < adev->usec_timeout; i++) {
>> -		if (RREG32_SOC15(GC, 0, mmCP_STAT) == 0)
>> -			break;
>> -		udelay(1);
>> -	}
>> -
>> -	if (i >= adev->usec_timeout)
>> -		DRM_ERROR("failed to %s cp gfx\n", enable ? "unhalt" : "halt");
>> +	if (!amdgpu_in_reset(adev)) {
>> +		for (i = 0; i < adev->usec_timeout; i++) {
>> +			if (RREG32_SOC15(GC, 0, mmCP_STAT) == 0)
>> +				break;
>> +			udelay(1);
>> +		}
>>    
>> +		if (i >= adev->usec_timeout)
>> +			DRM_ERROR("failed to %s cp gfx\n",
>> +				  enable ? "unhalt" : "halt");
>> +	}
>>    	return 0;
>> +
>>    }
>
> This change has impact beyond container case no ? We had no issue with this code during regular reset cases so why we would give up on this code which confirms CP is idle ? What is the side effect of skipping this during all GPU resets ?
>
> Andrey
>
> [Victor]: I see "failed to halt cp gfx" with regular reset cases as well when doing a gfx hang test using quark. I haven't seen a side effect with Mode1 reset yet but maybe shorten the wait time could be better?


Same as above i guess, it would indeed time out for a hung ring but GPU reset happens not only because of hung rings but for other reasons.

Andrey


>
>>    
>>    static int gfx_v10_0_cp_gfx_load_pfp_microcode(struct 
>> amdgpu_device
>> *adev) @@ -7569,8 +7572,10 @@ static int gfx_v10_0_kiq_disable_kgq(struct amdgpu_device *adev)
>>    	for (i = 0; i < adev->gfx.num_gfx_rings; i++)
>>    		kiq->pmf->kiq_unmap_queues(kiq_ring, &adev->gfx.gfx_ring[i],
>>    					   PREEMPT_QUEUES, 0, 0);
>> -
>> -	return amdgpu_ring_test_helper(kiq_ring);
>> +	if (!amdgpu_in_reset(adev))
>> +		return amdgpu_ring_test_helper(kiq_ring);
>> +	else
>> +		return 0;
>>    }
>>    #endif
>>    
>> @@ -7610,6 +7615,7 @@ static int gfx_v10_0_hw_fini(void *handle)
>>    
>>    		return 0;
>>    	}
>> +
>>    	gfx_v10_0_cp_enable(adev, false);
>>    	gfx_v10_0_enable_gui_idle_interrupt(adev, false);
>>    

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 5/5] drm/amdgpu: reduce reset time
  2022-07-27 10:35         ` Zhao, Victor
@ 2022-07-27 14:34           ` Andrey Grodzovsky
  0 siblings, 0 replies; 20+ messages in thread
From: Andrey Grodzovsky @ 2022-07-27 14:34 UTC (permalink / raw)
  To: Zhao, Victor, amd-gfx; +Cc: Deucher, Alexander, Deng, Emily, Koenig, Christian

On 2022-07-27 06:35, Zhao, Victor wrote:
> [AMD Official Use Only - General]
>
> Hi Andrey,
>
> Problem with status.hang is that it is set at amdgpu_device_ip_check_soft_reset, which is not implemented in nv or gfx10. They have to be nicely implemented first.
> Another option I thought is to mark status.hang or add a flag to amdgpu_gfx when job timeout reported on gfx/comp ring. And this will require some logic to map the relationship for ring and ip blocks. This way does not look good as well.


I don't think we need this at the ring level, its enough to know that 
the reset you are going through is because of one of rings are hanged to 
apply this skip logic, it's pretty easy if we add 'bool hang' flag to 
adev->reset_domain
which u can set in the beginning amdgpu_job_timedout and clear in the 
end. No protection is required as all the resets from all origins are 
serialized with timeout handler in a single threaded queue.

Andrey


>
>
> Thanks,
> Victor
>
>
>
> -----Original Message-----
> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
> Sent: Wednesday, July 27, 2022 12:57 AM
> To: Zhao, Victor <Victor.Zhao@amd.com>; amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Deng, Emily <Emily.Deng@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
> Subject: Re: [PATCH 5/5] drm/amdgpu: reduce reset time
>
>
> On 2022-07-26 05:40, Zhao, Victor wrote:
>> [AMD Official Use Only - General]
>>
>> Hi Andrey,
>>
>> Reply inline.
>>
>>
>> Thanks,
>> Victor
>>
>>
>>
>> -----Original Message-----
>> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>> Sent: Tuesday, July 26, 2022 5:18 AM
>> To: Zhao, Victor <Victor.Zhao@amd.com>; amd-gfx@lists.freedesktop.org
>> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Deng, Emily
>> <Emily.Deng@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
>> Subject: Re: [PATCH 5/5] drm/amdgpu: reduce reset time
>>
>>
>> On 2022-07-22 03:34, Victor Zhao wrote:
>>> In multi container use case, reset time is important, so skip ring
>>> tests and cp halt wait during ip suspending for reset as they are
>>> going to fail and cost more time on reset
>> Why are they failing in this case ? Skipping ring tests is not the best idea as you loose important indicator of system's sanity. Is there any way to make them work ?
>>
>> [Victor]: I've seen gfx ring test fail every time after a gfx engine hang. I thought it should be expected as gfx is in a bad state. Do you know the reason we have ring tests before reset? As we are going to reset the asic anyway.
>> Another approach could be to make the skip mode2 only or reduce the wait time here.
>
> I dug down in history and according to commit 'drm/amdgpu:unmap KCQ in gfx hw_fini(v2)' you need to write to scratch register for completion of queue unmap operation so you defently don't want to just skip it. I agree in case that the ring is hung this has no point but remember that GPU reset can happen not only to a hunged ring but for other reasons (RAS, manual reset e.t.c.) in which case you probably want to shut down gracefully here ?
> I see we have adev->ip_blocks[i].status.hang flag which you maybe can use here instead ?
>
>
>>
>>> Signed-off-by: Victor Zhao <Victor.Zhao@amd.com>
>>> ---
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c |  2 +-
>>>     drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c  | 26 +++++++++++++++----------
>>>     2 files changed, 17 insertions(+), 11 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>> index 222d3d7ea076..f872495ccc3a 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>> @@ -477,7 +477,7 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev)
>>>     		kiq->pmf->kiq_unmap_queues(kiq_ring, &adev->gfx.compute_ring[i],
>>>     					   RESET_QUEUES, 0, 0);
>>>     
>>> -	if (adev->gfx.kiq.ring.sched.ready)
>>> +	if (adev->gfx.kiq.ring.sched.ready && !amdgpu_in_reset(adev))
>>>     		r = amdgpu_ring_test_helper(kiq_ring);
>>>     	spin_unlock(&adev->gfx.kiq.ring_lock);
>>>     
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> index fafbad3cf08d..9ae29023e38f 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> @@ -5971,16 +5971,19 @@ static int gfx_v10_0_cp_gfx_enable(struct amdgpu_device *adev, bool enable)
>>>     		WREG32_SOC15(GC, 0, mmCP_ME_CNTL, tmp);
>>>     	}
>>>     
>>> -	for (i = 0; i < adev->usec_timeout; i++) {
>>> -		if (RREG32_SOC15(GC, 0, mmCP_STAT) == 0)
>>> -			break;
>>> -		udelay(1);
>>> -	}
>>> -
>>> -	if (i >= adev->usec_timeout)
>>> -		DRM_ERROR("failed to %s cp gfx\n", enable ? "unhalt" : "halt");
>>> +	if (!amdgpu_in_reset(adev)) {
>>> +		for (i = 0; i < adev->usec_timeout; i++) {
>>> +			if (RREG32_SOC15(GC, 0, mmCP_STAT) == 0)
>>> +				break;
>>> +			udelay(1);
>>> +		}
>>>     
>>> +		if (i >= adev->usec_timeout)
>>> +			DRM_ERROR("failed to %s cp gfx\n",
>>> +				  enable ? "unhalt" : "halt");
>>> +	}
>>>     	return 0;
>>> +
>>>     }
>> This change has impact beyond container case no ? We had no issue with this code during regular reset cases so why we would give up on this code which confirms CP is idle ? What is the side effect of skipping this during all GPU resets ?
>>
>> Andrey
>>
>> [Victor]: I see "failed to halt cp gfx" with regular reset cases as well when doing a gfx hang test using quark. I haven't seen a side effect with Mode1 reset yet but maybe shorten the wait time could be better?
>
> Same as above i guess, it would indeed time out for a hung ring but GPU reset happens not only because of hung rings but for other reasons.
>
> Andrey
>
>
>>>     
>>>     static int gfx_v10_0_cp_gfx_load_pfp_microcode(struct
>>> amdgpu_device
>>> *adev) @@ -7569,8 +7572,10 @@ static int gfx_v10_0_kiq_disable_kgq(struct amdgpu_device *adev)
>>>     	for (i = 0; i < adev->gfx.num_gfx_rings; i++)
>>>     		kiq->pmf->kiq_unmap_queues(kiq_ring, &adev->gfx.gfx_ring[i],
>>>     					   PREEMPT_QUEUES, 0, 0);
>>> -
>>> -	return amdgpu_ring_test_helper(kiq_ring);
>>> +	if (!amdgpu_in_reset(adev))
>>> +		return amdgpu_ring_test_helper(kiq_ring);
>>> +	else
>>> +		return 0;
>>>     }
>>>     #endif
>>>     
>>> @@ -7610,6 +7615,7 @@ static int gfx_v10_0_hw_fini(void *handle)
>>>     
>>>     		return 0;
>>>     	}
>>> +
>>>     	gfx_v10_0_cp_enable(adev, false);
>>>     	gfx_v10_0_enable_gui_idle_interrupt(adev, false);
>>>     

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2022-07-27 14:37 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-07-22  7:33 [PATCH 1/5] drm/amdgpu: add mode2 reset for sienna_cichlid Victor Zhao
2022-07-22  7:34 ` [PATCH 2/5] drm/amdgpu: add debugfs amdgpu_reset_level Victor Zhao
2022-07-22  8:19   ` Christian König
2022-07-25 10:45     ` Zhao, Victor
2022-07-25 17:37       ` Christian König
2022-07-25 21:07         ` Andrey Grodzovsky
2022-07-22  7:34 ` [PATCH 3/5] drm/amdgpu: save and restore gc hub regs Victor Zhao
2022-07-25 21:18   ` Andrey Grodzovsky
2022-07-22  7:34 ` [PATCH 4/5] drm/amdgpu: revert context to stop engine before mode2 reset Victor Zhao
2022-07-25 21:19   ` Andrey Grodzovsky
2022-07-26 10:01     ` Zhao, Victor
2022-07-26 16:59       ` Andrey Grodzovsky
2022-07-22  7:34 ` [PATCH 5/5] drm/amdgpu: reduce reset time Victor Zhao
2022-07-25 21:17   ` Andrey Grodzovsky
2022-07-26  9:40     ` Zhao, Victor
2022-07-26 16:57       ` Andrey Grodzovsky
2022-07-27 10:35         ` Zhao, Victor
2022-07-27 14:34           ` Andrey Grodzovsky
2022-07-25 21:05 ` [PATCH 1/5] drm/amdgpu: add mode2 reset for sienna_cichlid Andrey Grodzovsky
2022-07-26 10:04   ` Zhao, Victor

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.