All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 1/6] drm/amdgpu: add mode2 reset for sienna_cichlid
@ 2022-07-28 10:30 Victor Zhao
  2022-07-28 10:30 ` [PATCH v2 2/6] drm/amdgpu: let mode2 reset fallback to default when failure Victor Zhao
                   ` (6 more replies)
  0 siblings, 7 replies; 13+ messages in thread
From: Victor Zhao @ 2022-07-28 10:30 UTC (permalink / raw)
  To: amd-gfx; +Cc: emily.deng, Victor Zhao, Andrey.Grodzovsky

To meet the requirement for multi container usecase which needs
a quicker reset and not causing VRAM lost, adding the Mode2
reset handler for sienna_cichlid.

v2: move skip mode2 flag part separately

Signed-off-by: Victor Zhao <Victor.Zhao@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/Makefile           |   2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c     |   7 +
 drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c   | 297 ++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/sienna_cichlid.h   |  32 ++
 .../pm/swsmu/inc/pmfw_if/smu_v11_0_7_ppsmc.h  |   4 +-
 drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h  |   3 +-
 .../amd/pm/swsmu/smu11/sienna_cichlid_ppt.c   |  54 ++++
 7 files changed, 395 insertions(+), 4 deletions(-)
 create mode 100644 drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
 create mode 100644 drivers/gpu/drm/amd/amdgpu/sienna_cichlid.h

diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile
index c7d0cd15b5ef..7030ac2d7d2c 100644
--- a/drivers/gpu/drm/amd/amdgpu/Makefile
+++ b/drivers/gpu/drm/amd/amdgpu/Makefile
@@ -75,7 +75,7 @@ amdgpu-y += \
 	vi.o mxgpu_vi.o nbio_v6_1.o soc15.o emu_soc.o mxgpu_ai.o nbio_v7_0.o vega10_reg_init.o \
 	vega20_reg_init.o nbio_v7_4.o nbio_v2_3.o nv.o arct_reg_init.o mxgpu_nv.o \
 	nbio_v7_2.o hdp_v4_0.o hdp_v5_0.o aldebaran_reg_init.o aldebaran.o soc21.o \
-	nbio_v4_3.o hdp_v6_0.o nbio_v7_7.o hdp_v5_2.o lsdma_v6_0.o
+	sienna_cichlid.o nbio_v4_3.o hdp_v6_0.o nbio_v7_7.o hdp_v5_2.o lsdma_v6_0.o
 
 # add DF block
 amdgpu-y += \
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
index 32c86a0b145c..f778466bb9db 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
@@ -23,6 +23,7 @@
 
 #include "amdgpu_reset.h"
 #include "aldebaran.h"
+#include "sienna_cichlid.h"
 
 int amdgpu_reset_add_handler(struct amdgpu_reset_control *reset_ctl,
 			     struct amdgpu_reset_handler *handler)
@@ -40,6 +41,9 @@ int amdgpu_reset_init(struct amdgpu_device *adev)
 	case IP_VERSION(13, 0, 2):
 		ret = aldebaran_reset_init(adev);
 		break;
+	case IP_VERSION(11, 0, 7):
+		ret = sienna_cichlid_reset_init(adev);
+		break;
 	default:
 		break;
 	}
@@ -55,6 +59,9 @@ int amdgpu_reset_fini(struct amdgpu_device *adev)
 	case IP_VERSION(13, 0, 2):
 		ret = aldebaran_reset_fini(adev);
 		break;
+	case IP_VERSION(11, 0, 7):
+		ret = sienna_cichlid_reset_fini(adev);
+		break;
 	default:
 		break;
 	}
diff --git a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
new file mode 100644
index 000000000000..0512960bed23
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
@@ -0,0 +1,297 @@
+/*
+ * Copyright 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "sienna_cichlid.h"
+#include "amdgpu_reset.h"
+#include "amdgpu_amdkfd.h"
+#include "amdgpu_dpm.h"
+#include "amdgpu_job.h"
+#include "amdgpu_ring.h"
+#include "amdgpu_ras.h"
+#include "amdgpu_psp.h"
+#include "amdgpu_xgmi.h"
+
+static struct amdgpu_reset_handler *
+sienna_cichlid_get_reset_handler(struct amdgpu_reset_control *reset_ctl,
+			    struct amdgpu_reset_context *reset_context)
+{
+	struct amdgpu_reset_handler *handler;
+	struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;
+
+	if (reset_context->method != AMD_RESET_METHOD_NONE) {
+		list_for_each_entry(handler, &reset_ctl->reset_handlers,
+				     handler_list) {
+			if (handler->reset_method == reset_context->method)
+				return handler;
+		}
+	} else {
+		list_for_each_entry(handler, &reset_ctl->reset_handlers,
+				     handler_list) {
+			if (handler->reset_method == AMD_RESET_METHOD_MODE2 &&
+			    adev->pm.fw_version >= 0x3a5500 &&
+			    !amdgpu_sriov_vf(adev)) {
+				reset_context->method = AMD_RESET_METHOD_MODE2;
+				return handler;
+			}
+		}
+	}
+
+	return NULL;
+}
+
+static int sienna_cichlid_mode2_suspend_ip(struct amdgpu_device *adev)
+{
+	int r, i;
+
+	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
+	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
+
+	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
+		if (!(adev->ip_blocks[i].version->type ==
+			      AMD_IP_BLOCK_TYPE_GFX ||
+		      adev->ip_blocks[i].version->type ==
+			      AMD_IP_BLOCK_TYPE_SDMA))
+			continue;
+
+		r = adev->ip_blocks[i].version->funcs->suspend(adev);
+
+		if (r) {
+			dev_err(adev->dev,
+				"suspend of IP block <%s> failed %d\n",
+				adev->ip_blocks[i].version->funcs->name, r);
+			return r;
+		}
+		adev->ip_blocks[i].status.hw = false;
+	}
+
+	return r;
+}
+
+static int
+sienna_cichlid_mode2_prepare_hwcontext(struct amdgpu_reset_control *reset_ctl,
+				  struct amdgpu_reset_context *reset_context)
+{
+	int r = 0;
+	struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;
+
+	if (!amdgpu_sriov_vf(adev))
+		r = sienna_cichlid_mode2_suspend_ip(adev);
+
+	return r;
+}
+
+static void sienna_cichlid_async_reset(struct work_struct *work)
+{
+	struct amdgpu_reset_handler *handler;
+	struct amdgpu_reset_control *reset_ctl =
+		container_of(work, struct amdgpu_reset_control, reset_work);
+	struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;
+
+	list_for_each_entry(handler, &reset_ctl->reset_handlers,
+			     handler_list) {
+		if (handler->reset_method == reset_ctl->active_reset) {
+			dev_dbg(adev->dev, "Resetting device\n");
+			handler->do_reset(adev);
+			break;
+		}
+	}
+}
+
+static int sienna_cichlid_mode2_reset(struct amdgpu_device *adev)
+{
+	/* disable BM */
+	pci_clear_master(adev->pdev);
+	adev->asic_reset_res = amdgpu_dpm_mode2_reset(adev);
+	return adev->asic_reset_res;
+}
+
+static int
+sienna_cichlid_mode2_perform_reset(struct amdgpu_reset_control *reset_ctl,
+			      struct amdgpu_reset_context *reset_context)
+{
+	struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;
+	int r;
+
+	r = sienna_cichlid_mode2_reset(adev);
+	if (r) {
+		dev_err(adev->dev,
+			"ASIC reset failed with error, %d ", r);
+	}
+	return r;
+}
+
+static int sienna_cichlid_mode2_restore_ip(struct amdgpu_device *adev)
+{
+	int i, r;
+	struct psp_context *psp = &adev->psp;
+
+	r = psp_rlc_autoload_start(psp);
+	if (r) {
+		dev_err(adev->dev, "Failed to start rlc autoload\n");
+		return r;
+	}
+
+	/* Reinit GFXHUB */
+	adev->gfxhub.funcs->init(adev);
+	r = adev->gfxhub.funcs->gart_enable(adev);
+	if (r) {
+		dev_err(adev->dev, "GFXHUB gart reenable failed after reset\n");
+		return r;
+	}
+
+	for (i = 0; i < adev->num_ip_blocks; i++) {
+		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)
+			r = adev->ip_blocks[i].version->funcs->resume(adev);
+		if (r) {
+			dev_err(adev->dev,
+				"resume of IP block <%s> failed %d\n",
+				adev->ip_blocks[i].version->funcs->name, r);
+			return r;
+		}
+
+		adev->ip_blocks[i].status.hw = true;
+	}
+
+	for (i = 0; i < adev->num_ip_blocks; i++) {
+		if (!(adev->ip_blocks[i].version->type ==
+			      AMD_IP_BLOCK_TYPE_GFX ||
+		      adev->ip_blocks[i].version->type ==
+			      AMD_IP_BLOCK_TYPE_SDMA))
+			continue;
+		r = adev->ip_blocks[i].version->funcs->resume(adev);
+		if (r) {
+			dev_err(adev->dev,
+				"resume of IP block <%s> failed %d\n",
+				adev->ip_blocks[i].version->funcs->name, r);
+			return r;
+		}
+
+		adev->ip_blocks[i].status.hw = true;
+	}
+
+	for (i = 0; i < adev->num_ip_blocks; i++) {
+		if (!(adev->ip_blocks[i].version->type ==
+			      AMD_IP_BLOCK_TYPE_GFX ||
+		      adev->ip_blocks[i].version->type ==
+			      AMD_IP_BLOCK_TYPE_SDMA))
+			continue;
+
+		if (adev->ip_blocks[i].version->funcs->late_init) {
+			r = adev->ip_blocks[i].version->funcs->late_init(
+				(void *)adev);
+			if (r) {
+				dev_err(adev->dev,
+					"late_init of IP block <%s> failed %d after reset\n",
+					adev->ip_blocks[i].version->funcs->name,
+					r);
+				return r;
+			}
+		}
+		adev->ip_blocks[i].status.late_initialized = true;
+	}
+
+	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
+	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
+
+	return r;
+}
+
+static int
+sienna_cichlid_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,
+				  struct amdgpu_reset_context *reset_context)
+{
+	int r;
+	struct amdgpu_device *tmp_adev = (struct amdgpu_device *)reset_ctl->handle;
+
+	dev_info(tmp_adev->dev,
+			"GPU reset succeeded, trying to resume\n");
+	r = sienna_cichlid_mode2_restore_ip(tmp_adev);
+	if (r)
+		goto end;
+
+	/*
+	* Add this ASIC as tracked as reset was already
+	* complete successfully.
+	*/
+	amdgpu_register_gpu_instance(tmp_adev);
+
+	/* Resume RAS */
+	amdgpu_ras_resume(tmp_adev);
+
+	amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
+
+	r = amdgpu_ib_ring_tests(tmp_adev);
+	if (r) {
+		dev_err(tmp_adev->dev,
+			"ib ring test failed (%d).\n", r);
+		r = -EAGAIN;
+		tmp_adev->asic_reset_res = r;
+		goto end;
+	}
+
+end:
+	if (r)
+		return -EAGAIN;
+	else
+		return r;
+}
+
+static struct amdgpu_reset_handler sienna_cichlid_mode2_handler = {
+	.reset_method		= AMD_RESET_METHOD_MODE2,
+	.prepare_env		= NULL,
+	.prepare_hwcontext	= sienna_cichlid_mode2_prepare_hwcontext,
+	.perform_reset		= sienna_cichlid_mode2_perform_reset,
+	.restore_hwcontext	= sienna_cichlid_mode2_restore_hwcontext,
+	.restore_env		= NULL,
+	.do_reset		= sienna_cichlid_mode2_reset,
+};
+
+int sienna_cichlid_reset_init(struct amdgpu_device *adev)
+{
+	struct amdgpu_reset_control *reset_ctl;
+
+	reset_ctl = kzalloc(sizeof(*reset_ctl), GFP_KERNEL);
+	if (!reset_ctl)
+		return -ENOMEM;
+
+	reset_ctl->handle = adev;
+	reset_ctl->async_reset = sienna_cichlid_async_reset;
+	reset_ctl->active_reset = AMD_RESET_METHOD_NONE;
+	reset_ctl->get_reset_handler = sienna_cichlid_get_reset_handler;
+
+	INIT_LIST_HEAD(&reset_ctl->reset_handlers);
+	INIT_WORK(&reset_ctl->reset_work, reset_ctl->async_reset);
+	/* Only mode2 is handled through reset control now */
+	amdgpu_reset_add_handler(reset_ctl, &sienna_cichlid_mode2_handler);
+
+	adev->reset_cntl = reset_ctl;
+
+	return 0;
+}
+
+int sienna_cichlid_reset_fini(struct amdgpu_device *adev)
+{
+	kfree(adev->reset_cntl);
+	adev->reset_cntl = NULL;
+	return 0;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.h b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.h
new file mode 100644
index 000000000000..5213b162dacd
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __SIENNA_CICHLID_H__
+#define __SIENNA_CICHLID_H__
+
+#include "amdgpu.h"
+
+int sienna_cichlid_reset_init(struct amdgpu_device *adev);
+int sienna_cichlid_reset_fini(struct amdgpu_device *adev);
+
+#endif
diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v11_0_7_ppsmc.h b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v11_0_7_ppsmc.h
index d2e10a724560..82cf9e563065 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v11_0_7_ppsmc.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v11_0_7_ppsmc.h
@@ -137,7 +137,7 @@
 #define PPSMC_MSG_DisallowGpo                    0x56
 
 #define PPSMC_MSG_Enable2ndUSB20Port             0x57
-
-#define PPSMC_Message_Count                      0x58
+#define PPSMC_MSG_DriverMode2Reset               0x5D
+#define PPSMC_Message_Count                      0x5E
 
 #endif
diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
index 19084a4fcb2b..28f6a1eb6945 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
@@ -235,7 +235,8 @@
 	__SMU_DUMMY_MAP(UnforceGfxVid),           \
 	__SMU_DUMMY_MAP(HeavySBR),			\
 	__SMU_DUMMY_MAP(SetBadHBMPagesRetiredFlagsPerChannel), \
-	__SMU_DUMMY_MAP(EnableGfxImu),
+	__SMU_DUMMY_MAP(EnableGfxImu), \
+	__SMU_DUMMY_MAP(DriverMode2Reset),
 
 #undef __SMU_DUMMY_MAP
 #define __SMU_DUMMY_MAP(type)	SMU_MSG_##type
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
index fa520d79ef67..a73d241bb64f 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
@@ -154,6 +154,7 @@ static struct cmn2asic_msg_mapping sienna_cichlid_message_map[SMU_MSG_MAX_COUNT]
 	MSG_MAP(SetGpoFeaturePMask,		PPSMC_MSG_SetGpoFeaturePMask,          0),
 	MSG_MAP(DisallowGpo,			PPSMC_MSG_DisallowGpo,                 0),
 	MSG_MAP(Enable2ndUSB20Port,		PPSMC_MSG_Enable2ndUSB20Port,          0),
+	MSG_MAP(DriverMode2Reset,		PPSMC_MSG_DriverMode2Reset,	       0),
 };
 
 static struct cmn2asic_mapping sienna_cichlid_clk_map[SMU_CLK_COUNT] = {
@@ -4254,6 +4255,57 @@ static int sienna_cichlid_stb_get_data_direct(struct smu_context *smu,
 	return 0;
 }
 
+static bool sienna_cichlid_is_mode2_reset_supported(struct smu_context *smu)
+{
+	return true;
+}
+
+static int sienna_cichlid_mode2_reset(struct smu_context *smu)
+{
+	u32 smu_version;
+	int ret = 0, index;
+	struct amdgpu_device *adev = smu->adev;
+	int timeout = 100;
+
+	smu_cmn_get_smc_version(smu, NULL, &smu_version);
+
+	index = smu_cmn_to_asic_specific_index(smu, CMN2ASIC_MAPPING_MSG,
+						SMU_MSG_DriverMode2Reset);
+
+	mutex_lock(&smu->message_lock);
+
+	ret = smu_cmn_send_msg_without_waiting(smu, (uint16_t)index,
+					       SMU_RESET_MODE_2);
+
+	ret = smu_cmn_wait_for_response(smu);
+	while (ret != 0 && timeout) {
+		ret = smu_cmn_wait_for_response(smu);
+		/* Wait a bit more time for getting ACK */
+		if (ret != 0) {
+			--timeout;
+			usleep_range(500, 1000);
+			continue;
+		} else {
+			break;
+		}
+	}
+
+	if (!timeout) {
+		dev_err(adev->dev,
+			"failed to send mode2 message \tparam: 0x%08x response %#x\n",
+			SMU_RESET_MODE_2, ret);
+		goto out;
+	}
+
+	dev_info(smu->adev->dev, "restore config space...\n");
+	/* Restore the config space saved during init */
+	amdgpu_device_load_pci_state(adev->pdev);
+out:
+	mutex_unlock(&smu->message_lock);
+
+	return ret;
+}
+
 static const struct pptable_funcs sienna_cichlid_ppt_funcs = {
 	.get_allowed_feature_mask = sienna_cichlid_get_allowed_feature_mask,
 	.set_default_dpm_table = sienna_cichlid_set_default_dpm_table,
@@ -4348,6 +4400,8 @@ static const struct pptable_funcs sienna_cichlid_ppt_funcs = {
 	.get_default_config_table_settings = sienna_cichlid_get_default_config_table_settings,
 	.set_config_table = sienna_cichlid_set_config_table,
 	.get_unique_id = sienna_cichlid_get_unique_id,
+	.mode2_reset_is_support = sienna_cichlid_is_mode2_reset_supported,
+	.mode2_reset = sienna_cichlid_mode2_reset,
 };
 
 void sienna_cichlid_set_ppt_funcs(struct smu_context *smu)
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH v2 2/6] drm/amdgpu: let mode2 reset fallback to default when failure
  2022-07-28 10:30 [PATCH v2 1/6] drm/amdgpu: add mode2 reset for sienna_cichlid Victor Zhao
@ 2022-07-28 10:30 ` Victor Zhao
  2022-07-28 10:30 ` [PATCH v2 3/6] drm/amdgpu: add debugfs amdgpu_reset_level Victor Zhao
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 13+ messages in thread
From: Victor Zhao @ 2022-07-28 10:30 UTC (permalink / raw)
  To: amd-gfx; +Cc: emily.deng, Victor Zhao, Andrey.Grodzovsky

- introduce AMDGPU_SKIP_MODE2_RESET flag
- let mode2 reset fallback to default reset method if failed

v2: move this part out from the asic specific part

Signed-off-by: Victor Zhao <Victor.Zhao@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 7 ++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    | 1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c    | 1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c  | 6 ++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h  | 1 +
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c      | 1 +
 drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c      | 1 +
 drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c      | 1 +
 9 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 5e53a5293935..091415a4abf0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -135,6 +135,7 @@ static void amdgpu_amdkfd_reset_work(struct work_struct *work)
 	reset_context.method = AMD_RESET_METHOD_NONE;
 	reset_context.reset_req_dev = adev;
 	clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+	clear_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags);
 
 	amdgpu_device_gpu_recover(adev, NULL, &reset_context);
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 041bd906449d..f03750b64f61 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5147,6 +5147,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 
 	reset_context->job = job;
 	reset_context->hive = hive;
+
 	/*
 	 * Build list of devices to reset.
 	 * In case we are in XGMI hive mode, resort the device list
@@ -5266,8 +5267,11 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 			amdgpu_ras_resume(adev);
 	} else {
 		r = amdgpu_do_asic_reset(device_list_handle, reset_context);
-		if (r && r == -EAGAIN)
+		if (r && r == -EAGAIN) {
+			set_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context->flags);
+			adev->asic_reset_res = 0;
 			goto retry;
+		}
 	}
 
 skip_hw_reset:
@@ -5695,6 +5699,7 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
 	reset_context.reset_req_dev = adev;
 	set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
 	set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
+	set_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags);
 
 	adev->no_hw_access = true;
 	r = amdgpu_device_pre_asic_reset(adev, &reset_context);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 5071b96be982..6c3e7290153f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -71,6 +71,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
 		reset_context.method = AMD_RESET_METHOD_NONE;
 		reset_context.reset_req_dev = adev;
 		clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+		clear_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags);
 
 		r = amdgpu_device_gpu_recover(ring->adev, job, &reset_context);
 		if (r)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index ff5361f5c2d4..ab9ba5a9c33d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1949,6 +1949,7 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
 		reset_context.method = AMD_RESET_METHOD_NONE;
 		reset_context.reset_req_dev = adev;
 		clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+		clear_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags);
 
 		amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context);
 	}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
index f778466bb9db..831fb222139c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
@@ -74,6 +74,9 @@ int amdgpu_reset_prepare_hwcontext(struct amdgpu_device *adev,
 {
 	struct amdgpu_reset_handler *reset_handler = NULL;
 
+	if (test_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context->flags))
+		return -ENOSYS;
+
 	if (adev->reset_cntl && adev->reset_cntl->get_reset_handler)
 		reset_handler = adev->reset_cntl->get_reset_handler(
 			adev->reset_cntl, reset_context);
@@ -90,6 +93,9 @@ int amdgpu_reset_perform_reset(struct amdgpu_device *adev,
 	int ret;
 	struct amdgpu_reset_handler *reset_handler = NULL;
 
+	if (test_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context->flags))
+		return -ENOSYS;
+
 	if (adev->reset_cntl)
 		reset_handler = adev->reset_cntl->get_reset_handler(
 			adev->reset_cntl, reset_context);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
index 9e55a5d7a825..cc4b2eeb24cf 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
@@ -30,6 +30,7 @@ enum AMDGPU_RESET_FLAGS {
 
 	AMDGPU_NEED_FULL_RESET = 0,
 	AMDGPU_SKIP_HW_RESET = 1,
+	AMDGPU_SKIP_MODE2_RESET = 2,
 };
 
 struct amdgpu_reset_context {
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index 12906ba74462..a2f04b249132 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -290,6 +290,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
 		reset_context.method = AMD_RESET_METHOD_NONE;
 		reset_context.reset_req_dev = adev;
 		clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+		clear_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags);
 
 		amdgpu_device_gpu_recover(adev, NULL, &reset_context);
 	}
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
index e07757eea7ad..a977f0027928 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
@@ -317,6 +317,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
 		reset_context.method = AMD_RESET_METHOD_NONE;
 		reset_context.reset_req_dev = adev;
 		clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+		clear_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags);
 
 		amdgpu_device_gpu_recover(adev, NULL, &reset_context);
 	}
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
index 288c414babdf..fd14fa9b9cd7 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
@@ -529,6 +529,7 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct *work)
 		reset_context.method = AMD_RESET_METHOD_NONE;
 		reset_context.reset_req_dev = adev;
 		clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+		clear_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags);
 
 		amdgpu_device_gpu_recover(adev, NULL, &reset_context);
 	}
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH v2 3/6] drm/amdgpu: add debugfs amdgpu_reset_level
  2022-07-28 10:30 [PATCH v2 1/6] drm/amdgpu: add mode2 reset for sienna_cichlid Victor Zhao
  2022-07-28 10:30 ` [PATCH v2 2/6] drm/amdgpu: let mode2 reset fallback to default when failure Victor Zhao
@ 2022-07-28 10:30 ` Victor Zhao
  2022-07-28 10:30 ` [PATCH v2 4/6] drm/amdgpu: save and restore gc hub regs Victor Zhao
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 13+ messages in thread
From: Victor Zhao @ 2022-07-28 10:30 UTC (permalink / raw)
  To: amd-gfx; +Cc: emily.deng, Victor Zhao, Andrey.Grodzovsky

Introduce amdgpu_reset_level debugfs in order to help debug and
test specific type of reset. Also helps blocking unwanted type of
resets.

By default, mode2 reset will not be enabled

v2: make this debugfs in adev and use debugfs_create_u32

Signed-off-by: Victor Zhao <Victor.Zhao@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h         | 5 +++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 2 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c   | 8 ++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c    | 3 +++
 4 files changed, 18 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index e146810c700b..895d74c8aace 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -274,6 +274,9 @@ extern int amdgpu_vcnfw_log;
 #define AMDGPU_RESET_VCE			(1 << 13)
 #define AMDGPU_RESET_VCE1			(1 << 14)
 
+#define AMDGPU_RESET_LEVEL_SOFT_RECOVERY (1 << 0)
+#define AMDGPU_RESET_LEVEL_MODE2 (1 << 1)
+
 /* max cursor sizes (in pixels) */
 #define CIK_CURSOR_WIDTH 128
 #define CIK_CURSOR_HEIGHT 128
@@ -1060,6 +1063,8 @@ struct amdgpu_device {
 	uint32_t                        scpm_status;
 
 	struct work_struct		reset_work;
+
+	uint32_t						amdgpu_reset_level_mask;
 };
 
 static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index e2eec985adb3..c21b04463de8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1785,6 +1785,8 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
 		return PTR_ERR(ent);
 	}
 
+	debugfs_create_u32("amdgpu_reset_level", 0600, root, &adev->amdgpu_reset_level_mask);
+
 	/* Register debugfs entries for amdgpu_ttm */
 	amdgpu_ttm_debugfs_init(adev);
 	amdgpu_debugfs_pm_init(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
index 831fb222139c..9da5ead50c90 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
@@ -37,6 +37,8 @@ int amdgpu_reset_init(struct amdgpu_device *adev)
 {
 	int ret = 0;
 
+	adev->amdgpu_reset_level_mask = 0x1;
+
 	switch (adev->ip_versions[MP1_HWIP][0]) {
 	case IP_VERSION(13, 0, 2):
 		ret = aldebaran_reset_init(adev);
@@ -74,6 +76,9 @@ int amdgpu_reset_prepare_hwcontext(struct amdgpu_device *adev,
 {
 	struct amdgpu_reset_handler *reset_handler = NULL;
 
+	if (!(adev->amdgpu_reset_level_mask & AMDGPU_RESET_LEVEL_MODE2))
+		return -ENOSYS;
+
 	if (test_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context->flags))
 		return -ENOSYS;
 
@@ -93,6 +98,9 @@ int amdgpu_reset_perform_reset(struct amdgpu_device *adev,
 	int ret;
 	struct amdgpu_reset_handler *reset_handler = NULL;
 
+	if (!(adev->amdgpu_reset_level_mask & AMDGPU_RESET_LEVEL_MODE2))
+		return -ENOSYS;
+
 	if (test_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context->flags))
 		return -ENOSYS;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index d3558c34d406..3e316b013fd9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -405,6 +405,9 @@ bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int vmid,
 {
 	ktime_t deadline = ktime_add_us(ktime_get(), 10000);
 
+	if (!(ring->adev->amdgpu_reset_level_mask & AMDGPU_RESET_LEVEL_SOFT_RECOVERY))
+		return false;
+
 	if (amdgpu_sriov_vf(ring->adev) || !ring->funcs->soft_recovery || !fence)
 		return false;
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH v2 4/6] drm/amdgpu: save and restore gc hub regs
  2022-07-28 10:30 [PATCH v2 1/6] drm/amdgpu: add mode2 reset for sienna_cichlid Victor Zhao
  2022-07-28 10:30 ` [PATCH v2 2/6] drm/amdgpu: let mode2 reset fallback to default when failure Victor Zhao
  2022-07-28 10:30 ` [PATCH v2 3/6] drm/amdgpu: add debugfs amdgpu_reset_level Victor Zhao
@ 2022-07-28 10:30 ` Victor Zhao
  2022-07-28 10:30 ` [PATCH v2 5/6] drm/amdgpu: revert context to stop engine before mode2 reset Victor Zhao
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 13+ messages in thread
From: Victor Zhao @ 2022-07-28 10:30 UTC (permalink / raw)
  To: amd-gfx; +Cc: emily.deng, Victor Zhao, Andrey.Grodzovsky

Save and restore gfxhub regs as they will be reset during mode 2

Signed-off-by: Victor Zhao <Victor.Zhao@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h    |  2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h       | 26 +++++++
 drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c      | 72 +++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c   |  7 +-
 .../include/asic_reg/gc/gc_10_3_0_offset.h    |  4 ++
 5 files changed, 110 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h
index beabab515836..f8036f2b100e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h
@@ -35,6 +35,8 @@ struct amdgpu_gfxhub_funcs {
 	void (*init)(struct amdgpu_device *adev);
 	int (*get_xgmi_info)(struct amdgpu_device *adev);
 	void (*utcl2_harvest)(struct amdgpu_device *adev);
+	void (*mode2_save_regs)(struct amdgpu_device *adev);
+	void (*mode2_restore_regs)(struct amdgpu_device *adev);
 };
 
 struct amdgpu_gfxhub {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
index 008eaca27151..0305b660cd17 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
@@ -264,6 +264,32 @@ struct amdgpu_gmc {
 	u64 mall_size;
 	/* number of UMC instances */
 	int num_umc;
+	/* mode2 save restore */
+	u64 VM_L2_CNTL;
+	u64 VM_L2_CNTL2;
+	u64 VM_DUMMY_PAGE_FAULT_CNTL;
+	u64 VM_DUMMY_PAGE_FAULT_ADDR_LO32;
+	u64 VM_DUMMY_PAGE_FAULT_ADDR_HI32;
+	u64 VM_L2_PROTECTION_FAULT_CNTL;
+	u64 VM_L2_PROTECTION_FAULT_CNTL2;
+	u64 VM_L2_PROTECTION_FAULT_MM_CNTL3;
+	u64 VM_L2_PROTECTION_FAULT_MM_CNTL4;
+	u64 VM_L2_PROTECTION_FAULT_ADDR_LO32;
+	u64 VM_L2_PROTECTION_FAULT_ADDR_HI32;
+	u64 VM_DEBUG;
+	u64 VM_L2_MM_GROUP_RT_CLASSES;
+	u64 VM_L2_BANK_SELECT_RESERVED_CID;
+	u64 VM_L2_BANK_SELECT_RESERVED_CID2;
+	u64 VM_L2_CACHE_PARITY_CNTL;
+	u64 VM_L2_IH_LOG_CNTL;
+	u64 VM_CONTEXT_CNTL[16];
+	u64 VM_CONTEXT_PAGE_TABLE_BASE_ADDR_LO32[16];
+	u64 VM_CONTEXT_PAGE_TABLE_BASE_ADDR_HI32[16];
+	u64 VM_CONTEXT_PAGE_TABLE_START_ADDR_LO32[16];
+	u64 VM_CONTEXT_PAGE_TABLE_START_ADDR_HI32[16];
+	u64 VM_CONTEXT_PAGE_TABLE_END_ADDR_LO32[16];
+	u64 VM_CONTEXT_PAGE_TABLE_END_ADDR_HI32[16];
+	u64 MC_VM_MX_L1_TLB_CNTL;
 };
 
 #define amdgpu_gmc_flush_gpu_tlb(adev, vmid, vmhub, type) ((adev)->gmc.gmc_funcs->flush_gpu_tlb((adev), (vmid), (vmhub), (type)))
diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c b/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c
index d8c531581116..51cf8acd2d79 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c
@@ -576,6 +576,76 @@ static void gfxhub_v2_1_utcl2_harvest(struct amdgpu_device *adev)
 	}
 }
 
+static void gfxhub_v2_1_save_regs(struct amdgpu_device *adev)
+{
+	int i;
+	adev->gmc.VM_L2_CNTL = RREG32_SOC15(GC, 0, mmGCVM_L2_CNTL);
+	adev->gmc.VM_L2_CNTL2 = RREG32_SOC15(GC, 0, mmGCVM_L2_CNTL2);
+	adev->gmc.VM_DUMMY_PAGE_FAULT_CNTL = RREG32_SOC15(GC, 0, mmGCVM_DUMMY_PAGE_FAULT_CNTL);
+	adev->gmc.VM_DUMMY_PAGE_FAULT_ADDR_LO32 = RREG32_SOC15(GC, 0, mmGCVM_DUMMY_PAGE_FAULT_ADDR_LO32);
+	adev->gmc.VM_DUMMY_PAGE_FAULT_ADDR_HI32 = RREG32_SOC15(GC, 0, mmGCVM_DUMMY_PAGE_FAULT_ADDR_HI32);
+	adev->gmc.VM_L2_PROTECTION_FAULT_CNTL = RREG32_SOC15(GC, 0, mmGCVM_L2_PROTECTION_FAULT_CNTL);
+	adev->gmc.VM_L2_PROTECTION_FAULT_CNTL2 = RREG32_SOC15(GC, 0, mmGCVM_L2_PROTECTION_FAULT_CNTL2);
+	adev->gmc.VM_L2_PROTECTION_FAULT_MM_CNTL3 = RREG32_SOC15(GC, 0, mmGCVM_L2_PROTECTION_FAULT_MM_CNTL3);
+	adev->gmc.VM_L2_PROTECTION_FAULT_MM_CNTL4 = RREG32_SOC15(GC, 0, mmGCVM_L2_PROTECTION_FAULT_MM_CNTL4);
+	adev->gmc.VM_L2_PROTECTION_FAULT_ADDR_LO32 = RREG32_SOC15(GC, 0, mmGCVM_L2_PROTECTION_FAULT_ADDR_LO32);
+	adev->gmc.VM_L2_PROTECTION_FAULT_ADDR_HI32 = RREG32_SOC15(GC, 0, mmGCVM_L2_PROTECTION_FAULT_ADDR_HI32);
+	adev->gmc.VM_DEBUG = RREG32_SOC15(GC, 0, mmGCVM_DEBUG);
+	adev->gmc.VM_L2_MM_GROUP_RT_CLASSES = RREG32_SOC15(GC, 0, mmGCVM_L2_MM_GROUP_RT_CLASSES);
+	adev->gmc.VM_L2_BANK_SELECT_RESERVED_CID = RREG32_SOC15(GC, 0, mmGCVM_L2_BANK_SELECT_RESERVED_CID);
+	adev->gmc.VM_L2_BANK_SELECT_RESERVED_CID2 = RREG32_SOC15(GC, 0, mmGCVM_L2_BANK_SELECT_RESERVED_CID2);
+	adev->gmc.VM_L2_CACHE_PARITY_CNTL = RREG32_SOC15(GC, 0, mmGCVM_L2_CACHE_PARITY_CNTL);
+	adev->gmc.VM_L2_IH_LOG_CNTL = RREG32_SOC15(GC, 0, mmGCVM_L2_IH_LOG_CNTL);
+
+	for (i = 0; i <= 15; i++) {
+		adev->gmc.VM_CONTEXT_CNTL[i] = RREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT0_CNTL, i);
+		adev->gmc.VM_CONTEXT_PAGE_TABLE_BASE_ADDR_LO32[i] = RREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32, i * 2);
+		adev->gmc.VM_CONTEXT_PAGE_TABLE_BASE_ADDR_HI32[i] = RREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32, i * 2);
+		adev->gmc.VM_CONTEXT_PAGE_TABLE_START_ADDR_LO32[i] = RREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32, i * 2);
+		adev->gmc.VM_CONTEXT_PAGE_TABLE_START_ADDR_HI32[i] = RREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32, i * 2);
+		adev->gmc.VM_CONTEXT_PAGE_TABLE_END_ADDR_LO32[i] = RREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32, i * 2);
+		adev->gmc.VM_CONTEXT_PAGE_TABLE_END_ADDR_HI32[i] = RREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32, i * 2);
+	}
+
+	adev->gmc.MC_VM_MX_L1_TLB_CNTL = RREG32_SOC15(GC, 0, mmGCMC_VM_MX_L1_TLB_CNTL);
+}
+
+static void gfxhub_v2_1_restore_regs(struct amdgpu_device *adev)
+{
+	int i;
+	WREG32_SOC15(GC, 0, mmGCVM_L2_CNTL, adev->gmc.VM_L2_CNTL);
+	WREG32_SOC15(GC, 0, mmGCVM_L2_CNTL2, adev->gmc.VM_L2_CNTL2);
+	WREG32_SOC15(GC, 0, mmGCVM_DUMMY_PAGE_FAULT_CNTL, adev->gmc.VM_DUMMY_PAGE_FAULT_CNTL);
+	WREG32_SOC15(GC, 0, mmGCVM_DUMMY_PAGE_FAULT_ADDR_LO32, adev->gmc.VM_DUMMY_PAGE_FAULT_ADDR_LO32);
+	WREG32_SOC15(GC, 0, mmGCVM_DUMMY_PAGE_FAULT_ADDR_HI32, adev->gmc.VM_DUMMY_PAGE_FAULT_ADDR_HI32);
+	WREG32_SOC15(GC, 0, mmGCVM_L2_PROTECTION_FAULT_CNTL, adev->gmc.VM_L2_PROTECTION_FAULT_CNTL);
+	WREG32_SOC15(GC, 0, mmGCVM_L2_PROTECTION_FAULT_CNTL2, adev->gmc.VM_L2_PROTECTION_FAULT_CNTL2);
+	WREG32_SOC15(GC, 0, mmGCVM_L2_PROTECTION_FAULT_MM_CNTL3, adev->gmc.VM_L2_PROTECTION_FAULT_MM_CNTL3);
+	WREG32_SOC15(GC, 0, mmGCVM_L2_PROTECTION_FAULT_MM_CNTL4, adev->gmc.VM_L2_PROTECTION_FAULT_MM_CNTL4);
+	WREG32_SOC15(GC, 0, mmGCVM_L2_PROTECTION_FAULT_ADDR_LO32, adev->gmc.VM_L2_PROTECTION_FAULT_ADDR_LO32);
+	WREG32_SOC15(GC, 0, mmGCVM_L2_PROTECTION_FAULT_ADDR_HI32, adev->gmc.VM_L2_PROTECTION_FAULT_ADDR_HI32);
+	WREG32_SOC15(GC, 0, mmGCVM_DEBUG, adev->gmc.VM_DEBUG);
+	WREG32_SOC15(GC, 0, mmGCVM_L2_MM_GROUP_RT_CLASSES, adev->gmc.VM_L2_MM_GROUP_RT_CLASSES);
+	WREG32_SOC15(GC, 0, mmGCVM_L2_BANK_SELECT_RESERVED_CID, adev->gmc.VM_L2_BANK_SELECT_RESERVED_CID);
+	WREG32_SOC15(GC, 0, mmGCVM_L2_BANK_SELECT_RESERVED_CID2, adev->gmc.VM_L2_BANK_SELECT_RESERVED_CID2);
+	WREG32_SOC15(GC, 0, mmGCVM_L2_CACHE_PARITY_CNTL, adev->gmc.VM_L2_CACHE_PARITY_CNTL);
+	WREG32_SOC15(GC, 0, mmGCVM_L2_IH_LOG_CNTL, adev->gmc.VM_L2_IH_LOG_CNTL);
+
+	for (i = 0; i <= 15; i++) {
+		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT0_CNTL, i, adev->gmc.VM_CONTEXT_CNTL[i]);
+		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32, i * 2, adev->gmc.VM_CONTEXT_PAGE_TABLE_BASE_ADDR_LO32[i]);
+		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32, i * 2, adev->gmc.VM_CONTEXT_PAGE_TABLE_BASE_ADDR_HI32[i]);
+		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32, i * 2, adev->gmc.VM_CONTEXT_PAGE_TABLE_START_ADDR_LO32[i]);
+		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32, i * 2, adev->gmc.VM_CONTEXT_PAGE_TABLE_START_ADDR_HI32[i]);
+		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32, i * 2, adev->gmc.VM_CONTEXT_PAGE_TABLE_END_ADDR_LO32[i]);
+		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32, i * 2, adev->gmc.VM_CONTEXT_PAGE_TABLE_END_ADDR_HI32[i]);
+	}
+
+	WREG32_SOC15(GC, 0, mmGCMC_VM_FB_LOCATION_BASE, adev->gmc.vram_start >> 24);
+	WREG32_SOC15(GC, 0, mmGCMC_VM_FB_LOCATION_TOP, adev->gmc.vram_end >> 24);
+	WREG32_SOC15(GC, 0, mmGCMC_VM_MX_L1_TLB_CNTL, adev->gmc.MC_VM_MX_L1_TLB_CNTL);
+}
+
 const struct amdgpu_gfxhub_funcs gfxhub_v2_1_funcs = {
 	.get_fb_location = gfxhub_v2_1_get_fb_location,
 	.get_mc_fb_offset = gfxhub_v2_1_get_mc_fb_offset,
@@ -586,4 +656,6 @@ const struct amdgpu_gfxhub_funcs gfxhub_v2_1_funcs = {
 	.init = gfxhub_v2_1_init,
 	.get_xgmi_info = gfxhub_v2_1_get_xgmi_info,
 	.utcl2_harvest = gfxhub_v2_1_utcl2_harvest,
+	.mode2_save_regs = gfxhub_v2_1_save_regs,
+	.mode2_restore_regs = gfxhub_v2_1_restore_regs,
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
index 0512960bed23..51a5b68f77d3 100644
--- a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
+++ b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
@@ -94,8 +94,11 @@ sienna_cichlid_mode2_prepare_hwcontext(struct amdgpu_reset_control *reset_ctl,
 	int r = 0;
 	struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;
 
-	if (!amdgpu_sriov_vf(adev))
+	if (!amdgpu_sriov_vf(adev)) {
+		if (adev->gfxhub.funcs->mode2_save_regs)
+			adev->gfxhub.funcs->mode2_save_regs(adev);
 		r = sienna_cichlid_mode2_suspend_ip(adev);
+	}
 
 	return r;
 }
@@ -152,6 +155,8 @@ static int sienna_cichlid_mode2_restore_ip(struct amdgpu_device *adev)
 	}
 
 	/* Reinit GFXHUB */
+	if (adev->gfxhub.funcs->mode2_restore_regs)
+		adev->gfxhub.funcs->mode2_restore_regs(adev);
 	adev->gfxhub.funcs->init(adev);
 	r = adev->gfxhub.funcs->gart_enable(adev);
 	if (r) {
diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_offset.h b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_offset.h
index f21554a1c86c..594bffce93a9 100644
--- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_offset.h
+++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_offset.h
@@ -3129,6 +3129,8 @@
 #define mmGCVM_L2_PROTECTION_FAULT_DEFAULT_ADDR_LO32_BASE_IDX                                          0
 #define mmGCVM_L2_PROTECTION_FAULT_DEFAULT_ADDR_HI32                                                   0x15cc
 #define mmGCVM_L2_PROTECTION_FAULT_DEFAULT_ADDR_HI32_BASE_IDX                                          0
+#define mmGCVM_DEBUG                                                                                   0x15cd
+#define mmGCVM_DEBUG_BASE_IDX                                                                          0
 #define mmGCVM_L2_CONTEXT1_IDENTITY_APERTURE_LOW_ADDR_LO32                                             0x15ce
 #define mmGCVM_L2_CONTEXT1_IDENTITY_APERTURE_LOW_ADDR_LO32_BASE_IDX                                    0
 #define mmGCVM_L2_CONTEXT1_IDENTITY_APERTURE_LOW_ADDR_HI32                                             0x15cf
@@ -3151,6 +3153,8 @@
 #define mmGCVM_L2_BANK_SELECT_RESERVED_CID2_BASE_IDX                                                   0
 #define mmGCVM_L2_CACHE_PARITY_CNTL                                                                    0x15d8
 #define mmGCVM_L2_CACHE_PARITY_CNTL_BASE_IDX                                                           0
+#define mmGCVM_L2_IH_LOG_CNTL                                                                          0x15d9
+#define mmGCVM_L2_IH_LOG_CNTL_BASE_IDX                                                                 0
 #define mmGCVM_L2_CNTL5                                                                                0x15dc
 #define mmGCVM_L2_CNTL5_BASE_IDX                                                                       0
 #define mmGCVM_L2_GCR_CNTL                                                                             0x15dd
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH v2 5/6] drm/amdgpu: revert context to stop engine before mode2 reset
  2022-07-28 10:30 [PATCH v2 1/6] drm/amdgpu: add mode2 reset for sienna_cichlid Victor Zhao
                   ` (2 preceding siblings ...)
  2022-07-28 10:30 ` [PATCH v2 4/6] drm/amdgpu: save and restore gc hub regs Victor Zhao
@ 2022-07-28 10:30 ` Victor Zhao
  2022-07-28 10:30 ` [PATCH v2 6/6] drm/amdgpu: reduce reset time Victor Zhao
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 13+ messages in thread
From: Victor Zhao @ 2022-07-28 10:30 UTC (permalink / raw)
  To: amd-gfx; +Cc: emily.deng, Victor Zhao, Andrey.Grodzovsky

For some hang caused by slow tests, engine cannot be stopped which
may cause resume failure after reset. In this case, force halt
engine by reverting context addresses

Signed-off-by: Victor Zhao <Victor.Zhao@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h  |  1 +
 drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c    | 36 +++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c |  2 ++
 4 files changed, 40 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index f03750b64f61..debbba842fce 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5038,6 +5038,7 @@ static void amdgpu_device_recheck_guilty_jobs(
 
 			/* set guilty */
 			drm_sched_increase_karma(s_job);
+			amdgpu_reset_prepare_hwcontext(adev, reset_context);
 retry:
 			/* do hw reset */
 			if (amdgpu_sriov_vf(adev)) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h
index f8036f2b100e..c7b44aeb671b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h
@@ -37,6 +37,7 @@ struct amdgpu_gfxhub_funcs {
 	void (*utcl2_harvest)(struct amdgpu_device *adev);
 	void (*mode2_save_regs)(struct amdgpu_device *adev);
 	void (*mode2_restore_regs)(struct amdgpu_device *adev);
+	void (*halt)(struct amdgpu_device *adev);
 };
 
 struct amdgpu_gfxhub {
diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c b/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c
index 51cf8acd2d79..8cf53e039c11 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c
@@ -646,6 +646,41 @@ static void gfxhub_v2_1_restore_regs(struct amdgpu_device *adev)
 	WREG32_SOC15(GC, 0, mmGCMC_VM_MX_L1_TLB_CNTL, adev->gmc.MC_VM_MX_L1_TLB_CNTL);
 }
 
+static void gfxhub_v2_1_halt(struct amdgpu_device *adev)
+{
+	struct amdgpu_vmhub *hub = &adev->vmhub[AMDGPU_GFXHUB_0];
+	int i;
+	uint32_t tmp;
+	int time = 1000;
+
+	gfxhub_v2_1_set_fault_enable_default(adev, false);
+
+	for (i = 0; i <= 14; i++) {
+		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32,
+				    i * hub->ctx_addr_distance, ~0);
+		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_PAGE_TABLE_START_ADDR_HI32,
+				    i * hub->ctx_addr_distance, ~0);
+		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_PAGE_TABLE_END_ADDR_LO32,
+				    i * hub->ctx_addr_distance,
+				    0);
+		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_PAGE_TABLE_END_ADDR_HI32,
+				    i * hub->ctx_addr_distance,
+				    0);
+	}
+	tmp = RREG32_SOC15(GC, 0, mmGRBM_STATUS2);
+	while ((tmp & (GRBM_STATUS2__EA_BUSY_MASK |
+		      GRBM_STATUS2__EA_LINK_BUSY_MASK)) != 0 &&
+	       time) {
+		udelay(100);
+		time--;
+		tmp = RREG32_SOC15(GC, 0, mmGRBM_STATUS2);
+	}
+
+	if (!time) {
+		DRM_WARN("failed to wait for GRBM(EA) idle\n");
+	}
+}
+
 const struct amdgpu_gfxhub_funcs gfxhub_v2_1_funcs = {
 	.get_fb_location = gfxhub_v2_1_get_fb_location,
 	.get_mc_fb_offset = gfxhub_v2_1_get_mc_fb_offset,
@@ -658,4 +693,5 @@ const struct amdgpu_gfxhub_funcs gfxhub_v2_1_funcs = {
 	.utcl2_harvest = gfxhub_v2_1_utcl2_harvest,
 	.mode2_save_regs = gfxhub_v2_1_save_regs,
 	.mode2_restore_regs = gfxhub_v2_1_restore_regs,
+	.halt = gfxhub_v2_1_halt,
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
index 51a5b68f77d3..fead7251292f 100644
--- a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
+++ b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
@@ -97,6 +97,8 @@ sienna_cichlid_mode2_prepare_hwcontext(struct amdgpu_reset_control *reset_ctl,
 	if (!amdgpu_sriov_vf(adev)) {
 		if (adev->gfxhub.funcs->mode2_save_regs)
 			adev->gfxhub.funcs->mode2_save_regs(adev);
+		if (adev->gfxhub.funcs->halt)
+			adev->gfxhub.funcs->halt(adev);
 		r = sienna_cichlid_mode2_suspend_ip(adev);
 	}
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH v2 6/6] drm/amdgpu: reduce reset time
  2022-07-28 10:30 [PATCH v2 1/6] drm/amdgpu: add mode2 reset for sienna_cichlid Victor Zhao
                   ` (3 preceding siblings ...)
  2022-07-28 10:30 ` [PATCH v2 5/6] drm/amdgpu: revert context to stop engine before mode2 reset Victor Zhao
@ 2022-07-28 10:30 ` Victor Zhao
  2022-07-28 18:37   ` Andrey Grodzovsky
  2022-07-29  6:11   ` Lazar, Lijo
  2022-07-28 18:39 ` [PATCH v2 1/6] drm/amdgpu: add mode2 reset for sienna_cichlid Andrey Grodzovsky
  2022-07-29  6:05 ` Quan, Evan
  6 siblings, 2 replies; 13+ messages in thread
From: Victor Zhao @ 2022-07-28 10:30 UTC (permalink / raw)
  To: amd-gfx; +Cc: emily.deng, Victor Zhao, Andrey.Grodzovsky

In multi container use case, reset time is important, so skip ring
tests and cp halt wait during ip suspending for reset as they are
going to fail and cost more time on reset

v2: add a hang flag to indicate the reset comes from a job timeout,
skip ring test and cp halt wait in this case

Signed-off-by: Victor Zhao <Victor.Zhao@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c   |  3 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c   |  2 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h |  1 +
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c    | 11 +++++++++--
 5 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 222d3d7ea076..c735a17c6afb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -27,6 +27,7 @@
 #include "amdgpu_gfx.h"
 #include "amdgpu_rlc.h"
 #include "amdgpu_ras.h"
+#include "amdgpu_reset.h"
 
 /* delay 0.1 second to enable gfx off feature */
 #define GFX_OFF_DELAY_ENABLE         msecs_to_jiffies(100)
@@ -477,7 +478,7 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev)
 		kiq->pmf->kiq_unmap_queues(kiq_ring, &adev->gfx.compute_ring[i],
 					   RESET_QUEUES, 0, 0);
 
-	if (adev->gfx.kiq.ring.sched.ready)
+	if (adev->gfx.kiq.ring.sched.ready && !(amdgpu_in_reset(adev) && adev->reset_domain->hang))
 		r = amdgpu_ring_test_helper(kiq_ring);
 	spin_unlock(&adev->gfx.kiq.ring_lock);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 6c3e7290153f..bb40880a557f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -49,6 +49,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
 	}
 
 	memset(&ti, 0, sizeof(struct amdgpu_task_info));
+	adev->reset_domain->hang = true;
 
 	if (amdgpu_gpu_recovery &&
 	    amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) {
@@ -83,6 +84,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
 	}
 
 exit:
+	adev->reset_domain->hang = false;
 	drm_dev_exit(idx);
 	return DRM_GPU_SCHED_STAT_NOMINAL;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
index 9da5ead50c90..b828fe773f50 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
@@ -155,6 +155,7 @@ struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d
 	atomic_set(&reset_domain->in_gpu_reset, 0);
 	atomic_set(&reset_domain->reset_res, 0);
 	init_rwsem(&reset_domain->sem);
+	reset_domain->hang = false;
 
 	return reset_domain;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
index cc4b2eeb24cf..29e324add552 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
@@ -84,6 +84,7 @@ struct amdgpu_reset_domain {
 	struct rw_semaphore sem;
 	atomic_t in_gpu_reset;
 	atomic_t reset_res;
+	bool hang;
 };
 
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index fafbad3cf08d..a384e04d916c 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -29,6 +29,7 @@
 #include "amdgpu.h"
 #include "amdgpu_gfx.h"
 #include "amdgpu_psp.h"
+#include "amdgpu_reset.h"
 #include "nv.h"
 #include "nvd.h"
 
@@ -5971,6 +5972,9 @@ static int gfx_v10_0_cp_gfx_enable(struct amdgpu_device *adev, bool enable)
 		WREG32_SOC15(GC, 0, mmCP_ME_CNTL, tmp);
 	}
 
+	if ((amdgpu_in_reset(adev) && adev->reset_domain->hang) && !enable)
+		return 0;
+
 	for (i = 0; i < adev->usec_timeout; i++) {
 		if (RREG32_SOC15(GC, 0, mmCP_STAT) == 0)
 			break;
@@ -7569,8 +7573,10 @@ static int gfx_v10_0_kiq_disable_kgq(struct amdgpu_device *adev)
 	for (i = 0; i < adev->gfx.num_gfx_rings; i++)
 		kiq->pmf->kiq_unmap_queues(kiq_ring, &adev->gfx.gfx_ring[i],
 					   PREEMPT_QUEUES, 0, 0);
-
-	return amdgpu_ring_test_helper(kiq_ring);
+	if (!(amdgpu_in_reset(adev) && adev->reset_domain->hang))
+		return amdgpu_ring_test_helper(kiq_ring);
+	else
+		return 0;
 }
 #endif
 
@@ -7610,6 +7616,7 @@ static int gfx_v10_0_hw_fini(void *handle)
 
 		return 0;
 	}
+
 	gfx_v10_0_cp_enable(adev, false);
 	gfx_v10_0_enable_gui_idle_interrupt(adev, false);
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 6/6] drm/amdgpu: reduce reset time
  2022-07-28 10:30 ` [PATCH v2 6/6] drm/amdgpu: reduce reset time Victor Zhao
@ 2022-07-28 18:37   ` Andrey Grodzovsky
  2022-07-29  6:11   ` Lazar, Lijo
  1 sibling, 0 replies; 13+ messages in thread
From: Andrey Grodzovsky @ 2022-07-28 18:37 UTC (permalink / raw)
  To: Victor Zhao, amd-gfx; +Cc: emily.deng


On 2022-07-28 06:30, Victor Zhao wrote:
> In multi container use case, reset time is important, so skip ring
> tests and cp halt wait during ip suspending for reset as they are
> going to fail and cost more time on reset
>
> v2: add a hang flag to indicate the reset comes from a job timeout,
> skip ring test and cp halt wait in this case
>
> Signed-off-by: Victor Zhao <Victor.Zhao@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c   |  3 ++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c   |  2 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c |  1 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h |  1 +
>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c    | 11 +++++++++--
>   5 files changed, 15 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index 222d3d7ea076..c735a17c6afb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -27,6 +27,7 @@
>   #include "amdgpu_gfx.h"
>   #include "amdgpu_rlc.h"
>   #include "amdgpu_ras.h"
> +#include "amdgpu_reset.h"
>   
>   /* delay 0.1 second to enable gfx off feature */
>   #define GFX_OFF_DELAY_ENABLE         msecs_to_jiffies(100)
> @@ -477,7 +478,7 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev)
>   		kiq->pmf->kiq_unmap_queues(kiq_ring, &adev->gfx.compute_ring[i],
>   					   RESET_QUEUES, 0, 0);
>   
> -	if (adev->gfx.kiq.ring.sched.ready)
> +	if (adev->gfx.kiq.ring.sched.ready && !(amdgpu_in_reset(adev) && adev->reset_domain->hang))


I think it's enough to look at adev->reset_domain->hang and you can drop 
the amdgpu_in_reset check.

>   		r = amdgpu_ring_test_helper(kiq_ring);
>   	spin_unlock(&adev->gfx.kiq.ring_lock);
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index 6c3e7290153f..bb40880a557f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -49,6 +49,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
>   	}
>   
>   	memset(&ti, 0, sizeof(struct amdgpu_task_info));
> +	adev->reset_domain->hang = true;
>   
>   	if (amdgpu_gpu_recovery &&
>   	    amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) {
> @@ -83,6 +84,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
>   	}
>   
>   exit:
> +	adev->reset_domain->hang = false;
>   	drm_dev_exit(idx);
>   	return DRM_GPU_SCHED_STAT_NOMINAL;
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> index 9da5ead50c90..b828fe773f50 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> @@ -155,6 +155,7 @@ struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d
>   	atomic_set(&reset_domain->in_gpu_reset, 0);
>   	atomic_set(&reset_domain->reset_res, 0);
>   	init_rwsem(&reset_domain->sem);
> +	reset_domain->hang = false;
>   
>   	return reset_domain;
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> index cc4b2eeb24cf..29e324add552 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> @@ -84,6 +84,7 @@ struct amdgpu_reset_domain {
>   	struct rw_semaphore sem;
>   	atomic_t in_gpu_reset;
>   	atomic_t reset_res;
> +	bool hang;
>   };
>   
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index fafbad3cf08d..a384e04d916c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -29,6 +29,7 @@
>   #include "amdgpu.h"
>   #include "amdgpu_gfx.h"
>   #include "amdgpu_psp.h"
> +#include "amdgpu_reset.h"
>   #include "nv.h"
>   #include "nvd.h"
>   
> @@ -5971,6 +5972,9 @@ static int gfx_v10_0_cp_gfx_enable(struct amdgpu_device *adev, bool enable)
>   		WREG32_SOC15(GC, 0, mmCP_ME_CNTL, tmp);
>   	}
>   
> +	if ((amdgpu_in_reset(adev) && adev->reset_domain->hang) && !enable)
> +		return 0;
> +


Same as above


>   	for (i = 0; i < adev->usec_timeout; i++) {
>   		if (RREG32_SOC15(GC, 0, mmCP_STAT) == 0)
>   			break;
> @@ -7569,8 +7573,10 @@ static int gfx_v10_0_kiq_disable_kgq(struct amdgpu_device *adev)
>   	for (i = 0; i < adev->gfx.num_gfx_rings; i++)
>   		kiq->pmf->kiq_unmap_queues(kiq_ring, &adev->gfx.gfx_ring[i],
>   					   PREEMPT_QUEUES, 0, 0);
> -
> -	return amdgpu_ring_test_helper(kiq_ring);
> +	if (!(amdgpu_in_reset(adev) && adev->reset_domain->hang))


Same as above

Andrey


> +		return amdgpu_ring_test_helper(kiq_ring);
> +	else
> +		return 0;
>   }
>   #endif
>   
> @@ -7610,6 +7616,7 @@ static int gfx_v10_0_hw_fini(void *handle)
>   
>   		return 0;
>   	}
> +
>   	gfx_v10_0_cp_enable(adev, false);
>   	gfx_v10_0_enable_gui_idle_interrupt(adev, false);
>   

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 1/6] drm/amdgpu: add mode2 reset for sienna_cichlid
  2022-07-28 10:30 [PATCH v2 1/6] drm/amdgpu: add mode2 reset for sienna_cichlid Victor Zhao
                   ` (4 preceding siblings ...)
  2022-07-28 10:30 ` [PATCH v2 6/6] drm/amdgpu: reduce reset time Victor Zhao
@ 2022-07-28 18:39 ` Andrey Grodzovsky
  2022-07-29  6:05 ` Quan, Evan
  6 siblings, 0 replies; 13+ messages in thread
From: Andrey Grodzovsky @ 2022-07-28 18:39 UTC (permalink / raw)
  To: Victor Zhao, amd-gfx; +Cc: emily.deng


On 2022-07-28 06:30, Victor Zhao wrote:
> To meet the requirement for multi container usecase which needs
> a quicker reset and not causing VRAM lost, adding the Mode2
> reset handler for sienna_cichlid.
>
> v2: move skip mode2 flag part separately
>
> Signed-off-by: Victor Zhao <Victor.Zhao@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/Makefile           |   2 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c     |   7 +
>   drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c   | 297 ++++++++++++++++++
>   drivers/gpu/drm/amd/amdgpu/sienna_cichlid.h   |  32 ++
>   .../pm/swsmu/inc/pmfw_if/smu_v11_0_7_ppsmc.h  |   4 +-
>   drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h  |   3 +-
>   .../amd/pm/swsmu/smu11/sienna_cichlid_ppt.c   |  54 ++++
>   7 files changed, 395 insertions(+), 4 deletions(-)
>   create mode 100644 drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
>   create mode 100644 drivers/gpu/drm/amd/amdgpu/sienna_cichlid.h
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile
> index c7d0cd15b5ef..7030ac2d7d2c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/Makefile
> +++ b/drivers/gpu/drm/amd/amdgpu/Makefile
> @@ -75,7 +75,7 @@ amdgpu-y += \
>   	vi.o mxgpu_vi.o nbio_v6_1.o soc15.o emu_soc.o mxgpu_ai.o nbio_v7_0.o vega10_reg_init.o \
>   	vega20_reg_init.o nbio_v7_4.o nbio_v2_3.o nv.o arct_reg_init.o mxgpu_nv.o \
>   	nbio_v7_2.o hdp_v4_0.o hdp_v5_0.o aldebaran_reg_init.o aldebaran.o soc21.o \
> -	nbio_v4_3.o hdp_v6_0.o nbio_v7_7.o hdp_v5_2.o lsdma_v6_0.o
> +	sienna_cichlid.o nbio_v4_3.o hdp_v6_0.o nbio_v7_7.o hdp_v5_2.o lsdma_v6_0.o
>   
>   # add DF block
>   amdgpu-y += \
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> index 32c86a0b145c..f778466bb9db 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> @@ -23,6 +23,7 @@
>   
>   #include "amdgpu_reset.h"
>   #include "aldebaran.h"
> +#include "sienna_cichlid.h"
>   
>   int amdgpu_reset_add_handler(struct amdgpu_reset_control *reset_ctl,
>   			     struct amdgpu_reset_handler *handler)
> @@ -40,6 +41,9 @@ int amdgpu_reset_init(struct amdgpu_device *adev)
>   	case IP_VERSION(13, 0, 2):
>   		ret = aldebaran_reset_init(adev);
>   		break;
> +	case IP_VERSION(11, 0, 7):
> +		ret = sienna_cichlid_reset_init(adev);
> +		break;
>   	default:
>   		break;
>   	}
> @@ -55,6 +59,9 @@ int amdgpu_reset_fini(struct amdgpu_device *adev)
>   	case IP_VERSION(13, 0, 2):
>   		ret = aldebaran_reset_fini(adev);
>   		break;
> +	case IP_VERSION(11, 0, 7):
> +		ret = sienna_cichlid_reset_fini(adev);
> +		break;
>   	default:
>   		break;
>   	}
> diff --git a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
> new file mode 100644
> index 000000000000..0512960bed23
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
> @@ -0,0 +1,297 @@
> +/*
> + * Copyright 2021 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + *
> + */
> +
> +#include "sienna_cichlid.h"
> +#include "amdgpu_reset.h"
> +#include "amdgpu_amdkfd.h"
> +#include "amdgpu_dpm.h"
> +#include "amdgpu_job.h"
> +#include "amdgpu_ring.h"
> +#include "amdgpu_ras.h"
> +#include "amdgpu_psp.h"
> +#include "amdgpu_xgmi.h"
> +
> +static struct amdgpu_reset_handler *
> +sienna_cichlid_get_reset_handler(struct amdgpu_reset_control *reset_ctl,
> +			    struct amdgpu_reset_context *reset_context)
> +{
> +	struct amdgpu_reset_handler *handler;
> +	struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;
> +
> +	if (reset_context->method != AMD_RESET_METHOD_NONE) {
> +		list_for_each_entry(handler, &reset_ctl->reset_handlers,
> +				     handler_list) {
> +			if (handler->reset_method == reset_context->method)
> +				return handler;
> +		}
> +	} else {
> +		list_for_each_entry(handler, &reset_ctl->reset_handlers,
> +				     handler_list) {
> +			if (handler->reset_method == AMD_RESET_METHOD_MODE2 &&
> +			    adev->pm.fw_version >= 0x3a5500 &&
> +			    !amdgpu_sriov_vf(adev)) {
> +				reset_context->method = AMD_RESET_METHOD_MODE2;
> +				return handler;
> +			}
> +		}
> +	}
> +
> +	return NULL;
> +}
> +
> +static int sienna_cichlid_mode2_suspend_ip(struct amdgpu_device *adev)
> +{
> +	int r, i;
> +
> +	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
> +	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
> +
> +	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
> +		if (!(adev->ip_blocks[i].version->type ==
> +			      AMD_IP_BLOCK_TYPE_GFX ||
> +		      adev->ip_blocks[i].version->type ==
> +			      AMD_IP_BLOCK_TYPE_SDMA))
> +			continue;
> +
> +		r = adev->ip_blocks[i].version->funcs->suspend(adev);
> +
> +		if (r) {
> +			dev_err(adev->dev,
> +				"suspend of IP block <%s> failed %d\n",
> +				adev->ip_blocks[i].version->funcs->name, r);
> +			return r;
> +		}
> +		adev->ip_blocks[i].status.hw = false;
> +	}
> +
> +	return r;
> +}
> +
> +static int
> +sienna_cichlid_mode2_prepare_hwcontext(struct amdgpu_reset_control *reset_ctl,
> +				  struct amdgpu_reset_context *reset_context)
> +{
> +	int r = 0;
> +	struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;
> +
> +	if (!amdgpu_sriov_vf(adev))
> +		r = sienna_cichlid_mode2_suspend_ip(adev);
> +
> +	return r;
> +}
> +
> +static void sienna_cichlid_async_reset(struct work_struct *work)
> +{
> +	struct amdgpu_reset_handler *handler;
> +	struct amdgpu_reset_control *reset_ctl =
> +		container_of(work, struct amdgpu_reset_control, reset_work);
> +	struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;
> +
> +	list_for_each_entry(handler, &reset_ctl->reset_handlers,
> +			     handler_list) {
> +		if (handler->reset_method == reset_ctl->active_reset) {
> +			dev_dbg(adev->dev, "Resetting device\n");
> +			handler->do_reset(adev);
> +			break;
> +		}
> +	}
> +}
> +
> +static int sienna_cichlid_mode2_reset(struct amdgpu_device *adev)
> +{
> +	/* disable BM */
> +	pci_clear_master(adev->pdev);
> +	adev->asic_reset_res = amdgpu_dpm_mode2_reset(adev);
> +	return adev->asic_reset_res;
> +}


Why do you need to set adev->asic_reset_res here and
not just return back the result ?

Andrey


> +
> +static int
> +sienna_cichlid_mode2_perform_reset(struct amdgpu_reset_control *reset_ctl,
> +			      struct amdgpu_reset_context *reset_context)
> +{
> +	struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;
> +	int r;
> +
> +	r = sienna_cichlid_mode2_reset(adev);
> +	if (r) {
> +		dev_err(adev->dev,
> +			"ASIC reset failed with error, %d ", r);
> +	}
> +	return r;
> +}
> +
> +static int sienna_cichlid_mode2_restore_ip(struct amdgpu_device *adev)
> +{
> +	int i, r;
> +	struct psp_context *psp = &adev->psp;
> +
> +	r = psp_rlc_autoload_start(psp);
> +	if (r) {
> +		dev_err(adev->dev, "Failed to start rlc autoload\n");
> +		return r;
> +	}
> +
> +	/* Reinit GFXHUB */
> +	adev->gfxhub.funcs->init(adev);
> +	r = adev->gfxhub.funcs->gart_enable(adev);
> +	if (r) {
> +		dev_err(adev->dev, "GFXHUB gart reenable failed after reset\n");
> +		return r;
> +	}
> +
> +	for (i = 0; i < adev->num_ip_blocks; i++) {
> +		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)
> +			r = adev->ip_blocks[i].version->funcs->resume(adev);
> +		if (r) {
> +			dev_err(adev->dev,
> +				"resume of IP block <%s> failed %d\n",
> +				adev->ip_blocks[i].version->funcs->name, r);
> +			return r;
> +		}
> +
> +		adev->ip_blocks[i].status.hw = true;
> +	}
> +
> +	for (i = 0; i < adev->num_ip_blocks; i++) {
> +		if (!(adev->ip_blocks[i].version->type ==
> +			      AMD_IP_BLOCK_TYPE_GFX ||
> +		      adev->ip_blocks[i].version->type ==
> +			      AMD_IP_BLOCK_TYPE_SDMA))
> +			continue;
> +		r = adev->ip_blocks[i].version->funcs->resume(adev);
> +		if (r) {
> +			dev_err(adev->dev,
> +				"resume of IP block <%s> failed %d\n",
> +				adev->ip_blocks[i].version->funcs->name, r);
> +			return r;
> +		}
> +
> +		adev->ip_blocks[i].status.hw = true;
> +	}
> +
> +	for (i = 0; i < adev->num_ip_blocks; i++) {
> +		if (!(adev->ip_blocks[i].version->type ==
> +			      AMD_IP_BLOCK_TYPE_GFX ||
> +		      adev->ip_blocks[i].version->type ==
> +			      AMD_IP_BLOCK_TYPE_SDMA))
> +			continue;
> +
> +		if (adev->ip_blocks[i].version->funcs->late_init) {
> +			r = adev->ip_blocks[i].version->funcs->late_init(
> +				(void *)adev);
> +			if (r) {
> +				dev_err(adev->dev,
> +					"late_init of IP block <%s> failed %d after reset\n",
> +					adev->ip_blocks[i].version->funcs->name,
> +					r);
> +				return r;
> +			}
> +		}
> +		adev->ip_blocks[i].status.late_initialized = true;
> +	}
> +
> +	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
> +	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
> +
> +	return r;
> +}
> +
> +static int
> +sienna_cichlid_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,
> +				  struct amdgpu_reset_context *reset_context)
> +{
> +	int r;
> +	struct amdgpu_device *tmp_adev = (struct amdgpu_device *)reset_ctl->handle;
> +
> +	dev_info(tmp_adev->dev,
> +			"GPU reset succeeded, trying to resume\n");
> +	r = sienna_cichlid_mode2_restore_ip(tmp_adev);
> +	if (r)
> +		goto end;
> +
> +	/*
> +	* Add this ASIC as tracked as reset was already
> +	* complete successfully.
> +	*/
> +	amdgpu_register_gpu_instance(tmp_adev);
> +
> +	/* Resume RAS */
> +	amdgpu_ras_resume(tmp_adev);
> +
> +	amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
> +
> +	r = amdgpu_ib_ring_tests(tmp_adev);
> +	if (r) {
> +		dev_err(tmp_adev->dev,
> +			"ib ring test failed (%d).\n", r);
> +		r = -EAGAIN;
> +		tmp_adev->asic_reset_res = r;
> +		goto end;
> +	}
> +
> +end:
> +	if (r)
> +		return -EAGAIN;
> +	else
> +		return r;
> +}
> +
> +static struct amdgpu_reset_handler sienna_cichlid_mode2_handler = {
> +	.reset_method		= AMD_RESET_METHOD_MODE2,
> +	.prepare_env		= NULL,
> +	.prepare_hwcontext	= sienna_cichlid_mode2_prepare_hwcontext,
> +	.perform_reset		= sienna_cichlid_mode2_perform_reset,
> +	.restore_hwcontext	= sienna_cichlid_mode2_restore_hwcontext,
> +	.restore_env		= NULL,
> +	.do_reset		= sienna_cichlid_mode2_reset,
> +};
> +
> +int sienna_cichlid_reset_init(struct amdgpu_device *adev)
> +{
> +	struct amdgpu_reset_control *reset_ctl;
> +
> +	reset_ctl = kzalloc(sizeof(*reset_ctl), GFP_KERNEL);
> +	if (!reset_ctl)
> +		return -ENOMEM;
> +
> +	reset_ctl->handle = adev;
> +	reset_ctl->async_reset = sienna_cichlid_async_reset;
> +	reset_ctl->active_reset = AMD_RESET_METHOD_NONE;
> +	reset_ctl->get_reset_handler = sienna_cichlid_get_reset_handler;
> +
> +	INIT_LIST_HEAD(&reset_ctl->reset_handlers);
> +	INIT_WORK(&reset_ctl->reset_work, reset_ctl->async_reset);
> +	/* Only mode2 is handled through reset control now */
> +	amdgpu_reset_add_handler(reset_ctl, &sienna_cichlid_mode2_handler);
> +
> +	adev->reset_cntl = reset_ctl;
> +
> +	return 0;
> +}
> +
> +int sienna_cichlid_reset_fini(struct amdgpu_device *adev)
> +{
> +	kfree(adev->reset_cntl);
> +	adev->reset_cntl = NULL;
> +	return 0;
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.h b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.h
> new file mode 100644
> index 000000000000..5213b162dacd
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.h
> @@ -0,0 +1,32 @@
> +/*
> + * Copyright 2021 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + *
> + */
> +
> +#ifndef __SIENNA_CICHLID_H__
> +#define __SIENNA_CICHLID_H__
> +
> +#include "amdgpu.h"
> +
> +int sienna_cichlid_reset_init(struct amdgpu_device *adev);
> +int sienna_cichlid_reset_fini(struct amdgpu_device *adev);
> +
> +#endif
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v11_0_7_ppsmc.h b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v11_0_7_ppsmc.h
> index d2e10a724560..82cf9e563065 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v11_0_7_ppsmc.h
> +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v11_0_7_ppsmc.h
> @@ -137,7 +137,7 @@
>   #define PPSMC_MSG_DisallowGpo                    0x56
>   
>   #define PPSMC_MSG_Enable2ndUSB20Port             0x57
> -
> -#define PPSMC_Message_Count                      0x58
> +#define PPSMC_MSG_DriverMode2Reset               0x5D
> +#define PPSMC_Message_Count                      0x5E
>   
>   #endif
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
> index 19084a4fcb2b..28f6a1eb6945 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
> +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
> @@ -235,7 +235,8 @@
>   	__SMU_DUMMY_MAP(UnforceGfxVid),           \
>   	__SMU_DUMMY_MAP(HeavySBR),			\
>   	__SMU_DUMMY_MAP(SetBadHBMPagesRetiredFlagsPerChannel), \
> -	__SMU_DUMMY_MAP(EnableGfxImu),
> +	__SMU_DUMMY_MAP(EnableGfxImu), \
> +	__SMU_DUMMY_MAP(DriverMode2Reset),
>   
>   #undef __SMU_DUMMY_MAP
>   #define __SMU_DUMMY_MAP(type)	SMU_MSG_##type
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
> index fa520d79ef67..a73d241bb64f 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
> @@ -154,6 +154,7 @@ static struct cmn2asic_msg_mapping sienna_cichlid_message_map[SMU_MSG_MAX_COUNT]
>   	MSG_MAP(SetGpoFeaturePMask,		PPSMC_MSG_SetGpoFeaturePMask,          0),
>   	MSG_MAP(DisallowGpo,			PPSMC_MSG_DisallowGpo,                 0),
>   	MSG_MAP(Enable2ndUSB20Port,		PPSMC_MSG_Enable2ndUSB20Port,          0),
> +	MSG_MAP(DriverMode2Reset,		PPSMC_MSG_DriverMode2Reset,	       0),
>   };
>   
>   static struct cmn2asic_mapping sienna_cichlid_clk_map[SMU_CLK_COUNT] = {
> @@ -4254,6 +4255,57 @@ static int sienna_cichlid_stb_get_data_direct(struct smu_context *smu,
>   	return 0;
>   }
>   
> +static bool sienna_cichlid_is_mode2_reset_supported(struct smu_context *smu)
> +{
> +	return true;
> +}
> +
> +static int sienna_cichlid_mode2_reset(struct smu_context *smu)
> +{
> +	u32 smu_version;
> +	int ret = 0, index;
> +	struct amdgpu_device *adev = smu->adev;
> +	int timeout = 100;
> +
> +	smu_cmn_get_smc_version(smu, NULL, &smu_version);
> +
> +	index = smu_cmn_to_asic_specific_index(smu, CMN2ASIC_MAPPING_MSG,
> +						SMU_MSG_DriverMode2Reset);
> +
> +	mutex_lock(&smu->message_lock);
> +
> +	ret = smu_cmn_send_msg_without_waiting(smu, (uint16_t)index,
> +					       SMU_RESET_MODE_2);
> +
> +	ret = smu_cmn_wait_for_response(smu);
> +	while (ret != 0 && timeout) {
> +		ret = smu_cmn_wait_for_response(smu);
> +		/* Wait a bit more time for getting ACK */
> +		if (ret != 0) {
> +			--timeout;
> +			usleep_range(500, 1000);
> +			continue;
> +		} else {
> +			break;
> +		}
> +	}
> +
> +	if (!timeout) {
> +		dev_err(adev->dev,
> +			"failed to send mode2 message \tparam: 0x%08x response %#x\n",
> +			SMU_RESET_MODE_2, ret);
> +		goto out;
> +	}
> +
> +	dev_info(smu->adev->dev, "restore config space...\n");
> +	/* Restore the config space saved during init */
> +	amdgpu_device_load_pci_state(adev->pdev);
> +out:
> +	mutex_unlock(&smu->message_lock);
> +
> +	return ret;
> +}
> +
>   static const struct pptable_funcs sienna_cichlid_ppt_funcs = {
>   	.get_allowed_feature_mask = sienna_cichlid_get_allowed_feature_mask,
>   	.set_default_dpm_table = sienna_cichlid_set_default_dpm_table,
> @@ -4348,6 +4400,8 @@ static const struct pptable_funcs sienna_cichlid_ppt_funcs = {
>   	.get_default_config_table_settings = sienna_cichlid_get_default_config_table_settings,
>   	.set_config_table = sienna_cichlid_set_config_table,
>   	.get_unique_id = sienna_cichlid_get_unique_id,
> +	.mode2_reset_is_support = sienna_cichlid_is_mode2_reset_supported,
> +	.mode2_reset = sienna_cichlid_mode2_reset,
>   };
>   
>   void sienna_cichlid_set_ppt_funcs(struct smu_context *smu)

^ permalink raw reply	[flat|nested] 13+ messages in thread

* RE: [PATCH v2 1/6] drm/amdgpu: add mode2 reset for sienna_cichlid
  2022-07-28 10:30 [PATCH v2 1/6] drm/amdgpu: add mode2 reset for sienna_cichlid Victor Zhao
                   ` (5 preceding siblings ...)
  2022-07-28 18:39 ` [PATCH v2 1/6] drm/amdgpu: add mode2 reset for sienna_cichlid Andrey Grodzovsky
@ 2022-07-29  6:05 ` Quan, Evan
  2022-07-29  9:05   ` Zhao, Victor
  6 siblings, 1 reply; 13+ messages in thread
From: Quan, Evan @ 2022-07-29  6:05 UTC (permalink / raw)
  To: Zhao, Victor, amd-gfx; +Cc: Deng, Emily, Zhao, Victor, Grodzovsky, Andrey

[AMD Official Use Only - General]



> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
> Victor Zhao
> Sent: Thursday, July 28, 2022 6:30 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Deng, Emily <Emily.Deng@amd.com>; Zhao, Victor
> <Victor.Zhao@amd.com>; Grodzovsky, Andrey
> <Andrey.Grodzovsky@amd.com>
> Subject: [PATCH v2 1/6] drm/amdgpu: add mode2 reset for sienna_cichlid
> 
> To meet the requirement for multi container usecase which needs
> a quicker reset and not causing VRAM lost, adding the Mode2
> reset handler for sienna_cichlid.
> 
> v2: move skip mode2 flag part separately
> 
> Signed-off-by: Victor Zhao <Victor.Zhao@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/Makefile           |   2 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c     |   7 +
>  drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c   | 297
> ++++++++++++++++++
>  drivers/gpu/drm/amd/amdgpu/sienna_cichlid.h   |  32 ++
>  .../pm/swsmu/inc/pmfw_if/smu_v11_0_7_ppsmc.h  |   4 +-
>  drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h  |   3 +-
>  .../amd/pm/swsmu/smu11/sienna_cichlid_ppt.c   |  54 ++++
>  7 files changed, 395 insertions(+), 4 deletions(-)
>  create mode 100644 drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
>  create mode 100644 drivers/gpu/drm/amd/amdgpu/sienna_cichlid.h
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile
> b/drivers/gpu/drm/amd/amdgpu/Makefile
> index c7d0cd15b5ef..7030ac2d7d2c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/Makefile
> +++ b/drivers/gpu/drm/amd/amdgpu/Makefile
> @@ -75,7 +75,7 @@ amdgpu-y += \
>  	vi.o mxgpu_vi.o nbio_v6_1.o soc15.o emu_soc.o mxgpu_ai.o
> nbio_v7_0.o vega10_reg_init.o \
>  	vega20_reg_init.o nbio_v7_4.o nbio_v2_3.o nv.o arct_reg_init.o
> mxgpu_nv.o \
>  	nbio_v7_2.o hdp_v4_0.o hdp_v5_0.o aldebaran_reg_init.o
> aldebaran.o soc21.o \
> -	nbio_v4_3.o hdp_v6_0.o nbio_v7_7.o hdp_v5_2.o lsdma_v6_0.o
> +	sienna_cichlid.o nbio_v4_3.o hdp_v6_0.o nbio_v7_7.o hdp_v5_2.o
> lsdma_v6_0.o
> 
>  # add DF block
>  amdgpu-y += \
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> index 32c86a0b145c..f778466bb9db 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> @@ -23,6 +23,7 @@
> 
>  #include "amdgpu_reset.h"
>  #include "aldebaran.h"
> +#include "sienna_cichlid.h"
> 
>  int amdgpu_reset_add_handler(struct amdgpu_reset_control *reset_ctl,
>  			     struct amdgpu_reset_handler *handler)
> @@ -40,6 +41,9 @@ int amdgpu_reset_init(struct amdgpu_device *adev)
>  	case IP_VERSION(13, 0, 2):
>  		ret = aldebaran_reset_init(adev);
>  		break;
> +	case IP_VERSION(11, 0, 7):
> +		ret = sienna_cichlid_reset_init(adev);
> +		break;
>  	default:
>  		break;
>  	}
> @@ -55,6 +59,9 @@ int amdgpu_reset_fini(struct amdgpu_device *adev)
>  	case IP_VERSION(13, 0, 2):
>  		ret = aldebaran_reset_fini(adev);
>  		break;
> +	case IP_VERSION(11, 0, 7):
> +		ret = sienna_cichlid_reset_fini(adev);
> +		break;
>  	default:
>  		break;
>  	}
> diff --git a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
> b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
> new file mode 100644
> index 000000000000..0512960bed23
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
> @@ -0,0 +1,297 @@
> +/*
> + * Copyright 2021 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the
> "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO
> EVENT SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM,
> DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
> OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
> THE USE OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + *
> + */
> +
> +#include "sienna_cichlid.h"
> +#include "amdgpu_reset.h"
> +#include "amdgpu_amdkfd.h"
> +#include "amdgpu_dpm.h"
> +#include "amdgpu_job.h"
> +#include "amdgpu_ring.h"
> +#include "amdgpu_ras.h"
> +#include "amdgpu_psp.h"
> +#include "amdgpu_xgmi.h"
> +
> +static struct amdgpu_reset_handler *
> +sienna_cichlid_get_reset_handler(struct amdgpu_reset_control *reset_ctl,
> +			    struct amdgpu_reset_context *reset_context)
> +{
> +	struct amdgpu_reset_handler *handler;
> +	struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl-
> >handle;
> +
> +	if (reset_context->method != AMD_RESET_METHOD_NONE) {
> +		list_for_each_entry(handler, &reset_ctl->reset_handlers,
> +				     handler_list) {
> +			if (handler->reset_method == reset_context-
> >method)
> +				return handler;
> +		}
> +	} else {
> +		list_for_each_entry(handler, &reset_ctl->reset_handlers,
> +				     handler_list) {
> +			if (handler->reset_method ==
> AMD_RESET_METHOD_MODE2 &&
> +			    adev->pm.fw_version >= 0x3a5500 &&
> +			    !amdgpu_sriov_vf(adev)) {
> +				reset_context->method =
> AMD_RESET_METHOD_MODE2;
> +				return handler;
> +			}
> +		}
> +	}
> +
> +	return NULL;
> +}
> +
> +static int sienna_cichlid_mode2_suspend_ip(struct amdgpu_device *adev)
> +{
> +	int r, i;
> +
> +	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
> +	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
> +
> +	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
> +		if (!(adev->ip_blocks[i].version->type ==
> +			      AMD_IP_BLOCK_TYPE_GFX ||
> +		      adev->ip_blocks[i].version->type ==
> +			      AMD_IP_BLOCK_TYPE_SDMA))
> +			continue;
> +
> +		r = adev->ip_blocks[i].version->funcs->suspend(adev);
> +
> +		if (r) {
> +			dev_err(adev->dev,
> +				"suspend of IP block <%s> failed %d\n",
> +				adev->ip_blocks[i].version->funcs->name, r);
> +			return r;
> +		}
> +		adev->ip_blocks[i].status.hw = false;
> +	}
> +
> +	return r;
> +}
> +
> +static int
> +sienna_cichlid_mode2_prepare_hwcontext(struct amdgpu_reset_control
> *reset_ctl,
> +				  struct amdgpu_reset_context
> *reset_context)
> +{
> +	int r = 0;
> +	struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl-
> >handle;
> +
> +	if (!amdgpu_sriov_vf(adev))
> +		r = sienna_cichlid_mode2_suspend_ip(adev);
> +
> +	return r;
> +}
> +
> +static void sienna_cichlid_async_reset(struct work_struct *work)
> +{
> +	struct amdgpu_reset_handler *handler;
> +	struct amdgpu_reset_control *reset_ctl =
> +		container_of(work, struct amdgpu_reset_control,
> reset_work);
> +	struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl-
> >handle;
> +
> +	list_for_each_entry(handler, &reset_ctl->reset_handlers,
> +			     handler_list) {
> +		if (handler->reset_method == reset_ctl->active_reset) {
> +			dev_dbg(adev->dev, "Resetting device\n");
> +			handler->do_reset(adev);
> +			break;
> +		}
> +	}
> +}
> +
> +static int sienna_cichlid_mode2_reset(struct amdgpu_device *adev)
> +{
> +	/* disable BM */
> +	pci_clear_master(adev->pdev);
> +	adev->asic_reset_res = amdgpu_dpm_mode2_reset(adev);
> +	return adev->asic_reset_res;
> +}
> +
> +static int
> +sienna_cichlid_mode2_perform_reset(struct amdgpu_reset_control
> *reset_ctl,
> +			      struct amdgpu_reset_context *reset_context)
> +{
> +	struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl-
> >handle;
> +	int r;
> +
> +	r = sienna_cichlid_mode2_reset(adev);
> +	if (r) {
> +		dev_err(adev->dev,
> +			"ASIC reset failed with error, %d ", r);
> +	}
> +	return r;
> +}
> +
> +static int sienna_cichlid_mode2_restore_ip(struct amdgpu_device *adev)
> +{
> +	int i, r;
> +	struct psp_context *psp = &adev->psp;
> +
> +	r = psp_rlc_autoload_start(psp);
> +	if (r) {
> +		dev_err(adev->dev, "Failed to start rlc autoload\n");
> +		return r;
> +	}
> +
> +	/* Reinit GFXHUB */
> +	adev->gfxhub.funcs->init(adev);
> +	r = adev->gfxhub.funcs->gart_enable(adev);
> +	if (r) {
> +		dev_err(adev->dev, "GFXHUB gart reenable failed after
> reset\n");
> +		return r;
> +	}
> +
> +	for (i = 0; i < adev->num_ip_blocks; i++) {
> +		if (adev->ip_blocks[i].version->type ==
> AMD_IP_BLOCK_TYPE_IH)
> +			r = adev->ip_blocks[i].version->funcs-
> >resume(adev);
> +		if (r) {
> +			dev_err(adev->dev,
> +				"resume of IP block <%s> failed %d\n",
> +				adev->ip_blocks[i].version->funcs->name, r);
> +			return r;
> +		}
> +
> +		adev->ip_blocks[i].status.hw = true;
[Quan, Evan] It seems above operation will set the hw.status to true for all IPs. Although actually the resuming only performed on IH.
That seems an issue.

BR
Evan
> +	}
> +
> +	for (i = 0; i < adev->num_ip_blocks; i++) {
> +		if (!(adev->ip_blocks[i].version->type ==
> +			      AMD_IP_BLOCK_TYPE_GFX ||
> +		      adev->ip_blocks[i].version->type ==
> +			      AMD_IP_BLOCK_TYPE_SDMA))
> +			continue;
> +		r = adev->ip_blocks[i].version->funcs->resume(adev);
> +		if (r) {
> +			dev_err(adev->dev,
> +				"resume of IP block <%s> failed %d\n",
> +				adev->ip_blocks[i].version->funcs->name, r);
> +			return r;
> +		}
> +
> +		adev->ip_blocks[i].status.hw = true;
> +	}
> +
> +	for (i = 0; i < adev->num_ip_blocks; i++) {
> +		if (!(adev->ip_blocks[i].version->type ==
> +			      AMD_IP_BLOCK_TYPE_GFX ||
> +		      adev->ip_blocks[i].version->type ==
> +			      AMD_IP_BLOCK_TYPE_SDMA))
> +			continue;
> +
> +		if (adev->ip_blocks[i].version->funcs->late_init) {
> +			r = adev->ip_blocks[i].version->funcs->late_init(
> +				(void *)adev);
> +			if (r) {
> +				dev_err(adev->dev,
> +					"late_init of IP block <%s> failed %d
> after reset\n",
> +					adev->ip_blocks[i].version->funcs-
> >name,
> +					r);
> +				return r;
> +			}
> +		}
> +		adev->ip_blocks[i].status.late_initialized = true;
> +	}
> +
> +	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
> +	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
> +
> +	return r;
> +}
> +
> +static int
> +sienna_cichlid_mode2_restore_hwcontext(struct amdgpu_reset_control
> *reset_ctl,
> +				  struct amdgpu_reset_context
> *reset_context)
> +{
> +	int r;
> +	struct amdgpu_device *tmp_adev = (struct amdgpu_device
> *)reset_ctl->handle;
> +
> +	dev_info(tmp_adev->dev,
> +			"GPU reset succeeded, trying to resume\n");
> +	r = sienna_cichlid_mode2_restore_ip(tmp_adev);
> +	if (r)
> +		goto end;
> +
> +	/*
> +	* Add this ASIC as tracked as reset was already
> +	* complete successfully.
> +	*/
> +	amdgpu_register_gpu_instance(tmp_adev);
> +
> +	/* Resume RAS */
> +	amdgpu_ras_resume(tmp_adev);
> +
> +	amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
> +
> +	r = amdgpu_ib_ring_tests(tmp_adev);
> +	if (r) {
> +		dev_err(tmp_adev->dev,
> +			"ib ring test failed (%d).\n", r);
> +		r = -EAGAIN;
> +		tmp_adev->asic_reset_res = r;
> +		goto end;
> +	}
> +
> +end:
> +	if (r)
> +		return -EAGAIN;
> +	else
> +		return r;
> +}
> +
> +static struct amdgpu_reset_handler sienna_cichlid_mode2_handler = {
> +	.reset_method		= AMD_RESET_METHOD_MODE2,
> +	.prepare_env		= NULL,
> +	.prepare_hwcontext	= sienna_cichlid_mode2_prepare_hwcontext,
> +	.perform_reset		=
> sienna_cichlid_mode2_perform_reset,
> +	.restore_hwcontext	= sienna_cichlid_mode2_restore_hwcontext,
> +	.restore_env		= NULL,
> +	.do_reset		= sienna_cichlid_mode2_reset,
> +};
> +
> +int sienna_cichlid_reset_init(struct amdgpu_device *adev)
> +{
> +	struct amdgpu_reset_control *reset_ctl;
> +
> +	reset_ctl = kzalloc(sizeof(*reset_ctl), GFP_KERNEL);
> +	if (!reset_ctl)
> +		return -ENOMEM;
> +
> +	reset_ctl->handle = adev;
> +	reset_ctl->async_reset = sienna_cichlid_async_reset;
> +	reset_ctl->active_reset = AMD_RESET_METHOD_NONE;
> +	reset_ctl->get_reset_handler = sienna_cichlid_get_reset_handler;
> +
> +	INIT_LIST_HEAD(&reset_ctl->reset_handlers);
> +	INIT_WORK(&reset_ctl->reset_work, reset_ctl->async_reset);
> +	/* Only mode2 is handled through reset control now */
> +	amdgpu_reset_add_handler(reset_ctl,
> &sienna_cichlid_mode2_handler);
> +
> +	adev->reset_cntl = reset_ctl;
> +
> +	return 0;
> +}
> +
> +int sienna_cichlid_reset_fini(struct amdgpu_device *adev)
> +{
> +	kfree(adev->reset_cntl);
> +	adev->reset_cntl = NULL;
> +	return 0;
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.h
> b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.h
> new file mode 100644
> index 000000000000..5213b162dacd
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.h
> @@ -0,0 +1,32 @@
> +/*
> + * Copyright 2021 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the
> "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO
> EVENT SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM,
> DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
> OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
> THE USE OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + *
> + */
> +
> +#ifndef __SIENNA_CICHLID_H__
> +#define __SIENNA_CICHLID_H__
> +
> +#include "amdgpu.h"
> +
> +int sienna_cichlid_reset_init(struct amdgpu_device *adev);
> +int sienna_cichlid_reset_fini(struct amdgpu_device *adev);
> +
> +#endif
> diff --git
> a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v11_0_7_ppsmc.h
> b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v11_0_7_ppsmc.h
> index d2e10a724560..82cf9e563065 100644
> ---
> a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v11_0_7_ppsmc.h
> +++
> b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v11_0_7_ppsmc.h
> @@ -137,7 +137,7 @@
>  #define PPSMC_MSG_DisallowGpo                    0x56
> 
>  #define PPSMC_MSG_Enable2ndUSB20Port             0x57
> -
> -#define PPSMC_Message_Count                      0x58
> +#define PPSMC_MSG_DriverMode2Reset               0x5D
> +#define PPSMC_Message_Count                      0x5E
> 
>  #endif
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
> b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
> index 19084a4fcb2b..28f6a1eb6945 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
> +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
> @@ -235,7 +235,8 @@
>  	__SMU_DUMMY_MAP(UnforceGfxVid),           \
>  	__SMU_DUMMY_MAP(HeavySBR),			\
>  	__SMU_DUMMY_MAP(SetBadHBMPagesRetiredFlagsPerChannel), \
> -	__SMU_DUMMY_MAP(EnableGfxImu),
> +	__SMU_DUMMY_MAP(EnableGfxImu), \
> +	__SMU_DUMMY_MAP(DriverMode2Reset),
> 
>  #undef __SMU_DUMMY_MAP
>  #define __SMU_DUMMY_MAP(type)	SMU_MSG_##type
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
> b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
> index fa520d79ef67..a73d241bb64f 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
> @@ -154,6 +154,7 @@ static struct cmn2asic_msg_mapping
> sienna_cichlid_message_map[SMU_MSG_MAX_COUNT]
>  	MSG_MAP(SetGpoFeaturePMask,
> 	PPSMC_MSG_SetGpoFeaturePMask,          0),
>  	MSG_MAP(DisallowGpo,
> 	PPSMC_MSG_DisallowGpo,                 0),
>  	MSG_MAP(Enable2ndUSB20Port,
> 	PPSMC_MSG_Enable2ndUSB20Port,          0),
> +	MSG_MAP(DriverMode2Reset,
> 	PPSMC_MSG_DriverMode2Reset,	       0),
>  };
> 
>  static struct cmn2asic_mapping sienna_cichlid_clk_map[SMU_CLK_COUNT]
> = {
> @@ -4254,6 +4255,57 @@ static int
> sienna_cichlid_stb_get_data_direct(struct smu_context *smu,
>  	return 0;
>  }
> 
> +static bool sienna_cichlid_is_mode2_reset_supported(struct smu_context
> *smu)
> +{
> +	return true;
> +}
> +
> +static int sienna_cichlid_mode2_reset(struct smu_context *smu)
> +{
> +	u32 smu_version;
> +	int ret = 0, index;
> +	struct amdgpu_device *adev = smu->adev;
> +	int timeout = 100;
> +
> +	smu_cmn_get_smc_version(smu, NULL, &smu_version);
> +
> +	index = smu_cmn_to_asic_specific_index(smu,
> CMN2ASIC_MAPPING_MSG,
> +
> 	SMU_MSG_DriverMode2Reset);
> +
> +	mutex_lock(&smu->message_lock);
> +
> +	ret = smu_cmn_send_msg_without_waiting(smu, (uint16_t)index,
> +					       SMU_RESET_MODE_2);
> +
> +	ret = smu_cmn_wait_for_response(smu);
> +	while (ret != 0 && timeout) {
> +		ret = smu_cmn_wait_for_response(smu);
> +		/* Wait a bit more time for getting ACK */
> +		if (ret != 0) {
> +			--timeout;
> +			usleep_range(500, 1000);
> +			continue;
> +		} else {
> +			break;
> +		}
> +	}
> +
> +	if (!timeout) {
> +		dev_err(adev->dev,
> +			"failed to send mode2 message \tparam: 0x%08x
> response %#x\n",
> +			SMU_RESET_MODE_2, ret);
> +		goto out;
> +	}
> +
> +	dev_info(smu->adev->dev, "restore config space...\n");
> +	/* Restore the config space saved during init */
> +	amdgpu_device_load_pci_state(adev->pdev);
> +out:
> +	mutex_unlock(&smu->message_lock);
> +
> +	return ret;
> +}
> +
>  static const struct pptable_funcs sienna_cichlid_ppt_funcs = {
>  	.get_allowed_feature_mask =
> sienna_cichlid_get_allowed_feature_mask,
>  	.set_default_dpm_table = sienna_cichlid_set_default_dpm_table,
> @@ -4348,6 +4400,8 @@ static const struct pptable_funcs
> sienna_cichlid_ppt_funcs = {
>  	.get_default_config_table_settings =
> sienna_cichlid_get_default_config_table_settings,
>  	.set_config_table = sienna_cichlid_set_config_table,
>  	.get_unique_id = sienna_cichlid_get_unique_id,
> +	.mode2_reset_is_support =
> sienna_cichlid_is_mode2_reset_supported,
> +	.mode2_reset = sienna_cichlid_mode2_reset,
>  };
> 
>  void sienna_cichlid_set_ppt_funcs(struct smu_context *smu)
> --
> 2.25.1

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 6/6] drm/amdgpu: reduce reset time
  2022-07-28 10:30 ` [PATCH v2 6/6] drm/amdgpu: reduce reset time Victor Zhao
  2022-07-28 18:37   ` Andrey Grodzovsky
@ 2022-07-29  6:11   ` Lazar, Lijo
  2022-07-29  9:27     ` Zhao, Victor
  1 sibling, 1 reply; 13+ messages in thread
From: Lazar, Lijo @ 2022-07-29  6:11 UTC (permalink / raw)
  To: Victor Zhao, amd-gfx; +Cc: emily.deng, Andrey.Grodzovsky



On 7/28/2022 4:00 PM, Victor Zhao wrote:
> In multi container use case, reset time is important, so skip ring
> tests and cp halt wait during ip suspending for reset as they are
> going to fail and cost more time on reset
> 
> v2: add a hang flag to indicate the reset comes from a job timeout,
> skip ring test and cp halt wait in this case
> 
> Signed-off-by: Victor Zhao <Victor.Zhao@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c   |  3 ++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c   |  2 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c |  1 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h |  1 +
>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c    | 11 +++++++++--
>   5 files changed, 15 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index 222d3d7ea076..c735a17c6afb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -27,6 +27,7 @@
>   #include "amdgpu_gfx.h"
>   #include "amdgpu_rlc.h"
>   #include "amdgpu_ras.h"
> +#include "amdgpu_reset.h"
>   
>   /* delay 0.1 second to enable gfx off feature */
>   #define GFX_OFF_DELAY_ENABLE         msecs_to_jiffies(100)
> @@ -477,7 +478,7 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev)
>   		kiq->pmf->kiq_unmap_queues(kiq_ring, &adev->gfx.compute_ring[i],
>   					   RESET_QUEUES, 0, 0);
>   
> -	if (adev->gfx.kiq.ring.sched.ready)
> +	if (adev->gfx.kiq.ring.sched.ready && !(amdgpu_in_reset(adev) && adev->reset_domain->hang))
>   		r = amdgpu_ring_test_helper(kiq_ring);

On a system with multiple GPUs interconnected, this will affect other 
GPUs as well on which job was not really running. I guess your usecase 
here is device specific.

Thanks,
Lijo

>   	spin_unlock(&adev->gfx.kiq.ring_lock);
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index 6c3e7290153f..bb40880a557f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -49,6 +49,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
>   	}
>   
>   	memset(&ti, 0, sizeof(struct amdgpu_task_info));
> +	adev->reset_domain->hang = true;
>   
>   	if (amdgpu_gpu_recovery &&
>   	    amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) {
> @@ -83,6 +84,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
>   	}
>   
>   exit:
> +	adev->reset_domain->hang = false;
>   	drm_dev_exit(idx);
>   	return DRM_GPU_SCHED_STAT_NOMINAL;
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> index 9da5ead50c90..b828fe773f50 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> @@ -155,6 +155,7 @@ struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d
>   	atomic_set(&reset_domain->in_gpu_reset, 0);
>   	atomic_set(&reset_domain->reset_res, 0);
>   	init_rwsem(&reset_domain->sem);
> +	reset_domain->hang = false;
>   
>   	return reset_domain;
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> index cc4b2eeb24cf..29e324add552 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> @@ -84,6 +84,7 @@ struct amdgpu_reset_domain {
>   	struct rw_semaphore sem;
>   	atomic_t in_gpu_reset;
>   	atomic_t reset_res;
> +	bool hang;
>   };
>   
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index fafbad3cf08d..a384e04d916c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -29,6 +29,7 @@
>   #include "amdgpu.h"
>   #include "amdgpu_gfx.h"
>   #include "amdgpu_psp.h"
> +#include "amdgpu_reset.h"
>   #include "nv.h"
>   #include "nvd.h"
>   
> @@ -5971,6 +5972,9 @@ static int gfx_v10_0_cp_gfx_enable(struct amdgpu_device *adev, bool enable)
>   		WREG32_SOC15(GC, 0, mmCP_ME_CNTL, tmp);
>   	}
>   
> +	if ((amdgpu_in_reset(adev) && adev->reset_domain->hang) && !enable)
> +		return 0;
> +
>   	for (i = 0; i < adev->usec_timeout; i++) {
>   		if (RREG32_SOC15(GC, 0, mmCP_STAT) == 0)
>   			break;
> @@ -7569,8 +7573,10 @@ static int gfx_v10_0_kiq_disable_kgq(struct amdgpu_device *adev)
>   	for (i = 0; i < adev->gfx.num_gfx_rings; i++)
>   		kiq->pmf->kiq_unmap_queues(kiq_ring, &adev->gfx.gfx_ring[i],
>   					   PREEMPT_QUEUES, 0, 0);
> -
> -	return amdgpu_ring_test_helper(kiq_ring);
> +	if (!(amdgpu_in_reset(adev) && adev->reset_domain->hang))
> +		return amdgpu_ring_test_helper(kiq_ring);
> +	else
> +		return 0;
>   }
>   #endif
>   
> @@ -7610,6 +7616,7 @@ static int gfx_v10_0_hw_fini(void *handle)
>   
>   		return 0;
>   	}
> +
>   	gfx_v10_0_cp_enable(adev, false);
>   	gfx_v10_0_enable_gui_idle_interrupt(adev, false);
>   
> 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* RE: [PATCH v2 1/6] drm/amdgpu: add mode2 reset for sienna_cichlid
  2022-07-29  6:05 ` Quan, Evan
@ 2022-07-29  9:05   ` Zhao, Victor
  0 siblings, 0 replies; 13+ messages in thread
From: Zhao, Victor @ 2022-07-29  9:05 UTC (permalink / raw)
  To: Quan, Evan, amd-gfx; +Cc: Deng, Emily, Grodzovsky, Andrey

[AMD Official Use Only - General]

Thanks for catching.


Thanks,
Victor



-----Original Message-----
From: Quan, Evan <Evan.Quan@amd.com> 
Sent: Friday, July 29, 2022 2:06 PM
To: Zhao, Victor <Victor.Zhao@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Deng, Emily <Emily.Deng@amd.com>; Zhao, Victor <Victor.Zhao@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
Subject: RE: [PATCH v2 1/6] drm/amdgpu: add mode2 reset for sienna_cichlid

[AMD Official Use Only - General]



> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of 
> Victor Zhao
> Sent: Thursday, July 28, 2022 6:30 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Deng, Emily <Emily.Deng@amd.com>; Zhao, Victor 
> <Victor.Zhao@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
> Subject: [PATCH v2 1/6] drm/amdgpu: add mode2 reset for sienna_cichlid
> 
> To meet the requirement for multi container usecase which needs a 
> quicker reset and not causing VRAM lost, adding the Mode2 reset 
> handler for sienna_cichlid.
> 
> v2: move skip mode2 flag part separately
> 
> Signed-off-by: Victor Zhao <Victor.Zhao@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/Makefile           |   2 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c     |   7 +
>  drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c   | 297
> ++++++++++++++++++
>  drivers/gpu/drm/amd/amdgpu/sienna_cichlid.h   |  32 ++
>  .../pm/swsmu/inc/pmfw_if/smu_v11_0_7_ppsmc.h  |   4 +-
>  drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h  |   3 +-
>  .../amd/pm/swsmu/smu11/sienna_cichlid_ppt.c   |  54 ++++
>  7 files changed, 395 insertions(+), 4 deletions(-)  create mode 
> 100644 drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
>  create mode 100644 drivers/gpu/drm/amd/amdgpu/sienna_cichlid.h
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile
> b/drivers/gpu/drm/amd/amdgpu/Makefile
> index c7d0cd15b5ef..7030ac2d7d2c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/Makefile
> +++ b/drivers/gpu/drm/amd/amdgpu/Makefile
> @@ -75,7 +75,7 @@ amdgpu-y += \
>  	vi.o mxgpu_vi.o nbio_v6_1.o soc15.o emu_soc.o mxgpu_ai.o nbio_v7_0.o 
> vega10_reg_init.o \
>  	vega20_reg_init.o nbio_v7_4.o nbio_v2_3.o nv.o arct_reg_init.o 
> mxgpu_nv.o \
>  	nbio_v7_2.o hdp_v4_0.o hdp_v5_0.o aldebaran_reg_init.o aldebaran.o 
> soc21.o \
> -	nbio_v4_3.o hdp_v6_0.o nbio_v7_7.o hdp_v5_2.o lsdma_v6_0.o
> +	sienna_cichlid.o nbio_v4_3.o hdp_v6_0.o nbio_v7_7.o hdp_v5_2.o
> lsdma_v6_0.o
> 
>  # add DF block
>  amdgpu-y += \
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> index 32c86a0b145c..f778466bb9db 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> @@ -23,6 +23,7 @@
> 
>  #include "amdgpu_reset.h"
>  #include "aldebaran.h"
> +#include "sienna_cichlid.h"
> 
>  int amdgpu_reset_add_handler(struct amdgpu_reset_control *reset_ctl,
>  			     struct amdgpu_reset_handler *handler) @@ -40,6 +41,9 @@ int 
> amdgpu_reset_init(struct amdgpu_device *adev)
>  	case IP_VERSION(13, 0, 2):
>  		ret = aldebaran_reset_init(adev);
>  		break;
> +	case IP_VERSION(11, 0, 7):
> +		ret = sienna_cichlid_reset_init(adev);
> +		break;
>  	default:
>  		break;
>  	}
> @@ -55,6 +59,9 @@ int amdgpu_reset_fini(struct amdgpu_device *adev)
>  	case IP_VERSION(13, 0, 2):
>  		ret = aldebaran_reset_fini(adev);
>  		break;
> +	case IP_VERSION(11, 0, 7):
> +		ret = sienna_cichlid_reset_fini(adev);
> +		break;
>  	default:
>  		break;
>  	}
> diff --git a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
> b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
> new file mode 100644
> index 000000000000..0512960bed23
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
> @@ -0,0 +1,297 @@
> +/*
> + * Copyright 2021 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person 
> +obtaining a
> + * copy of this software and associated documentation files (the
> "Software"),
> + * to deal in the Software without restriction, including without 
> + limitation
> + * the rights to use, copy, modify, merge, publish, distribute, 
> + sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom 
> + the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be 
> + included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO
> EVENT SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM,
> DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
> OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
> THE USE OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + *
> + */
> +
> +#include "sienna_cichlid.h"
> +#include "amdgpu_reset.h"
> +#include "amdgpu_amdkfd.h"
> +#include "amdgpu_dpm.h"
> +#include "amdgpu_job.h"
> +#include "amdgpu_ring.h"
> +#include "amdgpu_ras.h"
> +#include "amdgpu_psp.h"
> +#include "amdgpu_xgmi.h"
> +
> +static struct amdgpu_reset_handler *
> +sienna_cichlid_get_reset_handler(struct amdgpu_reset_control *reset_ctl,
> +			    struct amdgpu_reset_context *reset_context) {
> +	struct amdgpu_reset_handler *handler;
> +	struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl-
> >handle;
> +
> +	if (reset_context->method != AMD_RESET_METHOD_NONE) {
> +		list_for_each_entry(handler, &reset_ctl->reset_handlers,
> +				     handler_list) {
> +			if (handler->reset_method == reset_context-
> >method)
> +				return handler;
> +		}
> +	} else {
> +		list_for_each_entry(handler, &reset_ctl->reset_handlers,
> +				     handler_list) {
> +			if (handler->reset_method ==
> AMD_RESET_METHOD_MODE2 &&
> +			    adev->pm.fw_version >= 0x3a5500 &&
> +			    !amdgpu_sriov_vf(adev)) {
> +				reset_context->method =
> AMD_RESET_METHOD_MODE2;
> +				return handler;
> +			}
> +		}
> +	}
> +
> +	return NULL;
> +}
> +
> +static int sienna_cichlid_mode2_suspend_ip(struct amdgpu_device 
> +*adev) {
> +	int r, i;
> +
> +	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
> +	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
> +
> +	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
> +		if (!(adev->ip_blocks[i].version->type ==
> +			      AMD_IP_BLOCK_TYPE_GFX ||
> +		      adev->ip_blocks[i].version->type ==
> +			      AMD_IP_BLOCK_TYPE_SDMA))
> +			continue;
> +
> +		r = adev->ip_blocks[i].version->funcs->suspend(adev);
> +
> +		if (r) {
> +			dev_err(adev->dev,
> +				"suspend of IP block <%s> failed %d\n",
> +				adev->ip_blocks[i].version->funcs->name, r);
> +			return r;
> +		}
> +		adev->ip_blocks[i].status.hw = false;
> +	}
> +
> +	return r;
> +}
> +
> +static int
> +sienna_cichlid_mode2_prepare_hwcontext(struct amdgpu_reset_control
> *reset_ctl,
> +				  struct amdgpu_reset_context
> *reset_context)
> +{
> +	int r = 0;
> +	struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl-
> >handle;
> +
> +	if (!amdgpu_sriov_vf(adev))
> +		r = sienna_cichlid_mode2_suspend_ip(adev);
> +
> +	return r;
> +}
> +
> +static void sienna_cichlid_async_reset(struct work_struct *work) {
> +	struct amdgpu_reset_handler *handler;
> +	struct amdgpu_reset_control *reset_ctl =
> +		container_of(work, struct amdgpu_reset_control,
> reset_work);
> +	struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl-
> >handle;
> +
> +	list_for_each_entry(handler, &reset_ctl->reset_handlers,
> +			     handler_list) {
> +		if (handler->reset_method == reset_ctl->active_reset) {
> +			dev_dbg(adev->dev, "Resetting device\n");
> +			handler->do_reset(adev);
> +			break;
> +		}
> +	}
> +}
> +
> +static int sienna_cichlid_mode2_reset(struct amdgpu_device *adev) {
> +	/* disable BM */
> +	pci_clear_master(adev->pdev);
> +	adev->asic_reset_res = amdgpu_dpm_mode2_reset(adev);
> +	return adev->asic_reset_res;
> +}
> +
> +static int
> +sienna_cichlid_mode2_perform_reset(struct amdgpu_reset_control
> *reset_ctl,
> +			      struct amdgpu_reset_context *reset_context) {
> +	struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl-
> >handle;
> +	int r;
> +
> +	r = sienna_cichlid_mode2_reset(adev);
> +	if (r) {
> +		dev_err(adev->dev,
> +			"ASIC reset failed with error, %d ", r);
> +	}
> +	return r;
> +}
> +
> +static int sienna_cichlid_mode2_restore_ip(struct amdgpu_device 
> +*adev) {
> +	int i, r;
> +	struct psp_context *psp = &adev->psp;
> +
> +	r = psp_rlc_autoload_start(psp);
> +	if (r) {
> +		dev_err(adev->dev, "Failed to start rlc autoload\n");
> +		return r;
> +	}
> +
> +	/* Reinit GFXHUB */
> +	adev->gfxhub.funcs->init(adev);
> +	r = adev->gfxhub.funcs->gart_enable(adev);
> +	if (r) {
> +		dev_err(adev->dev, "GFXHUB gart reenable failed after
> reset\n");
> +		return r;
> +	}
> +
> +	for (i = 0; i < adev->num_ip_blocks; i++) {
> +		if (adev->ip_blocks[i].version->type ==
> AMD_IP_BLOCK_TYPE_IH)
> +			r = adev->ip_blocks[i].version->funcs-
> >resume(adev);
> +		if (r) {
> +			dev_err(adev->dev,
> +				"resume of IP block <%s> failed %d\n",
> +				adev->ip_blocks[i].version->funcs->name, r);
> +			return r;
> +		}
> +
> +		adev->ip_blocks[i].status.hw = true;
[Quan, Evan] It seems above operation will set the hw.status to true for all IPs. Although actually the resuming only performed on IH.
That seems an issue.

BR
Evan
> +	}
> +
> +	for (i = 0; i < adev->num_ip_blocks; i++) {
> +		if (!(adev->ip_blocks[i].version->type ==
> +			      AMD_IP_BLOCK_TYPE_GFX ||
> +		      adev->ip_blocks[i].version->type ==
> +			      AMD_IP_BLOCK_TYPE_SDMA))
> +			continue;
> +		r = adev->ip_blocks[i].version->funcs->resume(adev);
> +		if (r) {
> +			dev_err(adev->dev,
> +				"resume of IP block <%s> failed %d\n",
> +				adev->ip_blocks[i].version->funcs->name, r);
> +			return r;
> +		}
> +
> +		adev->ip_blocks[i].status.hw = true;
> +	}
> +
> +	for (i = 0; i < adev->num_ip_blocks; i++) {
> +		if (!(adev->ip_blocks[i].version->type ==
> +			      AMD_IP_BLOCK_TYPE_GFX ||
> +		      adev->ip_blocks[i].version->type ==
> +			      AMD_IP_BLOCK_TYPE_SDMA))
> +			continue;
> +
> +		if (adev->ip_blocks[i].version->funcs->late_init) {
> +			r = adev->ip_blocks[i].version->funcs->late_init(
> +				(void *)adev);
> +			if (r) {
> +				dev_err(adev->dev,
> +					"late_init of IP block <%s> failed %d
> after reset\n",
> +					adev->ip_blocks[i].version->funcs-
> >name,
> +					r);
> +				return r;
> +			}
> +		}
> +		adev->ip_blocks[i].status.late_initialized = true;
> +	}
> +
> +	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
> +	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
> +
> +	return r;
> +}
> +
> +static int
> +sienna_cichlid_mode2_restore_hwcontext(struct amdgpu_reset_control
> *reset_ctl,
> +				  struct amdgpu_reset_context
> *reset_context)
> +{
> +	int r;
> +	struct amdgpu_device *tmp_adev = (struct amdgpu_device
> *)reset_ctl->handle;
> +
> +	dev_info(tmp_adev->dev,
> +			"GPU reset succeeded, trying to resume\n");
> +	r = sienna_cichlid_mode2_restore_ip(tmp_adev);
> +	if (r)
> +		goto end;
> +
> +	/*
> +	* Add this ASIC as tracked as reset was already
> +	* complete successfully.
> +	*/
> +	amdgpu_register_gpu_instance(tmp_adev);
> +
> +	/* Resume RAS */
> +	amdgpu_ras_resume(tmp_adev);
> +
> +	amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
> +
> +	r = amdgpu_ib_ring_tests(tmp_adev);
> +	if (r) {
> +		dev_err(tmp_adev->dev,
> +			"ib ring test failed (%d).\n", r);
> +		r = -EAGAIN;
> +		tmp_adev->asic_reset_res = r;
> +		goto end;
> +	}
> +
> +end:
> +	if (r)
> +		return -EAGAIN;
> +	else
> +		return r;
> +}
> +
> +static struct amdgpu_reset_handler sienna_cichlid_mode2_handler = {
> +	.reset_method		= AMD_RESET_METHOD_MODE2,
> +	.prepare_env		= NULL,
> +	.prepare_hwcontext	= sienna_cichlid_mode2_prepare_hwcontext,
> +	.perform_reset		=
> sienna_cichlid_mode2_perform_reset,
> +	.restore_hwcontext	= sienna_cichlid_mode2_restore_hwcontext,
> +	.restore_env		= NULL,
> +	.do_reset		= sienna_cichlid_mode2_reset,
> +};
> +
> +int sienna_cichlid_reset_init(struct amdgpu_device *adev) {
> +	struct amdgpu_reset_control *reset_ctl;
> +
> +	reset_ctl = kzalloc(sizeof(*reset_ctl), GFP_KERNEL);
> +	if (!reset_ctl)
> +		return -ENOMEM;
> +
> +	reset_ctl->handle = adev;
> +	reset_ctl->async_reset = sienna_cichlid_async_reset;
> +	reset_ctl->active_reset = AMD_RESET_METHOD_NONE;
> +	reset_ctl->get_reset_handler = sienna_cichlid_get_reset_handler;
> +
> +	INIT_LIST_HEAD(&reset_ctl->reset_handlers);
> +	INIT_WORK(&reset_ctl->reset_work, reset_ctl->async_reset);
> +	/* Only mode2 is handled through reset control now */
> +	amdgpu_reset_add_handler(reset_ctl,
> &sienna_cichlid_mode2_handler);
> +
> +	adev->reset_cntl = reset_ctl;
> +
> +	return 0;
> +}
> +
> +int sienna_cichlid_reset_fini(struct amdgpu_device *adev) {
> +	kfree(adev->reset_cntl);
> +	adev->reset_cntl = NULL;
> +	return 0;
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.h
> b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.h
> new file mode 100644
> index 000000000000..5213b162dacd
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.h
> @@ -0,0 +1,32 @@
> +/*
> + * Copyright 2021 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person 
> +obtaining a
> + * copy of this software and associated documentation files (the
> "Software"),
> + * to deal in the Software without restriction, including without 
> + limitation
> + * the rights to use, copy, modify, merge, publish, distribute, 
> + sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom 
> + the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be 
> + included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO
> EVENT SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM,
> DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
> OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
> THE USE OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + *
> + */
> +
> +#ifndef __SIENNA_CICHLID_H__
> +#define __SIENNA_CICHLID_H__
> +
> +#include "amdgpu.h"
> +
> +int sienna_cichlid_reset_init(struct amdgpu_device *adev); int 
> +sienna_cichlid_reset_fini(struct amdgpu_device *adev);
> +
> +#endif
> diff --git
> a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v11_0_7_ppsmc.h
> b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v11_0_7_ppsmc.h
> index d2e10a724560..82cf9e563065 100644
> ---
> a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v11_0_7_ppsmc.h
> +++
> b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v11_0_7_ppsmc.h
> @@ -137,7 +137,7 @@
>  #define PPSMC_MSG_DisallowGpo                    0x56
> 
>  #define PPSMC_MSG_Enable2ndUSB20Port             0x57
> -
> -#define PPSMC_Message_Count                      0x58
> +#define PPSMC_MSG_DriverMode2Reset               0x5D
> +#define PPSMC_Message_Count                      0x5E
> 
>  #endif
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
> b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
> index 19084a4fcb2b..28f6a1eb6945 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
> +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
> @@ -235,7 +235,8 @@
>  	__SMU_DUMMY_MAP(UnforceGfxVid),           \
>  	__SMU_DUMMY_MAP(HeavySBR),			\
>  	__SMU_DUMMY_MAP(SetBadHBMPagesRetiredFlagsPerChannel), \
> -	__SMU_DUMMY_MAP(EnableGfxImu),
> +	__SMU_DUMMY_MAP(EnableGfxImu), \
> +	__SMU_DUMMY_MAP(DriverMode2Reset),
> 
>  #undef __SMU_DUMMY_MAP
>  #define __SMU_DUMMY_MAP(type)	SMU_MSG_##type
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
> b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
> index fa520d79ef67..a73d241bb64f 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
> @@ -154,6 +154,7 @@ static struct cmn2asic_msg_mapping 
> sienna_cichlid_message_map[SMU_MSG_MAX_COUNT]
>  	MSG_MAP(SetGpoFeaturePMask,
> 	PPSMC_MSG_SetGpoFeaturePMask,          0),
>  	MSG_MAP(DisallowGpo,
> 	PPSMC_MSG_DisallowGpo,                 0),
>  	MSG_MAP(Enable2ndUSB20Port,
> 	PPSMC_MSG_Enable2ndUSB20Port,          0),
> +	MSG_MAP(DriverMode2Reset,
> 	PPSMC_MSG_DriverMode2Reset,	       0),
>  };
> 
>  static struct cmn2asic_mapping sienna_cichlid_clk_map[SMU_CLK_COUNT]
> = {
> @@ -4254,6 +4255,57 @@ static int
> sienna_cichlid_stb_get_data_direct(struct smu_context *smu,
>  	return 0;
>  }
> 
> +static bool sienna_cichlid_is_mode2_reset_supported(struct 
> +smu_context
> *smu)
> +{
> +	return true;
> +}
> +
> +static int sienna_cichlid_mode2_reset(struct smu_context *smu) {
> +	u32 smu_version;
> +	int ret = 0, index;
> +	struct amdgpu_device *adev = smu->adev;
> +	int timeout = 100;
> +
> +	smu_cmn_get_smc_version(smu, NULL, &smu_version);
> +
> +	index = smu_cmn_to_asic_specific_index(smu,
> CMN2ASIC_MAPPING_MSG,
> +
> 	SMU_MSG_DriverMode2Reset);
> +
> +	mutex_lock(&smu->message_lock);
> +
> +	ret = smu_cmn_send_msg_without_waiting(smu, (uint16_t)index,
> +					       SMU_RESET_MODE_2);
> +
> +	ret = smu_cmn_wait_for_response(smu);
> +	while (ret != 0 && timeout) {
> +		ret = smu_cmn_wait_for_response(smu);
> +		/* Wait a bit more time for getting ACK */
> +		if (ret != 0) {
> +			--timeout;
> +			usleep_range(500, 1000);
> +			continue;
> +		} else {
> +			break;
> +		}
> +	}
> +
> +	if (!timeout) {
> +		dev_err(adev->dev,
> +			"failed to send mode2 message \tparam: 0x%08x
> response %#x\n",
> +			SMU_RESET_MODE_2, ret);
> +		goto out;
> +	}
> +
> +	dev_info(smu->adev->dev, "restore config space...\n");
> +	/* Restore the config space saved during init */
> +	amdgpu_device_load_pci_state(adev->pdev);
> +out:
> +	mutex_unlock(&smu->message_lock);
> +
> +	return ret;
> +}
> +
>  static const struct pptable_funcs sienna_cichlid_ppt_funcs = {
>  	.get_allowed_feature_mask =
> sienna_cichlid_get_allowed_feature_mask,
>  	.set_default_dpm_table = sienna_cichlid_set_default_dpm_table,
> @@ -4348,6 +4400,8 @@ static const struct pptable_funcs 
> sienna_cichlid_ppt_funcs = {
>  	.get_default_config_table_settings = 
> sienna_cichlid_get_default_config_table_settings,
>  	.set_config_table = sienna_cichlid_set_config_table,
>  	.get_unique_id = sienna_cichlid_get_unique_id,
> +	.mode2_reset_is_support =
> sienna_cichlid_is_mode2_reset_supported,
> +	.mode2_reset = sienna_cichlid_mode2_reset,
>  };
> 
>  void sienna_cichlid_set_ppt_funcs(struct smu_context *smu)
> --
> 2.25.1

^ permalink raw reply	[flat|nested] 13+ messages in thread

* RE: [PATCH v2 6/6] drm/amdgpu: reduce reset time
  2022-07-29  6:11   ` Lazar, Lijo
@ 2022-07-29  9:27     ` Zhao, Victor
  2022-07-29  9:39       ` Lazar, Lijo
  0 siblings, 1 reply; 13+ messages in thread
From: Zhao, Victor @ 2022-07-29  9:27 UTC (permalink / raw)
  To: Lazar, Lijo, amd-gfx; +Cc: Deng, Emily, Grodzovsky, Andrey

[AMD Official Use Only - General]

Hi Lijo,

Can you be more specific why other gpus can be affected? I don’t have a xgmi system on my side. I thought reset_domain is per device.


Thanks,
Victor



-----Original Message-----
From: Lazar, Lijo <Lijo.Lazar@amd.com> 
Sent: Friday, July 29, 2022 2:11 PM
To: Zhao, Victor <Victor.Zhao@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Deng, Emily <Emily.Deng@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
Subject: Re: [PATCH v2 6/6] drm/amdgpu: reduce reset time



On 7/28/2022 4:00 PM, Victor Zhao wrote:
> In multi container use case, reset time is important, so skip ring 
> tests and cp halt wait during ip suspending for reset as they are 
> going to fail and cost more time on reset
> 
> v2: add a hang flag to indicate the reset comes from a job timeout, 
> skip ring test and cp halt wait in this case
> 
> Signed-off-by: Victor Zhao <Victor.Zhao@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c   |  3 ++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c   |  2 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c |  1 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h |  1 +
>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c    | 11 +++++++++--
>   5 files changed, 15 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index 222d3d7ea076..c735a17c6afb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -27,6 +27,7 @@
>   #include "amdgpu_gfx.h"
>   #include "amdgpu_rlc.h"
>   #include "amdgpu_ras.h"
> +#include "amdgpu_reset.h"
>   
>   /* delay 0.1 second to enable gfx off feature */
>   #define GFX_OFF_DELAY_ENABLE         msecs_to_jiffies(100)
> @@ -477,7 +478,7 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev)
>   		kiq->pmf->kiq_unmap_queues(kiq_ring, &adev->gfx.compute_ring[i],
>   					   RESET_QUEUES, 0, 0);
>   
> -	if (adev->gfx.kiq.ring.sched.ready)
> +	if (adev->gfx.kiq.ring.sched.ready && !(amdgpu_in_reset(adev) && 
> +adev->reset_domain->hang))
>   		r = amdgpu_ring_test_helper(kiq_ring);

On a system with multiple GPUs interconnected, this will affect other GPUs as well on which job was not really running. I guess your usecase here is device specific.

Thanks,
Lijo

>   	spin_unlock(&adev->gfx.kiq.ring_lock);
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index 6c3e7290153f..bb40880a557f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -49,6 +49,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
>   	}
>   
>   	memset(&ti, 0, sizeof(struct amdgpu_task_info));
> +	adev->reset_domain->hang = true;
>   
>   	if (amdgpu_gpu_recovery &&
>   	    amdgpu_ring_soft_recovery(ring, job->vmid, 
> s_job->s_fence->parent)) { @@ -83,6 +84,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
>   	}
>   
>   exit:
> +	adev->reset_domain->hang = false;
>   	drm_dev_exit(idx);
>   	return DRM_GPU_SCHED_STAT_NOMINAL;
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> index 9da5ead50c90..b828fe773f50 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> @@ -155,6 +155,7 @@ struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d
>   	atomic_set(&reset_domain->in_gpu_reset, 0);
>   	atomic_set(&reset_domain->reset_res, 0);
>   	init_rwsem(&reset_domain->sem);
> +	reset_domain->hang = false;
>   
>   	return reset_domain;
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> index cc4b2eeb24cf..29e324add552 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> @@ -84,6 +84,7 @@ struct amdgpu_reset_domain {
>   	struct rw_semaphore sem;
>   	atomic_t in_gpu_reset;
>   	atomic_t reset_res;
> +	bool hang;
>   };
>   
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index fafbad3cf08d..a384e04d916c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -29,6 +29,7 @@
>   #include "amdgpu.h"
>   #include "amdgpu_gfx.h"
>   #include "amdgpu_psp.h"
> +#include "amdgpu_reset.h"
>   #include "nv.h"
>   #include "nvd.h"
>   
> @@ -5971,6 +5972,9 @@ static int gfx_v10_0_cp_gfx_enable(struct amdgpu_device *adev, bool enable)
>   		WREG32_SOC15(GC, 0, mmCP_ME_CNTL, tmp);
>   	}
>   
> +	if ((amdgpu_in_reset(adev) && adev->reset_domain->hang) && !enable)
> +		return 0;
> +
>   	for (i = 0; i < adev->usec_timeout; i++) {
>   		if (RREG32_SOC15(GC, 0, mmCP_STAT) == 0)
>   			break;
> @@ -7569,8 +7573,10 @@ static int gfx_v10_0_kiq_disable_kgq(struct amdgpu_device *adev)
>   	for (i = 0; i < adev->gfx.num_gfx_rings; i++)
>   		kiq->pmf->kiq_unmap_queues(kiq_ring, &adev->gfx.gfx_ring[i],
>   					   PREEMPT_QUEUES, 0, 0);
> -
> -	return amdgpu_ring_test_helper(kiq_ring);
> +	if (!(amdgpu_in_reset(adev) && adev->reset_domain->hang))
> +		return amdgpu_ring_test_helper(kiq_ring);
> +	else
> +		return 0;
>   }
>   #endif
>   
> @@ -7610,6 +7616,7 @@ static int gfx_v10_0_hw_fini(void *handle)
>   
>   		return 0;
>   	}
> +
>   	gfx_v10_0_cp_enable(adev, false);
>   	gfx_v10_0_enable_gui_idle_interrupt(adev, false);
>   
> 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* RE: [PATCH v2 6/6] drm/amdgpu: reduce reset time
  2022-07-29  9:27     ` Zhao, Victor
@ 2022-07-29  9:39       ` Lazar, Lijo
  0 siblings, 0 replies; 13+ messages in thread
From: Lazar, Lijo @ 2022-07-29  9:39 UTC (permalink / raw)
  To: Zhao, Victor, amd-gfx; +Cc: Deng, Emily, Grodzovsky, Andrey

[AMD Official Use Only - General]

XGMI connected GPUs share the same reset domain software object. If the devices are not connected with each other, there is one domain per device.

Thanks,
Lijo

-----Original Message-----
From: Zhao, Victor <Victor.Zhao@amd.com> 
Sent: Friday, July 29, 2022 2:58 PM
To: Lazar, Lijo <Lijo.Lazar@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Deng, Emily <Emily.Deng@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
Subject: RE: [PATCH v2 6/6] drm/amdgpu: reduce reset time

[AMD Official Use Only - General]

Hi Lijo,

Can you be more specific why other gpus can be affected? I don’t have a xgmi system on my side. I thought reset_domain is per device.


Thanks,
Victor



-----Original Message-----
From: Lazar, Lijo <Lijo.Lazar@amd.com> 
Sent: Friday, July 29, 2022 2:11 PM
To: Zhao, Victor <Victor.Zhao@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Deng, Emily <Emily.Deng@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
Subject: Re: [PATCH v2 6/6] drm/amdgpu: reduce reset time



On 7/28/2022 4:00 PM, Victor Zhao wrote:
> In multi container use case, reset time is important, so skip ring 
> tests and cp halt wait during ip suspending for reset as they are 
> going to fail and cost more time on reset
> 
> v2: add a hang flag to indicate the reset comes from a job timeout, 
> skip ring test and cp halt wait in this case
> 
> Signed-off-by: Victor Zhao <Victor.Zhao@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c   |  3 ++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c   |  2 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c |  1 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h |  1 +
>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c    | 11 +++++++++--
>   5 files changed, 15 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index 222d3d7ea076..c735a17c6afb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -27,6 +27,7 @@
>   #include "amdgpu_gfx.h"
>   #include "amdgpu_rlc.h"
>   #include "amdgpu_ras.h"
> +#include "amdgpu_reset.h"
>   
>   /* delay 0.1 second to enable gfx off feature */
>   #define GFX_OFF_DELAY_ENABLE         msecs_to_jiffies(100)
> @@ -477,7 +478,7 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev)
>   		kiq->pmf->kiq_unmap_queues(kiq_ring, &adev->gfx.compute_ring[i],
>   					   RESET_QUEUES, 0, 0);
>   
> -	if (adev->gfx.kiq.ring.sched.ready)
> +	if (adev->gfx.kiq.ring.sched.ready && !(amdgpu_in_reset(adev) && 
> +adev->reset_domain->hang))
>   		r = amdgpu_ring_test_helper(kiq_ring);

On a system with multiple GPUs interconnected, this will affect other GPUs as well on which job was not really running. I guess your usecase here is device specific.

Thanks,
Lijo

>   	spin_unlock(&adev->gfx.kiq.ring_lock);
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index 6c3e7290153f..bb40880a557f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -49,6 +49,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
>   	}
>   
>   	memset(&ti, 0, sizeof(struct amdgpu_task_info));
> +	adev->reset_domain->hang = true;
>   
>   	if (amdgpu_gpu_recovery &&
>   	    amdgpu_ring_soft_recovery(ring, job->vmid, 
> s_job->s_fence->parent)) { @@ -83,6 +84,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
>   	}
>   
>   exit:
> +	adev->reset_domain->hang = false;
>   	drm_dev_exit(idx);
>   	return DRM_GPU_SCHED_STAT_NOMINAL;
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> index 9da5ead50c90..b828fe773f50 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> @@ -155,6 +155,7 @@ struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d
>   	atomic_set(&reset_domain->in_gpu_reset, 0);
>   	atomic_set(&reset_domain->reset_res, 0);
>   	init_rwsem(&reset_domain->sem);
> +	reset_domain->hang = false;
>   
>   	return reset_domain;
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> index cc4b2eeb24cf..29e324add552 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> @@ -84,6 +84,7 @@ struct amdgpu_reset_domain {
>   	struct rw_semaphore sem;
>   	atomic_t in_gpu_reset;
>   	atomic_t reset_res;
> +	bool hang;
>   };
>   
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index fafbad3cf08d..a384e04d916c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -29,6 +29,7 @@
>   #include "amdgpu.h"
>   #include "amdgpu_gfx.h"
>   #include "amdgpu_psp.h"
> +#include "amdgpu_reset.h"
>   #include "nv.h"
>   #include "nvd.h"
>   
> @@ -5971,6 +5972,9 @@ static int gfx_v10_0_cp_gfx_enable(struct amdgpu_device *adev, bool enable)
>   		WREG32_SOC15(GC, 0, mmCP_ME_CNTL, tmp);
>   	}
>   
> +	if ((amdgpu_in_reset(adev) && adev->reset_domain->hang) && !enable)
> +		return 0;
> +
>   	for (i = 0; i < adev->usec_timeout; i++) {
>   		if (RREG32_SOC15(GC, 0, mmCP_STAT) == 0)
>   			break;
> @@ -7569,8 +7573,10 @@ static int gfx_v10_0_kiq_disable_kgq(struct amdgpu_device *adev)
>   	for (i = 0; i < adev->gfx.num_gfx_rings; i++)
>   		kiq->pmf->kiq_unmap_queues(kiq_ring, &adev->gfx.gfx_ring[i],
>   					   PREEMPT_QUEUES, 0, 0);
> -
> -	return amdgpu_ring_test_helper(kiq_ring);
> +	if (!(amdgpu_in_reset(adev) && adev->reset_domain->hang))
> +		return amdgpu_ring_test_helper(kiq_ring);
> +	else
> +		return 0;
>   }
>   #endif
>   
> @@ -7610,6 +7616,7 @@ static int gfx_v10_0_hw_fini(void *handle)
>   
>   		return 0;
>   	}
> +
>   	gfx_v10_0_cp_enable(adev, false);
>   	gfx_v10_0_enable_gui_idle_interrupt(adev, false);
>   
> 

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2022-07-29  9:40 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-07-28 10:30 [PATCH v2 1/6] drm/amdgpu: add mode2 reset for sienna_cichlid Victor Zhao
2022-07-28 10:30 ` [PATCH v2 2/6] drm/amdgpu: let mode2 reset fallback to default when failure Victor Zhao
2022-07-28 10:30 ` [PATCH v2 3/6] drm/amdgpu: add debugfs amdgpu_reset_level Victor Zhao
2022-07-28 10:30 ` [PATCH v2 4/6] drm/amdgpu: save and restore gc hub regs Victor Zhao
2022-07-28 10:30 ` [PATCH v2 5/6] drm/amdgpu: revert context to stop engine before mode2 reset Victor Zhao
2022-07-28 10:30 ` [PATCH v2 6/6] drm/amdgpu: reduce reset time Victor Zhao
2022-07-28 18:37   ` Andrey Grodzovsky
2022-07-29  6:11   ` Lazar, Lijo
2022-07-29  9:27     ` Zhao, Victor
2022-07-29  9:39       ` Lazar, Lijo
2022-07-28 18:39 ` [PATCH v2 1/6] drm/amdgpu: add mode2 reset for sienna_cichlid Andrey Grodzovsky
2022-07-29  6:05 ` Quan, Evan
2022-07-29  9:05   ` Zhao, Victor

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.