[PATCH 01/10] drm/amdgpu: remove ras global recovery handling from ras_controller

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 01/10] drm/amdgpu: remove ras global recovery handling from ras_controller_int handler
@ 2019-11-27  9:15 ` Le Ma
  0 siblings, 0 replies; 57+ messages in thread
From: Le Ma @ 2019-11-27  9:15 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: guchun.chen-5C7GfCeVMHo, tao.zhou1-5C7GfCeVMHo, Le Ma,
	alexander.deucher-5C7GfCeVMHo, dennis.li-5C7GfCeVMHo,
	hawking.zhang-5C7GfCeVMHo

From: Le Ma <Le.Ma@amd.com>

v2: add notification when ras controller interrupt generates

Change-Id: Ic03e42e9d1c4dab1fa7f4817c191a16e485b48a9
Signed-off-by: Le Ma <Le.Ma@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
index 0db458f..25231d6 100644
--- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
@@ -324,7 +324,12 @@ static void nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device
 						RAS_CNTLR_INTERRUPT_CLEAR, 1);
 		WREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL, bif_doorbell_intr_cntl);
 
-		amdgpu_ras_global_ras_isr(adev);
+		DRM_WARN("RAS controller interrupt triggered by NBIF error\n");
+
+		/* ras_controller_int is dedicated for nbif ras error,
+		 * not the global interrupt for sync flood
+		 */
+		amdgpu_ras_reset_gpu(adev, true);
 	}
 }
 
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [PATCH 01/10] drm/amdgpu: remove ras global recovery handling from ras_controller_int handler
@ 2019-11-27  9:15 ` Le Ma
  0 siblings, 0 replies; 57+ messages in thread
From: Le Ma @ 2019-11-27  9:15 UTC (permalink / raw)
  To: amd-gfx
  Cc: guchun.chen, tao.zhou1, Le Ma, alexander.deucher, dennis.li,
	hawking.zhang

From: Le Ma <Le.Ma@amd.com>

v2: add notification when ras controller interrupt generates

Change-Id: Ic03e42e9d1c4dab1fa7f4817c191a16e485b48a9
Signed-off-by: Le Ma <Le.Ma@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
index 0db458f..25231d6 100644
--- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
@@ -324,7 +324,12 @@ static void nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device
 						RAS_CNTLR_INTERRUPT_CLEAR, 1);
 		WREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL, bif_doorbell_intr_cntl);
 
-		amdgpu_ras_global_ras_isr(adev);
+		DRM_WARN("RAS controller interrupt triggered by NBIF error\n");
+
+		/* ras_controller_int is dedicated for nbif ras error,
+		 * not the global interrupt for sync flood
+		 */
+		amdgpu_ras_reset_gpu(adev, true);
 	}
 }
 
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [PATCH 02/10] drm/amdgpu: export amdgpu_ras_find_obj to use externally
@ 2019-11-27  9:15     ` Le Ma
  0 siblings, 0 replies; 57+ messages in thread
From: Le Ma @ 2019-11-27  9:15 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: guchun.chen-5C7GfCeVMHo, tao.zhou1-5C7GfCeVMHo, Le Ma,
	alexander.deucher-5C7GfCeVMHo, dennis.li-5C7GfCeVMHo,
	hawking.zhang-5C7GfCeVMHo

Change it to external interface.

Change-Id: I2ab61f149c84a05a6f883a4c7415ea8012ec03a6
Signed-off-by: Le Ma <le.ma@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 5 +----
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 3 +++
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 1593564..04394c4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -198,9 +198,6 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
 	return 0;
 }
 
-static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
-		struct ras_common_if *head);
-
 /**
  * DOC: AMDGPU RAS debugfs control interface
  *
@@ -445,7 +442,7 @@ static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev,
 }
 
 /* return an obj equal to head, or the first when head is NULL */
-static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
+struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
 		struct ras_common_if *head)
 {
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index f80fd34..a2c1ac1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -611,6 +611,9 @@ int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev,
 int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev,
 		struct ras_dispatch_if *info);
 
+struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
+		struct ras_common_if *head);
+
 extern atomic_t amdgpu_ras_in_intr;
 
 static inline bool amdgpu_ras_intr_triggered(void)
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [PATCH 02/10] drm/amdgpu: export amdgpu_ras_find_obj to use externally
@ 2019-11-27  9:15     ` Le Ma
  0 siblings, 0 replies; 57+ messages in thread
From: Le Ma @ 2019-11-27  9:15 UTC (permalink / raw)
  To: amd-gfx
  Cc: guchun.chen, tao.zhou1, Le Ma, alexander.deucher, dennis.li,
	hawking.zhang

Change it to external interface.

Change-Id: I2ab61f149c84a05a6f883a4c7415ea8012ec03a6
Signed-off-by: Le Ma <le.ma@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 5 +----
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 3 +++
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 1593564..04394c4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -198,9 +198,6 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
 	return 0;
 }
 
-static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
-		struct ras_common_if *head);
-
 /**
  * DOC: AMDGPU RAS debugfs control interface
  *
@@ -445,7 +442,7 @@ static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev,
 }
 
 /* return an obj equal to head, or the first when head is NULL */
-static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
+struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
 		struct ras_common_if *head)
 {
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index f80fd34..a2c1ac1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -611,6 +611,9 @@ int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev,
 int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev,
 		struct ras_dispatch_if *info);
 
+struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
+		struct ras_common_if *head);
+
 extern atomic_t amdgpu_ras_in_intr;
 
 static inline bool amdgpu_ras_intr_triggered(void)
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [PATCH 03/10] drm/amdgpu: clear ras controller status registers when interrupt occurs
@ 2019-11-27  9:15     ` Le Ma
  0 siblings, 0 replies; 57+ messages in thread
From: Le Ma @ 2019-11-27  9:15 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: guchun.chen-5C7GfCeVMHo, tao.zhou1-5C7GfCeVMHo, Le Ma,
	alexander.deucher-5C7GfCeVMHo, dennis.li-5C7GfCeVMHo,
	hawking.zhang-5C7GfCeVMHo

To fix issue that ras controller interrupt cannot be triggered anymore after
one time nbif uncorrectable error. And error count is stored in nbif ras object
for query.

Change-Id: Iba482c169fdff3e9c390072c0289a622a522133c
Signed-off-by: Le Ma <le.ma@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
index 25231d6..9a3a65a 100644
--- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
@@ -52,6 +52,9 @@
 #define BIF_MMSCH1_DOORBELL_RANGE__OFFSET_MASK          0x00000FFCL
 #define BIF_MMSCH1_DOORBELL_RANGE__SIZE_MASK            0x001F0000L
 
+static void nbio_v7_4_query_ras_error_count(struct amdgpu_device *adev,
+					void *ras_error_status);
+
 static void nbio_v7_4_remap_hdp_registers(struct amdgpu_device *adev)
 {
 	WREG32_SOC15(NBIO, 0, mmREMAP_HDP_MEM_FLUSH_CNTL,
@@ -314,6 +317,7 @@ static void nbio_v7_4_init_registers(struct amdgpu_device *adev)
 static void nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device *adev)
 {
 	uint32_t bif_doorbell_intr_cntl;
+	struct ras_manager *obj = amdgpu_ras_find_obj(adev, adev->nbio.ras_if);
 
 	bif_doorbell_intr_cntl = RREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL);
 	if (REG_GET_FIELD(bif_doorbell_intr_cntl,
@@ -324,6 +328,12 @@ static void nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device
 						RAS_CNTLR_INTERRUPT_CLEAR, 1);
 		WREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL, bif_doorbell_intr_cntl);
 
+		/*
+		 * clear error status after ras_controller_intr according to
+		 * hw team and count ue number for query
+		 */
+		nbio_v7_4_query_ras_error_count(adev, &obj->err_data);
+
 		DRM_WARN("RAS controller interrupt triggered by NBIF error\n");
 
 		/* ras_controller_int is dedicated for nbif ras error,
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [PATCH 03/10] drm/amdgpu: clear ras controller status registers when interrupt occurs
@ 2019-11-27  9:15     ` Le Ma
  0 siblings, 0 replies; 57+ messages in thread
From: Le Ma @ 2019-11-27  9:15 UTC (permalink / raw)
  To: amd-gfx
  Cc: guchun.chen, tao.zhou1, Le Ma, alexander.deucher, dennis.li,
	hawking.zhang

To fix issue that ras controller interrupt cannot be triggered anymore after
one time nbif uncorrectable error. And error count is stored in nbif ras object
for query.

Change-Id: Iba482c169fdff3e9c390072c0289a622a522133c
Signed-off-by: Le Ma <le.ma@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
index 25231d6..9a3a65a 100644
--- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
@@ -52,6 +52,9 @@
 #define BIF_MMSCH1_DOORBELL_RANGE__OFFSET_MASK          0x00000FFCL
 #define BIF_MMSCH1_DOORBELL_RANGE__SIZE_MASK            0x001F0000L
 
+static void nbio_v7_4_query_ras_error_count(struct amdgpu_device *adev,
+					void *ras_error_status);
+
 static void nbio_v7_4_remap_hdp_registers(struct amdgpu_device *adev)
 {
 	WREG32_SOC15(NBIO, 0, mmREMAP_HDP_MEM_FLUSH_CNTL,
@@ -314,6 +317,7 @@ static void nbio_v7_4_init_registers(struct amdgpu_device *adev)
 static void nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device *adev)
 {
 	uint32_t bif_doorbell_intr_cntl;
+	struct ras_manager *obj = amdgpu_ras_find_obj(adev, adev->nbio.ras_if);
 
 	bif_doorbell_intr_cntl = RREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL);
 	if (REG_GET_FIELD(bif_doorbell_intr_cntl,
@@ -324,6 +328,12 @@ static void nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device
 						RAS_CNTLR_INTERRUPT_CLEAR, 1);
 		WREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL, bif_doorbell_intr_cntl);
 
+		/*
+		 * clear error status after ras_controller_intr according to
+		 * hw team and count ue number for query
+		 */
+		nbio_v7_4_query_ras_error_count(adev, &obj->err_data);
+
 		DRM_WARN("RAS controller interrupt triggered by NBIF error\n");
 
 		/* ras_controller_int is dedicated for nbif ras error,
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [PATCH 05/10] drm/amdgpu: enable/disable doorbell interrupt in baco entry/exit helper
@ 2019-11-27  9:15     ` Le Ma
  0 siblings, 0 replies; 57+ messages in thread
From: Le Ma @ 2019-11-27  9:15 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: guchun.chen-5C7GfCeVMHo, tao.zhou1-5C7GfCeVMHo, Le Ma,
	alexander.deucher-5C7GfCeVMHo, dennis.li-5C7GfCeVMHo,
	hawking.zhang-5C7GfCeVMHo

This operation is needed when baco entry/exit for ras recovery

Change-Id: I535c7231693f3138a8e3d5acd55672e2ac68232f
Signed-off-by: Le Ma <le.ma@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index b1408c5..bd387bb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4308,10 +4308,14 @@ static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
 int amdgpu_device_baco_enter(struct drm_device *dev)
 {
 	struct amdgpu_device *adev = dev->dev_private;
+	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
 
 	if (!amdgpu_device_supports_baco(adev->ddev))
 		return -ENOTSUPP;
 
+	if (ras && ras->supported)
+		adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
+
 	if (is_support_sw_smu(adev)) {
 		struct smu_context *smu = &adev->smu;
 		int ret;
@@ -4319,8 +4323,6 @@ int amdgpu_device_baco_enter(struct drm_device *dev)
 		ret = smu_baco_enter(smu);
 		if (ret)
 			return ret;
-
-		return 0;
 	} else {
 		void *pp_handle = adev->powerplay.pp_handle;
 		const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs;
@@ -4331,14 +4333,15 @@ int amdgpu_device_baco_enter(struct drm_device *dev)
 		/* enter BACO state */
 		if (pp_funcs->set_asic_baco_state(pp_handle, 1))
 			return -EIO;
-
-		return 0;
 	}
+
+	return 0;
 }
 
 int amdgpu_device_baco_exit(struct drm_device *dev)
 {
 	struct amdgpu_device *adev = dev->dev_private;
+	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
 
 	if (!amdgpu_device_supports_baco(adev->ddev))
 		return -ENOTSUPP;
@@ -4351,7 +4354,6 @@ int amdgpu_device_baco_exit(struct drm_device *dev)
 		if (ret)
 			return ret;
 
-		return 0;
 	} else {
 		void *pp_handle = adev->powerplay.pp_handle;
 		const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs;
@@ -4362,7 +4364,10 @@ int amdgpu_device_baco_exit(struct drm_device *dev)
 		/* exit BACO state */
 		if (pp_funcs->set_asic_baco_state(pp_handle, 0))
 			return -EIO;
-
-		return 0;
 	}
+
+	if (ras && ras->supported)
+		adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
+
+	return 0;
 }
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [PATCH 05/10] drm/amdgpu: enable/disable doorbell interrupt in baco entry/exit helper
@ 2019-11-27  9:15     ` Le Ma
  0 siblings, 0 replies; 57+ messages in thread
From: Le Ma @ 2019-11-27  9:15 UTC (permalink / raw)
  To: amd-gfx
  Cc: guchun.chen, tao.zhou1, Le Ma, alexander.deucher, dennis.li,
	hawking.zhang

This operation is needed when baco entry/exit for ras recovery

Change-Id: I535c7231693f3138a8e3d5acd55672e2ac68232f
Signed-off-by: Le Ma <le.ma@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index b1408c5..bd387bb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4308,10 +4308,14 @@ static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
 int amdgpu_device_baco_enter(struct drm_device *dev)
 {
 	struct amdgpu_device *adev = dev->dev_private;
+	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
 
 	if (!amdgpu_device_supports_baco(adev->ddev))
 		return -ENOTSUPP;
 
+	if (ras && ras->supported)
+		adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
+
 	if (is_support_sw_smu(adev)) {
 		struct smu_context *smu = &adev->smu;
 		int ret;
@@ -4319,8 +4323,6 @@ int amdgpu_device_baco_enter(struct drm_device *dev)
 		ret = smu_baco_enter(smu);
 		if (ret)
 			return ret;
-
-		return 0;
 	} else {
 		void *pp_handle = adev->powerplay.pp_handle;
 		const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs;
@@ -4331,14 +4333,15 @@ int amdgpu_device_baco_enter(struct drm_device *dev)
 		/* enter BACO state */
 		if (pp_funcs->set_asic_baco_state(pp_handle, 1))
 			return -EIO;
-
-		return 0;
 	}
+
+	return 0;
 }
 
 int amdgpu_device_baco_exit(struct drm_device *dev)
 {
 	struct amdgpu_device *adev = dev->dev_private;
+	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
 
 	if (!amdgpu_device_supports_baco(adev->ddev))
 		return -ENOTSUPP;
@@ -4351,7 +4354,6 @@ int amdgpu_device_baco_exit(struct drm_device *dev)
 		if (ret)
 			return ret;
 
-		return 0;
 	} else {
 		void *pp_handle = adev->powerplay.pp_handle;
 		const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs;
@@ -4362,7 +4364,10 @@ int amdgpu_device_baco_exit(struct drm_device *dev)
 		/* exit BACO state */
 		if (pp_funcs->set_asic_baco_state(pp_handle, 0))
 			return -EIO;
-
-		return 0;
 	}
+
+	if (ras && ras->supported)
+		adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
+
+	return 0;
 }
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [PATCH 06/10] drm/amdgpu: add condition to enable baco for xgmi/ras case
@ 2019-11-27  9:15     ` Le Ma
  0 siblings, 0 replies; 57+ messages in thread
From: Le Ma @ 2019-11-27  9:15 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: guchun.chen-5C7GfCeVMHo, tao.zhou1-5C7GfCeVMHo, Le Ma,
	alexander.deucher-5C7GfCeVMHo, dennis.li-5C7GfCeVMHo,
	hawking.zhang-5C7GfCeVMHo

Avoid to change default reset behavior for production card by checking
amdgpu_ras_enable equal to 2. And only new enough smu ucode can support
baco for xgmi/ras case.

Change-Id: I07c3e6862be03e068745c73db8ea71f428ecba6b
Signed-off-by: Le Ma <le.ma@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/soc15.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c
index 951327f..6202333 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
@@ -577,7 +577,9 @@ soc15_asic_reset_method(struct amdgpu_device *adev)
 			struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
 			struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
 
-			if (hive || (ras && ras->supported))
+			if ((hive || (ras && ras->supported)) &&
+			    (amdgpu_ras_enable != 2 ||
+			    adev->pm.fw_version <= 0x283400))
 				baco_reset = false;
 		}
 		break;
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [PATCH 06/10] drm/amdgpu: add condition to enable baco for xgmi/ras case
@ 2019-11-27  9:15     ` Le Ma
  0 siblings, 0 replies; 57+ messages in thread
From: Le Ma @ 2019-11-27  9:15 UTC (permalink / raw)
  To: amd-gfx
  Cc: guchun.chen, tao.zhou1, Le Ma, alexander.deucher, dennis.li,
	hawking.zhang

Avoid to change default reset behavior for production card by checking
amdgpu_ras_enable equal to 2. And only new enough smu ucode can support
baco for xgmi/ras case.

Change-Id: I07c3e6862be03e068745c73db8ea71f428ecba6b
Signed-off-by: Le Ma <le.ma@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/soc15.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c
index 951327f..6202333 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
@@ -577,7 +577,9 @@ soc15_asic_reset_method(struct amdgpu_device *adev)
 			struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
 			struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
 
-			if (hive || (ras && ras->supported))
+			if ((hive || (ras && ras->supported)) &&
+			    (amdgpu_ras_enable != 2 ||
+			    adev->pm.fw_version <= 0x283400))
 				baco_reset = false;
 		}
 		break;
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [PATCH 07/10] drm/amdgpu: add concurrent baco reset support for XGMI
@ 2019-11-27  9:15     ` Le Ma
  0 siblings, 0 replies; 57+ messages in thread
From: Le Ma @ 2019-11-27  9:15 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: guchun.chen-5C7GfCeVMHo, tao.zhou1-5C7GfCeVMHo, Le Ma,
	alexander.deucher-5C7GfCeVMHo, dennis.li-5C7GfCeVMHo,
	hawking.zhang-5C7GfCeVMHo

Currently each XGMI node reset wq does not run in parrallel because same work
item bound to same cpu runs in sequence. So change to bound the xgmi_reset_work
item to different cpus.

XGMI requires all nodes enter into baco within very close proximity before
any node exit baco. So schedule the xgmi_reset_work wq twice for enter/exit
baco respectively.

The default reset code path and methods do not change for vega20 production:
  - baco reset without xgmi/ras
  - psp reset with xgmi/ras

To enable baco for XGMI/RAS case, both 2 conditions below are needed:
  - amdgpu_ras_enable=2
  - baco-supported smu firmware

The case that PSP reset and baco reset coexist within an XGMI hive is not in
the consideration.

Change-Id: I9c08cf90134f940b42e20d2129ff87fba761c532
Signed-off-by: Le Ma <le.ma@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 78 ++++++++++++++++++++++++++----
 2 files changed, 70 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index d120fe5..08929e6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -998,6 +998,8 @@ struct amdgpu_device {
 	int				pstate;
 	/* enable runtime pm on the device */
 	bool                            runpm;
+
+	bool				in_baco;
 };
 
 static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index bd387bb..71abfe9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2654,7 +2654,13 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
 	struct amdgpu_device *adev =
 		container_of(__work, struct amdgpu_device, xgmi_reset_work);
 
-	adev->asic_reset_res =  amdgpu_asic_reset(adev);
+	if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)
+		adev->asic_reset_res = (adev->in_baco == false) ?
+				amdgpu_device_baco_enter(adev->ddev) :
+				amdgpu_device_baco_exit(adev->ddev);
+	else
+		adev->asic_reset_res = amdgpu_asic_reset(adev);
+
 	if (adev->asic_reset_res)
 		DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
 			 adev->asic_reset_res, adev->ddev->unique);
@@ -3796,6 +3802,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
 	struct amdgpu_device *tmp_adev = NULL;
 	bool need_full_reset = *need_full_reset_arg, vram_lost = false;
 	int r = 0;
+	int cpu = smp_processor_id();
 
 	/*
 	 * ASIC reset has to be done on all HGMI hive nodes ASAP
@@ -3803,21 +3810,24 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
 	 */
 	if (need_full_reset) {
 		list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
-			/* For XGMI run all resets in parallel to speed up the process */
+			/*
+			 * For XGMI run all resets in parallel to speed up the
+			 * process by scheduling the highpri wq on different
+			 * cpus. For XGMI with baco reset, all nodes must enter
+			 * baco within close proximity before anyone exit.
+			 */
 			if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
-				if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work))
+				if (!queue_work_on(cpu, system_highpri_wq,
+						   &tmp_adev->xgmi_reset_work))
 					r = -EALREADY;
+				cpu = cpumask_next(cpu, cpu_online_mask);
 			} else
 				r = amdgpu_asic_reset(tmp_adev);
-
-			if (r) {
-				DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
-					 r, tmp_adev->ddev->unique);
+			if (r)
 				break;
-			}
 		}
 
-		/* For XGMI wait for all PSP resets to complete before proceed */
+		/* For XGMI wait for all work to complete before proceed */
 		if (!r) {
 			list_for_each_entry(tmp_adev, device_list_handle,
 					    gmc.xgmi.head) {
@@ -3826,11 +3836,59 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
 					r = tmp_adev->asic_reset_res;
 					if (r)
 						break;
+					if(AMD_RESET_METHOD_BACO ==
+					   amdgpu_asic_reset_method(tmp_adev))
+						tmp_adev->in_baco = true;
 				}
 			}
 		}
-	}
 
+		/*
+		 * For XGMI with baco reset, need exit baco phase by scheduling
+		 * xgmi_reset_work one more time. PSP reset skips this phase.
+		 * Not assume the situation that PSP reset and baco reset
+		 * coexist within an XGMI hive.
+		 */
+
+		if (!r) {
+			cpu = smp_processor_id();
+			list_for_each_entry(tmp_adev, device_list_handle,
+					    gmc.xgmi.head) {
+				if (tmp_adev->gmc.xgmi.num_physical_nodes > 1
+				    && AMD_RESET_METHOD_BACO ==
+				    amdgpu_asic_reset_method(tmp_adev)) {
+					if (!queue_work_on(cpu,
+						system_highpri_wq,
+						&tmp_adev->xgmi_reset_work))
+						r = -EALREADY;
+					if (r)
+						break;
+					cpu = cpumask_next(cpu, cpu_online_mask);
+				}
+			}
+		}
+
+		if (!r) {
+			list_for_each_entry(tmp_adev, device_list_handle,
+					    gmc.xgmi.head) {
+				if (tmp_adev->gmc.xgmi.num_physical_nodes > 1
+				    && AMD_RESET_METHOD_BACO ==
+				    amdgpu_asic_reset_method(tmp_adev)) {
+					flush_work(&tmp_adev->xgmi_reset_work);
+					r = tmp_adev->asic_reset_res;
+					if (r)
+						break;
+					tmp_adev->in_baco = false;
+				}
+			}
+		}
+
+		if (r) {
+			DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
+				 r, tmp_adev->ddev->unique);
+			goto end;
+		}
+	}
 
 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
 		if (need_full_reset) {
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [PATCH 07/10] drm/amdgpu: add concurrent baco reset support for XGMI
@ 2019-11-27  9:15     ` Le Ma
  0 siblings, 0 replies; 57+ messages in thread
From: Le Ma @ 2019-11-27  9:15 UTC (permalink / raw)
  To: amd-gfx
  Cc: guchun.chen, tao.zhou1, Le Ma, alexander.deucher, dennis.li,
	hawking.zhang

Currently each XGMI node reset wq does not run in parrallel because same work
item bound to same cpu runs in sequence. So change to bound the xgmi_reset_work
item to different cpus.

XGMI requires all nodes enter into baco within very close proximity before
any node exit baco. So schedule the xgmi_reset_work wq twice for enter/exit
baco respectively.

The default reset code path and methods do not change for vega20 production:
  - baco reset without xgmi/ras
  - psp reset with xgmi/ras

To enable baco for XGMI/RAS case, both 2 conditions below are needed:
  - amdgpu_ras_enable=2
  - baco-supported smu firmware

The case that PSP reset and baco reset coexist within an XGMI hive is not in
the consideration.

Change-Id: I9c08cf90134f940b42e20d2129ff87fba761c532
Signed-off-by: Le Ma <le.ma@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 78 ++++++++++++++++++++++++++----
 2 files changed, 70 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index d120fe5..08929e6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -998,6 +998,8 @@ struct amdgpu_device {
 	int				pstate;
 	/* enable runtime pm on the device */
 	bool                            runpm;
+
+	bool				in_baco;
 };
 
 static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index bd387bb..71abfe9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2654,7 +2654,13 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
 	struct amdgpu_device *adev =
 		container_of(__work, struct amdgpu_device, xgmi_reset_work);
 
-	adev->asic_reset_res =  amdgpu_asic_reset(adev);
+	if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)
+		adev->asic_reset_res = (adev->in_baco == false) ?
+				amdgpu_device_baco_enter(adev->ddev) :
+				amdgpu_device_baco_exit(adev->ddev);
+	else
+		adev->asic_reset_res = amdgpu_asic_reset(adev);
+
 	if (adev->asic_reset_res)
 		DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
 			 adev->asic_reset_res, adev->ddev->unique);
@@ -3796,6 +3802,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
 	struct amdgpu_device *tmp_adev = NULL;
 	bool need_full_reset = *need_full_reset_arg, vram_lost = false;
 	int r = 0;
+	int cpu = smp_processor_id();
 
 	/*
 	 * ASIC reset has to be done on all HGMI hive nodes ASAP
@@ -3803,21 +3810,24 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
 	 */
 	if (need_full_reset) {
 		list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
-			/* For XGMI run all resets in parallel to speed up the process */
+			/*
+			 * For XGMI run all resets in parallel to speed up the
+			 * process by scheduling the highpri wq on different
+			 * cpus. For XGMI with baco reset, all nodes must enter
+			 * baco within close proximity before anyone exit.
+			 */
 			if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
-				if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work))
+				if (!queue_work_on(cpu, system_highpri_wq,
+						   &tmp_adev->xgmi_reset_work))
 					r = -EALREADY;
+				cpu = cpumask_next(cpu, cpu_online_mask);
 			} else
 				r = amdgpu_asic_reset(tmp_adev);
-
-			if (r) {
-				DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
-					 r, tmp_adev->ddev->unique);
+			if (r)
 				break;
-			}
 		}
 
-		/* For XGMI wait for all PSP resets to complete before proceed */
+		/* For XGMI wait for all work to complete before proceed */
 		if (!r) {
 			list_for_each_entry(tmp_adev, device_list_handle,
 					    gmc.xgmi.head) {
@@ -3826,11 +3836,59 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
 					r = tmp_adev->asic_reset_res;
 					if (r)
 						break;
+					if(AMD_RESET_METHOD_BACO ==
+					   amdgpu_asic_reset_method(tmp_adev))
+						tmp_adev->in_baco = true;
 				}
 			}
 		}
-	}
 
+		/*
+		 * For XGMI with baco reset, need exit baco phase by scheduling
+		 * xgmi_reset_work one more time. PSP reset skips this phase.
+		 * Not assume the situation that PSP reset and baco reset
+		 * coexist within an XGMI hive.
+		 */
+
+		if (!r) {
+			cpu = smp_processor_id();
+			list_for_each_entry(tmp_adev, device_list_handle,
+					    gmc.xgmi.head) {
+				if (tmp_adev->gmc.xgmi.num_physical_nodes > 1
+				    && AMD_RESET_METHOD_BACO ==
+				    amdgpu_asic_reset_method(tmp_adev)) {
+					if (!queue_work_on(cpu,
+						system_highpri_wq,
+						&tmp_adev->xgmi_reset_work))
+						r = -EALREADY;
+					if (r)
+						break;
+					cpu = cpumask_next(cpu, cpu_online_mask);
+				}
+			}
+		}
+
+		if (!r) {
+			list_for_each_entry(tmp_adev, device_list_handle,
+					    gmc.xgmi.head) {
+				if (tmp_adev->gmc.xgmi.num_physical_nodes > 1
+				    && AMD_RESET_METHOD_BACO ==
+				    amdgpu_asic_reset_method(tmp_adev)) {
+					flush_work(&tmp_adev->xgmi_reset_work);
+					r = tmp_adev->asic_reset_res;
+					if (r)
+						break;
+					tmp_adev->in_baco = false;
+				}
+			}
+		}
+
+		if (r) {
+			DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
+				 r, tmp_adev->ddev->unique);
+			goto end;
+		}
+	}
 
 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
 		if (need_full_reset) {
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [PATCH 08/10] drm/amdgpu: support full gpu reset workflow when ras err_event_athub occurs
@ 2019-11-27  9:15     ` Le Ma
  0 siblings, 0 replies; 57+ messages in thread
From: Le Ma @ 2019-11-27  9:15 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: guchun.chen-5C7GfCeVMHo, tao.zhou1-5C7GfCeVMHo, Le Ma,
	alexander.deucher-5C7GfCeVMHo, dennis.li-5C7GfCeVMHo,
	hawking.zhang-5C7GfCeVMHo

This athub fatal error can be recovered by baco without system-level reboot,
so add a mode to use baco for the recovery. Not affect the default psp reset
situations for now.

Change-Id: Ib17f2a39254ff6b0473a785752adfdfea79d0e0d
Signed-off-by: Le Ma <le.ma@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 71abfe9..53e9590 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4021,12 +4021,15 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 	struct amdgpu_device *tmp_adev = NULL;
 	int i, r = 0;
 	bool in_ras_intr = amdgpu_ras_intr_triggered();
+	bool use_baco =
+		(amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) ?
+		true : false;
 
 	/*
 	 * Flush RAM to disk so that after reboot
 	 * the user can read log and see why the system rebooted.
 	 */
-	if (in_ras_intr && amdgpu_ras_get_context(adev)->reboot) {
+	if (in_ras_intr && !use_baco && amdgpu_ras_get_context(adev)->reboot) {
 
 		DRM_WARN("Emergency reboot.");
 
@@ -4037,7 +4040,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 	need_full_reset = job_signaled = false;
 	INIT_LIST_HEAD(&device_list);
 
-	dev_info(adev->dev, "GPU %s begin!\n", in_ras_intr ? "jobs stop":"reset");
+	dev_info(adev->dev, "GPU %s begin!\n",
+		(in_ras_intr && !use_baco) ? "jobs stop":"reset");
 
 	cancel_delayed_work_sync(&adev->delayed_init_work);
 
@@ -4104,7 +4108,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 		amdgpu_unregister_gpu_instance(tmp_adev);
 
 		/* disable ras on ALL IPs */
-		if (!in_ras_intr && amdgpu_device_ip_need_full_reset(tmp_adev))
+		if (!(in_ras_intr && !use_baco) &&
+		      amdgpu_device_ip_need_full_reset(tmp_adev))
 			amdgpu_ras_suspend(tmp_adev);
 
 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
@@ -4115,13 +4120,13 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 
 			drm_sched_stop(&ring->sched, job ? &job->base : NULL);
 
-			if (in_ras_intr)
+			if (in_ras_intr && !use_baco)
 				amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
 		}
 	}
 
 
-	if (in_ras_intr)
+	if (in_ras_intr && !use_baco)
 		goto skip_sched_resume;
 
 	/*
@@ -4214,7 +4219,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 skip_sched_resume:
 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
 		/*unlock kfd: SRIOV would do it separately */
-		if (!in_ras_intr && !amdgpu_sriov_vf(tmp_adev))
+		if (!(in_ras_intr && !use_baco) && !amdgpu_sriov_vf(tmp_adev))
 	                amdgpu_amdkfd_post_reset(tmp_adev);
 		amdgpu_device_unlock_adev(tmp_adev);
 	}
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [PATCH 08/10] drm/amdgpu: support full gpu reset workflow when ras err_event_athub occurs
@ 2019-11-27  9:15     ` Le Ma
  0 siblings, 0 replies; 57+ messages in thread
From: Le Ma @ 2019-11-27  9:15 UTC (permalink / raw)
  To: amd-gfx
  Cc: guchun.chen, tao.zhou1, Le Ma, alexander.deucher, dennis.li,
	hawking.zhang

This athub fatal error can be recovered by baco without system-level reboot,
so add a mode to use baco for the recovery. Not affect the default psp reset
situations for now.

Change-Id: Ib17f2a39254ff6b0473a785752adfdfea79d0e0d
Signed-off-by: Le Ma <le.ma@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 71abfe9..53e9590 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4021,12 +4021,15 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 	struct amdgpu_device *tmp_adev = NULL;
 	int i, r = 0;
 	bool in_ras_intr = amdgpu_ras_intr_triggered();
+	bool use_baco =
+		(amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) ?
+		true : false;
 
 	/*
 	 * Flush RAM to disk so that after reboot
 	 * the user can read log and see why the system rebooted.
 	 */
-	if (in_ras_intr && amdgpu_ras_get_context(adev)->reboot) {
+	if (in_ras_intr && !use_baco && amdgpu_ras_get_context(adev)->reboot) {
 
 		DRM_WARN("Emergency reboot.");
 
@@ -4037,7 +4040,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 	need_full_reset = job_signaled = false;
 	INIT_LIST_HEAD(&device_list);
 
-	dev_info(adev->dev, "GPU %s begin!\n", in_ras_intr ? "jobs stop":"reset");
+	dev_info(adev->dev, "GPU %s begin!\n",
+		(in_ras_intr && !use_baco) ? "jobs stop":"reset");
 
 	cancel_delayed_work_sync(&adev->delayed_init_work);
 
@@ -4104,7 +4108,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 		amdgpu_unregister_gpu_instance(tmp_adev);
 
 		/* disable ras on ALL IPs */
-		if (!in_ras_intr && amdgpu_device_ip_need_full_reset(tmp_adev))
+		if (!(in_ras_intr && !use_baco) &&
+		      amdgpu_device_ip_need_full_reset(tmp_adev))
 			amdgpu_ras_suspend(tmp_adev);
 
 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
@@ -4115,13 +4120,13 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 
 			drm_sched_stop(&ring->sched, job ? &job->base : NULL);
 
-			if (in_ras_intr)
+			if (in_ras_intr && !use_baco)
 				amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
 		}
 	}
 
 
-	if (in_ras_intr)
+	if (in_ras_intr && !use_baco)
 		goto skip_sched_resume;
 
 	/*
@@ -4214,7 +4219,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 skip_sched_resume:
 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
 		/*unlock kfd: SRIOV would do it separately */
-		if (!in_ras_intr && !amdgpu_sriov_vf(tmp_adev))
+		if (!(in_ras_intr && !use_baco) && !amdgpu_sriov_vf(tmp_adev))
 	                amdgpu_amdkfd_post_reset(tmp_adev);
 		amdgpu_device_unlock_adev(tmp_adev);
 	}
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [PATCH 09/10] drm/amdgpu: clear err_event_athub flag after reset exit
@ 2019-11-27  9:15     ` Le Ma
  0 siblings, 0 replies; 57+ messages in thread
From: Le Ma @ 2019-11-27  9:15 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: guchun.chen-5C7GfCeVMHo, tao.zhou1-5C7GfCeVMHo, Le Ma,
	alexander.deucher-5C7GfCeVMHo, dennis.li-5C7GfCeVMHo,
	hawking.zhang-5C7GfCeVMHo

Otherwise next err_event_athub error cannot call gpu reset. And following
resume sequence will not be affected by this flag.

v2: create function to clear amdgpu_ras_in_intr for modularity of ras driver

Change-Id: I5cd293f30f23876bf2a1860681bcb50f47713ecd
Signed-off-by: Le Ma <le.ma@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h    | 5 +++++
 2 files changed, 8 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 53e9590..8387b44 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3890,6 +3890,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
 		}
 	}
 
+	if (!r && amdgpu_ras_intr_triggered())
+		amdgpu_ras_intr_cleared();
+
 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
 		if (need_full_reset) {
 			/* post card */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index a2c1ac1..d4ade47 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -621,6 +621,11 @@ static inline bool amdgpu_ras_intr_triggered(void)
 	return !!atomic_read(&amdgpu_ras_in_intr);
 }
 
+static inline void amdgpu_ras_intr_cleared(void)
+{
+	atomic_set(&amdgpu_ras_in_intr, 0);
+}
+
 void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev);
 
 #endif
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [PATCH 09/10] drm/amdgpu: clear err_event_athub flag after reset exit
@ 2019-11-27  9:15     ` Le Ma
  0 siblings, 0 replies; 57+ messages in thread
From: Le Ma @ 2019-11-27  9:15 UTC (permalink / raw)
  To: amd-gfx
  Cc: guchun.chen, tao.zhou1, Le Ma, alexander.deucher, dennis.li,
	hawking.zhang

Otherwise next err_event_athub error cannot call gpu reset. And following
resume sequence will not be affected by this flag.

v2: create function to clear amdgpu_ras_in_intr for modularity of ras driver

Change-Id: I5cd293f30f23876bf2a1860681bcb50f47713ecd
Signed-off-by: Le Ma <le.ma@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h    | 5 +++++
 2 files changed, 8 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 53e9590..8387b44 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3890,6 +3890,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
 		}
 	}
 
+	if (!r && amdgpu_ras_intr_triggered())
+		amdgpu_ras_intr_cleared();
+
 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
 		if (need_full_reset) {
 			/* post card */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index a2c1ac1..d4ade47 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -621,6 +621,11 @@ static inline bool amdgpu_ras_intr_triggered(void)
 	return !!atomic_read(&amdgpu_ras_in_intr);
 }
 
+static inline void amdgpu_ras_intr_cleared(void)
+{
+	atomic_set(&amdgpu_ras_in_intr, 0);
+}
+
 void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev);
 
 #endif
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [PATCH 10/10] drm/amdgpu: reduce redundant uvd context lost warning message
@ 2019-11-27  9:15     ` Le Ma
  0 siblings, 0 replies; 57+ messages in thread
From: Le Ma @ 2019-11-27  9:15 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: guchun.chen-5C7GfCeVMHo, tao.zhou1-5C7GfCeVMHo, Le Ma,
	alexander.deucher-5C7GfCeVMHo, dennis.li-5C7GfCeVMHo,
	hawking.zhang-5C7GfCeVMHo

Move the print out of uvd instance loop in amdgpu_uvd_suspend

Change-Id: Ifad997debd84763e1b55d668e144b729598f115e
Signed-off-by: Le Ma <le.ma@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
index e324bfe..ac7c7795 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
@@ -377,12 +377,15 @@ int amdgpu_uvd_suspend(struct amdgpu_device *adev)
 
 		/* re-write 0 since err_event_athub will corrupt VCPU buffer */
 		if (amdgpu_ras_intr_triggered()) {
-			DRM_WARN("UVD VCPU state may lost due to RAS ERREVENT_ATHUB_INTERRUPT\n");
 			memset(adev->uvd.inst[j].saved_bo, 0, size);
 		} else {
 			memcpy_fromio(adev->uvd.inst[j].saved_bo, ptr, size);
 		}
 	}
+
+	if (amdgpu_ras_intr_triggered()) {
+		DRM_WARN("UVD VCPU state may lost due to RAS ERREVENT_ATHUB_INTERRUPT\n");
+
 	return 0;
 }
 
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [PATCH 10/10] drm/amdgpu: reduce redundant uvd context lost warning message
@ 2019-11-27  9:15     ` Le Ma
  0 siblings, 0 replies; 57+ messages in thread
From: Le Ma @ 2019-11-27  9:15 UTC (permalink / raw)
  To: amd-gfx
  Cc: guchun.chen, tao.zhou1, Le Ma, alexander.deucher, dennis.li,
	hawking.zhang

Move the print out of uvd instance loop in amdgpu_uvd_suspend

Change-Id: Ifad997debd84763e1b55d668e144b729598f115e
Signed-off-by: Le Ma <le.ma@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
index e324bfe..ac7c7795 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
@@ -377,12 +377,15 @@ int amdgpu_uvd_suspend(struct amdgpu_device *adev)
 
 		/* re-write 0 since err_event_athub will corrupt VCPU buffer */
 		if (amdgpu_ras_intr_triggered()) {
-			DRM_WARN("UVD VCPU state may lost due to RAS ERREVENT_ATHUB_INTERRUPT\n");
 			memset(adev->uvd.inst[j].saved_bo, 0, size);
 		} else {
 			memcpy_fromio(adev->uvd.inst[j].saved_bo, ptr, size);
 		}
 	}
+
+	if (amdgpu_ras_intr_triggered()) {
+		DRM_WARN("UVD VCPU state may lost due to RAS ERREVENT_ATHUB_INTERRUPT\n");
+
 	return 0;
 }
 
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* RE: [PATCH 10/10] drm/amdgpu: reduce redundant uvd context lost warning message
@ 2019-11-27  9:49         ` Chen, Guchun
  0 siblings, 0 replies; 57+ messages in thread
From: Chen, Guchun @ 2019-11-27  9:49 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Deucher, Alexander, Ma, Le, Zhou1, Tao, Li, Dennis, Zhang, Hawking

[AMD Official Use Only - Internal Distribution Only]




-----Original Message-----
From: Le Ma <le.ma@amd.com> 
Sent: Wednesday, November 27, 2019 5:15 PM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking <Hawking.Zhang@amd.com>; Chen, Guchun <Guchun.Chen@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Li, Dennis <Dennis.Li@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Ma, Le <Le.Ma@amd.com>
Subject: [PATCH 10/10] drm/amdgpu: reduce redundant uvd context lost warning message

Move the print out of uvd instance loop in amdgpu_uvd_suspend

Change-Id: Ifad997debd84763e1b55d668e144b729598f115e
Signed-off-by: Le Ma <le.ma@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
index e324bfe..ac7c7795 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
@@ -377,12 +377,15 @@ int amdgpu_uvd_suspend(struct amdgpu_device *adev)
 
 		/* re-write 0 since err_event_athub will corrupt VCPU buffer */
 		if (amdgpu_ras_intr_triggered()) {
-			DRM_WARN("UVD VCPU state may lost due to RAS ERREVENT_ATHUB_INTERRUPT\n");
 			memset(adev->uvd.inst[j].saved_bo, 0, size);
 		} else {
 			memcpy_fromio(adev->uvd.inst[j].saved_bo, ptr, size);
 		}
 	}
+
+	if (amdgpu_ras_intr_triggered()) {
+		DRM_WARN("UVD VCPU state may lost due to RAS ERREVENT_ATHUB_INTERRUPT\n");
+
[Guchun]the "{" after the if condition needs to be removed?

 	return 0;
 }
 
-- 
2.7.4
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* RE: [PATCH 10/10] drm/amdgpu: reduce redundant uvd context lost warning message
@ 2019-11-27  9:49         ` Chen, Guchun
  0 siblings, 0 replies; 57+ messages in thread
From: Chen, Guchun @ 2019-11-27  9:49 UTC (permalink / raw)
  To: Ma, Le, amd-gfx
  Cc: Deucher, Alexander, Ma, Le, Zhou1, Tao, Li, Dennis, Zhang, Hawking

[AMD Official Use Only - Internal Distribution Only]




-----Original Message-----
From: Le Ma <le.ma@amd.com> 
Sent: Wednesday, November 27, 2019 5:15 PM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking <Hawking.Zhang@amd.com>; Chen, Guchun <Guchun.Chen@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Li, Dennis <Dennis.Li@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Ma, Le <Le.Ma@amd.com>
Subject: [PATCH 10/10] drm/amdgpu: reduce redundant uvd context lost warning message

Move the print out of uvd instance loop in amdgpu_uvd_suspend

Change-Id: Ifad997debd84763e1b55d668e144b729598f115e
Signed-off-by: Le Ma <le.ma@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
index e324bfe..ac7c7795 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
@@ -377,12 +377,15 @@ int amdgpu_uvd_suspend(struct amdgpu_device *adev)
 
 		/* re-write 0 since err_event_athub will corrupt VCPU buffer */
 		if (amdgpu_ras_intr_triggered()) {
-			DRM_WARN("UVD VCPU state may lost due to RAS ERREVENT_ATHUB_INTERRUPT\n");
 			memset(adev->uvd.inst[j].saved_bo, 0, size);
 		} else {
 			memcpy_fromio(adev->uvd.inst[j].saved_bo, ptr, size);
 		}
 	}
+
+	if (amdgpu_ras_intr_triggered()) {
+		DRM_WARN("UVD VCPU state may lost due to RAS ERREVENT_ATHUB_INTERRUPT\n");
+
[Guchun]the "{" after the if condition needs to be removed?

 	return 0;
 }
 
-- 
2.7.4
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* RE: [PATCH 10/10] drm/amdgpu: reduce redundant uvd context lost warning message
@ 2019-11-27  9:54             ` Ma, Le
  0 siblings, 0 replies; 57+ messages in thread
From: Ma, Le @ 2019-11-27  9:54 UTC (permalink / raw)
  To: Chen, Guchun, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Deucher, Alexander, Zhou1, Tao, Li, Dennis, Zhang, Hawking


[-- Attachment #1.1: Type: text/plain, Size: 3184 bytes --]





-----Original Message-----
From: Chen, Guchun <Guchun.Chen-5C7GfCeVMHo@public.gmane.org>
Sent: Wednesday, November 27, 2019 5:50 PM
To: Ma, Le <Le.Ma-5C7GfCeVMHo@public.gmane.org>; amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
Cc: Zhang, Hawking <Hawking.Zhang-5C7GfCeVMHo@public.gmane.org>; Zhou1, Tao <Tao.Zhou1-5C7GfCeVMHo@public.gmane.org>; Li, Dennis <Dennis.Li-5C7GfCeVMHo@public.gmane.org>; Deucher, Alexander <Alexander.Deucher@amd.com>; Ma, Le <Le.Ma-5C7GfCeVMHo@public.gmane.org>
Subject: RE: [PATCH 10/10] drm/amdgpu: reduce redundant uvd context lost warning message



[AMD Official Use Only - Internal Distribution Only]









-----Original Message-----

From: Le Ma <le.ma-5C7GfCeVMHo@public.gmane.org<mailto:le.ma-5C7GfCeVMHo@public.gmane.org>>

Sent: Wednesday, November 27, 2019 5:15 PM

To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org<mailto:amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org>

Cc: Zhang, Hawking <Hawking.Zhang-5C7GfCeVMHo@public.gmane.org<mailto:Hawking.Zhang-5C7GfCeVMHo@public.gmane.org>>; Chen, Guchun <Guchun.Chen-5C7GfCeVMHo@public.gmane.org<mailto:Guchun.Chen-5C7GfCeVMHo@public.gmane.org>>; Zhou1, Tao <Tao.Zhou1-5C7GfCeVMHo@public.gmane.org<mailto:Tao.Zhou1-5C7GfCeVMHo@public.gmane.org>>; Li, Dennis <Dennis.Li-5C7GfCeVMHo@public.gmane.org<mailto:Dennis.Li-5C7GfCeVMHo@public.gmane.org>>; Deucher, Alexander <Alexander.Deucher-5C7GfCeVMHo@public.gmane.org<mailto:Alexander.Deucher-5C7GfCeVMHo@public.gmane.org>>; Ma, Le <Le.Ma-5C7GfCeVMHo@public.gmane.org<mailto:Le.Ma@amd.com>>

Subject: [PATCH 10/10] drm/amdgpu: reduce redundant uvd context lost warning message



Move the print out of uvd instance loop in amdgpu_uvd_suspend



Change-Id: Ifad997debd84763e1b55d668e144b729598f115e

Signed-off-by: Le Ma <le.ma-5C7GfCeVMHo@public.gmane.org<mailto:le.ma-5C7GfCeVMHo@public.gmane.org>>

---

drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c | 5 ++++-

1 file changed, 4 insertions(+), 1 deletion(-)



diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c

index e324bfe..ac7c7795 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c

@@ -377,12 +377,15 @@ int amdgpu_uvd_suspend(struct amdgpu_device *adev)

                        /* re-write 0 since err_event_athub will corrupt VCPU buffer */

                       if (amdgpu_ras_intr_triggered()) {

-                                   DRM_WARN("UVD VCPU state may lost due to RAS ERREVENT_ATHUB_INTERRUPT\n");

                                   memset(adev->uvd.inst[j].saved_bo, 0, size);

                       } else {

                                   memcpy_fromio(adev->uvd.inst[j].saved_bo, ptr, size);

                       }

           }

+

+          if (amdgpu_ras_intr_triggered()) {

+                      DRM_WARN("UVD VCPU state may lost due to RAS ERREVENT_ATHUB_INTERRUPT\n");

+

[Guchun]the "{" after the if condition needs to be removed?

[Le] Yes, sent it too quickly and made a mistake here.

           return 0;

}

--

2.7.4

[-- Attachment #1.2: Type: text/html, Size: 9086 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 57+ messages in thread

* RE: [PATCH 10/10] drm/amdgpu: reduce redundant uvd context lost warning message
@ 2019-11-27  9:54             ` Ma, Le
  0 siblings, 0 replies; 57+ messages in thread
From: Ma, Le @ 2019-11-27  9:54 UTC (permalink / raw)
  To: Chen, Guchun, amd-gfx
  Cc: Deucher, Alexander, Zhou1, Tao, Li, Dennis, Zhang, Hawking


[-- Attachment #1.1: Type: text/plain, Size: 2659 bytes --]





-----Original Message-----
From: Chen, Guchun <Guchun.Chen@amd.com>
Sent: Wednesday, November 27, 2019 5:50 PM
To: Ma, Le <Le.Ma@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking <Hawking.Zhang@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Li, Dennis <Dennis.Li@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Ma, Le <Le.Ma@amd.com>
Subject: RE: [PATCH 10/10] drm/amdgpu: reduce redundant uvd context lost warning message



[AMD Official Use Only - Internal Distribution Only]









-----Original Message-----

From: Le Ma <le.ma@amd.com<mailto:le.ma@amd.com>>

Sent: Wednesday, November 27, 2019 5:15 PM

To: amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>

Cc: Zhang, Hawking <Hawking.Zhang@amd.com<mailto:Hawking.Zhang@amd.com>>; Chen, Guchun <Guchun.Chen@amd.com<mailto:Guchun.Chen@amd.com>>; Zhou1, Tao <Tao.Zhou1@amd.com<mailto:Tao.Zhou1@amd.com>>; Li, Dennis <Dennis.Li@amd.com<mailto:Dennis.Li@amd.com>>; Deucher, Alexander <Alexander.Deucher@amd.com<mailto:Alexander.Deucher@amd.com>>; Ma, Le <Le.Ma@amd.com<mailto:Le.Ma@amd.com>>

Subject: [PATCH 10/10] drm/amdgpu: reduce redundant uvd context lost warning message



Move the print out of uvd instance loop in amdgpu_uvd_suspend



Change-Id: Ifad997debd84763e1b55d668e144b729598f115e

Signed-off-by: Le Ma <le.ma@amd.com<mailto:le.ma@amd.com>>

---

drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c | 5 ++++-

1 file changed, 4 insertions(+), 1 deletion(-)



diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c

index e324bfe..ac7c7795 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c

@@ -377,12 +377,15 @@ int amdgpu_uvd_suspend(struct amdgpu_device *adev)

                        /* re-write 0 since err_event_athub will corrupt VCPU buffer */

                       if (amdgpu_ras_intr_triggered()) {

-                                   DRM_WARN("UVD VCPU state may lost due to RAS ERREVENT_ATHUB_INTERRUPT\n");

                                   memset(adev->uvd.inst[j].saved_bo, 0, size);

                       } else {

                                   memcpy_fromio(adev->uvd.inst[j].saved_bo, ptr, size);

                       }

           }

+

+          if (amdgpu_ras_intr_triggered()) {

+                      DRM_WARN("UVD VCPU state may lost due to RAS ERREVENT_ATHUB_INTERRUPT\n");

+

[Guchun]the "{" after the if condition needs to be removed?

[Le] Yes, sent it too quickly and made a mistake here.

           return 0;

}

--

2.7.4

[-- Attachment #1.2: Type: text/html, Size: 8556 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 57+ messages in thread

* RE: [PATCH 06/10] drm/amdgpu: add condition to enable baco for xgmi/ras case
@ 2019-11-27 11:28         ` Zhang, Hawking
  0 siblings, 0 replies; 57+ messages in thread
From: Zhang, Hawking @ 2019-11-27 11:28 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Deucher, Alexander, Ma, Le, Zhou1, Tao, Li, Dennis, Chen, Guchun

[AMD Public Use]

After thinking it a bit, I think we can just rely on PMFW version to decide to go RAS recovery or legacy fatal_error handling for the platforms that support RAS. Leveraging amdgpu_ras_enable as a temporary solution seems not necessary? Even baco ras recovery not stable, it is the same result as legacy fatal_error handling that user has to reboot the node manually. 

So the new soc reset use cases are:
XGMI (without RAS): use PSP mode1 based chain reset, 
RAS enabled (with PMFW 40.52 and onwards): use BACO based RAS recovery,
RAS enabled (with PMFW prior to 40.52): use legacy fatal_error handling.
 
Anything else?

Regards,
Hawking
-----Original Message-----
From: Le Ma <le.ma@amd.com> 
Sent: 2019年11月27日 17:15
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking <Hawking.Zhang@amd.com>; Chen, Guchun <Guchun.Chen@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Li, Dennis <Dennis.Li@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Ma, Le <Le.Ma@amd.com>
Subject: [PATCH 06/10] drm/amdgpu: add condition to enable baco for xgmi/ras case

Avoid to change default reset behavior for production card by checking amdgpu_ras_enable equal to 2. And only new enough smu ucode can support baco for xgmi/ras case.

Change-Id: I07c3e6862be03e068745c73db8ea71f428ecba6b
Signed-off-by: Le Ma <le.ma@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/soc15.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c
index 951327f..6202333 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
@@ -577,7 +577,9 @@ soc15_asic_reset_method(struct amdgpu_device *adev)
 			struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
 			struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
 
-			if (hive || (ras && ras->supported))
+			if ((hive || (ras && ras->supported)) &&
+			    (amdgpu_ras_enable != 2 ||
+			    adev->pm.fw_version <= 0x283400))
 				baco_reset = false;
 		}
 		break;
--
2.7.4
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* RE: [PATCH 06/10] drm/amdgpu: add condition to enable baco for xgmi/ras case
@ 2019-11-27 11:28         ` Zhang, Hawking
  0 siblings, 0 replies; 57+ messages in thread
From: Zhang, Hawking @ 2019-11-27 11:28 UTC (permalink / raw)
  To: Ma, Le, amd-gfx
  Cc: Deucher, Alexander, Ma, Le, Zhou1, Tao, Li, Dennis, Chen, Guchun

[AMD Public Use]

After thinking it a bit, I think we can just rely on PMFW version to decide to go RAS recovery or legacy fatal_error handling for the platforms that support RAS. Leveraging amdgpu_ras_enable as a temporary solution seems not necessary? Even baco ras recovery not stable, it is the same result as legacy fatal_error handling that user has to reboot the node manually. 

So the new soc reset use cases are:
XGMI (without RAS): use PSP mode1 based chain reset, 
RAS enabled (with PMFW 40.52 and onwards): use BACO based RAS recovery,
RAS enabled (with PMFW prior to 40.52): use legacy fatal_error handling.

Anything else?

Regards,
Hawking
-----Original Message-----
From: Le Ma <le.ma@amd.com> 
Sent: 2019年11月27日 17:15
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking <Hawking.Zhang@amd.com>; Chen, Guchun <Guchun.Chen@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Li, Dennis <Dennis.Li@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Ma, Le <Le.Ma@amd.com>
Subject: [PATCH 06/10] drm/amdgpu: add condition to enable baco for xgmi/ras case

Avoid to change default reset behavior for production card by checking amdgpu_ras_enable equal to 2. And only new enough smu ucode can support baco for xgmi/ras case.

Change-Id: I07c3e6862be03e068745c73db8ea71f428ecba6b
Signed-off-by: Le Ma <le.ma@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/soc15.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c
index 951327f..6202333 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
@@ -577,7 +577,9 @@ soc15_asic_reset_method(struct amdgpu_device *adev)
 			struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
 			struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);

-			if (hive || (ras && ras->supported))
+			if ((hive || (ras && ras->supported)) &&
+			    (amdgpu_ras_enable != 2 ||
+			    adev->pm.fw_version <= 0x283400))
 				baco_reset = false;
 		}
 		break;
--
2.7.4
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* RE: [PATCH 06/10] drm/amdgpu: add condition to enable baco for xgmi/ras case
@ 2019-11-27 11:38         ` Zhang, Hawking
  0 siblings, 0 replies; 57+ messages in thread
From: Zhang, Hawking @ 2019-11-27 11:38 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Deucher, Alexander, Ma, Le, Zhou1, Tao, Li, Dennis, Chen, Guchun

[AMD Public Use]

And It is still necessary to put all the condition check in a function. I mean a function that decide to go ras recovery or legacy fatal_error handling. The PMFW version that support RAS recovery will be different among ASICs. Current version check only works for VG20. In fact, once ras->supported is set and proper PMFW is detected, RAS recovery will be the best choice no matter it is sGPU or mGPU.

Regards,
Hawking

-----Original Message-----
From: Le Ma <le.ma@amd.com> 
Sent: 2019年11月27日 17:15
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking <Hawking.Zhang@amd.com>; Chen, Guchun <Guchun.Chen@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Li, Dennis <Dennis.Li@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Ma, Le <Le.Ma@amd.com>
Subject: [PATCH 06/10] drm/amdgpu: add condition to enable baco for xgmi/ras case

Avoid to change default reset behavior for production card by checking amdgpu_ras_enable equal to 2. And only new enough smu ucode can support baco for xgmi/ras case.

Change-Id: I07c3e6862be03e068745c73db8ea71f428ecba6b
Signed-off-by: Le Ma <le.ma@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/soc15.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c
index 951327f..6202333 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
@@ -577,7 +577,9 @@ soc15_asic_reset_method(struct amdgpu_device *adev)
 			struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
 			struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
 
-			if (hive || (ras && ras->supported))
+			if ((hive || (ras && ras->supported)) &&
+			    (amdgpu_ras_enable != 2 ||
+			    adev->pm.fw_version <= 0x283400))
 				baco_reset = false;
 		}
 		break;
--
2.7.4
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* RE: [PATCH 06/10] drm/amdgpu: add condition to enable baco for xgmi/ras case
@ 2019-11-27 11:38         ` Zhang, Hawking
  0 siblings, 0 replies; 57+ messages in thread
From: Zhang, Hawking @ 2019-11-27 11:38 UTC (permalink / raw)
  To: Ma, Le, amd-gfx
  Cc: Deucher, Alexander, Ma, Le, Zhou1, Tao, Li, Dennis, Chen, Guchun

[AMD Public Use]

And It is still necessary to put all the condition check in a function. I mean a function that decide to go ras recovery or legacy fatal_error handling. The PMFW version that support RAS recovery will be different among ASICs. Current version check only works for VG20. In fact, once ras->supported is set and proper PMFW is detected, RAS recovery will be the best choice no matter it is sGPU or mGPU.

Regards,
Hawking

-----Original Message-----
From: Le Ma <le.ma@amd.com> 
Sent: 2019年11月27日 17:15
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking <Hawking.Zhang@amd.com>; Chen, Guchun <Guchun.Chen@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Li, Dennis <Dennis.Li@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Ma, Le <Le.Ma@amd.com>
Subject: [PATCH 06/10] drm/amdgpu: add condition to enable baco for xgmi/ras case

Avoid to change default reset behavior for production card by checking amdgpu_ras_enable equal to 2. And only new enough smu ucode can support baco for xgmi/ras case.

Change-Id: I07c3e6862be03e068745c73db8ea71f428ecba6b
Signed-off-by: Le Ma <le.ma@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/soc15.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c
index 951327f..6202333 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
@@ -577,7 +577,9 @@ soc15_asic_reset_method(struct amdgpu_device *adev)
 			struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
 			struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
 
-			if (hive || (ras && ras->supported))
+			if ((hive || (ras && ras->supported)) &&
+			    (amdgpu_ras_enable != 2 ||
+			    adev->pm.fw_version <= 0x283400))
 				baco_reset = false;
 		}
 		break;
--
2.7.4
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* RE: [PATCH 05/10] drm/amdgpu: enable/disable doorbell interrupt in baco entry/exit helper
@ 2019-11-27 12:04         ` Zhang, Hawking
  0 siblings, 0 replies; 57+ messages in thread
From: Zhang, Hawking @ 2019-11-27 12:04 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Deucher, Alexander, Ma, Le, Zhou1, Tao, Li, Dennis, Chen, Guchun


[-- Attachment #1.1: Type: text/plain, Size: 4125 bytes --]

[AMD Official Use Only - Internal Distribution Only]


Please check my comments inline



Regards,
Hawking



-----Original Message-----
From: Le Ma <le.ma@amd.com>
Sent: 2019年11月27日 17:15
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking <Hawking.Zhang@amd.com>; Chen, Guchun <Guchun.Chen@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Li, Dennis <Dennis.Li@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Ma, Le <Le.Ma@amd.com>
Subject: [PATCH 05/10] drm/amdgpu: enable/disable doorbell interrupt in baco entry/exit helper



This operation is needed when baco entry/exit for ras recovery



Change-Id: I535c7231693f3138a8e3d5acd55672e2ac68232f

Signed-off-by: Le Ma <le.ma@amd.com<mailto:le.ma@amd.com>>

---

drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 19 ++++++++++++-------

1 file changed, 12 insertions(+), 7 deletions(-)



diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index b1408c5..bd387bb 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

@@ -4308,10 +4308,14 @@ static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)  int amdgpu_device_baco_enter(struct drm_device *dev)  {

               struct amdgpu_device *adev = dev->dev_private;

+             struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);



                if (!amdgpu_device_supports_baco(adev->ddev))

                               return -ENOTSUPP;



+             if (ras && ras->supported)

+                             adev->nbio.funcs->enable_doorbell_interrupt(adev, false);

+

               if (is_support_sw_smu(adev)) {

                               struct smu_context *smu = &adev->smu;

                               int ret;

@@ -4319,8 +4323,6 @@ int amdgpu_device_baco_enter(struct drm_device *dev)

                               ret = smu_baco_enter(smu);

                               if (ret)

                                               return ret;

-

-                              return 0;

               } else {

                               void *pp_handle = adev->powerplay.pp_handle;

                               const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs; @@ -4331,14 +4333,15 @@ int amdgpu_device_baco_enter(struct drm_device *dev)

                               /* enter BACO state */

                               if (pp_funcs->set_asic_baco_state(pp_handle, 1))

                                               return -EIO;

-

-                              return 0;

               }

+

+             return 0;

}



 int amdgpu_device_baco_exit(struct drm_device *dev)  {

               struct amdgpu_device *adev = dev->dev_private;

+             struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);



                if (!amdgpu_device_supports_baco(adev->ddev))

                               return -ENOTSUPP;

@@ -4351,7 +4354,6 @@ int amdgpu_device_baco_exit(struct drm_device *dev)

                               if (ret)

                                               return ret;



-                              return 0;

               } else {

                               void *pp_handle = adev->powerplay.pp_handle;

                               const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs; @@ -4362,7 +4364,10 @@ int amdgpu_device_baco_exit(struct drm_device *dev)

                               /* exit BACO state */

                               if (pp_funcs->set_asic_baco_state(pp_handle, 0))

                                               return -EIO;

-

-                              return 0;

               }

+

+             if (ras && ras->supported)

+                             adev->nbio.funcs->enable_doorbell_interrupt(adev, false);

+





[Hawking] Shouldn't be enabled doorbell interrupt after exit baco? Or do I miss something?



+             return 0;

}

--

2.7.4



[-- Attachment #1.2: Type: text/html, Size: 16122 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 57+ messages in thread

* RE: [PATCH 05/10] drm/amdgpu: enable/disable doorbell interrupt in baco entry/exit helper
@ 2019-11-27 12:04         ` Zhang, Hawking
  0 siblings, 0 replies; 57+ messages in thread
From: Zhang, Hawking @ 2019-11-27 12:04 UTC (permalink / raw)
  To: Ma, Le, amd-gfx
  Cc: Deucher, Alexander, Ma, Le, Zhou1, Tao, Li, Dennis, Chen, Guchun


[-- Attachment #1.1: Type: text/plain, Size: 4125 bytes --]

[AMD Official Use Only - Internal Distribution Only]


Please check my comments inline



Regards,
Hawking



-----Original Message-----
From: Le Ma <le.ma@amd.com>
Sent: 2019年11月27日 17:15
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking <Hawking.Zhang@amd.com>; Chen, Guchun <Guchun.Chen@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Li, Dennis <Dennis.Li@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Ma, Le <Le.Ma@amd.com>
Subject: [PATCH 05/10] drm/amdgpu: enable/disable doorbell interrupt in baco entry/exit helper



This operation is needed when baco entry/exit for ras recovery



Change-Id: I535c7231693f3138a8e3d5acd55672e2ac68232f

Signed-off-by: Le Ma <le.ma@amd.com<mailto:le.ma@amd.com>>

---

drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 19 ++++++++++++-------

1 file changed, 12 insertions(+), 7 deletions(-)



diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index b1408c5..bd387bb 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

@@ -4308,10 +4308,14 @@ static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)  int amdgpu_device_baco_enter(struct drm_device *dev)  {

               struct amdgpu_device *adev = dev->dev_private;

+             struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);



                if (!amdgpu_device_supports_baco(adev->ddev))

                               return -ENOTSUPP;



+             if (ras && ras->supported)

+                             adev->nbio.funcs->enable_doorbell_interrupt(adev, false);

+

               if (is_support_sw_smu(adev)) {

                               struct smu_context *smu = &adev->smu;

                               int ret;

@@ -4319,8 +4323,6 @@ int amdgpu_device_baco_enter(struct drm_device *dev)

                               ret = smu_baco_enter(smu);

                               if (ret)

                                               return ret;

-

-                              return 0;

               } else {

                               void *pp_handle = adev->powerplay.pp_handle;

                               const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs; @@ -4331,14 +4333,15 @@ int amdgpu_device_baco_enter(struct drm_device *dev)

                               /* enter BACO state */

                               if (pp_funcs->set_asic_baco_state(pp_handle, 1))

                                               return -EIO;

-

-                              return 0;

               }

+

+             return 0;

}



 int amdgpu_device_baco_exit(struct drm_device *dev)  {

               struct amdgpu_device *adev = dev->dev_private;

+             struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);



                if (!amdgpu_device_supports_baco(adev->ddev))

                               return -ENOTSUPP;

@@ -4351,7 +4354,6 @@ int amdgpu_device_baco_exit(struct drm_device *dev)

                               if (ret)

                                               return ret;



-                              return 0;

               } else {

                               void *pp_handle = adev->powerplay.pp_handle;

                               const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs; @@ -4362,7 +4364,10 @@ int amdgpu_device_baco_exit(struct drm_device *dev)

                               /* exit BACO state */

                               if (pp_funcs->set_asic_baco_state(pp_handle, 0))

                                               return -EIO;

-

-                              return 0;

               }

+

+             if (ras && ras->supported)

+                             adev->nbio.funcs->enable_doorbell_interrupt(adev, false);

+





[Hawking] Shouldn't be enabled doorbell interrupt after exit baco? Or do I miss something?



+             return 0;

}

--

2.7.4



[-- Attachment #1.2: Type: text/html, Size: 15925 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 57+ messages in thread

* RE: [PATCH 05/10] drm/amdgpu: enable/disable doorbell interrupt in baco entry/exit helper
@ 2019-11-27 12:14             ` Ma, Le
  0 siblings, 0 replies; 57+ messages in thread
From: Ma, Le @ 2019-11-27 12:14 UTC (permalink / raw)
  To: Zhang, Hawking, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Deucher, Alexander, Zhou1, Tao, Li, Dennis, Chen, Guchun


[-- Attachment #1.1: Type: text/plain, Size: 5357 bytes --]



From: Zhang, Hawking <Hawking.Zhang-5C7GfCeVMHo@public.gmane.org>
Sent: Wednesday, November 27, 2019 8:04 PM
To: Ma, Le <Le.Ma-5C7GfCeVMHo@public.gmane.org>; amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
Cc: Chen, Guchun <Guchun.Chen-5C7GfCeVMHo@public.gmane.org>; Zhou1, Tao <Tao.Zhou1-5C7GfCeVMHo@public.gmane.org>; Li, Dennis <Dennis.Li-5C7GfCeVMHo@public.gmane.org>; Deucher, Alexander <Alexander.Deucher-5C7GfCeVMHo@public.gmane.org>; Ma, Le <Le.Ma-5C7GfCeVMHo@public.gmane.org>
Subject: RE: [PATCH 05/10] drm/amdgpu: enable/disable doorbell interrupt in baco entry/exit helper


Please check my comments inline



Regards,
Hawking



-----Original Message-----
From: Le Ma <le.ma-5C7GfCeVMHo@public.gmane.org<mailto:le.ma-5C7GfCeVMHo@public.gmane.org>>
Sent: 2019年11月27日 17:15
To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org<mailto:amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org>
Cc: Zhang, Hawking <Hawking.Zhang-5C7GfCeVMHo@public.gmane.org<mailto:Hawking.Zhang-5C7GfCeVMHo@public.gmane.org>>; Chen, Guchun <Guchun.Chen-5C7GfCeVMHo@public.gmane.org<mailto:Guchun.Chen-5C7GfCeVMHo@public.gmane.org>>; Zhou1, Tao <Tao.Zhou1-5C7GfCeVMHo@public.gmane.org<mailto:Tao.Zhou1-5C7GfCeVMHo@public.gmane.org>>; Li, Dennis <Dennis.Li-5C7GfCeVMHo@public.gmane.org<mailto:Dennis.Li-5C7GfCeVMHo@public.gmane.org>>; Deucher, Alexander <Alexander.Deucher-5C7GfCeVMHo@public.gmane.org<mailto:Alexander.Deucher-5C7GfCeVMHo@public.gmane.org>>; Ma, Le <Le.Ma-5C7GfCeVMHo@public.gmane.org<mailto:Le.Ma@amd.com>>
Subject: [PATCH 05/10] drm/amdgpu: enable/disable doorbell interrupt in baco entry/exit helper



This operation is needed when baco entry/exit for ras recovery



Change-Id: I535c7231693f3138a8e3d5acd55672e2ac68232f

Signed-off-by: Le Ma <le.ma-5C7GfCeVMHo@public.gmane.org<mailto:le.ma-5C7GfCeVMHo@public.gmane.org>>

---

drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 19 ++++++++++++-------

1 file changed, 12 insertions(+), 7 deletions(-)



diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index b1408c5..bd387bb 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

@@ -4308,10 +4308,14 @@ static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)  int amdgpu_device_baco_enter(struct drm_device *dev)  {

               struct amdgpu_device *adev = dev->dev_private;

+             struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);



                if (!amdgpu_device_supports_baco(adev->ddev))

                               return -ENOTSUPP;



+             if (ras && ras->supported)

+                             adev->nbio.funcs->enable_doorbell_interrupt(adev, false);

+

               if (is_support_sw_smu(adev)) {

                               struct smu_context *smu = &adev->smu;

                               int ret;

@@ -4319,8 +4323,6 @@ int amdgpu_device_baco_enter(struct drm_device *dev)

                               ret = smu_baco_enter(smu);

                               if (ret)

                                               return ret;

-

-                              return 0;

               } else {

                               void *pp_handle = adev->powerplay.pp_handle;

                               const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs; @@ -4331,14 +4333,15 @@ int amdgpu_device_baco_enter(struct drm_device *dev)

                               /* enter BACO state */

                               if (pp_funcs->set_asic_baco_state(pp_handle, 1))

                                               return -EIO;

-

-                              return 0;

               }

+

+             return 0;

}



 int amdgpu_device_baco_exit(struct drm_device *dev)  {

               struct amdgpu_device *adev = dev->dev_private;

+             struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);



                if (!amdgpu_device_supports_baco(adev->ddev))

                               return -ENOTSUPP;

@@ -4351,7 +4354,6 @@ int amdgpu_device_baco_exit(struct drm_device *dev)

                               if (ret)

                                               return ret;



-                              return 0;

               } else {

                               void *pp_handle = adev->powerplay.pp_handle;

                               const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs; @@ -4362,7 +4364,10 @@ int amdgpu_device_baco_exit(struct drm_device *dev)

                               /* exit BACO state */

                               if (pp_funcs->set_asic_baco_state(pp_handle, 0))

                                               return -EIO;

-

-                              return 0;

               }

+

+             if (ras && ras->supported)

+                             adev->nbio.funcs->enable_doorbell_interrupt(adev, false);

+





[Hawking] Shouldn't be enabled doorbell interrupt after exit baco? Or do I miss something?



[Le]: Yes, the argument should be true. I made a typo here.



+             return 0;

}

--

2.7.4



[-- Attachment #1.2: Type: text/html, Size: 17657 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 57+ messages in thread

* RE: [PATCH 05/10] drm/amdgpu: enable/disable doorbell interrupt in baco entry/exit helper
@ 2019-11-27 12:14             ` Ma, Le
  0 siblings, 0 replies; 57+ messages in thread
From: Ma, Le @ 2019-11-27 12:14 UTC (permalink / raw)
  To: Zhang, Hawking, amd-gfx
  Cc: Deucher, Alexander, Zhou1, Tao, Li, Dennis, Chen, Guchun


[-- Attachment #1.1: Type: text/plain, Size: 4811 bytes --]



From: Zhang, Hawking <Hawking.Zhang@amd.com>
Sent: Wednesday, November 27, 2019 8:04 PM
To: Ma, Le <Le.Ma@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Chen, Guchun <Guchun.Chen@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Li, Dennis <Dennis.Li@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Ma, Le <Le.Ma@amd.com>
Subject: RE: [PATCH 05/10] drm/amdgpu: enable/disable doorbell interrupt in baco entry/exit helper


Please check my comments inline



Regards,
Hawking



-----Original Message-----
From: Le Ma <le.ma@amd.com<mailto:le.ma@amd.com>>
Sent: 2019年11月27日 17:15
To: amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
Cc: Zhang, Hawking <Hawking.Zhang@amd.com<mailto:Hawking.Zhang@amd.com>>; Chen, Guchun <Guchun.Chen@amd.com<mailto:Guchun.Chen@amd.com>>; Zhou1, Tao <Tao.Zhou1@amd.com<mailto:Tao.Zhou1@amd.com>>; Li, Dennis <Dennis.Li@amd.com<mailto:Dennis.Li@amd.com>>; Deucher, Alexander <Alexander.Deucher@amd.com<mailto:Alexander.Deucher@amd.com>>; Ma, Le <Le.Ma@amd.com<mailto:Le.Ma@amd.com>>
Subject: [PATCH 05/10] drm/amdgpu: enable/disable doorbell interrupt in baco entry/exit helper



This operation is needed when baco entry/exit for ras recovery



Change-Id: I535c7231693f3138a8e3d5acd55672e2ac68232f

Signed-off-by: Le Ma <le.ma@amd.com<mailto:le.ma@amd.com>>

---

drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 19 ++++++++++++-------

1 file changed, 12 insertions(+), 7 deletions(-)



diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index b1408c5..bd387bb 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

@@ -4308,10 +4308,14 @@ static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)  int amdgpu_device_baco_enter(struct drm_device *dev)  {

               struct amdgpu_device *adev = dev->dev_private;

+             struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);



                if (!amdgpu_device_supports_baco(adev->ddev))

                               return -ENOTSUPP;



+             if (ras && ras->supported)

+                             adev->nbio.funcs->enable_doorbell_interrupt(adev, false);

+

               if (is_support_sw_smu(adev)) {

                               struct smu_context *smu = &adev->smu;

                               int ret;

@@ -4319,8 +4323,6 @@ int amdgpu_device_baco_enter(struct drm_device *dev)

                               ret = smu_baco_enter(smu);

                               if (ret)

                                               return ret;

-

-                              return 0;

               } else {

                               void *pp_handle = adev->powerplay.pp_handle;

                               const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs; @@ -4331,14 +4333,15 @@ int amdgpu_device_baco_enter(struct drm_device *dev)

                               /* enter BACO state */

                               if (pp_funcs->set_asic_baco_state(pp_handle, 1))

                                               return -EIO;

-

-                              return 0;

               }

+

+             return 0;

}



 int amdgpu_device_baco_exit(struct drm_device *dev)  {

               struct amdgpu_device *adev = dev->dev_private;

+             struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);



                if (!amdgpu_device_supports_baco(adev->ddev))

                               return -ENOTSUPP;

@@ -4351,7 +4354,6 @@ int amdgpu_device_baco_exit(struct drm_device *dev)

                               if (ret)

                                               return ret;



-                              return 0;

               } else {

                               void *pp_handle = adev->powerplay.pp_handle;

                               const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs; @@ -4362,7 +4364,10 @@ int amdgpu_device_baco_exit(struct drm_device *dev)

                               /* exit BACO state */

                               if (pp_funcs->set_asic_baco_state(pp_handle, 0))

                                               return -EIO;

-

-                              return 0;

               }

+

+             if (ras && ras->supported)

+                             adev->nbio.funcs->enable_doorbell_interrupt(adev, false);

+





[Hawking] Shouldn't be enabled doorbell interrupt after exit baco? Or do I miss something?



[Le]: Yes, the argument should be true. I made a typo here.



+             return 0;

}

--

2.7.4



[-- Attachment #1.2: Type: text/html, Size: 17154 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 57+ messages in thread

* RE: [PATCH 06/10] drm/amdgpu: add condition to enable baco for xgmi/ras case
@ 2019-11-27 12:35             ` Ma, Le
  0 siblings, 0 replies; 57+ messages in thread
From: Ma, Le @ 2019-11-27 12:35 UTC (permalink / raw)
  To: Zhang, Hawking, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Deucher, Alexander, Zhou1, Tao, Li, Dennis, Chen, Guchun


[-- Attachment #1.1: Type: text/plain, Size: 4511 bytes --]

Agree with your thoughts that we drop amdgpu_ras_enable=2 condition. The only concern in my side is that besides fatal_error, another result may happen that atombios_init timeout on xgmi by baco (not sure psp mode1 reset causes this as well).



Assuming no amdgpu_ras_enable=2 check, if PMFW > 40.52,  the use cases as my understanding includes:

  1.  sGPU without RAS:
     *   new: baco
     *   old: baco
  2.  sGPU with RAS:

  *   new: baco
  *   old: psp mode1 chain reset and legacy fatal_error handling

  1.  XGMI with RAS: baco
     *   new: baco
     *   old: psp mode1 chain reset and legacy fatal_error handling
  2.  XGMI without RAS: baco
     *   new: baco
     *   old: psp mode1 chain reset



That is to say, all uses cases go on baco road when PMFW > 40.52.



Regards,

Ma Le



-----Original Message-----
From: Zhang, Hawking <Hawking.Zhang-5C7GfCeVMHo@public.gmane.org>
Sent: Wednesday, November 27, 2019 7:28 PM
To: Ma, Le <Le.Ma-5C7GfCeVMHo@public.gmane.org>; amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
Cc: Chen, Guchun <Guchun.Chen-5C7GfCeVMHo@public.gmane.org>; Zhou1, Tao <Tao.Zhou1-5C7GfCeVMHo@public.gmane.org>; Li, Dennis <Dennis.Li-5C7GfCeVMHo@public.gmane.org>; Deucher, Alexander <Alexander.Deucher-5C7GfCeVMHo@public.gmane.org>; Ma, Le <Le.Ma-5C7GfCeVMHo@public.gmane.org>
Subject: RE: [PATCH 06/10] drm/amdgpu: add condition to enable baco for xgmi/ras case



[AMD Public Use]



After thinking it a bit, I think we can just rely on PMFW version to decide to go RAS recovery or legacy fatal_error handling for the platforms that support RAS. Leveraging amdgpu_ras_enable as a temporary solution seems not necessary? Even baco ras recovery not stable, it is the same result as legacy fatal_error handling that user has to reboot the node manually.



So the new soc reset use cases are:

XGMI (without RAS): use PSP mode1 based chain reset, RAS enabled (with PMFW 40.52 and onwards): use BACO based RAS recovery, RAS enabled (with PMFW prior to 40.52): use legacy fatal_error handling.

Anything else?



Regards,

Hawking

-----Original Message-----

From: Le Ma <le.ma-5C7GfCeVMHo@public.gmane.org<mailto:le.ma-5C7GfCeVMHo@public.gmane.org>>

Sent: 2019年11月27日 17:15

To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org<mailto:amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org>

Cc: Zhang, Hawking <Hawking.Zhang-5C7GfCeVMHo@public.gmane.org<mailto:Hawking.Zhang-5C7GfCeVMHo@public.gmane.org>>; Chen, Guchun <Guchun.Chen-5C7GfCeVMHo@public.gmane.org<mailto:Guchun.Chen-5C7GfCeVMHo@public.gmane.org>>; Zhou1, Tao <Tao.Zhou1-5C7GfCeVMHo@public.gmane.org<mailto:Tao.Zhou1-5C7GfCeVMHo@public.gmane.org>>; Li, Dennis <Dennis.Li-5C7GfCeVMHo@public.gmane.org<mailto:Dennis.Li-5C7GfCeVMHo@public.gmane.org>>; Deucher, Alexander <Alexander.Deucher-5C7GfCeVMHo@public.gmane.org<mailto:Alexander.Deucher-5C7GfCeVMHo@public.gmane.org>>; Ma, Le <Le.Ma-5C7GfCeVMHo@public.gmane.org<mailto:Le.Ma@amd.com>>

Subject: [PATCH 06/10] drm/amdgpu: add condition to enable baco for xgmi/ras case



Avoid to change default reset behavior for production card by checking amdgpu_ras_enable equal to 2. And only new enough smu ucode can support baco for xgmi/ras case.



Change-Id: I07c3e6862be03e068745c73db8ea71f428ecba6b

Signed-off-by: Le Ma <le.ma-5C7GfCeVMHo@public.gmane.org<mailto:le.ma-5C7GfCeVMHo@public.gmane.org>>

---

drivers/gpu/drm/amd/amdgpu/soc15.c | 4 +++-

1 file changed, 3 insertions(+), 1 deletion(-)



diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c

index 951327f..6202333 100644

--- a/drivers/gpu/drm/amd/amdgpu/soc15.c

+++ b/drivers/gpu/drm/amd/amdgpu/soc15.c

@@ -577,7 +577,9 @@ soc15_asic_reset_method(struct amdgpu_device *adev)

                                   struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);

                                   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);

-                                   if (hive || (ras && ras->supported))

+                                  if ((hive || (ras && ras->supported)) &&

+                                      (amdgpu_ras_enable != 2 ||

+                                      adev->pm.fw_version <= 0x283400))

                                               baco_reset = false;

                       }

                       break;

--

2.7.4

[-- Attachment #1.2: Type: text/html, Size: 16311 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 57+ messages in thread

* RE: [PATCH 06/10] drm/amdgpu: add condition to enable baco for xgmi/ras case
@ 2019-11-27 12:35             ` Ma, Le
  0 siblings, 0 replies; 57+ messages in thread
From: Ma, Le @ 2019-11-27 12:35 UTC (permalink / raw)
  To: Zhang, Hawking, amd-gfx
  Cc: Deucher, Alexander, Zhou1, Tao, Li, Dennis, Chen, Guchun


[-- Attachment #1.1: Type: text/plain, Size: 3965 bytes --]

Agree with your thoughts that we drop amdgpu_ras_enable=2 condition. The only concern in my side is that besides fatal_error, another result may happen that atombios_init timeout on xgmi by baco (not sure psp mode1 reset causes this as well).



Assuming no amdgpu_ras_enable=2 check, if PMFW > 40.52,  the use cases as my understanding includes:

  1.  sGPU without RAS:
     *   new: baco
     *   old: baco
  2.  sGPU with RAS:

  *   new: baco
  *   old: psp mode1 chain reset and legacy fatal_error handling

  1.  XGMI with RAS: baco
     *   new: baco
     *   old: psp mode1 chain reset and legacy fatal_error handling
  2.  XGMI without RAS: baco
     *   new: baco
     *   old: psp mode1 chain reset



That is to say, all uses cases go on baco road when PMFW > 40.52.



Regards,

Ma Le



-----Original Message-----
From: Zhang, Hawking <Hawking.Zhang@amd.com>
Sent: Wednesday, November 27, 2019 7:28 PM
To: Ma, Le <Le.Ma@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Chen, Guchun <Guchun.Chen@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Li, Dennis <Dennis.Li@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Ma, Le <Le.Ma@amd.com>
Subject: RE: [PATCH 06/10] drm/amdgpu: add condition to enable baco for xgmi/ras case



[AMD Public Use]



After thinking it a bit, I think we can just rely on PMFW version to decide to go RAS recovery or legacy fatal_error handling for the platforms that support RAS. Leveraging amdgpu_ras_enable as a temporary solution seems not necessary? Even baco ras recovery not stable, it is the same result as legacy fatal_error handling that user has to reboot the node manually.



So the new soc reset use cases are:

XGMI (without RAS): use PSP mode1 based chain reset, RAS enabled (with PMFW 40.52 and onwards): use BACO based RAS recovery, RAS enabled (with PMFW prior to 40.52): use legacy fatal_error handling.

Anything else?



Regards,

Hawking

-----Original Message-----

From: Le Ma <le.ma@amd.com<mailto:le.ma@amd.com>>

Sent: 2019年11月27日 17:15

To: amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>

Cc: Zhang, Hawking <Hawking.Zhang@amd.com<mailto:Hawking.Zhang@amd.com>>; Chen, Guchun <Guchun.Chen@amd.com<mailto:Guchun.Chen@amd.com>>; Zhou1, Tao <Tao.Zhou1@amd.com<mailto:Tao.Zhou1@amd.com>>; Li, Dennis <Dennis.Li@amd.com<mailto:Dennis.Li@amd.com>>; Deucher, Alexander <Alexander.Deucher@amd.com<mailto:Alexander.Deucher@amd.com>>; Ma, Le <Le.Ma@amd.com<mailto:Le.Ma@amd.com>>

Subject: [PATCH 06/10] drm/amdgpu: add condition to enable baco for xgmi/ras case



Avoid to change default reset behavior for production card by checking amdgpu_ras_enable equal to 2. And only new enough smu ucode can support baco for xgmi/ras case.



Change-Id: I07c3e6862be03e068745c73db8ea71f428ecba6b

Signed-off-by: Le Ma <le.ma@amd.com<mailto:le.ma@amd.com>>

---

drivers/gpu/drm/amd/amdgpu/soc15.c | 4 +++-

1 file changed, 3 insertions(+), 1 deletion(-)



diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c

index 951327f..6202333 100644

--- a/drivers/gpu/drm/amd/amdgpu/soc15.c

+++ b/drivers/gpu/drm/amd/amdgpu/soc15.c

@@ -577,7 +577,9 @@ soc15_asic_reset_method(struct amdgpu_device *adev)

                                   struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);

                                   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);

-                                   if (hive || (ras && ras->supported))

+                                  if ((hive || (ras && ras->supported)) &&

+                                      (amdgpu_ras_enable != 2 ||

+                                      adev->pm.fw_version <= 0x283400))

                                               baco_reset = false;

                       }

                       break;

--

2.7.4

[-- Attachment #1.2: Type: text/html, Size: 15781 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 57+ messages in thread

* RE: [PATCH 06/10] drm/amdgpu: add condition to enable baco for xgmi/ras case
@ 2019-11-27 14:00             ` Ma, Le
  0 siblings, 0 replies; 57+ messages in thread
From: Ma, Le @ 2019-11-27 14:00 UTC (permalink / raw)
  To: Zhang, Hawking, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Deucher, Alexander, Zhou1, Tao, Li, Dennis, Chen, Guchun

Hi Hawking,

Please check this v2 patch which is just sent out. And as discussed, we decide to still leverage the current reset_method() function with functionality/change scale/code maintainability balanced . Thanks.

Regards,
Ma Le

-----Original Message-----
From: Zhang, Hawking <Hawking.Zhang@amd.com> 
Sent: Wednesday, November 27, 2019 7:39 PM
To: Ma, Le <Le.Ma@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Chen, Guchun <Guchun.Chen@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Li, Dennis <Dennis.Li@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Ma, Le <Le.Ma@amd.com>
Subject: RE: [PATCH 06/10] drm/amdgpu: add condition to enable baco for xgmi/ras case

[AMD Public Use]

And It is still necessary to put all the condition check in a function. I mean a function that decide to go ras recovery or legacy fatal_error handling. The PMFW version that support RAS recovery will be different among ASICs. Current version check only works for VG20. In fact, once ras->supported is set and proper PMFW is detected, RAS recovery will be the best choice no matter it is sGPU or mGPU.

Regards,
Hawking

-----Original Message-----
From: Le Ma <le.ma@amd.com> 
Sent: 2019年11月27日 17:15
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking <Hawking.Zhang@amd.com>; Chen, Guchun <Guchun.Chen@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Li, Dennis <Dennis.Li@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Ma, Le <Le.Ma@amd.com>
Subject: [PATCH 06/10] drm/amdgpu: add condition to enable baco for xgmi/ras case

Avoid to change default reset behavior for production card by checking amdgpu_ras_enable equal to 2. And only new enough smu ucode can support baco for xgmi/ras case.

Change-Id: I07c3e6862be03e068745c73db8ea71f428ecba6b
Signed-off-by: Le Ma <le.ma@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/soc15.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c
index 951327f..6202333 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
@@ -577,7 +577,9 @@ soc15_asic_reset_method(struct amdgpu_device *adev)
 			struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
 			struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
 
-			if (hive || (ras && ras->supported))
+			if ((hive || (ras && ras->supported)) &&
+			    (amdgpu_ras_enable != 2 ||
+			    adev->pm.fw_version <= 0x283400))
 				baco_reset = false;
 		}
 		break;
--
2.7.4
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* RE: [PATCH 06/10] drm/amdgpu: add condition to enable baco for xgmi/ras case
@ 2019-11-27 14:00             ` Ma, Le
  0 siblings, 0 replies; 57+ messages in thread
From: Ma, Le @ 2019-11-27 14:00 UTC (permalink / raw)
  To: Zhang, Hawking, amd-gfx
  Cc: Deucher, Alexander, Zhou1, Tao, Li, Dennis, Chen, Guchun

Hi Hawking,

Please check this v2 patch which is just sent out. And as discussed, we decide to still leverage the current reset_method() function with functionality/change scale/code maintainability balanced . Thanks.

Regards,
Ma Le

-----Original Message-----
From: Zhang, Hawking <Hawking.Zhang@amd.com> 
Sent: Wednesday, November 27, 2019 7:39 PM
To: Ma, Le <Le.Ma@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Chen, Guchun <Guchun.Chen@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Li, Dennis <Dennis.Li@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Ma, Le <Le.Ma@amd.com>
Subject: RE: [PATCH 06/10] drm/amdgpu: add condition to enable baco for xgmi/ras case

[AMD Public Use]

And It is still necessary to put all the condition check in a function. I mean a function that decide to go ras recovery or legacy fatal_error handling. The PMFW version that support RAS recovery will be different among ASICs. Current version check only works for VG20. In fact, once ras->supported is set and proper PMFW is detected, RAS recovery will be the best choice no matter it is sGPU or mGPU.

Regards,
Hawking

-----Original Message-----
From: Le Ma <le.ma@amd.com> 
Sent: 2019年11月27日 17:15
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking <Hawking.Zhang@amd.com>; Chen, Guchun <Guchun.Chen@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Li, Dennis <Dennis.Li@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Ma, Le <Le.Ma@amd.com>
Subject: [PATCH 06/10] drm/amdgpu: add condition to enable baco for xgmi/ras case

Avoid to change default reset behavior for production card by checking amdgpu_ras_enable equal to 2. And only new enough smu ucode can support baco for xgmi/ras case.

Change-Id: I07c3e6862be03e068745c73db8ea71f428ecba6b
Signed-off-by: Le Ma <le.ma@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/soc15.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c
index 951327f..6202333 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
@@ -577,7 +577,9 @@ soc15_asic_reset_method(struct amdgpu_device *adev)
 			struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
 			struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
 
-			if (hive || (ras && ras->supported))
+			if ((hive || (ras && ras->supported)) &&
+			    (amdgpu_ras_enable != 2 ||
+			    adev->pm.fw_version <= 0x283400))
 				baco_reset = false;
 		}
 		break;
--
2.7.4
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [PATCH 07/10] drm/amdgpu: add concurrent baco reset support for XGMI
@ 2019-11-27 15:46         ` Andrey Grodzovsky
  0 siblings, 0 replies; 57+ messages in thread
From: Andrey Grodzovsky @ 2019-11-27 15:46 UTC (permalink / raw)
  To: Le Ma, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: alexander.deucher-5C7GfCeVMHo, tao.zhou1-5C7GfCeVMHo,
	dennis.li-5C7GfCeVMHo, guchun.chen-5C7GfCeVMHo,
	hawking.zhang-5C7GfCeVMHo


On 11/27/19 4:15 AM, Le Ma wrote:
> Currently each XGMI node reset wq does not run in parrallel because same work
> item bound to same cpu runs in sequence. So change to bound the xgmi_reset_work
> item to different cpus.

It's not the same work item, see more bellow


>
> XGMI requires all nodes enter into baco within very close proximity before
> any node exit baco. So schedule the xgmi_reset_work wq twice for enter/exit
> baco respectively.
>
> The default reset code path and methods do not change for vega20 production:
>    - baco reset without xgmi/ras
>    - psp reset with xgmi/ras
>
> To enable baco for XGMI/RAS case, both 2 conditions below are needed:
>    - amdgpu_ras_enable=2
>    - baco-supported smu firmware
>
> The case that PSP reset and baco reset coexist within an XGMI hive is not in
> the consideration.
>
> Change-Id: I9c08cf90134f940b42e20d2129ff87fba761c532
> Signed-off-by: Le Ma <le.ma@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  2 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 78 ++++++++++++++++++++++++++----
>   2 files changed, 70 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index d120fe5..08929e6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -998,6 +998,8 @@ struct amdgpu_device {
>   	int				pstate;
>   	/* enable runtime pm on the device */
>   	bool                            runpm;
> +
> +	bool				in_baco;
>   };
>   
>   static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index bd387bb..71abfe9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2654,7 +2654,13 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
>   	struct amdgpu_device *adev =
>   		container_of(__work, struct amdgpu_device, xgmi_reset_work);
>   
> -	adev->asic_reset_res =  amdgpu_asic_reset(adev);
> +	if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)
> +		adev->asic_reset_res = (adev->in_baco == false) ?
> +				amdgpu_device_baco_enter(adev->ddev) :
> +				amdgpu_device_baco_exit(adev->ddev);
> +	else
> +		adev->asic_reset_res = amdgpu_asic_reset(adev);
> +
>   	if (adev->asic_reset_res)
>   		DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
>   			 adev->asic_reset_res, adev->ddev->unique);
> @@ -3796,6 +3802,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>   	struct amdgpu_device *tmp_adev = NULL;
>   	bool need_full_reset = *need_full_reset_arg, vram_lost = false;
>   	int r = 0;
> +	int cpu = smp_processor_id();
>   
>   	/*
>   	 * ASIC reset has to be done on all HGMI hive nodes ASAP
> @@ -3803,21 +3810,24 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>   	 */
>   	if (need_full_reset) {
>   		list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
> -			/* For XGMI run all resets in parallel to speed up the process */
> +			/*
> +			 * For XGMI run all resets in parallel to speed up the
> +			 * process by scheduling the highpri wq on different
> +			 * cpus. For XGMI with baco reset, all nodes must enter
> +			 * baco within close proximity before anyone exit.
> +			 */
>   			if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
> -				if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work))


Note that tmp_adev->xgmi_reset_work (the work item) is per device in 
XGMI hive and not the same work item. So I don't see why you need to 
explicitly queue them on different CPUs, they should run in parallel 
already.

Andrey



> +				if (!queue_work_on(cpu, system_highpri_wq,
> +						   &tmp_adev->xgmi_reset_work))
>   					r = -EALREADY;
> +				cpu = cpumask_next(cpu, cpu_online_mask);
>   			} else
>   				r = amdgpu_asic_reset(tmp_adev);
> -
> -			if (r) {
> -				DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
> -					 r, tmp_adev->ddev->unique);
> +			if (r)
>   				break;
> -			}
>   		}
>   
> -		/* For XGMI wait for all PSP resets to complete before proceed */
> +		/* For XGMI wait for all work to complete before proceed */
>   		if (!r) {
>   			list_for_each_entry(tmp_adev, device_list_handle,
>   					    gmc.xgmi.head) {
> @@ -3826,11 +3836,59 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>   					r = tmp_adev->asic_reset_res;
>   					if (r)
>   						break;
> +					if(AMD_RESET_METHOD_BACO ==
> +					   amdgpu_asic_reset_method(tmp_adev))
> +						tmp_adev->in_baco = true;
>   				}
>   			}
>   		}
> -	}
>   
> +		/*
> +		 * For XGMI with baco reset, need exit baco phase by scheduling
> +		 * xgmi_reset_work one more time. PSP reset skips this phase.
> +		 * Not assume the situation that PSP reset and baco reset
> +		 * coexist within an XGMI hive.
> +		 */
> +
> +		if (!r) {
> +			cpu = smp_processor_id();
> +			list_for_each_entry(tmp_adev, device_list_handle,
> +					    gmc.xgmi.head) {
> +				if (tmp_adev->gmc.xgmi.num_physical_nodes > 1
> +				    && AMD_RESET_METHOD_BACO ==
> +				    amdgpu_asic_reset_method(tmp_adev)) {
> +					if (!queue_work_on(cpu,
> +						system_highpri_wq,
> +						&tmp_adev->xgmi_reset_work))
> +						r = -EALREADY;
> +					if (r)
> +						break;
> +					cpu = cpumask_next(cpu, cpu_online_mask);
> +				}
> +			}
> +		}
> +
> +		if (!r) {
> +			list_for_each_entry(tmp_adev, device_list_handle,
> +					    gmc.xgmi.head) {
> +				if (tmp_adev->gmc.xgmi.num_physical_nodes > 1
> +				    && AMD_RESET_METHOD_BACO ==
> +				    amdgpu_asic_reset_method(tmp_adev)) {
> +					flush_work(&tmp_adev->xgmi_reset_work);
> +					r = tmp_adev->asic_reset_res;
> +					if (r)
> +						break;
> +					tmp_adev->in_baco = false;
> +				}
> +			}
> +		}
> +
> +		if (r) {
> +			DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
> +				 r, tmp_adev->ddev->unique);
> +			goto end;
> +		}
> +	}
>   
>   	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>   		if (need_full_reset) {
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 07/10] drm/amdgpu: add concurrent baco reset support for XGMI
@ 2019-11-27 15:46         ` Andrey Grodzovsky
  0 siblings, 0 replies; 57+ messages in thread
From: Andrey Grodzovsky @ 2019-11-27 15:46 UTC (permalink / raw)
  To: Le Ma, amd-gfx
  Cc: alexander.deucher, tao.zhou1, dennis.li, guchun.chen, hawking.zhang


On 11/27/19 4:15 AM, Le Ma wrote:
> Currently each XGMI node reset wq does not run in parrallel because same work
> item bound to same cpu runs in sequence. So change to bound the xgmi_reset_work
> item to different cpus.

It's not the same work item, see more bellow


>
> XGMI requires all nodes enter into baco within very close proximity before
> any node exit baco. So schedule the xgmi_reset_work wq twice for enter/exit
> baco respectively.
>
> The default reset code path and methods do not change for vega20 production:
>    - baco reset without xgmi/ras
>    - psp reset with xgmi/ras
>
> To enable baco for XGMI/RAS case, both 2 conditions below are needed:
>    - amdgpu_ras_enable=2
>    - baco-supported smu firmware
>
> The case that PSP reset and baco reset coexist within an XGMI hive is not in
> the consideration.
>
> Change-Id: I9c08cf90134f940b42e20d2129ff87fba761c532
> Signed-off-by: Le Ma <le.ma@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  2 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 78 ++++++++++++++++++++++++++----
>   2 files changed, 70 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index d120fe5..08929e6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -998,6 +998,8 @@ struct amdgpu_device {
>   	int				pstate;
>   	/* enable runtime pm on the device */
>   	bool                            runpm;
> +
> +	bool				in_baco;
>   };
>   
>   static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index bd387bb..71abfe9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2654,7 +2654,13 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
>   	struct amdgpu_device *adev =
>   		container_of(__work, struct amdgpu_device, xgmi_reset_work);
>   
> -	adev->asic_reset_res =  amdgpu_asic_reset(adev);
> +	if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)
> +		adev->asic_reset_res = (adev->in_baco == false) ?
> +				amdgpu_device_baco_enter(adev->ddev) :
> +				amdgpu_device_baco_exit(adev->ddev);
> +	else
> +		adev->asic_reset_res = amdgpu_asic_reset(adev);
> +
>   	if (adev->asic_reset_res)
>   		DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
>   			 adev->asic_reset_res, adev->ddev->unique);
> @@ -3796,6 +3802,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>   	struct amdgpu_device *tmp_adev = NULL;
>   	bool need_full_reset = *need_full_reset_arg, vram_lost = false;
>   	int r = 0;
> +	int cpu = smp_processor_id();
>   
>   	/*
>   	 * ASIC reset has to be done on all HGMI hive nodes ASAP
> @@ -3803,21 +3810,24 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>   	 */
>   	if (need_full_reset) {
>   		list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
> -			/* For XGMI run all resets in parallel to speed up the process */
> +			/*
> +			 * For XGMI run all resets in parallel to speed up the
> +			 * process by scheduling the highpri wq on different
> +			 * cpus. For XGMI with baco reset, all nodes must enter
> +			 * baco within close proximity before anyone exit.
> +			 */
>   			if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
> -				if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work))


Note that tmp_adev->xgmi_reset_work (the work item) is per device in 
XGMI hive and not the same work item. So I don't see why you need to 
explicitly queue them on different CPUs, they should run in parallel 
already.

Andrey



> +				if (!queue_work_on(cpu, system_highpri_wq,
> +						   &tmp_adev->xgmi_reset_work))
>   					r = -EALREADY;
> +				cpu = cpumask_next(cpu, cpu_online_mask);
>   			} else
>   				r = amdgpu_asic_reset(tmp_adev);
> -
> -			if (r) {
> -				DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
> -					 r, tmp_adev->ddev->unique);
> +			if (r)
>   				break;
> -			}
>   		}
>   
> -		/* For XGMI wait for all PSP resets to complete before proceed */
> +		/* For XGMI wait for all work to complete before proceed */
>   		if (!r) {
>   			list_for_each_entry(tmp_adev, device_list_handle,
>   					    gmc.xgmi.head) {
> @@ -3826,11 +3836,59 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>   					r = tmp_adev->asic_reset_res;
>   					if (r)
>   						break;
> +					if(AMD_RESET_METHOD_BACO ==
> +					   amdgpu_asic_reset_method(tmp_adev))
> +						tmp_adev->in_baco = true;
>   				}
>   			}
>   		}
> -	}
>   
> +		/*
> +		 * For XGMI with baco reset, need exit baco phase by scheduling
> +		 * xgmi_reset_work one more time. PSP reset skips this phase.
> +		 * Not assume the situation that PSP reset and baco reset
> +		 * coexist within an XGMI hive.
> +		 */
> +
> +		if (!r) {
> +			cpu = smp_processor_id();
> +			list_for_each_entry(tmp_adev, device_list_handle,
> +					    gmc.xgmi.head) {
> +				if (tmp_adev->gmc.xgmi.num_physical_nodes > 1
> +				    && AMD_RESET_METHOD_BACO ==
> +				    amdgpu_asic_reset_method(tmp_adev)) {
> +					if (!queue_work_on(cpu,
> +						system_highpri_wq,
> +						&tmp_adev->xgmi_reset_work))
> +						r = -EALREADY;
> +					if (r)
> +						break;
> +					cpu = cpumask_next(cpu, cpu_online_mask);
> +				}
> +			}
> +		}
> +
> +		if (!r) {
> +			list_for_each_entry(tmp_adev, device_list_handle,
> +					    gmc.xgmi.head) {
> +				if (tmp_adev->gmc.xgmi.num_physical_nodes > 1
> +				    && AMD_RESET_METHOD_BACO ==
> +				    amdgpu_asic_reset_method(tmp_adev)) {
> +					flush_work(&tmp_adev->xgmi_reset_work);
> +					r = tmp_adev->asic_reset_res;
> +					if (r)
> +						break;
> +					tmp_adev->in_baco = false;
> +				}
> +			}
> +		}
> +
> +		if (r) {
> +			DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
> +				 r, tmp_adev->ddev->unique);
> +			goto end;
> +		}
> +	}
>   
>   	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>   		if (need_full_reset) {
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 57+ messages in thread

* RE: [PATCH 01/10] drm/amdgpu: remove ras global recovery handling from ras_controller_int handler
@ 2019-11-28  5:27     ` Zhang, Hawking
  0 siblings, 0 replies; 57+ messages in thread
From: Zhang, Hawking @ 2019-11-28  5:27 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Deucher, Alexander, Ma, Le, Zhou1, Tao, Li, Dennis, Chen, Guchun

[AMD Official Use Only - Internal Distribution Only]

With the v2 version for patch #6, #7 and the fix to enable doorbell int after BACO exit in Patch #5,

The series is 

Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>

Regards,
Hawking
-----Original Message-----
From: Le Ma <le.ma@amd.com> 
Sent: 2019年11月27日 17:15
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking <Hawking.Zhang@amd.com>; Chen, Guchun <Guchun.Chen@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Li, Dennis <Dennis.Li@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Ma, Le <Le.Ma@amd.com>
Subject: [PATCH 01/10] drm/amdgpu: remove ras global recovery handling from ras_controller_int handler

From: Le Ma <Le.Ma@amd.com>

v2: add notification when ras controller interrupt generates

Change-Id: Ic03e42e9d1c4dab1fa7f4817c191a16e485b48a9
Signed-off-by: Le Ma <Le.Ma@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
index 0db458f..25231d6 100644
--- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
@@ -324,7 +324,12 @@ static void nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device
 						RAS_CNTLR_INTERRUPT_CLEAR, 1);
 		WREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL, bif_doorbell_intr_cntl);
 
-		amdgpu_ras_global_ras_isr(adev);
+		DRM_WARN("RAS controller interrupt triggered by NBIF error\n");
+
+		/* ras_controller_int is dedicated for nbif ras error,
+		 * not the global interrupt for sync flood
+		 */
+		amdgpu_ras_reset_gpu(adev, true);
 	}
 }
 
-- 
2.7.4
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* RE: [PATCH 01/10] drm/amdgpu: remove ras global recovery handling from ras_controller_int handler
@ 2019-11-28  5:27     ` Zhang, Hawking
  0 siblings, 0 replies; 57+ messages in thread
From: Zhang, Hawking @ 2019-11-28  5:27 UTC (permalink / raw)
  To: Ma, Le, amd-gfx
  Cc: Deucher, Alexander, Ma, Le, Zhou1, Tao, Li, Dennis, Chen, Guchun

[AMD Official Use Only - Internal Distribution Only]

With the v2 version for patch #6, #7 and the fix to enable doorbell int after BACO exit in Patch #5,

The series is 

Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>

Regards,
Hawking
-----Original Message-----
From: Le Ma <le.ma@amd.com> 
Sent: 2019年11月27日 17:15
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking <Hawking.Zhang@amd.com>; Chen, Guchun <Guchun.Chen@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Li, Dennis <Dennis.Li@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Ma, Le <Le.Ma@amd.com>
Subject: [PATCH 01/10] drm/amdgpu: remove ras global recovery handling from ras_controller_int handler

From: Le Ma <Le.Ma@amd.com>

v2: add notification when ras controller interrupt generates

Change-Id: Ic03e42e9d1c4dab1fa7f4817c191a16e485b48a9
Signed-off-by: Le Ma <Le.Ma@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
index 0db458f..25231d6 100644
--- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
@@ -324,7 +324,12 @@ static void nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device
 						RAS_CNTLR_INTERRUPT_CLEAR, 1);
 		WREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL, bif_doorbell_intr_cntl);
 
-		amdgpu_ras_global_ras_isr(adev);
+		DRM_WARN("RAS controller interrupt triggered by NBIF error\n");
+
+		/* ras_controller_int is dedicated for nbif ras error,
+		 * not the global interrupt for sync flood
+		 */
+		amdgpu_ras_reset_gpu(adev, true);
 	}
 }
 
-- 
2.7.4
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* RE: [PATCH 05/10] drm/amdgpu: enable/disable doorbell interrupt in baco entry/exit helper
@ 2019-11-28  6:50         ` Zhou1, Tao
  0 siblings, 0 replies; 57+ messages in thread
From: Zhou1, Tao @ 2019-11-28  6:50 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Deucher, Alexander, Ma, Le, Li, Dennis, Chen, Guchun, Zhang, Hawking



> -----Original Message-----
> From: Le Ma <le.ma@amd.com>
> Sent: 2019年11月27日 17:15
> To: amd-gfx@lists.freedesktop.org
> Cc: Zhang, Hawking <Hawking.Zhang@amd.com>; Chen, Guchun
> <Guchun.Chen@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Li, Dennis
> <Dennis.Li@amd.com>; Deucher, Alexander
> <Alexander.Deucher@amd.com>; Ma, Le <Le.Ma@amd.com>
> Subject: [PATCH 05/10] drm/amdgpu: enable/disable doorbell interrupt in
> baco entry/exit helper
> 
> This operation is needed when baco entry/exit for ras recovery
> 
> Change-Id: I535c7231693f3138a8e3d5acd55672e2ac68232f
> Signed-off-by: Le Ma <le.ma@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 19 ++++++++++++-------
>  1 file changed, 12 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index b1408c5..bd387bb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4308,10 +4308,14 @@ static void amdgpu_device_get_pcie_info(struct
> amdgpu_device *adev)  int amdgpu_device_baco_enter(struct drm_device
> *dev)  {
>  	struct amdgpu_device *adev = dev->dev_private;
> +	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> 
>  	if (!amdgpu_device_supports_baco(adev->ddev))
>  		return -ENOTSUPP;
> 
> +	if (ras && ras->supported)
> +		adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
> +

[Tao] The following code is observed several times, I think we can add a new interface to replace it, I'll do that when I have time.

struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
if (ras && ras->supported)

>  	if (is_support_sw_smu(adev)) {
>  		struct smu_context *smu = &adev->smu;
>  		int ret;
> @@ -4319,8 +4323,6 @@ int amdgpu_device_baco_enter(struct drm_device
> *dev)
>  		ret = smu_baco_enter(smu);
>  		if (ret)
>  			return ret;
> -
> -		return 0;
>  	} else {
>  		void *pp_handle = adev->powerplay.pp_handle;
>  		const struct amd_pm_funcs *pp_funcs = adev-
> >powerplay.pp_funcs; @@ -4331,14 +4333,15 @@ int
> amdgpu_device_baco_enter(struct drm_device *dev)
>  		/* enter BACO state */
>  		if (pp_funcs->set_asic_baco_state(pp_handle, 1))
>  			return -EIO;
> -
> -		return 0;
>  	}
> +
> +	return 0;
>  }
> 
>  int amdgpu_device_baco_exit(struct drm_device *dev)  {
>  	struct amdgpu_device *adev = dev->dev_private;
> +	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> 
>  	if (!amdgpu_device_supports_baco(adev->ddev))
>  		return -ENOTSUPP;
> @@ -4351,7 +4354,6 @@ int amdgpu_device_baco_exit(struct drm_device
> *dev)
>  		if (ret)
>  			return ret;
> 
> -		return 0;
>  	} else {
>  		void *pp_handle = adev->powerplay.pp_handle;
>  		const struct amd_pm_funcs *pp_funcs = adev-
> >powerplay.pp_funcs; @@ -4362,7 +4364,10 @@ int
> amdgpu_device_baco_exit(struct drm_device *dev)
>  		/* exit BACO state */
>  		if (pp_funcs->set_asic_baco_state(pp_handle, 0))
>  			return -EIO;
> -
> -		return 0;
>  	}
> +
> +	if (ras && ras->supported)
> +		adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
> +
> +	return 0;
>  }
> --
> 2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 57+ messages in thread

* RE: [PATCH 05/10] drm/amdgpu: enable/disable doorbell interrupt in baco entry/exit helper
@ 2019-11-28  6:50         ` Zhou1, Tao
  0 siblings, 0 replies; 57+ messages in thread
From: Zhou1, Tao @ 2019-11-28  6:50 UTC (permalink / raw)
  To: Ma, Le, amd-gfx
  Cc: Deucher, Alexander, Ma, Le, Li, Dennis, Chen, Guchun, Zhang, Hawking



> -----Original Message-----
> From: Le Ma <le.ma@amd.com>
> Sent: 2019年11月27日 17:15
> To: amd-gfx@lists.freedesktop.org
> Cc: Zhang, Hawking <Hawking.Zhang@amd.com>; Chen, Guchun
> <Guchun.Chen@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Li, Dennis
> <Dennis.Li@amd.com>; Deucher, Alexander
> <Alexander.Deucher@amd.com>; Ma, Le <Le.Ma@amd.com>
> Subject: [PATCH 05/10] drm/amdgpu: enable/disable doorbell interrupt in
> baco entry/exit helper
> 
> This operation is needed when baco entry/exit for ras recovery
> 
> Change-Id: I535c7231693f3138a8e3d5acd55672e2ac68232f
> Signed-off-by: Le Ma <le.ma@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 19 ++++++++++++-------
>  1 file changed, 12 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index b1408c5..bd387bb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4308,10 +4308,14 @@ static void amdgpu_device_get_pcie_info(struct
> amdgpu_device *adev)  int amdgpu_device_baco_enter(struct drm_device
> *dev)  {
>  	struct amdgpu_device *adev = dev->dev_private;
> +	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> 
>  	if (!amdgpu_device_supports_baco(adev->ddev))
>  		return -ENOTSUPP;
> 
> +	if (ras && ras->supported)
> +		adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
> +

[Tao] The following code is observed several times, I think we can add a new interface to replace it, I'll do that when I have time.

struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
if (ras && ras->supported)

>  	if (is_support_sw_smu(adev)) {
>  		struct smu_context *smu = &adev->smu;
>  		int ret;
> @@ -4319,8 +4323,6 @@ int amdgpu_device_baco_enter(struct drm_device
> *dev)
>  		ret = smu_baco_enter(smu);
>  		if (ret)
>  			return ret;
> -
> -		return 0;
>  	} else {
>  		void *pp_handle = adev->powerplay.pp_handle;
>  		const struct amd_pm_funcs *pp_funcs = adev-
> >powerplay.pp_funcs; @@ -4331,14 +4333,15 @@ int
> amdgpu_device_baco_enter(struct drm_device *dev)
>  		/* enter BACO state */
>  		if (pp_funcs->set_asic_baco_state(pp_handle, 1))
>  			return -EIO;
> -
> -		return 0;
>  	}
> +
> +	return 0;
>  }
> 
>  int amdgpu_device_baco_exit(struct drm_device *dev)  {
>  	struct amdgpu_device *adev = dev->dev_private;
> +	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> 
>  	if (!amdgpu_device_supports_baco(adev->ddev))
>  		return -ENOTSUPP;
> @@ -4351,7 +4354,6 @@ int amdgpu_device_baco_exit(struct drm_device
> *dev)
>  		if (ret)
>  			return ret;
> 
> -		return 0;
>  	} else {
>  		void *pp_handle = adev->powerplay.pp_handle;
>  		const struct amd_pm_funcs *pp_funcs = adev-
> >powerplay.pp_funcs; @@ -4362,7 +4364,10 @@ int
> amdgpu_device_baco_exit(struct drm_device *dev)
>  		/* exit BACO state */
>  		if (pp_funcs->set_asic_baco_state(pp_handle, 0))
>  			return -EIO;
> -
> -		return 0;
>  	}
> +
> +	if (ras && ras->supported)
> +		adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
> +
> +	return 0;
>  }
> --
> 2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 57+ messages in thread

* RE: [PATCH 07/10] drm/amdgpu: add concurrent baco reset support for XGMI
@ 2019-11-28  9:00             ` Ma, Le
  0 siblings, 0 replies; 57+ messages in thread
From: Ma, Le @ 2019-11-28  9:00 UTC (permalink / raw)
  To: Grodzovsky, Andrey, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Deucher, Alexander, Zhou1, Tao, Li, Dennis, Chen, Guchun, Zhang, Hawking


[-- Attachment #1.1: Type: text/plain, Size: 12530 bytes --]





-----Original Message-----
From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
Sent: Wednesday, November 27, 2019 11:46 PM
To: Ma, Le <Le.Ma@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Chen, Guchun <Guchun.Chen@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Li, Dennis <Dennis.Li@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>
Subject: Re: [PATCH 07/10] drm/amdgpu: add concurrent baco reset support for XGMI





On 11/27/19 4:15 AM, Le Ma wrote:

> Currently each XGMI node reset wq does not run in parrallel because

> same work item bound to same cpu runs in sequence. So change to bound

> the xgmi_reset_work item to different cpus.



It's not the same work item, see more bellow





>

> XGMI requires all nodes enter into baco within very close proximity

> before any node exit baco. So schedule the xgmi_reset_work wq twice

> for enter/exit baco respectively.

>

> The default reset code path and methods do not change for vega20 production:

>    - baco reset without xgmi/ras

>    - psp reset with xgmi/ras

>

> To enable baco for XGMI/RAS case, both 2 conditions below are needed:

>    - amdgpu_ras_enable=2

>    - baco-supported smu firmware

>

> The case that PSP reset and baco reset coexist within an XGMI hive is

> not in the consideration.

>

> Change-Id: I9c08cf90134f940b42e20d2129ff87fba761c532

> Signed-off-by: Le Ma <le.ma@amd.com<mailto:le.ma@amd.com>>

> ---

>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  2 +

>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 78 ++++++++++++++++++++++++++----

>   2 files changed, 70 insertions(+), 10 deletions(-)

>

> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h

> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

> index d120fe5..08929e6 100644

> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h

> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

> @@ -998,6 +998,8 @@ struct amdgpu_device {

>          int                                           pstate;

>          /* enable runtime pm on the device */

>          bool                            runpm;

> +

> +      bool                                        in_baco;

>   };

>

>   static inline struct amdgpu_device *amdgpu_ttm_adev(struct

> ttm_bo_device *bdev) diff --git

> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

> index bd387bb..71abfe9 100644

> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

> @@ -2654,7 +2654,13 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)

>          struct amdgpu_device *adev =

>                      container_of(__work, struct amdgpu_device, xgmi_reset_work);

>

> -       adev->asic_reset_res =  amdgpu_asic_reset(adev);

> +      if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)

> +                  adev->asic_reset_res = (adev->in_baco == false) ?

> +                                          amdgpu_device_baco_enter(adev->ddev) :

> +                                          amdgpu_device_baco_exit(adev->ddev);

> +      else

> +                  adev->asic_reset_res = amdgpu_asic_reset(adev);

> +

>          if (adev->asic_reset_res)

>                      DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",

>                                   adev->asic_reset_res, adev->ddev->unique); @@ -3796,6 +3802,7 @@

> static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,

>          struct amdgpu_device *tmp_adev = NULL;

>          bool need_full_reset = *need_full_reset_arg, vram_lost = false;

>          int r = 0;

> +      int cpu = smp_processor_id();

>

>          /*

>           * ASIC reset has to be done on all HGMI hive nodes ASAP @@

> -3803,21 +3810,24 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,

>           */

>          if (need_full_reset) {

>                      list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {

> -                               /* For XGMI run all resets in parallel to speed up the process */

> +                              /*

> +                              * For XGMI run all resets in parallel to speed up the

> +                              * process by scheduling the highpri wq on different

> +                              * cpus. For XGMI with baco reset, all nodes must enter

> +                              * baco within close proximity before anyone exit.

> +                              */

>                                  if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {

> -                                           if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work))





Note that tmp_adev->xgmi_reset_work (the work item) is per device in XGMI hive and not the same work item. So I don't see why you need to explicitly queue them on different CPUs, they should run in parallel already.



Andrey



[Le]: It’s also beyond my understanding that the 2 node reset work items scheduled to same cpu does not run in parallel. But from the experiment result in my side, the 2nd work item always run after 1st work item finished. Based on this result, I changed to queue them on different CPUs to make sure more XGMI nodes case to run in parallel, because baco requires all nodes enter baco within very close proximity.



The experiment code is as following for your reference. When card0 worker running, card1 worker is not observed to run.



+atomic_t card0_in_baco = ATOMIC_INIT(0);

+atomic_t card1_in_baco = ATOMIC_INIT(0);

+

static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)

{

        struct amdgpu_device *adev =

                container_of(__work, struct amdgpu_device, xgmi_reset_work);



+       printk("lema1: card 0x%x goes into reset wq\n", adev->pdev->bus->number);

+       if (adev->pdev->bus->number == 0x7) {

+               atomic_set(&card1_in_baco, 1);

+               printk("lema1: card1 in baco from card1 view\n");

+       }

+

        if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)

               adev->asic_reset_res = (adev->in_baco == false) ?

                                amdgpu_device_baco_enter(adev->ddev) :

@@ -2664,6 +2673,23 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)

        if (adev->asic_reset_res)

                DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",

                         adev->asic_reset_res, adev->ddev->unique);

+

+       if (adev->pdev->bus->number == 0x4) {

+               atomic_set(&card0_in_baco, 1);

+               printk("lema1: card0 in baco from card0 view\n");

+

+               while (true)

+                       if (!!atomic_read(&card1_in_baco))

+                               break;

+               printk("lema1: card1 in baco from card0 view\n");

+       }

+

+       if (adev->pdev->bus->number == 0x7) {

+               while (true)

+                       if (!!atomic_read(&card0_in_baco))

+                               break;

+               printk("lema1: card0 in baco from card1 view\n");

+       }



> +                                          if (!queue_work_on(cpu, system_highpri_wq,

> +                                                                     &tmp_adev->xgmi_reset_work))

>                                                          r = -EALREADY;

> +                                          cpu = cpumask_next(cpu, cpu_online_mask);

>                                  } else

>                                              r = amdgpu_asic_reset(tmp_adev);

> -

> -                               if (r) {

> -                                           DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",

> -                                                       r, tmp_adev->ddev->unique);

> +                              if (r)

>                                              break;

> -                               }

>                      }

>

> -                   /* For XGMI wait for all PSP resets to complete before proceed */

> +                  /* For XGMI wait for all work to complete before proceed */

>                      if (!r) {

>                                  list_for_each_entry(tmp_adev, device_list_handle,

>                                                              gmc.xgmi.head) {

> @@ -3826,11 +3836,59 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,

>                                                          r = tmp_adev->asic_reset_res;

>                                                          if (r)

>                                                                      break;

> +                                                      if(AMD_RESET_METHOD_BACO ==

> +                                                         amdgpu_asic_reset_method(tmp_adev))

> +                                                                  tmp_adev->in_baco = true;

>                                              }

>                                  }

>                      }

> -       }

>

> +                  /*

> +                  * For XGMI with baco reset, need exit baco phase by scheduling

> +                  * xgmi_reset_work one more time. PSP reset skips this phase.

> +                  * Not assume the situation that PSP reset and baco reset

> +                  * coexist within an XGMI hive.

> +                  */

> +

> +                  if (!r) {

> +                              cpu = smp_processor_id();

> +                              list_for_each_entry(tmp_adev, device_list_handle,

> +                                                          gmc.xgmi.head) {

> +                                          if (tmp_adev->gmc.xgmi.num_physical_nodes > 1

> +                                              && AMD_RESET_METHOD_BACO ==

> +                                              amdgpu_asic_reset_method(tmp_adev)) {

> +                                                      if (!queue_work_on(cpu,

> +                                                                  system_highpri_wq,

> +                                                                  &tmp_adev->xgmi_reset_work))

> +                                                                  r = -EALREADY;

> +                                                      if (r)

> +                                                                  break;

> +                                                      cpu = cpumask_next(cpu, cpu_online_mask);

> +                                          }

> +                              }

> +                  }

> +

> +                  if (!r) {

> +                              list_for_each_entry(tmp_adev, device_list_handle,

> +                                                          gmc.xgmi.head) {

> +                                          if (tmp_adev->gmc.xgmi.num_physical_nodes > 1

> +                                              && AMD_RESET_METHOD_BACO ==

> +                                              amdgpu_asic_reset_method(tmp_adev)) {

> +                                                      flush_work(&tmp_adev->xgmi_reset_work);

> +                                                      r = tmp_adev->asic_reset_res;

> +                                                      if (r)

> +                                                                  break;

> +                                                      tmp_adev->in_baco = false;

> +                                          }

> +                              }

> +                  }

> +

> +                  if (r) {

> +                              DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",

> +                                          r, tmp_adev->ddev->unique);

> +                              goto end;

> +                  }

> +      }

>

>          list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {

>                      if (need_full_reset) {

[-- Attachment #1.2: Type: text/html, Size: 48322 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 57+ messages in thread

* RE: [PATCH 07/10] drm/amdgpu: add concurrent baco reset support for XGMI
@ 2019-11-28  9:00             ` Ma, Le
  0 siblings, 0 replies; 57+ messages in thread
From: Ma, Le @ 2019-11-28  9:00 UTC (permalink / raw)
  To: Grodzovsky, Andrey, amd-gfx
  Cc: Deucher, Alexander, Zhou1, Tao, Li, Dennis, Chen, Guchun, Zhang, Hawking


[-- Attachment #1.1: Type: text/plain, Size: 12530 bytes --]





-----Original Message-----
From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
Sent: Wednesday, November 27, 2019 11:46 PM
To: Ma, Le <Le.Ma@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Chen, Guchun <Guchun.Chen@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Li, Dennis <Dennis.Li@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>
Subject: Re: [PATCH 07/10] drm/amdgpu: add concurrent baco reset support for XGMI





On 11/27/19 4:15 AM, Le Ma wrote:

> Currently each XGMI node reset wq does not run in parrallel because

> same work item bound to same cpu runs in sequence. So change to bound

> the xgmi_reset_work item to different cpus.



It's not the same work item, see more bellow





>

> XGMI requires all nodes enter into baco within very close proximity

> before any node exit baco. So schedule the xgmi_reset_work wq twice

> for enter/exit baco respectively.

>

> The default reset code path and methods do not change for vega20 production:

>    - baco reset without xgmi/ras

>    - psp reset with xgmi/ras

>

> To enable baco for XGMI/RAS case, both 2 conditions below are needed:

>    - amdgpu_ras_enable=2

>    - baco-supported smu firmware

>

> The case that PSP reset and baco reset coexist within an XGMI hive is

> not in the consideration.

>

> Change-Id: I9c08cf90134f940b42e20d2129ff87fba761c532

> Signed-off-by: Le Ma <le.ma@amd.com<mailto:le.ma@amd.com>>

> ---

>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  2 +

>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 78 ++++++++++++++++++++++++++----

>   2 files changed, 70 insertions(+), 10 deletions(-)

>

> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h

> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

> index d120fe5..08929e6 100644

> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h

> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

> @@ -998,6 +998,8 @@ struct amdgpu_device {

>          int                                           pstate;

>          /* enable runtime pm on the device */

>          bool                            runpm;

> +

> +      bool                                        in_baco;

>   };

>

>   static inline struct amdgpu_device *amdgpu_ttm_adev(struct

> ttm_bo_device *bdev) diff --git

> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

> index bd387bb..71abfe9 100644

> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

> @@ -2654,7 +2654,13 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)

>          struct amdgpu_device *adev =

>                      container_of(__work, struct amdgpu_device, xgmi_reset_work);

>

> -       adev->asic_reset_res =  amdgpu_asic_reset(adev);

> +      if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)

> +                  adev->asic_reset_res = (adev->in_baco == false) ?

> +                                          amdgpu_device_baco_enter(adev->ddev) :

> +                                          amdgpu_device_baco_exit(adev->ddev);

> +      else

> +                  adev->asic_reset_res = amdgpu_asic_reset(adev);

> +

>          if (adev->asic_reset_res)

>                      DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",

>                                   adev->asic_reset_res, adev->ddev->unique); @@ -3796,6 +3802,7 @@

> static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,

>          struct amdgpu_device *tmp_adev = NULL;

>          bool need_full_reset = *need_full_reset_arg, vram_lost = false;

>          int r = 0;

> +      int cpu = smp_processor_id();

>

>          /*

>           * ASIC reset has to be done on all HGMI hive nodes ASAP @@

> -3803,21 +3810,24 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,

>           */

>          if (need_full_reset) {

>                      list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {

> -                               /* For XGMI run all resets in parallel to speed up the process */

> +                              /*

> +                              * For XGMI run all resets in parallel to speed up the

> +                              * process by scheduling the highpri wq on different

> +                              * cpus. For XGMI with baco reset, all nodes must enter

> +                              * baco within close proximity before anyone exit.

> +                              */

>                                  if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {

> -                                           if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work))





Note that tmp_adev->xgmi_reset_work (the work item) is per device in XGMI hive and not the same work item. So I don't see why you need to explicitly queue them on different CPUs, they should run in parallel already.



Andrey



[Le]: It’s also beyond my understanding that the 2 node reset work items scheduled to same cpu does not run in parallel. But from the experiment result in my side, the 2nd work item always run after 1st work item finished. Based on this result, I changed to queue them on different CPUs to make sure more XGMI nodes case to run in parallel, because baco requires all nodes enter baco within very close proximity.



The experiment code is as following for your reference. When card0 worker running, card1 worker is not observed to run.



+atomic_t card0_in_baco = ATOMIC_INIT(0);

+atomic_t card1_in_baco = ATOMIC_INIT(0);

+

static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)

{

        struct amdgpu_device *adev =

                container_of(__work, struct amdgpu_device, xgmi_reset_work);



+       printk("lema1: card 0x%x goes into reset wq\n", adev->pdev->bus->number);

+       if (adev->pdev->bus->number == 0x7) {

+               atomic_set(&card1_in_baco, 1);

+               printk("lema1: card1 in baco from card1 view\n");

+       }

+

        if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)

               adev->asic_reset_res = (adev->in_baco == false) ?

                                amdgpu_device_baco_enter(adev->ddev) :

@@ -2664,6 +2673,23 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)

        if (adev->asic_reset_res)

                DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",

                         adev->asic_reset_res, adev->ddev->unique);

+

+       if (adev->pdev->bus->number == 0x4) {

+               atomic_set(&card0_in_baco, 1);

+               printk("lema1: card0 in baco from card0 view\n");

+

+               while (true)

+                       if (!!atomic_read(&card1_in_baco))

+                               break;

+               printk("lema1: card1 in baco from card0 view\n");

+       }

+

+       if (adev->pdev->bus->number == 0x7) {

+               while (true)

+                       if (!!atomic_read(&card0_in_baco))

+                               break;

+               printk("lema1: card0 in baco from card1 view\n");

+       }



> +                                          if (!queue_work_on(cpu, system_highpri_wq,

> +                                                                     &tmp_adev->xgmi_reset_work))

>                                                          r = -EALREADY;

> +                                          cpu = cpumask_next(cpu, cpu_online_mask);

>                                  } else

>                                              r = amdgpu_asic_reset(tmp_adev);

> -

> -                               if (r) {

> -                                           DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",

> -                                                       r, tmp_adev->ddev->unique);

> +                              if (r)

>                                              break;

> -                               }

>                      }

>

> -                   /* For XGMI wait for all PSP resets to complete before proceed */

> +                  /* For XGMI wait for all work to complete before proceed */

>                      if (!r) {

>                                  list_for_each_entry(tmp_adev, device_list_handle,

>                                                              gmc.xgmi.head) {

> @@ -3826,11 +3836,59 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,

>                                                          r = tmp_adev->asic_reset_res;

>                                                          if (r)

>                                                                      break;

> +                                                      if(AMD_RESET_METHOD_BACO ==

> +                                                         amdgpu_asic_reset_method(tmp_adev))

> +                                                                  tmp_adev->in_baco = true;

>                                              }

>                                  }

>                      }

> -       }

>

> +                  /*

> +                  * For XGMI with baco reset, need exit baco phase by scheduling

> +                  * xgmi_reset_work one more time. PSP reset skips this phase.

> +                  * Not assume the situation that PSP reset and baco reset

> +                  * coexist within an XGMI hive.

> +                  */

> +

> +                  if (!r) {

> +                              cpu = smp_processor_id();

> +                              list_for_each_entry(tmp_adev, device_list_handle,

> +                                                          gmc.xgmi.head) {

> +                                          if (tmp_adev->gmc.xgmi.num_physical_nodes > 1

> +                                              && AMD_RESET_METHOD_BACO ==

> +                                              amdgpu_asic_reset_method(tmp_adev)) {

> +                                                      if (!queue_work_on(cpu,

> +                                                                  system_highpri_wq,

> +                                                                  &tmp_adev->xgmi_reset_work))

> +                                                                  r = -EALREADY;

> +                                                      if (r)

> +                                                                  break;

> +                                                      cpu = cpumask_next(cpu, cpu_online_mask);

> +                                          }

> +                              }

> +                  }

> +

> +                  if (!r) {

> +                              list_for_each_entry(tmp_adev, device_list_handle,

> +                                                          gmc.xgmi.head) {

> +                                          if (tmp_adev->gmc.xgmi.num_physical_nodes > 1

> +                                              && AMD_RESET_METHOD_BACO ==

> +                                              amdgpu_asic_reset_method(tmp_adev)) {

> +                                                      flush_work(&tmp_adev->xgmi_reset_work);

> +                                                      r = tmp_adev->asic_reset_res;

> +                                                      if (r)

> +                                                                  break;

> +                                                      tmp_adev->in_baco = false;

> +                                          }

> +                              }

> +                  }

> +

> +                  if (r) {

> +                              DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",

> +                                          r, tmp_adev->ddev->unique);

> +                              goto end;

> +                  }

> +      }

>

>          list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {

>                      if (need_full_reset) {

[-- Attachment #1.2: Type: text/html, Size: 48322 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 07/10] drm/amdgpu: add concurrent baco reset support for XGMI
  2019-11-28  9:00             ` Ma, Le
  (?)
@ 2019-11-29 16:21             ` Andrey Grodzovsky
  2019-12-02 11:42               ` Ma, Le
  -1 siblings, 1 reply; 57+ messages in thread
From: Andrey Grodzovsky @ 2019-11-29 16:21 UTC (permalink / raw)
  To: Ma, Le, amd-gfx
  Cc: Deucher, Alexander, Zhou1, Tao, Li, Dennis, Chen, Guchun, Zhang, Hawking


[-- Attachment #1.1: Type: text/plain, Size: 13818 bytes --]


On 11/28/19 4:00 AM, Ma, Le wrote:
>
> -----Original Message-----
> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
> Sent: Wednesday, November 27, 2019 11:46 PM
> To: Ma, Le <Le.Ma@amd.com>; amd-gfx@lists.freedesktop.org
> Cc: Chen, Guchun <Guchun.Chen@amd.com>; Zhou1, Tao 
> <Tao.Zhou1@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; 
> Li, Dennis <Dennis.Li@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>
> Subject: Re: [PATCH 07/10] drm/amdgpu: add concurrent baco reset 
> support for XGMI
>
> On 11/27/19 4:15 AM, Le Ma wrote:
>
> > Currently each XGMI node reset wq does not run in parrallel because
>
> > same work item bound to same cpu runs in sequence. So change to bound
>
> > the xgmi_reset_work item to different cpus.
>
> It's not the same work item, see more bellow
>
> >
>
> > XGMI requires all nodes enter into baco within very close proximity
>
> > before any node exit baco. So schedule the xgmi_reset_work wq twice
>
> > for enter/exit baco respectively.
>
> >
>
> > The default reset code path and methods do not change for vega20 
> production:
>
> >    - baco reset without xgmi/ras
>
> >    - psp reset with xgmi/ras
>
> >
>
> > To enable baco for XGMI/RAS case, both 2 conditions below are needed:
>
> >    - amdgpu_ras_enable=2
>
> >    - baco-supported smu firmware
>
> >
>
> > The case that PSP reset and baco reset coexist within an XGMI hive is
>
> > not in the consideration.
>
> >
>
> > Change-Id: I9c08cf90134f940b42e20d2129ff87fba761c532
>
> > Signed-off-by: Le Ma <le.ma@amd.com <mailto:le.ma@amd.com>>
>
> > ---
>
> > drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  2 +
>
> > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 78 
> ++++++++++++++++++++++++++----
>
> >   2 files changed, 70 insertions(+), 10 deletions(-)
>
> >
>
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>
> > index d120fe5..08929e6 100644
>
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>
> > @@ -998,6 +998,8 @@ struct amdgpu_device {
>
> > int                                           pstate;
>
> >          /* enable runtime pm on the device */
>
> > bool                            runpm;
>
> > +
>
> > + bool                                        in_baco;
>
> >   };
>
> >
>
> >   static inline struct amdgpu_device *amdgpu_ttm_adev(struct
>
> > ttm_bo_device *bdev) diff --git
>
> > a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>
> > index bd387bb..71abfe9 100644
>
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>
> > @@ -2654,7 +2654,13 @@ static void 
> amdgpu_device_xgmi_reset_func(struct work_struct *__work)
>
> >          struct amdgpu_device *adev =
>
> > container_of(__work, struct amdgpu_device, xgmi_reset_work);
>
> >
>
> > -       adev->asic_reset_res = amdgpu_asic_reset(adev);
>
> > +      if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)
>
> > + adev->asic_reset_res = (adev->in_baco == false) ?
>
> > +             amdgpu_device_baco_enter(adev->ddev) :
>
> > +             amdgpu_device_baco_exit(adev->ddev);
>
> > +      else
>
> > + adev->asic_reset_res = amdgpu_asic_reset(adev);
>
> > +
>
> >          if (adev->asic_reset_res)
>
> >                      DRM_WARN("ASIC reset failed with error, %d for 
> drm dev, %s",
>
> >  adev->asic_reset_res, adev->ddev->unique); @@ -3796,6 +3802,7 @@
>
> > static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>
> >          struct amdgpu_device *tmp_adev = NULL;
>
> >          bool need_full_reset = *need_full_reset_arg, vram_lost = false;
>
> >          int r = 0;
>
> > +      int cpu = smp_processor_id();
>
> >
>
> >          /*
>
> >           * ASIC reset has to be done on all HGMI hive nodes ASAP @@
>
> > -3803,21 +3810,24 @@ static int amdgpu_do_asic_reset(struct 
> amdgpu_hive_info *hive,
>
> >           */
>
> >          if (need_full_reset) {
>
> > list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>
> > -                               /* For XGMI run all resets in 
> parallel to speed up the process */
>
> > +                              /*
>
> > +                              * For XGMI run all resets in parallel 
> to speed up the
>
> > +                              * process by scheduling the highpri 
> wq on different
>
> > +                              * cpus. For XGMI with baco reset, all 
> nodes must enter
>
> > +                              * baco within close proximity before 
> anyone exit.
>
> > +                              */
>
> >                                  if 
> (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
>
> > -                                           if 
> (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work))
>
> Note that tmp_adev->xgmi_reset_work (the work item) is per device in 
> XGMI hive and not the same work item. So I don't see why you need to 
> explicitly queue them on different CPUs, they should run in parallel 
> already.
>
> Andrey
>
> [Le]: It’s also beyond my understanding that the 2 node reset work 
> items scheduled to same cpu does not run in parallel. But from the 
> experiment result in my side, the 2nd work item always run after 1st 
> work item finished. Based on this result, I changed to queue them on 
> different CPUs to make sure more XGMI nodes case to run in parallel, 
> because baco requires all nodes enter baco within very close proximity.
>
> The experiment code is as following for your reference. When card0 
> worker running, card1 worker is not observed to run.
>

The code bellow will only test that they don't run concurrently - but 
this doesn't mean they don't run on different CPUs and threads,I don't 
have an XGMI setup at hand to test this theory but what if there is some 
locking dependency between them that serializes their execution ? Can 
you just add a one line print inside amdgpu_device_xgmi_reset_func that 
prints CPU id, thread name/id and card number ?

Andrey


> +atomic_t card0_in_baco = ATOMIC_INIT(0);
>
> +atomic_t card1_in_baco = ATOMIC_INIT(0);
>
> +
>
> static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
>
> {
>
> struct amdgpu_device *adev =
>
> container_of(__work, struct amdgpu_device, xgmi_reset_work);
>
> + printk("lema1: card 0x%x goes into reset wq\n", 
> adev->pdev->bus->number);
>
> + if (adev->pdev->bus->number == 0x7) {
>
> + atomic_set(&card1_in_baco, 1);
>
> + printk("lema1: card1 in baco from card1 view\n");
>
> + }
>
> +
>
> if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)
>
>                adev->asic_reset_res = (adev->in_baco == false) ?
>
> amdgpu_device_baco_enter(adev->ddev) :
>
> @@ -2664,6 +2673,23 @@ static void 
> amdgpu_device_xgmi_reset_func(struct work_struct *__work)
>
> if (adev->asic_reset_res)
>
> DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
>
> adev->asic_reset_res, adev->ddev->unique);
>
> +
>
> + if (adev->pdev->bus->number == 0x4) {
>
> + atomic_set(&card0_in_baco, 1);
>
> +        printk("lema1: card0 in baco from card0 view\n");
>
> +
>
> + while (true)
>
> + if (!!atomic_read(&card1_in_baco))
>
> + break;
>
> + printk("lema1: card1 in baco from card0 view\n");
>
> +     }
>
> +
>
> + if (adev->pdev->bus->number == 0x7) {
>
> + while (true)
>
> + if (!!atomic_read(&card0_in_baco))
>
> + break;
>
> + printk("lema1: card0 in baco from card1 view\n");
>
> + }
>
> > +                                          if (!queue_work_on(cpu, 
> system_highpri_wq,
>
> > +    &tmp_adev->xgmi_reset_work))
>
> >                                                        r = -EALREADY;
>
> > +                                          cpu = cpumask_next(cpu, 
> cpu_online_mask);
>
> >                                  } else
>
> >                                            r = 
> amdgpu_asic_reset(tmp_adev);
>
> > -
>
> > -                               if (r) {
>
> > -                                           DRM_ERROR("ASIC reset 
> failed with error, %d for drm dev, %s",
>
> > -                                                       r, 
> tmp_adev->ddev->unique);
>
> > +                              if (r)
>
> >                                            break;
>
> > -                               }
>
> >                      }
>
> >
>
> > -                   /* For XGMI wait for all PSP resets to complete 
> before proceed */
>
> > +                  /* For XGMI wait for all work to complete before 
> proceed */
>
> >                      if (!r) {
>
> > list_for_each_entry(tmp_adev, device_list_handle,
>
> >     gmc.xgmi.head) {
>
> > @@ -3826,11 +3836,59 @@ static int amdgpu_do_asic_reset(struct 
> amdgpu_hive_info *hive,
>
> >                                                        r = 
> tmp_adev->asic_reset_res;
>
> >                                                        if (r)
>
> > break;
>
> > + if(AMD_RESET_METHOD_BACO ==
>
> > + amdgpu_asic_reset_method(tmp_adev))
>
> > + tmp_adev->in_baco = true;
>
> >                                            }
>
> >                                  }
>
> >                      }
>
> > -       }
>
> >
>
> > +                  /*
>
> > +                  * For XGMI with baco reset, need exit baco phase 
> by scheduling
>
> > +                  * xgmi_reset_work one more time. PSP reset skips 
> this phase.
>
> > +                  * Not assume the situation that PSP reset and 
> baco reset
>
> > +                  * coexist within an XGMI hive.
>
> > +                  */
>
> > +
>
> > +                  if (!r) {
>
> > +                              cpu = smp_processor_id();
>
> > + list_for_each_entry(tmp_adev, device_list_handle,
>
> > + gmc.xgmi.head) {
>
> > +                                          if 
> (tmp_adev->gmc.xgmi.num_physical_nodes > 1
>
> > +                                              && 
> AMD_RESET_METHOD_BACO ==
>
> > + amdgpu_asic_reset_method(tmp_adev)) {
>
> > +                                                      if 
> (!queue_work_on(cpu,
>
> > + system_highpri_wq,
>
> > +             &tmp_adev->xgmi_reset_work))
>
> > + r = -EALREADY;
>
> > +                                                      if (r)
>
> > + break;
>
> > +                                                      cpu = 
> cpumask_next(cpu, cpu_online_mask);
>
> > +                                          }
>
> > +                              }
>
> > +                  }
>
> > +
>
> > +                  if (!r) {
>
> > + list_for_each_entry(tmp_adev, device_list_handle,
>
> > + gmc.xgmi.head) {
>
> > +                                          if 
> (tmp_adev->gmc.xgmi.num_physical_nodes > 1
>
> > +                                              && 
> AMD_RESET_METHOD_BACO ==
>
> > + amdgpu_asic_reset_method(tmp_adev)) {
>
> > + flush_work(&tmp_adev->xgmi_reset_work);
>
> > +                                                      r = 
> tmp_adev->asic_reset_res;
>
> > +                                                      if (r)
>
> > + break;
>
> > + tmp_adev->in_baco = false;
>
> > +                                          }
>
> > +                              }
>
> > +                  }
>
> > +
>
> > +                  if (r) {
>
> > + DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
>
> > +                                          r, tmp_adev->ddev->unique);
>
> > +                              goto end;
>
> > +                  }
>
> > +      }
>
> >
>
> > list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>
> >                      if (need_full_reset) {
>

[-- Attachment #1.2: Type: text/html, Size: 53613 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 57+ messages in thread

* RE: [PATCH 07/10] drm/amdgpu: add concurrent baco reset support for XGMI
  2019-11-29 16:21             ` Andrey Grodzovsky
@ 2019-12-02 11:42               ` Ma, Le
  2019-12-02 22:05                 ` Andrey Grodzovsky
  0 siblings, 1 reply; 57+ messages in thread
From: Ma, Le @ 2019-12-02 11:42 UTC (permalink / raw)
  To: Grodzovsky, Andrey, amd-gfx
  Cc: Deucher, Alexander, Zhou1, Tao, Li, Dennis, Chen, Guchun, Zhang, Hawking


[-- Attachment #1.1: Type: text/plain, Size: 14094 bytes --]

[AMD Official Use Only - Internal Distribution Only]



From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
Sent: Saturday, November 30, 2019 12:22 AM
To: Ma, Le <Le.Ma@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Chen, Guchun <Guchun.Chen@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Li, Dennis <Dennis.Li@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>
Subject: Re: [PATCH 07/10] drm/amdgpu: add concurrent baco reset support for XGMI



On 11/28/19 4:00 AM, Ma, Le wrote:





-----Original Message-----
From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com><mailto:Andrey.Grodzovsky@amd.com>
Sent: Wednesday, November 27, 2019 11:46 PM
To: Ma, Le <Le.Ma@amd.com><mailto:Le.Ma@amd.com>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
Cc: Chen, Guchun <Guchun.Chen@amd.com><mailto:Guchun.Chen@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com><mailto:Tao.Zhou1@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com><mailto:Alexander.Deucher@amd.com>; Li, Dennis <Dennis.Li@amd.com><mailto:Dennis.Li@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com><mailto:Hawking.Zhang@amd.com>
Subject: Re: [PATCH 07/10] drm/amdgpu: add concurrent baco reset support for XGMI





On 11/27/19 4:15 AM, Le Ma wrote:

> Currently each XGMI node reset wq does not run in parrallel because

> same work item bound to same cpu runs in sequence. So change to bound

> the xgmi_reset_work item to different cpus.



It's not the same work item, see more bellow





>

> XGMI requires all nodes enter into baco within very close proximity

> before any node exit baco. So schedule the xgmi_reset_work wq twice

> for enter/exit baco respectively.

>

> The default reset code path and methods do not change for vega20 production:

>    - baco reset without xgmi/ras

>    - psp reset with xgmi/ras

>

> To enable baco for XGMI/RAS case, both 2 conditions below are needed:

>    - amdgpu_ras_enable=2

>    - baco-supported smu firmware

>

> The case that PSP reset and baco reset coexist within an XGMI hive is

> not in the consideration.

>

> Change-Id: I9c08cf90134f940b42e20d2129ff87fba761c532

> Signed-off-by: Le Ma <le.ma@amd.com<mailto:le.ma@amd.com>>

> ---

>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  2 +

>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 78 ++++++++++++++++++++++++++----

>   2 files changed, 70 insertions(+), 10 deletions(-)

>

> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h

> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

> index d120fe5..08929e6 100644

> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h

> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

> @@ -998,6 +998,8 @@ struct amdgpu_device {

>          int                                           pstate;

>          /* enable runtime pm on the device */

>          bool                            runpm;

> +

> +      bool                                        in_baco;

>   };

>

>   static inline struct amdgpu_device *amdgpu_ttm_adev(struct

> ttm_bo_device *bdev) diff --git

> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

> index bd387bb..71abfe9 100644

> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

> @@ -2654,7 +2654,13 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)

>          struct amdgpu_device *adev =

>                      container_of(__work, struct amdgpu_device, xgmi_reset_work);

>

> -       adev->asic_reset_res =  amdgpu_asic_reset(adev);

> +      if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)

> +                  adev->asic_reset_res = (adev->in_baco == false) ?

> +                                          amdgpu_device_baco_enter(adev->ddev) :

> +                                          amdgpu_device_baco_exit(adev->ddev);

> +      else

> +                  adev->asic_reset_res = amdgpu_asic_reset(adev);

> +

>          if (adev->asic_reset_res)

>                      DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",

>                                   adev->asic_reset_res, adev->ddev->unique); @@ -3796,6 +3802,7 @@

> static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,

>          struct amdgpu_device *tmp_adev = NULL;

>          bool need_full_reset = *need_full_reset_arg, vram_lost = false;

>          int r = 0;

> +      int cpu = smp_processor_id();

>

>          /*

>           * ASIC reset has to be done on all HGMI hive nodes ASAP @@

> -3803,21 +3810,24 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,

>           */

>          if (need_full_reset) {

>                      list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {

> -                               /* For XGMI run all resets in parallel to speed up the process */

> +                              /*

> +                              * For XGMI run all resets in parallel to speed up the

> +                              * process by scheduling the highpri wq on different

> +                              * cpus. For XGMI with baco reset, all nodes must enter

> +                              * baco within close proximity before anyone exit.

> +                              */

>                                  if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {

> -                                           if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work))





Note that tmp_adev->xgmi_reset_work (the work item) is per device in XGMI hive and not the same work item. So I don't see why you need to explicitly queue them on different CPUs, they should run in parallel already.



Andrey



[Le]: It's also beyond my understanding that the 2 node reset work items scheduled to same cpu does not run in parallel. But from the experiment result in my side, the 2nd work item always run after 1st work item finished. Based on this result, I changed to queue them on different CPUs to make sure more XGMI nodes case to run in parallel, because baco requires all nodes enter baco within very close proximity.



The experiment code is as following for your reference. When card0 worker running, card1 worker is not observed to run.



The code bellow will only test that they don't run concurrently - but this doesn't mean they don't run on different CPUs and threads,I don't have an XGMI setup at hand to test this theory but what if there is some locking dependency between them that serializes their execution ? Can you just add a one line print inside amdgpu_device_xgmi_reset_func that prints CPU id, thread name/id and card number ?

Andrey

[Le]: I checked if directly use queue_work() several times, the same CPU thread will be used. And the worker per CPU will execute the item one by one. Our goal here is to make the xgmi_reset_func run concurrently for XGMI BACO case. That's why I schedule them on different CPUs to run parallelly. And I can share the XGMI system with you if you'd like to verify more.



+atomic_t card0_in_baco = ATOMIC_INIT(0);

+atomic_t card1_in_baco = ATOMIC_INIT(0);

+

static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)

{

        struct amdgpu_device *adev =

                container_of(__work, struct amdgpu_device, xgmi_reset_work);



+       printk("lema1: card 0x%x goes into reset wq\n", adev->pdev->bus->number);

+       if (adev->pdev->bus->number == 0x7) {

+               atomic_set(&card1_in_baco, 1);

+               printk("lema1: card1 in baco from card1 view\n");

+       }

+

        if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)

               adev->asic_reset_res = (adev->in_baco == false) ?

                                amdgpu_device_baco_enter(adev->ddev) :

@@ -2664,6 +2673,23 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)

        if (adev->asic_reset_res)

                DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",

                         adev->asic_reset_res, adev->ddev->unique);

+

+       if (adev->pdev->bus->number == 0x4) {

+               atomic_set(&card0_in_baco, 1);

+               printk("lema1: card0 in baco from card0 view\n");

+

+               while (true)

+                       if (!!atomic_read(&card1_in_baco))

+                               break;

+               printk("lema1: card1 in baco from card0 view\n");

+       }

+

+       if (adev->pdev->bus->number == 0x7) {

+               while (true)

+                       if (!!atomic_read(&card0_in_baco))

+                               break;

+               printk("lema1: card0 in baco from card1 view\n");

+       }



> +                                          if (!queue_work_on(cpu, system_highpri_wq,

> +                                                                     &tmp_adev->xgmi_reset_work))

>                                                          r = -EALREADY;

> +                                          cpu = cpumask_next(cpu, cpu_online_mask);

>                                  } else

>                                              r = amdgpu_asic_reset(tmp_adev);

> -

> -                               if (r) {

> -                                           DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",

> -                                                       r, tmp_adev->ddev->unique);

> +                              if (r)

>                                              break;

> -                               }

>                      }

>

> -                   /* For XGMI wait for all PSP resets to complete before proceed */

> +                  /* For XGMI wait for all work to complete before proceed */

>                      if (!r) {

>                                  list_for_each_entry(tmp_adev, device_list_handle,

>                                                              gmc.xgmi.head) {

> @@ -3826,11 +3836,59 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,

>                                                          r = tmp_adev->asic_reset_res;

>                                                          if (r)

>                                                                      break;

> +                                                      if(AMD_RESET_METHOD_BACO ==

> +                                                         amdgpu_asic_reset_method(tmp_adev))

> +                                                                  tmp_adev->in_baco = true;

>                                              }

>                                  }

>                      }

> -       }

>

> +                  /*

> +                  * For XGMI with baco reset, need exit baco phase by scheduling

> +                  * xgmi_reset_work one more time. PSP reset skips this phase.

> +                  * Not assume the situation that PSP reset and baco reset

> +                  * coexist within an XGMI hive.

> +                  */

> +

> +                  if (!r) {

> +                              cpu = smp_processor_id();

> +                              list_for_each_entry(tmp_adev, device_list_handle,

> +                                                          gmc.xgmi.head) {

> +                                          if (tmp_adev->gmc.xgmi.num_physical_nodes > 1

> +                                              && AMD_RESET_METHOD_BACO ==

> +                                              amdgpu_asic_reset_method(tmp_adev)) {

> +                                                      if (!queue_work_on(cpu,

> +                                                                  system_highpri_wq,

> +                                                                  &tmp_adev->xgmi_reset_work))

> +                                                                  r = -EALREADY;

> +                                                      if (r)

> +                                                                  break;

> +                                                      cpu = cpumask_next(cpu, cpu_online_mask);

> +                                          }

> +                              }

> +                  }

> +

> +                  if (!r) {

> +                              list_for_each_entry(tmp_adev, device_list_handle,

> +                                                          gmc.xgmi.head) {

> +                                          if (tmp_adev->gmc.xgmi.num_physical_nodes > 1

> +                                              && AMD_RESET_METHOD_BACO ==

> +                                              amdgpu_asic_reset_method(tmp_adev)) {

> +                                                      flush_work(&tmp_adev->xgmi_reset_work);

> +                                                      r = tmp_adev->asic_reset_res;

> +                                                      if (r)

> +                                                                  break;

> +                                                      tmp_adev->in_baco = false;

> +                                          }

> +                              }

> +                  }

> +

> +                  if (r) {

> +                              DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",

> +                                          r, tmp_adev->ddev->unique);

> +                              goto end;

> +                  }

> +      }

>

>          list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {

>                      if (need_full_reset) {

[-- Attachment #1.2: Type: text/html, Size: 48838 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 07/10] drm/amdgpu: add concurrent baco reset support for XGMI
  2019-12-02 11:42               ` Ma, Le
@ 2019-12-02 22:05                 ` Andrey Grodzovsky
       [not found]                   ` <MN2PR12MB42855B198BB4064A0D311845F6420@MN2PR12MB4285.namprd12.prod.outlook.com>
  0 siblings, 1 reply; 57+ messages in thread
From: Andrey Grodzovsky @ 2019-12-02 22:05 UTC (permalink / raw)
  To: Ma, Le, amd-gfx
  Cc: Deucher, Alexander, Zhou1, Tao, Li, Dennis, Chen, Guchun, Zhang, Hawking


[-- Attachment #1.1: Type: text/plain, Size: 16162 bytes --]


On 12/2/19 6:42 AM, Ma, Le wrote:
>
> [AMD Official Use Only - Internal Distribution Only]
>
> *From:*Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
> *Sent:* Saturday, November 30, 2019 12:22 AM
> *To:* Ma, Le <Le.Ma@amd.com>; amd-gfx@lists.freedesktop.org
> *Cc:* Chen, Guchun <Guchun.Chen@amd.com>; Zhou1, Tao 
> <Tao.Zhou1@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; 
> Li, Dennis <Dennis.Li@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>
> *Subject:* Re: [PATCH 07/10] drm/amdgpu: add concurrent baco reset 
> support for XGMI
>
> On 11/28/19 4:00 AM, Ma, Le wrote:
>
>     -----Original Message-----
>     From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>     <mailto:Andrey.Grodzovsky@amd.com>
>     Sent: Wednesday, November 27, 2019 11:46 PM
>     To: Ma, Le <Le.Ma@amd.com> <mailto:Le.Ma@amd.com>;
>     amd-gfx@lists.freedesktop.org <mailto:amd-gfx@lists.freedesktop.org>
>     Cc: Chen, Guchun <Guchun.Chen@amd.com>
>     <mailto:Guchun.Chen@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>
>     <mailto:Tao.Zhou1@amd.com>; Deucher, Alexander
>     <Alexander.Deucher@amd.com> <mailto:Alexander.Deucher@amd.com>;
>     Li, Dennis <Dennis.Li@amd.com> <mailto:Dennis.Li@amd.com>; Zhang,
>     Hawking <Hawking.Zhang@amd.com> <mailto:Hawking.Zhang@amd.com>
>     Subject: Re: [PATCH 07/10] drm/amdgpu: add concurrent baco reset
>     support for XGMI
>
>     On 11/27/19 4:15 AM, Le Ma wrote:
>
>     > Currently each XGMI node reset wq does not run in parrallel because
>
>     > same work item bound to same cpu runs in sequence. So change to
>     bound
>
>     > the xgmi_reset_work item to different cpus.
>
>     It's not the same work item, see more bellow
>
>     >
>
>     > XGMI requires all nodes enter into baco within very close proximity
>
>     > before any node exit baco. So schedule the xgmi_reset_work wq twice
>
>     > for enter/exit baco respectively.
>
>     >
>
>     > The default reset code path and methods do not change for vega20
>     production:
>
>     >    - baco reset without xgmi/ras
>
>     >    - psp reset with xgmi/ras
>
>     >
>
>     > To enable baco for XGMI/RAS case, both 2 conditions below are
>     needed:
>
>     >    - amdgpu_ras_enable=2
>
>     >    - baco-supported smu firmware
>
>     >
>
>     > The case that PSP reset and baco reset coexist within an XGMI
>     hive is
>
>     > not in the consideration.
>
>     >
>
>     > Change-Id: I9c08cf90134f940b42e20d2129ff87fba761c532
>
>     > Signed-off-by: Le Ma <le.ma@amd.com <mailto:le.ma@amd.com>>
>
>     > ---
>
>     > drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  2 +
>
>     > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 78
>     ++++++++++++++++++++++++++----
>
>     >   2 files changed, 70 insertions(+), 10 deletions(-)
>
>     >
>
>     > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>
>     > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>
>     > index d120fe5..08929e6 100644
>
>     > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>
>     > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>
>     > @@ -998,6 +998,8 @@ struct amdgpu_device {
>
>     > int                                           pstate;
>
>     >          /* enable runtime pm on the device */
>
>     > bool                            runpm;
>
>     > +
>
>     > + bool                                        in_baco;
>
>     >   };
>
>     >
>
>     >   static inline struct amdgpu_device *amdgpu_ttm_adev(struct
>
>     > ttm_bo_device *bdev) diff --git
>
>     > a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>
>     > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>
>     > index bd387bb..71abfe9 100644
>
>     > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>
>     > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>
>     > @@ -2654,7 +2654,13 @@ static void
>     amdgpu_device_xgmi_reset_func(struct work_struct *__work)
>
>     >          struct amdgpu_device *adev =
>
>     > container_of(__work, struct amdgpu_device, xgmi_reset_work);
>
>     >
>
>     > -       adev->asic_reset_res =  amdgpu_asic_reset(adev);
>
>     > +      if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)
>
>     > + adev->asic_reset_res = (adev->in_baco == false) ?
>
>     > +             amdgpu_device_baco_enter(adev->ddev) :
>
>     > +             amdgpu_device_baco_exit(adev->ddev);
>
>     > +      else
>
>     > + adev->asic_reset_res = amdgpu_asic_reset(adev);
>
>     > +
>
>     >          if (adev->asic_reset_res)
>
>     > DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
>
>     >  adev->asic_reset_res, adev->ddev->unique); @@ -3796,6 +3802,7 @@
>
>     > static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>
>     >          struct amdgpu_device *tmp_adev = NULL;
>
>     >          bool need_full_reset = *need_full_reset_arg, vram_lost
>     = false;
>
>     >          int r = 0;
>
>     > +      int cpu = smp_processor_id();
>
>     >
>
>     >          /*
>
>     >           * ASIC reset has to be done on all HGMI hive nodes
>     ASAP @@
>
>     > -3803,21 +3810,24 @@ static int amdgpu_do_asic_reset(struct
>     amdgpu_hive_info *hive,
>
>     >           */
>
>     >          if (need_full_reset) {
>
>     > list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>
>     > - /* For XGMI run all resets in parallel to speed up the process */
>
>     > +                              /*
>
>     > +                              * For XGMI run all resets in
>     parallel to speed up the
>
>     > +                              * process by scheduling the
>     highpri wq on different
>
>     > +                              * cpus. For XGMI with baco reset,
>     all nodes must enter
>
>     > +                              * baco within close proximity
>     before anyone exit.
>
>     > +                              */
>
>     > if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
>
>     > -                                           if
>     (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work))
>
>     Note that tmp_adev->xgmi_reset_work (the work item) is per device
>     in XGMI hive and not the same work item. So I don't see why you
>     need to explicitly queue them on different CPUs, they should run
>     in parallel already.
>
>     Andrey
>
>     [Le]: It’s also beyond my understanding that the 2 node reset work
>     items scheduled to same cpu does not run in parallel. But from the
>     experiment result in my side, the 2nd work item always run after
>     1st work item finished. Based on this result, I changed to queue
>     them on different CPUs to make sure more XGMI nodes case to run in
>     parallel, because baco requires all nodes enter baco within very
>     close proximity.
>
>     The experiment code is as following for your reference. When card0
>     worker running, card1 worker is not observed to run.
>
> The code bellow will only test that they don't run concurrently - but 
> this doesn't mean they don't run on different CPUs and threads,I don't 
> have an XGMI setup at hand to test this theory but what if there is 
> some locking dependency between them that serializes their execution ? 
> Can you just add a one line print inside amdgpu_device_xgmi_reset_func 
> that prints CPU id, thread name/id and card number ?
>
> Andrey
>
> [Le]: I checked if directly use queue_work() several times, the same 
> CPU thread will be used. And the worker per CPU will execute the item 
> one by one. Our goal here is to make the xgmi_reset_func run 
> concurrently for XGMI BACO case. That’s why I schedule them on 
> different CPUs to run parallelly. And I can share the XGMI system with 
> you if you’d like to verify more.
>

I tried today to setup XGMI 2P setup to test this but weren't able to 
load with the XGMI bridge in place (maybe faulty bridge) - so yea - 
maybe leave me your setup before your changes (the original code) so i 
can try to open some kernel traces that show CPU id and thread id to 
check this. It's just so weird that system_highpri_wq which is 
documented to be multi-cpu and multi-threaded wouldn't queue those work 
items to different cpus and worker threads.

Andrey


>     +atomic_t card0_in_baco = ATOMIC_INIT(0);
>
>     +atomic_t card1_in_baco = ATOMIC_INIT(0);
>
>     +
>
>     static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
>
>     {
>
>     struct amdgpu_device *adev =
>
>     container_of(__work, struct amdgpu_device, xgmi_reset_work);
>
>     + printk("lema1: card 0x%x goes into reset wq\n",
>     adev->pdev->bus->number);
>
>     +       if (adev->pdev->bus->number == 0x7) {
>
>     + atomic_set(&card1_in_baco, 1);
>
>     + printk("lema1: card1 in baco from card1 view\n");
>
>     +       }
>
>     +
>
>             if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)
>
>                    adev->asic_reset_res = (adev->in_baco == false) ?
>
>     amdgpu_device_baco_enter(adev->ddev) :
>
>     @@ -2664,6 +2673,23 @@ static void
>     amdgpu_device_xgmi_reset_func(struct work_struct *__work)
>
>             if (adev->asic_reset_res)
>
>     DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
>
>     adev->asic_reset_res, adev->ddev->unique);
>
>     +
>
>     +       if (adev->pdev->bus->number == 0x4) {
>
>     + atomic_set(&card0_in_baco, 1);
>
>     +        printk("lema1: card0 in baco from card0 view\n");
>
>     +
>
>     + while (true)
>
>     + if (!!atomic_read(&card1_in_baco))
>
>     + break;
>
>     + printk("lema1: card1 in baco from card0 view\n");
>
>     +       }
>
>     +
>
>     +       if (adev->pdev->bus->number == 0x7) {
>
>     + while (true)
>
>     + if (!!atomic_read(&card0_in_baco))
>
>     + break;
>
>     + printk("lema1: card0 in baco from card1 view\n");
>
>     +       }
>
>     > +                                          if
>     (!queue_work_on(cpu, system_highpri_wq,
>
>     > +    &tmp_adev->xgmi_reset_work))
>
>     >                                                        r =
>     -EALREADY;
>
>     > +                                          cpu =
>     cpumask_next(cpu, cpu_online_mask);
>
>     > } else
>
>     >                                            r =
>     amdgpu_asic_reset(tmp_adev);
>
>     > -
>
>     > - if (r) {
>
>     > -                                           DRM_ERROR("ASIC
>     reset failed with error, %d for drm dev, %s",
>
>     > -                                                       r,
>     tmp_adev->ddev->unique);
>
>     > +                              if (r)
>
>     >                                            break;
>
>     > -                               }
>
>     >                      }
>
>     >
>
>     > -                   /* For XGMI wait for all PSP resets to
>     complete before proceed */
>
>     > +                  /* For XGMI wait for all work to complete
>     before proceed */
>
>     >                      if (!r) {
>
>     > list_for_each_entry(tmp_adev, device_list_handle,
>
>     >     gmc.xgmi.head) {
>
>     > @@ -3826,11 +3836,59 @@ static int amdgpu_do_asic_reset(struct
>     amdgpu_hive_info *hive,
>
>     >                                                        r =
>     tmp_adev->asic_reset_res;
>
>     >                                                        if (r)
>
>     > break;
>
>     > + if(AMD_RESET_METHOD_BACO ==
>
>     > + amdgpu_asic_reset_method(tmp_adev))
>
>     > + tmp_adev->in_baco = true;
>
>     >                                            }
>
>     > }
>
>     >                      }
>
>     > -       }
>
>     >
>
>     > +                  /*
>
>     > +                  * For XGMI with baco reset, need exit baco
>     phase by scheduling
>
>     > +                  * xgmi_reset_work one more time. PSP reset
>     skips this phase.
>
>     > +                  * Not assume the situation that PSP reset and
>     baco reset
>
>     > +                  * coexist within an XGMI hive.
>
>     > +                  */
>
>     > +
>
>     > +                  if (!r) {
>
>     > + cpu = smp_processor_id();
>
>     > + list_for_each_entry(tmp_adev, device_list_handle,
>
>     > + gmc.xgmi.head) {
>
>     > +                                          if
>     (tmp_adev->gmc.xgmi.num_physical_nodes > 1
>
>     > +                                              &&
>     AMD_RESET_METHOD_BACO ==
>
>     > + amdgpu_asic_reset_method(tmp_adev)) {
>
>     > +                                                      if
>     (!queue_work_on(cpu,
>
>     > + system_highpri_wq,
>
>     > +             &tmp_adev->xgmi_reset_work))
>
>     > + r = -EALREADY;
>
>     > +                                                      if (r)
>
>     > + break;
>
>     > +                                                      cpu =
>     cpumask_next(cpu, cpu_online_mask);
>
>     > +                                          }
>
>     > +                              }
>
>     > +                  }
>
>     > +
>
>     > +                  if (!r) {
>
>     > + list_for_each_entry(tmp_adev, device_list_handle,
>
>     > + gmc.xgmi.head) {
>
>     > +                                          if
>     (tmp_adev->gmc.xgmi.num_physical_nodes > 1
>
>     > +                                              &&
>     AMD_RESET_METHOD_BACO ==
>
>     > + amdgpu_asic_reset_method(tmp_adev)) {
>
>     > + flush_work(&tmp_adev->xgmi_reset_work);
>
>     > +                                                      r =
>     tmp_adev->asic_reset_res;
>
>     > +                                                      if (r)
>
>     > + break;
>
>     > + tmp_adev->in_baco = false;
>
>     > +                                          }
>
>     > +                              }
>
>     > +                  }
>
>     > +
>
>     > +                  if (r) {
>
>     > + DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
>
>     > +                                          r,
>     tmp_adev->ddev->unique);
>
>     > + goto end;
>
>     > +                  }
>
>     > +      }
>
>     >
>
>     > list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>
>     >                      if (need_full_reset) {
>

[-- Attachment #1.2: Type: text/html, Size: 55616 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 57+ messages in thread

* RE: [PATCH 07/10] drm/amdgpu: add concurrent baco reset support for XGMI
       [not found]                     ` <2c4dd3f3-e2ce-9843-312b-1e5c05a51521@amd.com>
@ 2019-12-04  7:09                       ` Ma, Le
  2019-12-04 16:05                         ` Andrey Grodzovsky
  0 siblings, 1 reply; 57+ messages in thread
From: Ma, Le @ 2019-12-04  7:09 UTC (permalink / raw)
  To: Grodzovsky, Andrey, amd-gfx, Zhou1, Tao, Deucher, Alexander, Li,
	Dennis, Zhang, Hawking
  Cc: Chen, Guchun


[-- Attachment #1.1: Type: text/plain, Size: 20417 bytes --]

[AMD Official Use Only - Internal Distribution Only]


From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
Sent: Wednesday, December 4, 2019 2:44 AM
To: Ma, Le <Le.Ma@amd.com>; amd-gfx@lists.freedesktop.org; Zhou1, Tao <Tao.Zhou1@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Li, Dennis <Dennis.Li@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>
Cc: Chen, Guchun <Guchun.Chen@amd.com>
Subject: Re: [PATCH 07/10] drm/amdgpu: add concurrent baco reset support for XGMI


Thanks Ma, this was very helpful as I am sill not able to setup XGMI hive with latest FW and VBIOS.

I traced the workqueue subsystem (full log attached). Specifically here is the life cycle of our 2 work items executing amdgpu_device_xgmi_reset_func bellow

[Le]: Thanks Andrey for the deep debug. Your feedback gave me a more profound understanding on this case. My comments split as below.

You were right to note they both run on came CPU (32) but they are executed by different threads. Also as you see by workqueue_execute_start/end timestamps they actually ran in parallel and not one after another even while being assigned to the same CPU and that because of thread preemption (there is at least psp_v11_0_mode1_reset->msleep(500)) which yields the CPU and hence allows the second work to run + I am sure that on preemptive kernel one reset work would be preempted at some point anyway  and let the other run.

[Le]: Yes, from the trace log, the xgmi_reset_func items are assigned to different work threads bound to one same CPU. And you are right that cpu preemption will happen when msleep called which yield the CPU to allow second work to run. That’s a great founding😊. But it’s not a real parallel run to me because second work can only preempt to run when first work go to sleep. I made an experiment here to change this unique msleep to udelay, then second work item will run after first item finished in a serial execuation.

Now you had issues with BACO reset while the test I ran on your system is mode1 reset and so I assumed that maybe BACO has some non preempt-able busy wait which doesn't give a chance to second work item's thread to run on that CPU before the first finished - but from looking in the code I see smu_v11_0_baco_enter->msleep(10) so even in that case the first reset work item was supposed to yield CPU after BACO ENTER sent to SMU and let the other reset work do the same to the second card and so i don't see how even in this case there is a serial execution ?

[Le]: VG20 uses old powerplay framework (smu_v11_0_baco_enter->msleep(10) in swSMU framework), so no msleep and no CPU preemption. BACO reset has Enter/Exit 2 phases. We expect all the XGMI nodes enter BACO simultaneously instead of one after one as a serial execution, then exit BACO simultaneously.

P.S How you solution solves the case where the XGMI hive is bigger then number of CPUs on the system ? Assuming that what you say is correct and there is a serial execution when on the same CPU, if they hive is bigger then number of CPUs you will eventually get back to sending reset work to a CPU already executing BACO ENTER (or EXIT) for another device and will get the serialization problem anyway.

[Le]: Yeah, I also considered the situation that XGMI hive bigger than CPU NR. I think it’s an extreme situation and should not exist. However, assuming it exists, many work items scatter in several CPUs will be executed faster than bound to one same CPU, isn’t it ?

             cat-3002  [032] d... 33153.791829: workqueue_queue_work: work struct=00000000e43c1ebb function=amdgpu_device_xgmi_reset_func [amdgpu] workqueue=0000000080331d91 req_cpu=8192 cpu=32
             cat-3002  [032] d... 33153.791829: workqueue_activate_work: work struct 00000000e43c1ebb
             cat-3002  [032] dN.. 33153.791831: workqueue_queue_work: work struct=00000000e67113aa function=amdgpu_device_xgmi_reset_func [amdgpu] workqueue=0000000080331d91 req_cpu=8192 cpu=32
             cat-3002  [032] dN.. 33153.791832: workqueue_activate_work: work struct 00000000e67113aa
   kworker/32:1H-551   [032] .... 33153.791834: workqueue_execute_start: work struct 00000000e43c1ebb: function amdgpu_device_xgmi_reset_func [amdgpu]
   kworker/32:0H-175   [032] .... 33153.792087: workqueue_execute_start: work struct 00000000e67113aa: function amdgpu_device_xgmi_reset_func [amdgpu]
   kworker/32:1H-551   [032] .... 33154.310948: workqueue_execute_end: work struct 00000000e43c1ebb
   kworker/32:0H-175   [032] .... 33154.311043: workqueue_execute_end: work struct 00000000e67113aa

Andrey




On 12/3/19 5:06 AM, Ma, Le wrote:

[AMD Official Use Only - Internal Distribution Only]

Hi Andrey,

You can try the XGMI system below:
              IP: 10.67.69.53
              U/P: jenkins/0

The original drm-next kernel is installed.

Regards,
Ma Le

From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com><mailto:Andrey.Grodzovsky@amd.com>
Sent: Tuesday, December 3, 2019 6:05 AM
To: Ma, Le <Le.Ma@amd.com><mailto:Le.Ma@amd.com>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
Cc: Chen, Guchun <Guchun.Chen@amd.com><mailto:Guchun.Chen@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com><mailto:Tao.Zhou1@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com><mailto:Alexander.Deucher@amd.com>; Li, Dennis <Dennis.Li@amd.com><mailto:Dennis.Li@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com><mailto:Hawking.Zhang@amd.com>
Subject: Re: [PATCH 07/10] drm/amdgpu: add concurrent baco reset support for XGMI



On 12/2/19 6:42 AM, Ma, Le wrote:

[AMD Official Use Only - Internal Distribution Only]



From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com><mailto:Andrey.Grodzovsky@amd.com>
Sent: Saturday, November 30, 2019 12:22 AM
To: Ma, Le <Le.Ma@amd.com><mailto:Le.Ma@amd.com>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
Cc: Chen, Guchun <Guchun.Chen@amd.com><mailto:Guchun.Chen@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com><mailto:Tao.Zhou1@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com><mailto:Alexander.Deucher@amd.com>; Li, Dennis <Dennis.Li@amd.com><mailto:Dennis.Li@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com><mailto:Hawking.Zhang@amd.com>
Subject: Re: [PATCH 07/10] drm/amdgpu: add concurrent baco reset support for XGMI



On 11/28/19 4:00 AM, Ma, Le wrote:





-----Original Message-----
From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com><mailto:Andrey.Grodzovsky@amd.com>
Sent: Wednesday, November 27, 2019 11:46 PM
To: Ma, Le <Le.Ma@amd.com><mailto:Le.Ma@amd.com>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
Cc: Chen, Guchun <Guchun.Chen@amd.com><mailto:Guchun.Chen@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com><mailto:Tao.Zhou1@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com><mailto:Alexander.Deucher@amd.com>; Li, Dennis <Dennis.Li@amd.com><mailto:Dennis.Li@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com><mailto:Hawking.Zhang@amd.com>
Subject: Re: [PATCH 07/10] drm/amdgpu: add concurrent baco reset support for XGMI





On 11/27/19 4:15 AM, Le Ma wrote:

> Currently each XGMI node reset wq does not run in parrallel because

> same work item bound to same cpu runs in sequence. So change to bound

> the xgmi_reset_work item to different cpus.



It's not the same work item, see more bellow





>

> XGMI requires all nodes enter into baco within very close proximity

> before any node exit baco. So schedule the xgmi_reset_work wq twice

> for enter/exit baco respectively.

>

> The default reset code path and methods do not change for vega20 production:

>    - baco reset without xgmi/ras

>    - psp reset with xgmi/ras

>

> To enable baco for XGMI/RAS case, both 2 conditions below are needed:

>    - amdgpu_ras_enable=2

>    - baco-supported smu firmware

>

> The case that PSP reset and baco reset coexist within an XGMI hive is

> not in the consideration.

>

> Change-Id: I9c08cf90134f940b42e20d2129ff87fba761c532

> Signed-off-by: Le Ma <le.ma@amd.com<mailto:le.ma@amd.com>>

> ---

>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  2 +

>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 78 ++++++++++++++++++++++++++----

>   2 files changed, 70 insertions(+), 10 deletions(-)

>

> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h

> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

> index d120fe5..08929e6 100644

> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h

> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

> @@ -998,6 +998,8 @@ struct amdgpu_device {

>          int                                           pstate;

>          /* enable runtime pm on the device */

>          bool                            runpm;

> +

> +      bool                                        in_baco;

>   };

>

>   static inline struct amdgpu_device *amdgpu_ttm_adev(struct

> ttm_bo_device *bdev) diff --git

> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

> index bd387bb..71abfe9 100644

> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

> @@ -2654,7 +2654,13 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)

>          struct amdgpu_device *adev =

>                      container_of(__work, struct amdgpu_device, xgmi_reset_work);

>

> -       adev->asic_reset_res =  amdgpu_asic_reset(adev);

> +      if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)

> +                  adev->asic_reset_res = (adev->in_baco == false) ?

> +                                          amdgpu_device_baco_enter(adev->ddev) :

> +                                          amdgpu_device_baco_exit(adev->ddev);

> +      else

> +                  adev->asic_reset_res = amdgpu_asic_reset(adev);

> +

>          if (adev->asic_reset_res)

>                      DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",

>                                   adev->asic_reset_res, adev->ddev->unique); @@ -3796,6 +3802,7 @@

> static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,

>          struct amdgpu_device *tmp_adev = NULL;

>          bool need_full_reset = *need_full_reset_arg, vram_lost = false;

>          int r = 0;

> +      int cpu = smp_processor_id();

>

>          /*

>           * ASIC reset has to be done on all HGMI hive nodes ASAP @@

> -3803,21 +3810,24 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,

>           */

>          if (need_full_reset) {

>                      list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {

> -                               /* For XGMI run all resets in parallel to speed up the process */

> +                              /*

> +                              * For XGMI run all resets in parallel to speed up the

> +                              * process by scheduling the highpri wq on different

> +                              * cpus. For XGMI with baco reset, all nodes must enter

> +                              * baco within close proximity before anyone exit.

> +                              */

>                                  if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {

> -                                           if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work))





Note that tmp_adev->xgmi_reset_work (the work item) is per device in XGMI hive and not the same work item. So I don't see why you need to explicitly queue them on different CPUs, they should run in parallel already.



Andrey



[Le]: It’s also beyond my understanding that the 2 node reset work items scheduled to same cpu does not run in parallel. But from the experiment result in my side, the 2nd work item always run after 1st work item finished. Based on this result, I changed to queue them on different CPUs to make sure more XGMI nodes case to run in parallel, because baco requires all nodes enter baco within very close proximity.



The experiment code is as following for your reference. When card0 worker running, card1 worker is not observed to run.



The code bellow will only test that they don't run concurrently - but this doesn't mean they don't run on different CPUs and threads,I don't have an XGMI setup at hand to test this theory but what if there is some locking dependency between them that serializes their execution ? Can you just add a one line print inside amdgpu_device_xgmi_reset_func that prints CPU id, thread name/id and card number ?

Andrey

[Le]: I checked if directly use queue_work() several times, the same CPU thread will be used. And the worker per CPU will execute the item one by one. Our goal here is to make the xgmi_reset_func run concurrently for XGMI BACO case. That’s why I schedule them on different CPUs to run parallelly. And I can share the XGMI system with you if you’d like to verify more.



I tried today to setup XGMI 2P setup to test this but weren't able to load with the XGMI bridge in place (maybe faulty bridge) - so yea - maybe leave me your setup before your changes (the original code) so i can try to open some kernel traces that show CPU id and thread id to check this. It's just so weird that system_highpri_wq which is documented to be multi-cpu and multi-threaded wouldn't queue those work items to different cpus and worker threads.

Andrey





+atomic_t card0_in_baco = ATOMIC_INIT(0);

+atomic_t card1_in_baco = ATOMIC_INIT(0);

+

static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)

{

        struct amdgpu_device *adev =

                container_of(__work, struct amdgpu_device, xgmi_reset_work);



+       printk("lema1: card 0x%x goes into reset wq\n", adev->pdev->bus->number);

+       if (adev->pdev->bus->number == 0x7) {

+               atomic_set(&card1_in_baco, 1);

+               printk("lema1: card1 in baco from card1 view\n");

+       }

+

        if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)

               adev->asic_reset_res = (adev->in_baco == false) ?

                                amdgpu_device_baco_enter(adev->ddev) :

@@ -2664,6 +2673,23 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)

        if (adev->asic_reset_res)

                DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",

                         adev->asic_reset_res, adev->ddev->unique);

+

+       if (adev->pdev->bus->number == 0x4) {

+               atomic_set(&card0_in_baco, 1);

+               printk("lema1: card0 in baco from card0 view\n");

+

+               while (true)

+                       if (!!atomic_read(&card1_in_baco))

+                               break;

+               printk("lema1: card1 in baco from card0 view\n");

+       }

+

+       if (adev->pdev->bus->number == 0x7) {

+               while (true)

+                       if (!!atomic_read(&card0_in_baco))

+                               break;

+               printk("lema1: card0 in baco from card1 view\n");

+       }



> +                                          if (!queue_work_on(cpu, system_highpri_wq,

> +                                                                     &tmp_adev->xgmi_reset_work))

>                                                          r = -EALREADY;

> +                                          cpu = cpumask_next(cpu, cpu_online_mask);

>                                  } else

>                                              r = amdgpu_asic_reset(tmp_adev);

> -

> -                               if (r) {

> -                                           DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",

> -                                                       r, tmp_adev->ddev->unique);

> +                              if (r)

>                                              break;

> -                               }

>                      }

>

> -                   /* For XGMI wait for all PSP resets to complete before proceed */

> +                  /* For XGMI wait for all work to complete before proceed */

>                      if (!r) {

>                                  list_for_each_entry(tmp_adev, device_list_handle,

>                                                              gmc.xgmi.head) {

> @@ -3826,11 +3836,59 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,

>                                                          r = tmp_adev->asic_reset_res;

>                                                          if (r)

>                                                                      break;

> +                                                      if(AMD_RESET_METHOD_BACO ==

> +                                                         amdgpu_asic_reset_method(tmp_adev))

> +                                                                  tmp_adev->in_baco = true;

>                                              }

>                                  }

>                      }

> -       }

>

> +                  /*

> +                  * For XGMI with baco reset, need exit baco phase by scheduling

> +                  * xgmi_reset_work one more time. PSP reset skips this phase.

> +                  * Not assume the situation that PSP reset and baco reset

> +                  * coexist within an XGMI hive.

> +                  */

> +

> +                  if (!r) {

> +                              cpu = smp_processor_id();

> +                              list_for_each_entry(tmp_adev, device_list_handle,

> +                                                          gmc.xgmi.head) {

> +                                          if (tmp_adev->gmc.xgmi.num_physical_nodes > 1

> +                                              && AMD_RESET_METHOD_BACO ==

> +                                              amdgpu_asic_reset_method(tmp_adev)) {

> +                                                      if (!queue_work_on(cpu,

> +                                                                  system_highpri_wq,

> +                                                                  &tmp_adev->xgmi_reset_work))

> +                                                                  r = -EALREADY;

> +                                                      if (r)

> +                                                                  break;

> +                                                      cpu = cpumask_next(cpu, cpu_online_mask);

> +                                          }

> +                              }

> +                  }

> +

> +                  if (!r) {

> +                              list_for_each_entry(tmp_adev, device_list_handle,

> +                                                          gmc.xgmi.head) {

> +                                          if (tmp_adev->gmc.xgmi.num_physical_nodes > 1

> +                                              && AMD_RESET_METHOD_BACO ==

> +                                              amdgpu_asic_reset_method(tmp_adev)) {

> +                                                      flush_work(&tmp_adev->xgmi_reset_work);

> +                                                      r = tmp_adev->asic_reset_res;

> +                                                      if (r)

> +                                                                  break;

> +                                                      tmp_adev->in_baco = false;

> +                                          }

> +                              }

> +                  }

> +

> +                  if (r) {

> +                              DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",

> +                                          r, tmp_adev->ddev->unique);

> +                              goto end;

> +                  }

> +      }

>

>          list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {

>                      if (need_full_reset) {

[-- Attachment #1.2: Type: text/html, Size: 59880 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 07/10] drm/amdgpu: add concurrent baco reset support for XGMI
  2019-12-04  7:09                       ` Ma, Le
@ 2019-12-04 16:05                         ` Andrey Grodzovsky
  2019-12-05  3:14                           ` Ma, Le
  0 siblings, 1 reply; 57+ messages in thread
From: Andrey Grodzovsky @ 2019-12-04 16:05 UTC (permalink / raw)
  To: Ma, Le, amd-gfx, Zhou1, Tao, Deucher, Alexander, Li, Dennis,
	Zhang, Hawking
  Cc: Chen, Guchun


[-- Attachment #1.1: Type: text/plain, Size: 26221 bytes --]


On 12/4/19 2:09 AM, Ma, Le wrote:
>
> [AMD Official Use Only - Internal Distribution Only]
>
> *From:*Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
> *Sent:* Wednesday, December 4, 2019 2:44 AM
> *To:* Ma, Le <Le.Ma@amd.com>; amd-gfx@lists.freedesktop.org; Zhou1, 
> Tao <Tao.Zhou1@amd.com>; Deucher, Alexander 
> <Alexander.Deucher@amd.com>; Li, Dennis <Dennis.Li@amd.com>; Zhang, 
> Hawking <Hawking.Zhang@amd.com>
> *Cc:* Chen, Guchun <Guchun.Chen@amd.com>
> *Subject:* Re: [PATCH 07/10] drm/amdgpu: add concurrent baco reset 
> support for XGMI
>
> Thanks Ma, this was very helpful as I am sill not able to setup XGMI 
> hive with latest FW and VBIOS.
>
> I traced the workqueue subsystem (full log attached). Specifically 
> here is the life cycle of our 2 work items executing 
> amdgpu_device_xgmi_reset_func bellow
>
> [Le]: Thanks Andrey for the deep debug. Your feedback gave me a more 
> profound understanding on this case. My comments split as below.
>
> You were right to note they both run on came CPU (32) but they are 
> executed by different threads. Also as you see by 
> workqueue_execute_start/end timestamps they actually ran in parallel 
> and not one after another even while being assigned to the same CPU 
> and that because of thread preemption (there is at least 
> psp_v11_0_mode1_reset->msleep(500)) which yields the CPU and hence 
> allows the second work to run + I am sure that on preemptive kernel 
> one reset work would be preempted at some point anyway  and let the 
> other run.
>
> [Le]: Yes, from the trace log, the xgmi_reset_func items are assigned 
> to different work threads bound to one same CPU. And you are right 
> that cpu preemption will happen when msleep called which yield the CPU 
> to allow second work to run. That’s a great founding😊. But it’s not a 
> *real* parallel run to me because second work can only preempt to run 
> when first work go to sleep. I made an experiment here to change this 
> unique msleep to udelay, then second work item will run after first 
> item finished in a serial execuation.
>

I would expect in kernel compiled with preemption support that a running 
thread would be interrupted to let others run even when he is not 
voluntarily yields the CPU so this is strange.


> Now you had issues with BACO reset while the test I ran on your system 
> is mode1 reset and so I assumed that maybe BACO has some non 
> preempt-able busy wait which doesn't give a chance to second work 
> item's thread to run on that CPU before the first finished - but from 
> looking in the code I see smu_v11_0_baco_enter->msleep(10) so even in 
> that case the first reset work item was supposed to yield CPU after 
> BACO ENTER sent to SMU and let the other reset work do the same to the 
> second card and so i don't see how even in this case there is a serial 
> execution ?
>
> [Le]: VG20 uses old powerplay framework 
> (smu_v11_0_baco_enter->msleep(10) in swSMU framework), so no msleep 
> and no CPU preemption. BACO reset has Enter/Exit 2 phases. We expect 
> all the XGMI nodes enter BACO simultaneously instead of one after one 
> as a serial execution, then exit BACO simultaneously.
>

Well, we always can add something like bellow to force each XGMI reset 
work to let others run before going into BACO exit. We can also 
guarantee that all of the reset works will execute BACO ENTER before 
proceeding to BACO EXIT by using some kind of semaphore barrier along 
the line of this - 
https://stackoverflow.com/questions/47522174/reusable-barrier-implementation-using-posix-semaphores. 
This will also solve the #XGMI_NODES > #CPUs use case.


diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c 
b/drivers/gpu/drm/amd/amdgpu/soc15.c
index 48649f5..3e91e54 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
@@ -531,6 +531,8 @@ static int soc15_asic_baco_reset(struct 
amdgpu_device *adev)
                 if (pp_funcs->set_asic_baco_state(pp_handle, 1))
                         return -EIO;

+               yield();
+
                 /* exit BACO state */
                 if (pp_funcs->set_asic_baco_state(pp_handle, 0))
                         return -EIO;


> P.S How you solution solves the case where the XGMI hive is bigger 
> then number of CPUs on the system ? Assuming that what you say is 
> correct and there is a serial execution when on the same CPU, if they 
> hive is bigger then number of CPUs you will eventually get back to 
> sending reset work to a CPU already executing BACO ENTER (or EXIT) for 
> another device and will get the serialization problem anyway.
>
> [Le]: Yeah, I also considered the situation that XGMI hive bigger than 
> CPU NR. I think it’s an extreme situation and should not exist. 
> However, assuming it exists, many work items scatter in several CPUs 
> will be executed faster than bound to one same CPU, isn’t it ?
>

AFAIK it's enough for even single one node in the hive to to fail the 
enter the BACO state on time to fail the entire hive reset procedure, no ?

Any way - I see our discussion blocks your entire patch set - I think 
you can go ahead and commit yours way (I think you got an RB from 
Hawking) and I will look then and see if I can implement my method and 
if it works will just revert your patch.

Andrey


>              cat-3002  [032] d... 33153.791829: workqueue_queue_work: 
> work struct=00000000e43c1ebb function=amdgpu_device_xgmi_reset_func 
> [amdgpu] workqueue=0000000080331d91 req_cpu=8192 cpu=32
>              cat-3002  [032] d... 33153.791829: 
> workqueue_activate_work: work struct 00000000e43c1ebb
>              cat-3002  [032] dN.. 33153.791831: workqueue_queue_work: 
> work struct=00000000e67113aa function=amdgpu_device_xgmi_reset_func 
> [amdgpu] workqueue=0000000080331d91 req_cpu=8192 cpu=32
>              cat-3002  [032] dN.. 33153.791832: 
> workqueue_activate_work: work struct 00000000e67113aa
>    kworker/32:1H-551   [032] .... 33153.791834: 
> workqueue_execute_start: work struct 00000000e43c1ebb: function 
> amdgpu_device_xgmi_reset_func [amdgpu]
>    kworker/32:0H-175   [032] .... 33153.792087: 
> workqueue_execute_start: work struct 00000000e67113aa: function 
> amdgpu_device_xgmi_reset_func [amdgpu]
>    kworker/32:1H-551   [032] .... 33154.310948: workqueue_execute_end: 
> work struct 00000000e43c1ebb
>    kworker/32:0H-175   [032] .... 33154.311043: workqueue_execute_end: 
> work struct 00000000e67113aa
>
> Andrey
>
> On 12/3/19 5:06 AM, Ma, Le wrote:
>
>     [AMD Official Use Only - Internal Distribution Only]
>
>     Hi Andrey,
>
>     You can try the XGMI system below:
>
>     IP: 10.67.69.53
>
>     U/P: jenkins/0
>
>     The original drm-next kernel is installed.
>
>     Regards,
>
>     Ma Le
>
>     *From:*Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>     <mailto:Andrey.Grodzovsky@amd.com>
>     *Sent:* Tuesday, December 3, 2019 6:05 AM
>     *To:* Ma, Le <Le.Ma@amd.com> <mailto:Le.Ma@amd.com>;
>     amd-gfx@lists.freedesktop.org <mailto:amd-gfx@lists.freedesktop.org>
>     *Cc:* Chen, Guchun <Guchun.Chen@amd.com>
>     <mailto:Guchun.Chen@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>
>     <mailto:Tao.Zhou1@amd.com>; Deucher, Alexander
>     <Alexander.Deucher@amd.com> <mailto:Alexander.Deucher@amd.com>;
>     Li, Dennis <Dennis.Li@amd.com> <mailto:Dennis.Li@amd.com>; Zhang,
>     Hawking <Hawking.Zhang@amd.com> <mailto:Hawking.Zhang@amd.com>
>     *Subject:* Re: [PATCH 07/10] drm/amdgpu: add concurrent baco reset
>     support for XGMI
>
>     On 12/2/19 6:42 AM, Ma, Le wrote:
>
>         [AMD Official Use Only - Internal Distribution Only]
>
>         *From:*Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>         <mailto:Andrey.Grodzovsky@amd.com>
>         *Sent:* Saturday, November 30, 2019 12:22 AM
>         *To:* Ma, Le <Le.Ma@amd.com> <mailto:Le.Ma@amd.com>;
>         amd-gfx@lists.freedesktop.org
>         <mailto:amd-gfx@lists.freedesktop.org>
>         *Cc:* Chen, Guchun <Guchun.Chen@amd.com>
>         <mailto:Guchun.Chen@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>
>         <mailto:Tao.Zhou1@amd.com>; Deucher, Alexander
>         <Alexander.Deucher@amd.com>
>         <mailto:Alexander.Deucher@amd.com>; Li, Dennis
>         <Dennis.Li@amd.com> <mailto:Dennis.Li@amd.com>; Zhang, Hawking
>         <Hawking.Zhang@amd.com> <mailto:Hawking.Zhang@amd.com>
>         *Subject:* Re: [PATCH 07/10] drm/amdgpu: add concurrent baco
>         reset support for XGMI
>
>         On 11/28/19 4:00 AM, Ma, Le wrote:
>
>             -----Original Message-----
>             From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>             <mailto:Andrey.Grodzovsky@amd.com>
>             Sent: Wednesday, November 27, 2019 11:46 PM
>             To: Ma, Le <Le.Ma@amd.com> <mailto:Le.Ma@amd.com>;
>             amd-gfx@lists.freedesktop.org
>             <mailto:amd-gfx@lists.freedesktop.org>
>             Cc: Chen, Guchun <Guchun.Chen@amd.com>
>             <mailto:Guchun.Chen@amd.com>; Zhou1, Tao
>             <Tao.Zhou1@amd.com> <mailto:Tao.Zhou1@amd.com>; Deucher,
>             Alexander <Alexander.Deucher@amd.com>
>             <mailto:Alexander.Deucher@amd.com>; Li, Dennis
>             <Dennis.Li@amd.com> <mailto:Dennis.Li@amd.com>; Zhang,
>             Hawking <Hawking.Zhang@amd.com> <mailto:Hawking.Zhang@amd.com>
>             Subject: Re: [PATCH 07/10] drm/amdgpu: add concurrent baco
>             reset support for XGMI
>
>             On 11/27/19 4:15 AM, Le Ma wrote:
>
>             > Currently each XGMI node reset wq does not run in
>             parrallel because
>
>             > same work item bound to same cpu runs in sequence. So
>             change to bound
>
>             > the xgmi_reset_work item to different cpus.
>
>             It's not the same work item, see more bellow
>
>             >
>
>             > XGMI requires all nodes enter into baco within very
>             close proximity
>
>             > before any node exit baco. So schedule the
>             xgmi_reset_work wq twice
>
>             > for enter/exit baco respectively.
>
>             >
>
>             > The default reset code path and methods do not change
>             for vega20 production:
>
>             >    - baco reset without xgmi/ras
>
>             >    - psp reset with xgmi/ras
>
>             >
>
>             > To enable baco for XGMI/RAS case, both 2 conditions
>             below are needed:
>
>             >    - amdgpu_ras_enable=2
>
>             >    - baco-supported smu firmware
>
>             >
>
>             > The case that PSP reset and baco reset coexist within an
>             XGMI hive is
>
>             > not in the consideration.
>
>             >
>
>             > Change-Id: I9c08cf90134f940b42e20d2129ff87fba761c532
>
>             > Signed-off-by: Le Ma <le.ma@amd.com <mailto:le.ma@amd.com>>
>
>             > ---
>
>             > drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  2 +
>
>             > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 78
>             ++++++++++++++++++++++++++----
>
>             >   2 files changed, 70 insertions(+), 10 deletions(-)
>
>             >
>
>             > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>
>             > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>
>             > index d120fe5..08929e6 100644
>
>             > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>
>             > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>
>             > @@ -998,6 +998,8 @@ struct amdgpu_device {
>
>             > int                                           pstate;
>
>             >          /* enable runtime pm on the device */
>
>             > bool                            runpm;
>
>             > +
>
>             > + bool                                        in_baco;
>
>             >   };
>
>             >
>
>             >   static inline struct amdgpu_device
>             *amdgpu_ttm_adev(struct
>
>             > ttm_bo_device *bdev) diff --git
>
>             > a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>
>             > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>
>             > index bd387bb..71abfe9 100644
>
>             > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>
>             > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>
>             > @@ -2654,7 +2654,13 @@ static void
>             amdgpu_device_xgmi_reset_func(struct work_struct *__work)
>
>             >          struct amdgpu_device *adev =
>
>             > container_of(__work, struct amdgpu_device, xgmi_reset_work);
>
>             >
>
>             > - adev->asic_reset_res =  amdgpu_asic_reset(adev);
>
>             > +      if (amdgpu_asic_reset_method(adev) ==
>             AMD_RESET_METHOD_BACO)
>
>             > + adev->asic_reset_res = (adev->in_baco == false) ?
>
>             > + amdgpu_device_baco_enter(adev->ddev) :
>
>             > + amdgpu_device_baco_exit(adev->ddev);
>
>             > +      else
>
>             > + adev->asic_reset_res = amdgpu_asic_reset(adev);
>
>             > +
>
>             >          if (adev->asic_reset_res)
>
>             > DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
>
>             >                                 adev->asic_reset_res,
>             adev->ddev->unique); @@ -3796,6 +3802,7 @@
>
>             > static int amdgpu_do_asic_reset(struct amdgpu_hive_info
>             *hive,
>
>             >          struct amdgpu_device *tmp_adev = NULL;
>
>             >          bool need_full_reset = *need_full_reset_arg,
>             vram_lost = false;
>
>             >          int r = 0;
>
>             > +      int cpu = smp_processor_id();
>
>             >
>
>             >          /*
>
>             >           * ASIC reset has to be done on all HGMI hive
>             nodes ASAP @@
>
>             > -3803,21 +3810,24 @@ static int
>             amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>
>             >           */
>
>             >          if (need_full_reset) {
>
>             > list_for_each_entry(tmp_adev, device_list_handle,
>             gmc.xgmi.head) {
>
>             > -                               /* For XGMI run all
>             resets in parallel to speed up the process */
>
>             > +                              /*
>
>             > +                              * For XGMI run all resets
>             in parallel to speed up the
>
>             > +                              * process by scheduling
>             the highpri wq on different
>
>             > +                              * cpus. For XGMI with
>             baco reset, all nodes must enter
>
>             > +                              * baco within close
>             proximity before anyone exit.
>
>             > +                              */
>
>             >                                if
>             (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
>
>             > -                                           if
>             (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work))
>
>             Note that tmp_adev->xgmi_reset_work (the work item) is per
>             device in XGMI hive and not the same work item. So I don't
>             see why you need to explicitly queue them on different
>             CPUs, they should run in parallel already.
>
>             Andrey
>
>             [Le]: It’s also beyond my understanding that the 2 node
>             reset work items scheduled to same cpu does not run in
>             parallel. But from the experiment result in my side, the
>             2nd work item always run after 1st work item finished.
>             Based on this result, I changed to queue them on different
>             CPUs to make sure more XGMI nodes case to run in parallel,
>             because baco requires all nodes enter baco within very
>             close proximity.
>
>             The experiment code is as following for your reference.
>             When card0 worker running, card1 worker is not observed to
>             run.
>
>         The code bellow will only test that they don't run
>         concurrently - but this doesn't mean they don't run on
>         different CPUs and threads,I don't have an XGMI setup at hand
>         to test this theory but what if there is some locking
>         dependency between them that serializes their execution ? Can
>         you just add a one line print inside
>         amdgpu_device_xgmi_reset_func that prints CPU id, thread
>         name/id and card number ?
>
>         Andrey
>
>         [Le]: I checked if directly use queue_work() several times,
>         the same CPU thread will be used. And the worker per CPU will
>         execute the item one by one. Our goal here is to make the
>         xgmi_reset_func run concurrently for XGMI BACO case. That’s
>         why I schedule them on different CPUs to run parallelly. And I
>         can share the XGMI system with you if you’d like to verify more.
>
>     I tried today to setup XGMI 2P setup to test this but weren't able
>     to load with the XGMI bridge in place (maybe faulty bridge) - so
>     yea - maybe leave me your setup before your changes (the original
>     code) so i can try to open some kernel traces that show CPU id and
>     thread id to check this. It's just so weird that system_highpri_wq
>     which is documented to be multi-cpu and multi-threaded wouldn't
>     queue those work items to different cpus and worker threads.
>
>     Andrey
>
>             +atomic_t card0_in_baco = ATOMIC_INIT(0);
>
>             +atomic_t card1_in_baco = ATOMIC_INIT(0);
>
>             +
>
>             static void amdgpu_device_xgmi_reset_func(struct
>             work_struct *__work)
>
>             {
>
>             struct amdgpu_device *adev =
>
>             container_of(__work, struct amdgpu_device, xgmi_reset_work);
>
>             + printk("lema1: card 0x%x goes into reset wq\n",
>             adev->pdev->bus->number);
>
>             + if (adev->pdev->bus->number == 0x7) {
>
>             + atomic_set(&card1_in_baco, 1);
>
>             + printk("lema1: card1 in baco from card1 view\n");
>
>             + }
>
>             +
>
>             if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)
>
>                            adev->asic_reset_res = (adev->in_baco ==
>             false) ?
>
>             amdgpu_device_baco_enter(adev->ddev) :
>
>             @@ -2664,6 +2673,23 @@ static void
>             amdgpu_device_xgmi_reset_func(struct work_struct *__work)
>
>             if (adev->asic_reset_res)
>
>             DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
>
>             adev->asic_reset_res, adev->ddev->unique);
>
>             +
>
>             + if (adev->pdev->bus->number == 0x4) {
>
>             + atomic_set(&card0_in_baco, 1);
>
>             +        printk("lema1: card0 in baco from card0 view\n");
>
>             +
>
>             + while (true)
>
>             + if (!!atomic_read(&card1_in_baco))
>
>             + break;
>
>             + printk("lema1: card1 in baco from card0 view\n");
>
>             +     }
>
>             +
>
>             + if (adev->pdev->bus->number == 0x7) {
>
>             + while (true)
>
>             + if (!!atomic_read(&card0_in_baco))
>
>             + break;
>
>             + printk("lema1: card0 in baco from card1 view\n");
>
>             + }
>
>             > +                                          if
>             (!queue_work_on(cpu, system_highpri_wq,
>
>             > +    &tmp_adev->xgmi_reset_work))
>
>             >                                                        r
>             = -EALREADY;
>
>             > +                                          cpu =
>             cpumask_next(cpu, cpu_online_mask);
>
>             >                                } else
>
>             >                                            r =
>             amdgpu_asic_reset(tmp_adev);
>
>             > -
>
>             > -                               if (r) {
>
>             > - DRM_ERROR("ASIC reset failed with error, %d for drm
>             dev, %s",
>
>             > - r, tmp_adev->ddev->unique);
>
>             > +                              if (r)
>
>             >                                            break;
>
>             > -                               }
>
>             >                      }
>
>             >
>
>             > -                   /* For XGMI wait for all PSP resets
>             to complete before proceed */
>
>             > +                  /* For XGMI wait for all work to
>             complete before proceed */
>
>             >                      if (!r) {
>
>             > list_for_each_entry(tmp_adev, device_list_handle,
>
>             >     gmc.xgmi.head) {
>
>             > @@ -3826,11 +3836,59 @@ static int
>             amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>
>             >                                                        r
>             = tmp_adev->asic_reset_res;
>
>             > if (r)
>
>             > break;
>
>             > + if(AMD_RESET_METHOD_BACO ==
>
>             > +    amdgpu_asic_reset_method(tmp_adev))
>
>             > + tmp_adev->in_baco = true;
>
>             >                                            }
>
>             >                                }
>
>             >                      }
>
>             > -       }
>
>             >
>
>             > +                  /*
>
>             > +                  * For XGMI with baco reset, need exit
>             baco phase by scheduling
>
>             > +                  * xgmi_reset_work one more time. PSP
>             reset skips this phase.
>
>             > +                  * Not assume the situation that PSP
>             reset and baco reset
>
>             > +                  * coexist within an XGMI hive.
>
>             > +                  */
>
>             > +
>
>             > +                  if (!r) {
>
>             > +                              cpu = smp_processor_id();
>
>             > + list_for_each_entry(tmp_adev, device_list_handle,
>
>             > +     gmc.xgmi.head) {
>
>             > +                                          if
>             (tmp_adev->gmc.xgmi.num_physical_nodes > 1
>
>             > + && AMD_RESET_METHOD_BACO ==
>
>             > + amdgpu_asic_reset_method(tmp_adev)) {
>
>             > + if (!queue_work_on(cpu,
>
>             > + system_highpri_wq,
>
>             > +             &tmp_adev->xgmi_reset_work))
>
>             > + r = -EALREADY;
>
>             > + if (r)
>
>             > + break;
>
>             > + cpu = cpumask_next(cpu, cpu_online_mask);
>
>             > +                                          }
>
>             > +                              }
>
>             > +                  }
>
>             > +
>
>             > +                  if (!r) {
>
>             > + list_for_each_entry(tmp_adev, device_list_handle,
>
>             > +     gmc.xgmi.head) {
>
>             > +                                          if
>             (tmp_adev->gmc.xgmi.num_physical_nodes > 1
>
>             > + && AMD_RESET_METHOD_BACO ==
>
>             > + amdgpu_asic_reset_method(tmp_adev)) {
>
>             > + flush_work(&tmp_adev->xgmi_reset_work);
>
>             > +                                                      r
>             = tmp_adev->asic_reset_res;
>
>             > + if (r)
>
>             > + break;
>
>             > + tmp_adev->in_baco = false;
>
>             > +                                          }
>
>             > +                              }
>
>             > +                  }
>
>             > +
>
>             > +                  if (r) {
>
>             > +                              DRM_ERROR("ASIC reset
>             failed with error, %d for drm dev, %s",
>
>             > +                                          r,
>             tmp_adev->ddev->unique);
>
>             > +                              goto end;
>
>             > +                  }
>
>             > +      }
>
>             >
>
>             > list_for_each_entry(tmp_adev, device_list_handle,
>             gmc.xgmi.head) {
>
>             >                      if (need_full_reset) {
>

[-- Attachment #1.2: Type: text/html, Size: 74285 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* RE: [PATCH 07/10] drm/amdgpu: add concurrent baco reset support for XGMI
  2019-12-04 16:05                         ` Andrey Grodzovsky
@ 2019-12-05  3:14                           ` Ma, Le
  2019-12-06 21:50                             ` Andrey Grodzovsky
  0 siblings, 1 reply; 57+ messages in thread
From: Ma, Le @ 2019-12-05  3:14 UTC (permalink / raw)
  To: Grodzovsky, Andrey, amd-gfx, Zhou1, Tao, Deucher, Alexander, Li,
	Dennis, Zhang, Hawking
  Cc: Chen, Guchun


[-- Attachment #1.1: Type: text/plain, Size: 23365 bytes --]

[AMD Official Use Only - Internal Distribution Only]



From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
Sent: Thursday, December 5, 2019 12:06 AM
To: Ma, Le <Le.Ma@amd.com>; amd-gfx@lists.freedesktop.org; Zhou1, Tao <Tao.Zhou1@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Li, Dennis <Dennis.Li@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>
Cc: Chen, Guchun <Guchun.Chen@amd.com>
Subject: Re: [PATCH 07/10] drm/amdgpu: add concurrent baco reset support for XGMI



On 12/4/19 2:09 AM, Ma, Le wrote:

[AMD Official Use Only - Internal Distribution Only]


From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com><mailto:Andrey.Grodzovsky@amd.com>
Sent: Wednesday, December 4, 2019 2:44 AM
To: Ma, Le <Le.Ma@amd.com><mailto:Le.Ma@amd.com>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>; Zhou1, Tao <Tao.Zhou1@amd.com><mailto:Tao.Zhou1@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com><mailto:Alexander.Deucher@amd.com>; Li, Dennis <Dennis.Li@amd.com><mailto:Dennis.Li@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com><mailto:Hawking.Zhang@amd.com>
Cc: Chen, Guchun <Guchun.Chen@amd.com><mailto:Guchun.Chen@amd.com>
Subject: Re: [PATCH 07/10] drm/amdgpu: add concurrent baco reset support for XGMI


Thanks Ma, this was very helpful as I am sill not able to setup XGMI hive with latest FW and VBIOS.

I traced the workqueue subsystem (full log attached). Specifically here is the life cycle of our 2 work items executing amdgpu_device_xgmi_reset_func bellow

[Le]: Thanks Andrey for the deep debug. Your feedback gave me a more profound understanding on this case. My comments split as below.

You were right to note they both run on came CPU (32) but they are executed by different threads. Also as you see by workqueue_execute_start/end timestamps they actually ran in parallel and not one after another even while being assigned to the same CPU and that because of thread preemption (there is at least psp_v11_0_mode1_reset->msleep(500)) which yields the CPU and hence allows the second work to run + I am sure that on preemptive kernel one reset work would be preempted at some point anyway  and let the other run.

[Le]: Yes, from the trace log, the xgmi_reset_func items are assigned to different work threads bound to one same CPU. And you are right that cpu preemption will happen when msleep called which yield the CPU to allow second work to run. That’s a great founding😊. But it’s not a real parallel run to me because second work can only preempt to run when first work go to sleep. I made an experiment here to change this unique msleep to udelay, then second work item will run after first item finished in a serial execuation.



I would expect in kernel compiled with preemption support that a running thread would be interrupted to let others run even when he is not voluntarily yields the CPU so this is strange.



Now you had issues with BACO reset while the test I ran on your system is mode1 reset and so I assumed that maybe BACO has some non preempt-able busy wait which doesn't give a chance to second work item's thread to run on that CPU before the first finished - but from looking in the code I see smu_v11_0_baco_enter->msleep(10) so even in that case the first reset work item was supposed to yield CPU after BACO ENTER sent to SMU and let the other reset work do the same to the second card and so i don't see how even in this case there is a serial execution ?

[Le]: VG20 uses old powerplay framework (smu_v11_0_baco_enter->msleep(10) in swSMU framework), so no msleep and no CPU preemption. BACO reset has Enter/Exit 2 phases. We expect all the XGMI nodes enter BACO simultaneously instead of one after one as a serial execution, then exit BACO simultaneously.



Well, we always can add something like bellow to force each XGMI reset work to let others run before going into BACO exit. We can also guarantee that all of the reset works will execute BACO ENTER before proceeding to BACO EXIT by using some kind of semaphore barrier along the line of this - https://stackoverflow.com/questions/47522174/reusable-barrier-implementation-using-posix-semaphores. This will also solve the #XGMI_NODES > #CPUs use case.

diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c
index 48649f5..3e91e54 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
@@ -531,6 +531,8 @@ static int soc15_asic_baco_reset(struct amdgpu_device *adev)
                if (pp_funcs->set_asic_baco_state(pp_handle, 1))
                        return -EIO;

+               yield();
+
                /* exit BACO state */
                if (pp_funcs->set_asic_baco_state(pp_handle, 0))
                        return -EIO;



P.S How you solution solves the case where the XGMI hive is bigger then number of CPUs on the system ? Assuming that what you say is correct and there is a serial execution when on the same CPU, if they hive is bigger then number of CPUs you will eventually get back to sending reset work to a CPU already executing BACO ENTER (or EXIT) for another device and will get the serialization problem anyway.

[Le]: Yeah, I also considered the situation that XGMI hive bigger than CPU NR. I think it’s an extreme situation and should not exist. However, assuming it exists, many work items scatter in several CPUs will be executed faster than bound to one same CPU, isn’t it ?



AFAIK it's enough for even single one node in the hive to to fail the enter the BACO state on time to fail the entire hive reset procedure, no ?
[Le]: Yeah, agree that. I’ve been thinking that make all nodes entering baco simultaneously can reduce the possibility of node failure to enter/exit BACO risk. For example, in an XGMI hive with 8 nodes, the total time interval of 8 nodes enter/exit BACO on 8 CPUs is less than the interval that 8 nodes enter BACO serially and exit BACO serially depending on one CPU with yield capability. This interval is usually strict for BACO feature itself. Anyway, we need more looping test later on any method we will choose.

Any way - I see our discussion blocks your entire patch set - I think you can go ahead and commit yours way (I think you got an RB from Hawking) and I will look then and see if I can implement my method and if it works will just revert your patch.

[Le]: OK, fine.

Andrey



             cat-3002  [032] d... 33153.791829: workqueue_queue_work: work struct=00000000e43c1ebb function=amdgpu_device_xgmi_reset_func [amdgpu] workqueue=0000000080331d91 req_cpu=8192 cpu=32
             cat-3002  [032] d... 33153.791829: workqueue_activate_work: work struct 00000000e43c1ebb
             cat-3002  [032] dN.. 33153.791831: workqueue_queue_work: work struct=00000000e67113aa function=amdgpu_device_xgmi_reset_func [amdgpu] workqueue=0000000080331d91 req_cpu=8192 cpu=32
             cat-3002  [032] dN.. 33153.791832: workqueue_activate_work: work struct 00000000e67113aa
   kworker/32:1H-551   [032] .... 33153.791834: workqueue_execute_start: work struct 00000000e43c1ebb: function amdgpu_device_xgmi_reset_func [amdgpu]
   kworker/32:0H-175   [032] .... 33153.792087: workqueue_execute_start: work struct 00000000e67113aa: function amdgpu_device_xgmi_reset_func [amdgpu]
   kworker/32:1H-551   [032] .... 33154.310948: workqueue_execute_end: work struct 00000000e43c1ebb
   kworker/32:0H-175   [032] .... 33154.311043: workqueue_execute_end: work struct 00000000e67113aa

Andrey




On 12/3/19 5:06 AM, Ma, Le wrote:

[AMD Official Use Only - Internal Distribution Only]

Hi Andrey,

You can try the XGMI system below:
              IP: 10.67.69.53
              U/P: jenkins/0

The original drm-next kernel is installed.

Regards,
Ma Le

From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com><mailto:Andrey.Grodzovsky@amd.com>
Sent: Tuesday, December 3, 2019 6:05 AM
To: Ma, Le <Le.Ma@amd.com><mailto:Le.Ma@amd.com>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
Cc: Chen, Guchun <Guchun.Chen@amd.com><mailto:Guchun.Chen@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com><mailto:Tao.Zhou1@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com><mailto:Alexander.Deucher@amd.com>; Li, Dennis <Dennis.Li@amd.com><mailto:Dennis.Li@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com><mailto:Hawking.Zhang@amd.com>
Subject: Re: [PATCH 07/10] drm/amdgpu: add concurrent baco reset support for XGMI



On 12/2/19 6:42 AM, Ma, Le wrote:

[AMD Official Use Only - Internal Distribution Only]



From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com><mailto:Andrey.Grodzovsky@amd.com>
Sent: Saturday, November 30, 2019 12:22 AM
To: Ma, Le <Le.Ma@amd.com><mailto:Le.Ma@amd.com>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
Cc: Chen, Guchun <Guchun.Chen@amd.com><mailto:Guchun.Chen@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com><mailto:Tao.Zhou1@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com><mailto:Alexander.Deucher@amd.com>; Li, Dennis <Dennis.Li@amd.com><mailto:Dennis.Li@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com><mailto:Hawking.Zhang@amd.com>
Subject: Re: [PATCH 07/10] drm/amdgpu: add concurrent baco reset support for XGMI



On 11/28/19 4:00 AM, Ma, Le wrote:





-----Original Message-----
From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com><mailto:Andrey.Grodzovsky@amd.com>
Sent: Wednesday, November 27, 2019 11:46 PM
To: Ma, Le <Le.Ma@amd.com><mailto:Le.Ma@amd.com>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
Cc: Chen, Guchun <Guchun.Chen@amd.com><mailto:Guchun.Chen@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com><mailto:Tao.Zhou1@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com><mailto:Alexander.Deucher@amd.com>; Li, Dennis <Dennis.Li@amd.com><mailto:Dennis.Li@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com><mailto:Hawking.Zhang@amd.com>
Subject: Re: [PATCH 07/10] drm/amdgpu: add concurrent baco reset support for XGMI





On 11/27/19 4:15 AM, Le Ma wrote:

> Currently each XGMI node reset wq does not run in parrallel because

> same work item bound to same cpu runs in sequence. So change to bound

> the xgmi_reset_work item to different cpus.



It's not the same work item, see more bellow





>

> XGMI requires all nodes enter into baco within very close proximity

> before any node exit baco. So schedule the xgmi_reset_work wq twice

> for enter/exit baco respectively.

>

> The default reset code path and methods do not change for vega20 production:

>    - baco reset without xgmi/ras

>    - psp reset with xgmi/ras

>

> To enable baco for XGMI/RAS case, both 2 conditions below are needed:

>    - amdgpu_ras_enable=2

>    - baco-supported smu firmware

>

> The case that PSP reset and baco reset coexist within an XGMI hive is

> not in the consideration.

>

> Change-Id: I9c08cf90134f940b42e20d2129ff87fba761c532

> Signed-off-by: Le Ma <le.ma@amd.com<mailto:le.ma@amd.com>>

> ---

>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  2 +

>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 78 ++++++++++++++++++++++++++----

>   2 files changed, 70 insertions(+), 10 deletions(-)

>

> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h

> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

> index d120fe5..08929e6 100644

> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h

> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

> @@ -998,6 +998,8 @@ struct amdgpu_device {

>          int                                           pstate;

>          /* enable runtime pm on the device */

>          bool                            runpm;

> +

> +      bool                                        in_baco;

>   };

>

>   static inline struct amdgpu_device *amdgpu_ttm_adev(struct

> ttm_bo_device *bdev) diff --git

> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

> index bd387bb..71abfe9 100644

> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

> @@ -2654,7 +2654,13 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)

>          struct amdgpu_device *adev =

>                      container_of(__work, struct amdgpu_device, xgmi_reset_work);

>

> -       adev->asic_reset_res =  amdgpu_asic_reset(adev);

> +      if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)

> +                  adev->asic_reset_res = (adev->in_baco == false) ?

> +                                          amdgpu_device_baco_enter(adev->ddev) :

> +                                          amdgpu_device_baco_exit(adev->ddev);

> +      else

> +                  adev->asic_reset_res = amdgpu_asic_reset(adev);

> +

>          if (adev->asic_reset_res)

>                      DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",

>                                   adev->asic_reset_res, adev->ddev->unique); @@ -3796,6 +3802,7 @@

> static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,

>          struct amdgpu_device *tmp_adev = NULL;

>          bool need_full_reset = *need_full_reset_arg, vram_lost = false;

>          int r = 0;

> +      int cpu = smp_processor_id();

>

>          /*

>           * ASIC reset has to be done on all HGMI hive nodes ASAP @@

> -3803,21 +3810,24 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,

>           */

>          if (need_full_reset) {

>                      list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {

> -                               /* For XGMI run all resets in parallel to speed up the process */

> +                              /*

> +                              * For XGMI run all resets in parallel to speed up the

> +                              * process by scheduling the highpri wq on different

> +                              * cpus. For XGMI with baco reset, all nodes must enter

> +                              * baco within close proximity before anyone exit.

> +                              */

>                                  if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {

> -                                           if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work))





Note that tmp_adev->xgmi_reset_work (the work item) is per device in XGMI hive and not the same work item. So I don't see why you need to explicitly queue them on different CPUs, they should run in parallel already.



Andrey



[Le]: It’s also beyond my understanding that the 2 node reset work items scheduled to same cpu does not run in parallel. But from the experiment result in my side, the 2nd work item always run after 1st work item finished. Based on this result, I changed to queue them on different CPUs to make sure more XGMI nodes case to run in parallel, because baco requires all nodes enter baco within very close proximity.



The experiment code is as following for your reference. When card0 worker running, card1 worker is not observed to run.



The code bellow will only test that they don't run concurrently - but this doesn't mean they don't run on different CPUs and threads,I don't have an XGMI setup at hand to test this theory but what if there is some locking dependency between them that serializes their execution ? Can you just add a one line print inside amdgpu_device_xgmi_reset_func that prints CPU id, thread name/id and card number ?

Andrey

[Le]: I checked if directly use queue_work() several times, the same CPU thread will be used. And the worker per CPU will execute the item one by one. Our goal here is to make the xgmi_reset_func run concurrently for XGMI BACO case. That’s why I schedule them on different CPUs to run parallelly. And I can share the XGMI system with you if you’d like to verify more.



I tried today to setup XGMI 2P setup to test this but weren't able to load with the XGMI bridge in place (maybe faulty bridge) - so yea - maybe leave me your setup before your changes (the original code) so i can try to open some kernel traces that show CPU id and thread id to check this. It's just so weird that system_highpri_wq which is documented to be multi-cpu and multi-threaded wouldn't queue those work items to different cpus and worker threads.

Andrey





+atomic_t card0_in_baco = ATOMIC_INIT(0);

+atomic_t card1_in_baco = ATOMIC_INIT(0);

+

static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)

{

        struct amdgpu_device *adev =

                container_of(__work, struct amdgpu_device, xgmi_reset_work);



+       printk("lema1: card 0x%x goes into reset wq\n", adev->pdev->bus->number);

+       if (adev->pdev->bus->number == 0x7) {

+               atomic_set(&card1_in_baco, 1);

+               printk("lema1: card1 in baco from card1 view\n");

+       }

+

        if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)

               adev->asic_reset_res = (adev->in_baco == false) ?

                                amdgpu_device_baco_enter(adev->ddev) :

@@ -2664,6 +2673,23 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)

        if (adev->asic_reset_res)

                DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",

                         adev->asic_reset_res, adev->ddev->unique);

+

+       if (adev->pdev->bus->number == 0x4) {

+               atomic_set(&card0_in_baco, 1);

+               printk("lema1: card0 in baco from card0 view\n");

+

+               while (true)

+                       if (!!atomic_read(&card1_in_baco))

+                               break;

+               printk("lema1: card1 in baco from card0 view\n");

+       }

+

+       if (adev->pdev->bus->number == 0x7) {

+               while (true)

+                       if (!!atomic_read(&card0_in_baco))

+                               break;

+               printk("lema1: card0 in baco from card1 view\n");

+       }



> +                                          if (!queue_work_on(cpu, system_highpri_wq,

> +                                                                     &tmp_adev->xgmi_reset_work))

>                                                          r = -EALREADY;

> +                                          cpu = cpumask_next(cpu, cpu_online_mask);

>                                  } else

>                                              r = amdgpu_asic_reset(tmp_adev);

> -

> -                               if (r) {

> -                                           DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",

> -                                                       r, tmp_adev->ddev->unique);

> +                              if (r)

>                                              break;

> -                               }

>                      }

>

> -                   /* For XGMI wait for all PSP resets to complete before proceed */

> +                  /* For XGMI wait for all work to complete before proceed */

>                      if (!r) {

>                                  list_for_each_entry(tmp_adev, device_list_handle,

>                                                              gmc.xgmi.head) {

> @@ -3826,11 +3836,59 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,

>                                                          r = tmp_adev->asic_reset_res;

>                                                          if (r)

>                                                                      break;

> +                                                      if(AMD_RESET_METHOD_BACO ==

> +                                                         amdgpu_asic_reset_method(tmp_adev))

> +                                                                  tmp_adev->in_baco = true;

>                                              }

>                                  }

>                      }

> -       }

>

> +                  /*

> +                  * For XGMI with baco reset, need exit baco phase by scheduling

> +                  * xgmi_reset_work one more time. PSP reset skips this phase.

> +                  * Not assume the situation that PSP reset and baco reset

> +                  * coexist within an XGMI hive.

> +                  */

> +

> +                  if (!r) {

> +                              cpu = smp_processor_id();

> +                              list_for_each_entry(tmp_adev, device_list_handle,

> +                                                          gmc.xgmi.head) {

> +                                          if (tmp_adev->gmc.xgmi.num_physical_nodes > 1

> +                                              && AMD_RESET_METHOD_BACO ==

> +                                              amdgpu_asic_reset_method(tmp_adev)) {

> +                                                      if (!queue_work_on(cpu,

> +                                                                  system_highpri_wq,

> +                                                                  &tmp_adev->xgmi_reset_work))

> +                                                                  r = -EALREADY;

> +                                                      if (r)

> +                                                                  break;

> +                                                      cpu = cpumask_next(cpu, cpu_online_mask);

> +                                          }

> +                              }

> +                  }

> +

> +                  if (!r) {

> +                              list_for_each_entry(tmp_adev, device_list_handle,

> +                                                          gmc.xgmi.head) {

> +                                          if (tmp_adev->gmc.xgmi.num_physical_nodes > 1

> +                                              && AMD_RESET_METHOD_BACO ==

> +                                              amdgpu_asic_reset_method(tmp_adev)) {

> +                                                      flush_work(&tmp_adev->xgmi_reset_work);

> +                                                      r = tmp_adev->asic_reset_res;

> +                                                      if (r)

> +                                                                  break;

> +                                                      tmp_adev->in_baco = false;

> +                                          }

> +                              }

> +                  }

> +

> +                  if (r) {

> +                              DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",

> +                                          r, tmp_adev->ddev->unique);

> +                              goto end;

> +                  }

> +      }

>

>          list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {

>                      if (need_full_reset) {

[-- Attachment #1.2: Type: text/html, Size: 65429 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [PATCH 07/10] drm/amdgpu: add concurrent baco reset support for XGMI
  2019-12-05  3:14                           ` Ma, Le
@ 2019-12-06 21:50                             ` Andrey Grodzovsky
  2019-12-09 11:34                               ` Ma, Le
  0 siblings, 1 reply; 57+ messages in thread
From: Andrey Grodzovsky @ 2019-12-06 21:50 UTC (permalink / raw)
  To: Ma, Le, amd-gfx, Zhou1, Tao, Deucher, Alexander, Li, Dennis,
	Zhang, Hawking
  Cc: Chen, Guchun


[-- Attachment #1.1: Type: text/plain, Size: 1267 bytes --]

Hey Ma, attached a solution - it's just compiled as I still can't make 
my XGMI setup work (with bridge connected only one device is visible to 
the system while the other is not). Please try it on your system if you 
have a chance.

Andrey

On 12/4/19 10:14 PM, Ma, Le wrote:
>
> AFAIK it's enough for even single one node in the hive to to fail the 
> enter the BACO state on time to fail the entire hive reset procedure, no ?
>
> [Le]: Yeah, agree that. I’ve been thinking that make all nodes 
> entering baco simultaneously can reduce the possibility of node 
> failure to enter/exit BACO risk. For example, in an XGMI hive with 8 
> nodes, the total time interval of 8 nodes enter/exit BACO on 8 CPUs is 
> less than the interval that 8 nodes enter BACO serially and exit BACO 
> serially depending on one CPU with yield capability. This interval is 
> usually strict for BACO feature itself. Anyway, we need more looping 
> test later on any method we will choose.
>
> Any way - I see our discussion blocks your entire patch set - I think 
> you can go ahead and commit yours way (I think you got an RB from 
> Hawking) and I will look then and see if I can implement my method and 
> if it works will just revert your patch.
>
> [Le]: OK, fine.
>
> Andrey
>

[-- Attachment #1.2: Type: text/html, Size: 1963 bytes --]

[-- Attachment #2: 0001-drm-Add-Reusable-task-barrier.patch --]
[-- Type: text/x-patch, Size: 3756 bytes --]

From 1c89d4d835b9dbb6e02bdbdce903adbd12b1c115 Mon Sep 17 00:00:00 2001
From: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Date: Fri, 6 Dec 2019 12:26:33 -0500
Subject: drm: Add Reusable task barrier.

It is used to synchronize N threads at a rendevouz point before execution
of critical code that has to be started by all the threads at approximatly
the same time.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 include/drm/task_barrier.h | 97 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 97 insertions(+)
 create mode 100644 include/drm/task_barrier.h

diff --git a/include/drm/task_barrier.h b/include/drm/task_barrier.h
new file mode 100644
index 0000000..858cd7f
--- /dev/null
+++ b/include/drm/task_barrier.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright 2019 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#include <linux/semaphore.h>
+#include <linux/atomic.h>
+
+/*
+ * Reusable 2 PHASE task barrier (randevouz point) implementation for N tasks.
+ * Based on the Little book of sempahores - https://greenteapress.com/wp/semaphores/
+ */
+
+
+
+#ifndef DRM_TASK_BARRIER_H_
+#define DRM_TASK_BARRIER_H_
+
+/*
+ * Represents an instance of a task barrier.
+ */
+struct task_barrier {
+	unsigned int n;
+	atomic_t count;
+	struct semaphore enter_turnstile;
+	struct semaphore exit_turnstile;
+};
+
+static inline void task_barrier_signal_turnstile(struct semaphore *turnstile,
+					  unsigned int n) {
+	int i;
+	for (i = 0 ; i < n; i++)
+		up(turnstile);
+}
+
+static inline void task_barrier_init(struct task_barrier *tb) {
+
+	tb->n = 0;
+	atomic_set(&tb->count, 0);
+	sema_init(&tb->enter_turnstile, 0);
+	sema_init(&tb->exit_turnstile, 0);
+}
+
+static inline void task_barrier_add_task(struct task_barrier *tb) {
+	tb->n++;
+}
+
+static inline void task_barrier_rem_task(struct task_barrier *tb) {
+	tb->n--;
+}
+
+/*
+ * Lines up all the threads BEFORE the critical point.
+ *
+ * When all thread passed this code the entry barrier is back to locked state.
+ */
+static inline void task_barrier_enter(struct task_barrier *tb) {
+
+	if (atomic_inc_return(&tb->count) == tb->n)
+			task_barrier_signal_turnstile(&tb->enter_turnstile,
+						      tb->n);
+
+	down(&tb->enter_turnstile);
+}
+
+/*
+ * Lines up all the threads AFTER the critical point.
+ *
+ * This function is used to avoid any one thread running ahead of the reset if
+ * the barrier is used in a loop (repeatedly) .
+ */
+static inline void task_barrier_exit(struct task_barrier *tb) {
+	if (atomic_dec_return(&tb->count) == 0)
+			task_barrier_signal_turnstile(&tb->exit_turnstile,
+						      tb->n);
+
+	down(&tb->exit_turnstile);
+}
+
+#endif
-- 
2.7.4


[-- Attachment #3: 0002-drm-amdgpu-Add-task-barrier-to-XGMI-hive.patch --]
[-- Type: text/x-patch, Size: 2241 bytes --]

From 77d54ef864cb0c667b9396f0cdf5c66cb672f6c2 Mon Sep 17 00:00:00 2001
From: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Date: Fri, 6 Dec 2019 12:43:30 -0500
Subject: drm/amdgpu: Add task barrier to XGMI hive.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 4 ++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h | 2 ++
 2 files changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 61d13d8..5cf920d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -261,6 +261,7 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lo
 	INIT_LIST_HEAD(&tmp->device_list);
 	mutex_init(&tmp->hive_lock);
 	mutex_init(&tmp->reset_lock);
+	task_barrier_init(&tmp->tb);
 
 	if (lock)
 		mutex_lock(&tmp->hive_lock);
@@ -408,6 +409,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
 	top_info->num_nodes = count;
 	hive->number_devices = count;
 
+	task_barrier_add_task(&hive->tb);
+
 	if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
 		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
 			/* update node list for other device in the hive */
@@ -470,6 +473,7 @@ void amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
 		mutex_destroy(&hive->hive_lock);
 		mutex_destroy(&hive->reset_lock);
 	} else {
+		task_barrier_rem_task(&hive->tb);
 		amdgpu_xgmi_sysfs_rem_dev_info(adev, hive);
 		mutex_unlock(&hive->hive_lock);
 	}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
index bbf504f..74011fb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
@@ -22,6 +22,7 @@
 #ifndef __AMDGPU_XGMI_H__
 #define __AMDGPU_XGMI_H__
 
+#include <drm/task_barrier.h>
 #include "amdgpu_psp.h"
 
 struct amdgpu_hive_info {
@@ -33,6 +34,7 @@ struct amdgpu_hive_info {
 	struct device_attribute dev_attr;
 	struct amdgpu_device *adev;
 	int pstate; /*0 -- low , 1 -- high , -1 unknown*/
+	struct task_barrier tb;
 };
 
 struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock);
-- 
2.7.4


[-- Attachment #4: 0003-drm-amdgpu-Redo-concurrent-support-of-BACO-reset-for.patch --]
[-- Type: text/x-patch, Size: 6996 bytes --]

From 016717e5caeaf1355935eb4302e43396c8a494ce Mon Sep 17 00:00:00 2001
From: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Date: Fri, 6 Dec 2019 13:19:15 -0500
Subject: drm/amdgpu: Redo concurrent support of BACO reset for XGMI.

Use task barrier in XGMI hive to synchronize BACO enter/exit
across devices in XGMI hive.
This also reverts commit b01245ff54db66073b104ac9d9fbefb7b264b36d.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        |   2 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 113 +++++++++++------------------
 2 files changed, 44 insertions(+), 71 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index a78a363..50bab33 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1001,8 +1001,6 @@ struct amdgpu_device {
 
 	bool                            pm_sysfs_en;
 	bool                            ucode_sysfs_en;
-
-	bool				in_baco;
 };
 
 static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 7324a5f..bf04e81 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -67,6 +67,7 @@
 #include "amdgpu_tmz.h"
 
 #include <linux/suspend.h>
+#include <drm/task_barrier.h>
 
 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
@@ -2664,13 +2665,39 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
 	struct amdgpu_device *adev =
 		container_of(__work, struct amdgpu_device, xgmi_reset_work);
 
-	if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)
-		adev->asic_reset_res = (adev->in_baco == false) ?
-				amdgpu_device_baco_enter(adev->ddev) :
-				amdgpu_device_baco_exit(adev->ddev);
-	else
-		adev->asic_reset_res = amdgpu_asic_reset(adev);
+	/*
+	 * Use task barrier to synchronize all xgmi reset works across the
+	 * hive.
+	 * task_barrier_enter and task_barrier_exit will block untill all the
+	 * threads running the xgmi reset works reach those points. I assume
+	 * guarantee of progress here for all the threads as the workqueue code
+	 * creates new worker threads as needed by amount of work items in queue
+	 * (see worker_thread) and also each thread sleeps in the barrir and by
+	 * this yielding the CPU for other work threads to make progress.
+	 */
+	if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
+		struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
+
+		if (hive)
+			task_barrier_enter(&hive->tb);
+
+		adev->asic_reset_res = amdgpu_device_baco_enter(adev->ddev);
+
+		if (adev->asic_reset_res)
+			goto fail;
 
+		if (hive)
+			task_barrier_exit(&hive->tb);
+
+		adev->asic_reset_res = amdgpu_device_baco_exit(adev->ddev);
+
+		if (adev->asic_reset_res)
+			goto fail;
+	} else {
+		adev->asic_reset_res =  amdgpu_asic_reset(adev);
+	}
+
+fail:
 	if (adev->asic_reset_res)
 		DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
 			 adev->asic_reset_res, adev->ddev->unique);
@@ -3796,18 +3823,13 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
 	return r;
 }
 
-static int amdgpu_do_asic_reset(struct amdgpu_device *adev,
-			       struct amdgpu_hive_info *hive,
+static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
 			       struct list_head *device_list_handle,
 			       bool *need_full_reset_arg)
 {
 	struct amdgpu_device *tmp_adev = NULL;
 	bool need_full_reset = *need_full_reset_arg, vram_lost = false;
 	int r = 0;
-	int cpu = smp_processor_id();
-	bool use_baco =
-		(amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) ?
-		true : false;
 
 	/*
 	 * ASIC reset has to be done on all HGMI hive nodes ASAP
@@ -3815,62 +3837,22 @@ static int amdgpu_do_asic_reset(struct amdgpu_device *adev,
 	 */
 	if (need_full_reset) {
 		list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
-			/*
-			 * For XGMI run all resets in parallel to speed up the
-			 * process by scheduling the highpri wq on different
-			 * cpus. For XGMI with baco reset, all nodes must enter
-			 * baco within close proximity before anyone exit.
-			 */
+			/* For XGMI run all resets in parallel to speed up the process */
 			if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
-				if (!queue_work_on(cpu, system_highpri_wq,
-						   &tmp_adev->xgmi_reset_work))
+				if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work))
 					r = -EALREADY;
-				cpu = cpumask_next(cpu, cpu_online_mask);
 			} else
 				r = amdgpu_asic_reset(tmp_adev);
-			if (r)
-				break;
-		}
 
-		/* For XGMI wait for all work to complete before proceed */
-		if (!r) {
-			list_for_each_entry(tmp_adev, device_list_handle,
-					    gmc.xgmi.head) {
-				if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
-					flush_work(&tmp_adev->xgmi_reset_work);
-					r = tmp_adev->asic_reset_res;
-					if (r)
-						break;
-					if (use_baco)
-						tmp_adev->in_baco = true;
-				}
-			}
-		}
-
-		/*
-		 * For XGMI with baco reset, need exit baco phase by scheduling
-		 * xgmi_reset_work one more time. PSP reset and sGPU skips this
-		 * phase. Not assume the situation that PSP reset and baco reset
-		 * coexist within an XGMI hive.
-		 */
-
-		if (!r && use_baco) {
-			cpu = smp_processor_id();
-			list_for_each_entry(tmp_adev, device_list_handle,
-					    gmc.xgmi.head) {
-				if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
-					if (!queue_work_on(cpu,
-						system_highpri_wq,
-						&tmp_adev->xgmi_reset_work))
-						r = -EALREADY;
-					if (r)
-						break;
-					cpu = cpumask_next(cpu, cpu_online_mask);
-				}
+			if (r) {
+				DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
+					 r, tmp_adev->ddev->unique);
+				break;
 			}
 		}
 
-		if (!r && use_baco) {
+		/* For XGMI wait for all PSP resets to complete before proceed */
+		if (!r) {
 			list_for_each_entry(tmp_adev, device_list_handle,
 					    gmc.xgmi.head) {
 				if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
@@ -3878,21 +3860,15 @@ static int amdgpu_do_asic_reset(struct amdgpu_device *adev,
 					r = tmp_adev->asic_reset_res;
 					if (r)
 						break;
-					tmp_adev->in_baco = false;
 				}
 			}
 		}
-
-		if (r) {
-			DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
-				 r, tmp_adev->ddev->unique);
-			goto end;
-		}
 	}
 
 	if (!r && amdgpu_ras_intr_triggered())
 		amdgpu_ras_intr_cleared();
 
+
 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
 		if (need_full_reset) {
 			/* post card */
@@ -4181,8 +4157,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 		if (r)
 			adev->asic_reset_res = r;
 	} else {
-		r  = amdgpu_do_asic_reset(adev, hive, device_list_handle,
-					  &need_full_reset);
+		r  = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset);
 		if (r && r == -EAGAIN)
 			goto retry;
 	}
-- 
2.7.4


[-- Attachment #5: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* RE: [PATCH 07/10] drm/amdgpu: add concurrent baco reset support for XGMI
  2019-12-06 21:50                             ` Andrey Grodzovsky
@ 2019-12-09 11:34                               ` Ma, Le
  2019-12-09 15:52                                 ` Andrey Grodzovsky
  2019-12-09 22:00                                 ` Andrey Grodzovsky
  0 siblings, 2 replies; 57+ messages in thread
From: Ma, Le @ 2019-12-09 11:34 UTC (permalink / raw)
  To: Grodzovsky, Andrey, amd-gfx, Zhou1, Tao, Deucher, Alexander, Li,
	Dennis, Zhang, Hawking
  Cc: Chen, Guchun

[-- Attachment #1.1: Type: text/plain, Size: 2061 bytes --]

[AMD Official Use Only - Internal Distribution Only]

Hi Andrey,

I tried your patches on my 2P XGMI platform. The baco can work at most time, and randomly got following error:
[ 1701.542298] amdgpu: [powerplay] Failed to send message 0x25, response 0x0

This error usually means some sync issue exist for xgmi baco case. Feel free to debug your patches on my XGMI platform.

Regards,
Ma Le

From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
Sent: Saturday, December 7, 2019 5:51 AM
To: Ma, Le <Le.Ma@amd.com>; amd-gfx@lists.freedesktop.org; Zhou1, Tao <Tao.Zhou1@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Li, Dennis <Dennis.Li@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>
Cc: Chen, Guchun <Guchun.Chen@amd.com>
Subject: Re: [PATCH 07/10] drm/amdgpu: add concurrent baco reset support for XGMI

Hey Ma, attached a solution - it's just compiled as I still can't make my XGMI setup work (with bridge connected only one device is visible to the system while the other is not). Please try it on your system if you have a chance.

Andrey
On 12/4/19 10:14 PM, Ma, Le wrote:

AFAIK it's enough for even single one node in the hive to to fail the enter the BACO state on time to fail the entire hive reset procedure, no ?
[Le]: Yeah, agree that. I've been thinking that make all nodes entering baco simultaneously can reduce the possibility of node failure to enter/exit BACO risk. For example, in an XGMI hive with 8 nodes, the total time interval of 8 nodes enter/exit BACO on 8 CPUs is less than the interval that 8 nodes enter BACO serially and exit BACO serially depending on one CPU with yield capability. This interval is usually strict for BACO feature itself. Anyway, we need more looping test later on any method we will choose.

Any way - I see our discussion blocks your entire patch set - I think you can go ahead and commit yours way (I think you got an RB from Hawking) and I will look then and see if I can implement my method and if it works will just revert your patch.

[Le]: OK, fine.

Andrey

[-- Attachment #1.2: Type: text/html, Size: 6099 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 07/10] drm/amdgpu: add concurrent baco reset support for XGMI
  2019-12-09 11:34                               ` Ma, Le
@ 2019-12-09 15:52                                 ` Andrey Grodzovsky
  2019-12-10  2:45                                   ` Ma, Le
  2019-12-09 22:00                                 ` Andrey Grodzovsky
  1 sibling, 1 reply; 57+ messages in thread
From: Andrey Grodzovsky @ 2019-12-09 15:52 UTC (permalink / raw)
  To: Ma, Le, amd-gfx, Zhou1, Tao, Deucher, Alexander, Li, Dennis,
	Zhang, Hawking
  Cc: Chen, Guchun


[-- Attachment #1.1: Type: text/plain, Size: 2577 bytes --]

Thanks a lot Ma for trying - I think I have to have my own system to 
debug this so I will keep trying enabling XGMI - i still think the is 
the right and the generic solution for multiple nodes reset 
synchronization and in fact the barrier should also be used for 
synchronizing PSP mode 1 XGMI reset too.

Andrey

On 12/9/19 6:34 AM, Ma, Le wrote:
>
> [AMD Official Use Only - Internal Distribution Only]
>
>
> Hi Andrey,
>
> I tried your patches on my 2P XGMI platform. The baco can work at most 
> time, and randomly got following error:
>
> [ 1701.542298] amdgpu: [powerplay] Failed to send message 0x25, 
> response 0x0
>
> This error usually means some sync issue exist for xgmi baco case. 
> Feel free to debug your patches on my XGMI platform.
>
> Regards,
>
> Ma Le
>
> *From:*Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
> *Sent:* Saturday, December 7, 2019 5:51 AM
> *To:* Ma, Le <Le.Ma@amd.com>; amd-gfx@lists.freedesktop.org; Zhou1, 
> Tao <Tao.Zhou1@amd.com>; Deucher, Alexander 
> <Alexander.Deucher@amd.com>; Li, Dennis <Dennis.Li@amd.com>; Zhang, 
> Hawking <Hawking.Zhang@amd.com>
> *Cc:* Chen, Guchun <Guchun.Chen@amd.com>
> *Subject:* Re: [PATCH 07/10] drm/amdgpu: add concurrent baco reset 
> support for XGMI
>
> Hey Ma, attached a solution - it's just compiled as I still can't make 
> my XGMI setup work (with bridge connected only one device is visible 
> to the system while the other is not). Please try it on your system if 
> you have a chance.
>
> Andrey
>
> On 12/4/19 10:14 PM, Ma, Le wrote:
>
>     AFAIK it's enough for even single one node in the hive to to fail
>     the enter the BACO state on time to fail the entire hive reset
>     procedure, no ?
>
>     [Le]: Yeah, agree that. I’ve been thinking that make all nodes
>     entering baco simultaneously can reduce the possibility of node
>     failure to enter/exit BACO risk. For example, in an XGMI hive with
>     8 nodes, the total time interval of 8 nodes enter/exit BACO on 8
>     CPUs is less than the interval that 8 nodes enter BACO serially
>     and exit BACO serially depending on one CPU with yield capability.
>     This interval is usually strict for BACO feature itself. Anyway,
>     we need more looping test later on any method we will choose.
>
>     Any way - I see our discussion blocks your entire patch set - I
>     think you can go ahead and commit yours way (I think you got an RB
>     from Hawking) and I will look then and see if I can implement my
>     method and if it works will just revert your patch.
>
>     [Le]: OK, fine.
>
>     Andrey
>

[-- Attachment #1.2: Type: text/html, Size: 7724 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 07/10] drm/amdgpu: add concurrent baco reset support for XGMI
  2019-12-09 11:34                               ` Ma, Le
  2019-12-09 15:52                                 ` Andrey Grodzovsky
@ 2019-12-09 22:00                                 ` Andrey Grodzovsky
  2019-12-10  3:27                                   ` Ma, Le
  1 sibling, 1 reply; 57+ messages in thread
From: Andrey Grodzovsky @ 2019-12-09 22:00 UTC (permalink / raw)
  To: Ma, Le, amd-gfx, Zhou1, Tao, Deucher, Alexander, Li, Dennis,
	Zhang, Hawking
  Cc: Chen, Guchun


[-- Attachment #1.1: Type: text/plain, Size: 2556 bytes --]

I reproduced the issue on my side - i consistently  observe amdgpu: 
[powerplay] Failed to send message 0x58, response 0x0 - Baco exit 
failure - do you know what is the strict time interval within which all 
the Baco enter/Exit messages needs to be sent to all the nodes in the hive ?

Andrey

On 12/9/19 6:34 AM, Ma, Le wrote:
>
> [AMD Official Use Only - Internal Distribution Only]
>
>
> Hi Andrey,
>
> I tried your patches on my 2P XGMI platform. The baco can work at most 
> time, and randomly got following error:
>
> [ 1701.542298] amdgpu: [powerplay] Failed to send message 0x25, 
> response 0x0
>
> This error usually means some sync issue exist for xgmi baco case. 
> Feel free to debug your patches on my XGMI platform.
>
> Regards,
>
> Ma Le
>
> *From:*Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
> *Sent:* Saturday, December 7, 2019 5:51 AM
> *To:* Ma, Le <Le.Ma@amd.com>; amd-gfx@lists.freedesktop.org; Zhou1, 
> Tao <Tao.Zhou1@amd.com>; Deucher, Alexander 
> <Alexander.Deucher@amd.com>; Li, Dennis <Dennis.Li@amd.com>; Zhang, 
> Hawking <Hawking.Zhang@amd.com>
> *Cc:* Chen, Guchun <Guchun.Chen@amd.com>
> *Subject:* Re: [PATCH 07/10] drm/amdgpu: add concurrent baco reset 
> support for XGMI
>
> Hey Ma, attached a solution - it's just compiled as I still can't make 
> my XGMI setup work (with bridge connected only one device is visible 
> to the system while the other is not). Please try it on your system if 
> you have a chance.
>
> Andrey
>
> On 12/4/19 10:14 PM, Ma, Le wrote:
>
>     AFAIK it's enough for even single one node in the hive to to fail
>     the enter the BACO state on time to fail the entire hive reset
>     procedure, no ?
>
>     [Le]: Yeah, agree that. I’ve been thinking that make all nodes
>     entering baco simultaneously can reduce the possibility of node
>     failure to enter/exit BACO risk. For example, in an XGMI hive with
>     8 nodes, the total time interval of 8 nodes enter/exit BACO on 8
>     CPUs is less than the interval that 8 nodes enter BACO serially
>     and exit BACO serially depending on one CPU with yield capability.
>     This interval is usually strict for BACO feature itself. Anyway,
>     we need more looping test later on any method we will choose.
>
>     Any way - I see our discussion blocks your entire patch set - I
>     think you can go ahead and commit yours way (I think you got an RB
>     from Hawking) and I will look then and see if I can implement my
>     method and if it works will just revert your patch.
>
>     [Le]: OK, fine.
>
>     Andrey
>

[-- Attachment #1.2: Type: text/html, Size: 7708 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 57+ messages in thread

* RE: [PATCH 07/10] drm/amdgpu: add concurrent baco reset support for XGMI
  2019-12-09 15:52                                 ` Andrey Grodzovsky
@ 2019-12-10  2:45                                   ` Ma, Le
  2019-12-10 19:55                                     ` Andrey Grodzovsky
  0 siblings, 1 reply; 57+ messages in thread
From: Ma, Le @ 2019-12-10  2:45 UTC (permalink / raw)
  To: Grodzovsky, Andrey, amd-gfx, Zhou1, Tao, Deucher, Alexander, Li,
	Dennis, Zhang, Hawking
  Cc: Chen, Guchun

[-- Attachment #1.1: Type: text/plain, Size: 3290 bytes --]

[AMD Official Use Only - Internal Distribution Only]

I'm fine with your solution if synchronization time interval satisfies BACO requirements and loop test can pass on XGMI system.

Regards,
Ma Le

From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
Sent: Monday, December 9, 2019 11:52 PM
To: Ma, Le <Le.Ma@amd.com>; amd-gfx@lists.freedesktop.org; Zhou1, Tao <Tao.Zhou1@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Li, Dennis <Dennis.Li@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>
Cc: Chen, Guchun <Guchun.Chen@amd.com>
Subject: Re: [PATCH 07/10] drm/amdgpu: add concurrent baco reset support for XGMI

Thanks a lot Ma for trying - I think I have to have my own system to debug this so I will keep trying enabling XGMI - i still think the is the right and the generic solution for multiple nodes reset synchronization and in fact the barrier should also be used for synchronizing PSP mode 1 XGMI reset too.

Andrey
On 12/9/19 6:34 AM, Ma, Le wrote:

[AMD Official Use Only - Internal Distribution Only]

Hi Andrey,

I tried your patches on my 2P XGMI platform. The baco can work at most time, and randomly got following error:
[ 1701.542298] amdgpu: [powerplay] Failed to send message 0x25, response 0x0

This error usually means some sync issue exist for xgmi baco case. Feel free to debug your patches on my XGMI platform.

Regards,
Ma Le

From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com><mailto:Andrey.Grodzovsky@amd.com>
Sent: Saturday, December 7, 2019 5:51 AM
To: Ma, Le <Le.Ma@amd.com><mailto:Le.Ma@amd.com>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>; Zhou1, Tao <Tao.Zhou1@amd.com><mailto:Tao.Zhou1@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com><mailto:Alexander.Deucher@amd.com>; Li, Dennis <Dennis.Li@amd.com><mailto:Dennis.Li@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com><mailto:Hawking.Zhang@amd.com>
Cc: Chen, Guchun <Guchun.Chen@amd.com><mailto:Guchun.Chen@amd.com>
Subject: Re: [PATCH 07/10] drm/amdgpu: add concurrent baco reset support for XGMI

Hey Ma, attached a solution - it's just compiled as I still can't make my XGMI setup work (with bridge connected only one device is visible to the system while the other is not). Please try it on your system if you have a chance.

Andrey
On 12/4/19 10:14 PM, Ma, Le wrote:

AFAIK it's enough for even single one node in the hive to to fail the enter the BACO state on time to fail the entire hive reset procedure, no ?
[Le]: Yeah, agree that. I've been thinking that make all nodes entering baco simultaneously can reduce the possibility of node failure to enter/exit BACO risk. For example, in an XGMI hive with 8 nodes, the total time interval of 8 nodes enter/exit BACO on 8 CPUs is less than the interval that 8 nodes enter BACO serially and exit BACO serially depending on one CPU with yield capability. This interval is usually strict for BACO feature itself. Anyway, we need more looping test later on any method we will choose.

Any way - I see our discussion blocks your entire patch set - I think you can go ahead and commit yours way (I think you got an RB from Hawking) and I will look then and see if I can implement my method and if it works will just revert your patch.

[Le]: OK, fine.

Andrey

[-- Attachment #1.2: Type: text/html, Size: 9188 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 57+ messages in thread

* RE: [PATCH 07/10] drm/amdgpu: add concurrent baco reset support for XGMI
  2019-12-09 22:00                                 ` Andrey Grodzovsky
@ 2019-12-10  3:27                                   ` Ma, Le
  0 siblings, 0 replies; 57+ messages in thread
From: Ma, Le @ 2019-12-10  3:27 UTC (permalink / raw)
  To: Grodzovsky, Andrey, amd-gfx, Zhou1, Tao, Deucher, Alexander, Li,
	Dennis, Zhang, Hawking
  Cc: Chen, Guchun

[-- Attachment #1.1: Type: text/plain, Size: 3480 bytes --]

[AMD Official Use Only - Internal Distribution Only]

Not sure it's same issue as I observed.

If you have an XGMI setup, use the latest drm-next and the PMFW I used on my XGMI system(I just sent you the vega20_smc.bin through mail). And then give another attempt.

About the strict time interval, I remember the XGMI node EnterBaco message will fail when interval is around millisecond.

Regards,
Ma Le

From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
Sent: Tuesday, December 10, 2019 6:01 AM
To: Ma, Le <Le.Ma@amd.com>; amd-gfx@lists.freedesktop.org; Zhou1, Tao <Tao.Zhou1@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Li, Dennis <Dennis.Li@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>
Cc: Chen, Guchun <Guchun.Chen@amd.com>
Subject: Re: [PATCH 07/10] drm/amdgpu: add concurrent baco reset support for XGMI

I reproduced the issue on my side - i consistently  observe amdgpu: [powerplay] Failed to send message 0x58, response 0x0 - Baco exit failure - do you know what is the strict time interval within which all the Baco enter/Exit messages needs to be sent to all the nodes in the hive ?

Andrey
On 12/9/19 6:34 AM, Ma, Le wrote:

[AMD Official Use Only - Internal Distribution Only]

Hi Andrey,

I tried your patches on my 2P XGMI platform. The baco can work at most time, and randomly got following error:
[ 1701.542298] amdgpu: [powerplay] Failed to send message 0x25, response 0x0

This error usually means some sync issue exist for xgmi baco case. Feel free to debug your patches on my XGMI platform.

Regards,
Ma Le

From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com><mailto:Andrey.Grodzovsky@amd.com>
Sent: Saturday, December 7, 2019 5:51 AM
To: Ma, Le <Le.Ma@amd.com><mailto:Le.Ma@amd.com>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>; Zhou1, Tao <Tao.Zhou1@amd.com><mailto:Tao.Zhou1@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com><mailto:Alexander.Deucher@amd.com>; Li, Dennis <Dennis.Li@amd.com><mailto:Dennis.Li@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com><mailto:Hawking.Zhang@amd.com>
Cc: Chen, Guchun <Guchun.Chen@amd.com><mailto:Guchun.Chen@amd.com>
Subject: Re: [PATCH 07/10] drm/amdgpu: add concurrent baco reset support for XGMI

Hey Ma, attached a solution - it's just compiled as I still can't make my XGMI setup work (with bridge connected only one device is visible to the system while the other is not). Please try it on your system if you have a chance.

Andrey
On 12/4/19 10:14 PM, Ma, Le wrote:

AFAIK it's enough for even single one node in the hive to to fail the enter the BACO state on time to fail the entire hive reset procedure, no ?
[Le]: Yeah, agree that. I've been thinking that make all nodes entering baco simultaneously can reduce the possibility of node failure to enter/exit BACO risk. For example, in an XGMI hive with 8 nodes, the total time interval of 8 nodes enter/exit BACO on 8 CPUs is less than the interval that 8 nodes enter BACO serially and exit BACO serially depending on one CPU with yield capability. This interval is usually strict for BACO feature itself. Anyway, we need more looping test later on any method we will choose.

Any way - I see our discussion blocks your entire patch set - I think you can go ahead and commit yours way (I think you got an RB from Hawking) and I will look then and see if I can implement my method and if it works will just revert your patch.

[Le]: OK, fine.

Andrey

[-- Attachment #1.2: Type: text/html, Size: 10005 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 07/10] drm/amdgpu: add concurrent baco reset support for XGMI
  2019-12-10  2:45                                   ` Ma, Le
@ 2019-12-10 19:55                                     ` Andrey Grodzovsky
  2019-12-11 12:18                                       ` Ma, Le
  0 siblings, 1 reply; 57+ messages in thread
From: Andrey Grodzovsky @ 2019-12-10 19:55 UTC (permalink / raw)
  To: Ma, Le, amd-gfx, Zhou1, Tao, Deucher, Alexander, Li, Dennis,
	Zhang, Hawking
  Cc: Chen, Guchun


[-- Attachment #1.1: Type: text/plain, Size: 5404 bytes --]

I switched the workqueue we were using for xgmi_reset_work from 
system_highpri_wq to system_unbound_wq - the difference is that workers 
servicing the queue in system_unbound_wq are not bounded to specific CPU 
and so the reset jobs for each XGMI node are getting scheduled to 
different CPU while system_highpri_wq is a bounded work queue. I traced 
it as bellow for 10 consecutive times and didn't see errors any more. 
Also the time diff between BACO entries or exits was never more then 
around 2 uS.

Please give this updated patchset a try

    kworker/u16:2-57    [004] ...1   243.276312: trace_code: func: 
vega20_baco_set_state, line 91 <----- - Before BEACO enter
            <...>-60    [007] ...1   243.276312: trace_code: func: 
vega20_baco_set_state, line 91 <----- - Before BEACO enter
    kworker/u16:2-57    [004] ...1   243.276384: trace_code: func: 
vega20_baco_set_state, line 105 <----- - After BEACO enter done
            <...>-60    [007] ...1   243.276392: trace_code: func: 
vega20_baco_set_state, line 105 <----- - After BEACO enter done
    kworker/u16:3-60    [007] ...1   243.276397: trace_code: func: 
vega20_baco_set_state, line 108 <----- - Before BEACO exit
    kworker/u16:2-57    [004] ...1   243.276399: trace_code: func: 
vega20_baco_set_state, line 108 <----- - Before BEACO exit
    kworker/u16:3-60    [007] ...1   243.288067: trace_code: func: 
vega20_baco_set_state, line 114 <----- - After BEACO exit done
    kworker/u16:2-57    [004] ...1   243.295624: trace_code: func: 
vega20_baco_set_state, line 114 <----- - After BEACO exit done

Andrey

On 12/9/19 9:45 PM, Ma, Le wrote:
>
> [AMD Official Use Only - Internal Distribution Only]
>
>
> I’m fine with your solution if synchronization time interval satisfies 
> BACO requirements and loop test can pass on XGMI system.
>
> Regards,
>
> Ma Le
>
> *From:*Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
> *Sent:* Monday, December 9, 2019 11:52 PM
> *To:* Ma, Le <Le.Ma@amd.com>; amd-gfx@lists.freedesktop.org; Zhou1, 
> Tao <Tao.Zhou1@amd.com>; Deucher, Alexander 
> <Alexander.Deucher@amd.com>; Li, Dennis <Dennis.Li@amd.com>; Zhang, 
> Hawking <Hawking.Zhang@amd.com>
> *Cc:* Chen, Guchun <Guchun.Chen@amd.com>
> *Subject:* Re: [PATCH 07/10] drm/amdgpu: add concurrent baco reset 
> support for XGMI
>
> Thanks a lot Ma for trying - I think I have to have my own system to 
> debug this so I will keep trying enabling XGMI - i still think the is 
> the right and the generic solution for multiple nodes reset 
> synchronization and in fact the barrier should also be used for 
> synchronizing PSP mode 1 XGMI reset too.
>
> Andrey
>
> On 12/9/19 6:34 AM, Ma, Le wrote:
>
>     [AMD Official Use Only - Internal Distribution Only]
>
>     Hi Andrey,
>
>     I tried your patches on my 2P XGMI platform. The baco can work at
>     most time, and randomly got following error:
>
>     [ 1701.542298] amdgpu: [powerplay] Failed to send message 0x25,
>     response 0x0
>
>     This error usually means some sync issue exist for xgmi baco case.
>     Feel free to debug your patches on my XGMI platform.
>
>     Regards,
>
>     Ma Le
>
>     *From:*Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>     <mailto:Andrey.Grodzovsky@amd.com>
>     *Sent:* Saturday, December 7, 2019 5:51 AM
>     *To:* Ma, Le <Le.Ma@amd.com> <mailto:Le.Ma@amd.com>;
>     amd-gfx@lists.freedesktop.org
>     <mailto:amd-gfx@lists.freedesktop.org>; Zhou1, Tao
>     <Tao.Zhou1@amd.com> <mailto:Tao.Zhou1@amd.com>; Deucher, Alexander
>     <Alexander.Deucher@amd.com> <mailto:Alexander.Deucher@amd.com>;
>     Li, Dennis <Dennis.Li@amd.com> <mailto:Dennis.Li@amd.com>; Zhang,
>     Hawking <Hawking.Zhang@amd.com> <mailto:Hawking.Zhang@amd.com>
>     *Cc:* Chen, Guchun <Guchun.Chen@amd.com> <mailto:Guchun.Chen@amd.com>
>     *Subject:* Re: [PATCH 07/10] drm/amdgpu: add concurrent baco reset
>     support for XGMI
>
>     Hey Ma, attached a solution - it's just compiled as I still can't
>     make my XGMI setup work (with bridge connected only one device is
>     visible to the system while the other is not). Please try it on
>     your system if you have a chance.
>
>     Andrey
>
>     On 12/4/19 10:14 PM, Ma, Le wrote:
>
>         AFAIK it's enough for even single one node in the hive to to
>         fail the enter the BACO state on time to fail the entire hive
>         reset procedure, no ?
>
>         [Le]: Yeah, agree that. I’ve been thinking that make all nodes
>         entering baco simultaneously can reduce the possibility of
>         node failure to enter/exit BACO risk. For example, in an XGMI
>         hive with 8 nodes, the total time interval of 8 nodes
>         enter/exit BACO on 8 CPUs is less than the interval that 8
>         nodes enter BACO serially and exit BACO serially depending on
>         one CPU with yield capability. This interval is usually strict
>         for BACO feature itself. Anyway, we need more looping test
>         later on any method we will choose.
>
>         Any way - I see our discussion blocks your entire patch set -
>         I think you can go ahead and commit yours way (I think you got
>         an RB from Hawking) and I will look then and see if I can
>         implement my method and if it works will just revert your patch.
>
>         [Le]: OK, fine.
>
>         Andrey
>

[-- Attachment #1.2: Type: text/html, Size: 13386 bytes --]

[-- Attachment #2: 0003-drm-amdgpu-Redo-concurrent-support-of-BACO-reset-for.patch --]
[-- Type: text/x-patch, Size: 7090 bytes --]

From 8fcefad4194358ad55aba815cab437459f4bb0e4 Mon Sep 17 00:00:00 2001
From: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Date: Fri, 6 Dec 2019 13:19:15 -0500
Subject: drm/amdgpu: Redo concurrent support of BACO reset for XGMI V2

Use task barrier in XGMI hive to synchronize BACO enter/exit
across devices in XGMI hive.
This also reverts commit b01245ff54db66073b104ac9d9fbefb7b264b36d.

v2: Switch from system_highpri_wq to system_unbound_wq to avoid
queueing jobs to same CPU.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        |   2 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 113 +++++++++++------------------
 2 files changed, 44 insertions(+), 71 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index a78a363..50bab33 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1001,8 +1001,6 @@ struct amdgpu_device {
 
 	bool                            pm_sysfs_en;
 	bool                            ucode_sysfs_en;
-
-	bool				in_baco;
 };
 
 static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 7324a5f..e2b4882 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -67,6 +67,7 @@
 #include "amdgpu_tmz.h"
 
 #include <linux/suspend.h>
+#include <drm/task_barrier.h>
 
 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
@@ -2664,13 +2665,39 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
 	struct amdgpu_device *adev =
 		container_of(__work, struct amdgpu_device, xgmi_reset_work);
 
-	if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)
-		adev->asic_reset_res = (adev->in_baco == false) ?
-				amdgpu_device_baco_enter(adev->ddev) :
-				amdgpu_device_baco_exit(adev->ddev);
-	else
-		adev->asic_reset_res = amdgpu_asic_reset(adev);
+	/*
+	 * Use task barrier to synchronize all xgmi reset works across the
+	 * hive.
+	 * task_barrier_enter and task_barrier_exit will block untill all the
+	 * threads running the xgmi reset works reach those points. I assume
+	 * guarantee of progress here for all the threads as the workqueue code
+	 * creates new worker threads as needed by amount of work items in queue
+	 * (see worker_thread) and also each thread sleeps in the barrir and by
+	 * this yielding the CPU for other work threads to make progress.
+	 */
+	if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
+		struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
+
+		if (hive)
+			task_barrier_enter(&hive->tb);
+
+		adev->asic_reset_res = amdgpu_device_baco_enter(adev->ddev);
+
+		if (adev->asic_reset_res)
+			goto fail;
 
+		if (hive)
+			task_barrier_exit(&hive->tb);
+
+		adev->asic_reset_res = amdgpu_device_baco_exit(adev->ddev);
+
+		if (adev->asic_reset_res)
+			goto fail;
+	} else {
+		adev->asic_reset_res =  amdgpu_asic_reset(adev);
+	}
+
+fail:
 	if (adev->asic_reset_res)
 		DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
 			 adev->asic_reset_res, adev->ddev->unique);
@@ -3796,18 +3823,13 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
 	return r;
 }
 
-static int amdgpu_do_asic_reset(struct amdgpu_device *adev,
-			       struct amdgpu_hive_info *hive,
+static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
 			       struct list_head *device_list_handle,
 			       bool *need_full_reset_arg)
 {
 	struct amdgpu_device *tmp_adev = NULL;
 	bool need_full_reset = *need_full_reset_arg, vram_lost = false;
 	int r = 0;
-	int cpu = smp_processor_id();
-	bool use_baco =
-		(amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) ?
-		true : false;
 
 	/*
 	 * ASIC reset has to be done on all HGMI hive nodes ASAP
@@ -3815,62 +3837,22 @@ static int amdgpu_do_asic_reset(struct amdgpu_device *adev,
 	 */
 	if (need_full_reset) {
 		list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
-			/*
-			 * For XGMI run all resets in parallel to speed up the
-			 * process by scheduling the highpri wq on different
-			 * cpus. For XGMI with baco reset, all nodes must enter
-			 * baco within close proximity before anyone exit.
-			 */
+			/* For XGMI run all resets in parallel to speed up the process */
 			if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
-				if (!queue_work_on(cpu, system_highpri_wq,
-						   &tmp_adev->xgmi_reset_work))
+				if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
 					r = -EALREADY;
-				cpu = cpumask_next(cpu, cpu_online_mask);
 			} else
 				r = amdgpu_asic_reset(tmp_adev);
-			if (r)
-				break;
-		}
 
-		/* For XGMI wait for all work to complete before proceed */
-		if (!r) {
-			list_for_each_entry(tmp_adev, device_list_handle,
-					    gmc.xgmi.head) {
-				if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
-					flush_work(&tmp_adev->xgmi_reset_work);
-					r = tmp_adev->asic_reset_res;
-					if (r)
-						break;
-					if (use_baco)
-						tmp_adev->in_baco = true;
-				}
-			}
-		}
-
-		/*
-		 * For XGMI with baco reset, need exit baco phase by scheduling
-		 * xgmi_reset_work one more time. PSP reset and sGPU skips this
-		 * phase. Not assume the situation that PSP reset and baco reset
-		 * coexist within an XGMI hive.
-		 */
-
-		if (!r && use_baco) {
-			cpu = smp_processor_id();
-			list_for_each_entry(tmp_adev, device_list_handle,
-					    gmc.xgmi.head) {
-				if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
-					if (!queue_work_on(cpu,
-						system_highpri_wq,
-						&tmp_adev->xgmi_reset_work))
-						r = -EALREADY;
-					if (r)
-						break;
-					cpu = cpumask_next(cpu, cpu_online_mask);
-				}
+			if (r) {
+				DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
+					 r, tmp_adev->ddev->unique);
+				break;
 			}
 		}
 
-		if (!r && use_baco) {
+		/* For XGMI wait for all PSP resets to complete before proceed */
+		if (!r) {
 			list_for_each_entry(tmp_adev, device_list_handle,
 					    gmc.xgmi.head) {
 				if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
@@ -3878,21 +3860,15 @@ static int amdgpu_do_asic_reset(struct amdgpu_device *adev,
 					r = tmp_adev->asic_reset_res;
 					if (r)
 						break;
-					tmp_adev->in_baco = false;
 				}
 			}
 		}
-
-		if (r) {
-			DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
-				 r, tmp_adev->ddev->unique);
-			goto end;
-		}
 	}
 
 	if (!r && amdgpu_ras_intr_triggered())
 		amdgpu_ras_intr_cleared();
 
+
 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
 		if (need_full_reset) {
 			/* post card */
@@ -4181,8 +4157,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 		if (r)
 			adev->asic_reset_res = r;
 	} else {
-		r  = amdgpu_do_asic_reset(adev, hive, device_list_handle,
-					  &need_full_reset);
+		r  = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset);
 		if (r && r == -EAGAIN)
 			goto retry;
 	}
-- 
2.7.4


[-- Attachment #3: 0002-drm-amdgpu-Add-task-barrier-to-XGMI-hive.patch --]
[-- Type: text/x-patch, Size: 2241 bytes --]

From da9d5b4ceb7b0f985574617acae71261f9006238 Mon Sep 17 00:00:00 2001
From: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Date: Fri, 6 Dec 2019 12:43:30 -0500
Subject: drm/amdgpu: Add task barrier to XGMI hive.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 4 ++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h | 2 ++
 2 files changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 61d13d8..5cf920d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -261,6 +261,7 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lo
 	INIT_LIST_HEAD(&tmp->device_list);
 	mutex_init(&tmp->hive_lock);
 	mutex_init(&tmp->reset_lock);
+	task_barrier_init(&tmp->tb);
 
 	if (lock)
 		mutex_lock(&tmp->hive_lock);
@@ -408,6 +409,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
 	top_info->num_nodes = count;
 	hive->number_devices = count;
 
+	task_barrier_add_task(&hive->tb);
+
 	if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
 		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
 			/* update node list for other device in the hive */
@@ -470,6 +473,7 @@ void amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
 		mutex_destroy(&hive->hive_lock);
 		mutex_destroy(&hive->reset_lock);
 	} else {
+		task_barrier_rem_task(&hive->tb);
 		amdgpu_xgmi_sysfs_rem_dev_info(adev, hive);
 		mutex_unlock(&hive->hive_lock);
 	}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
index bbf504f..74011fb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
@@ -22,6 +22,7 @@
 #ifndef __AMDGPU_XGMI_H__
 #define __AMDGPU_XGMI_H__
 
+#include <drm/task_barrier.h>
 #include "amdgpu_psp.h"
 
 struct amdgpu_hive_info {
@@ -33,6 +34,7 @@ struct amdgpu_hive_info {
 	struct device_attribute dev_attr;
 	struct amdgpu_device *adev;
 	int pstate; /*0 -- low , 1 -- high , -1 unknown*/
+	struct task_barrier tb;
 };
 
 struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock);
-- 
2.7.4


[-- Attachment #4: 0001-drm-Add-Reusable-task-barrier.patch --]
[-- Type: text/x-patch, Size: 3756 bytes --]

From 34438a766a83002057ac051e3efdcc63eda36f52 Mon Sep 17 00:00:00 2001
From: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Date: Fri, 6 Dec 2019 12:26:33 -0500
Subject: drm: Add Reusable task barrier.

It is used to synchronize N threads at a rendevouz point before execution
of critical code that has to be started by all the threads at approximatly
the same time.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 include/drm/task_barrier.h | 97 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 97 insertions(+)
 create mode 100644 include/drm/task_barrier.h

diff --git a/include/drm/task_barrier.h b/include/drm/task_barrier.h
new file mode 100644
index 0000000..858cd7f
--- /dev/null
+++ b/include/drm/task_barrier.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright 2019 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#include <linux/semaphore.h>
+#include <linux/atomic.h>
+
+/*
+ * Reusable 2 PHASE task barrier (randevouz point) implementation for N tasks.
+ * Based on the Little book of sempahores - https://greenteapress.com/wp/semaphores/
+ */
+
+
+
+#ifndef DRM_TASK_BARRIER_H_
+#define DRM_TASK_BARRIER_H_
+
+/*
+ * Represents an instance of a task barrier.
+ */
+struct task_barrier {
+	unsigned int n;
+	atomic_t count;
+	struct semaphore enter_turnstile;
+	struct semaphore exit_turnstile;
+};
+
+static inline void task_barrier_signal_turnstile(struct semaphore *turnstile,
+					  unsigned int n) {
+	int i;
+	for (i = 0 ; i < n; i++)
+		up(turnstile);
+}
+
+static inline void task_barrier_init(struct task_barrier *tb) {
+
+	tb->n = 0;
+	atomic_set(&tb->count, 0);
+	sema_init(&tb->enter_turnstile, 0);
+	sema_init(&tb->exit_turnstile, 0);
+}
+
+static inline void task_barrier_add_task(struct task_barrier *tb) {
+	tb->n++;
+}
+
+static inline void task_barrier_rem_task(struct task_barrier *tb) {
+	tb->n--;
+}
+
+/*
+ * Lines up all the threads BEFORE the critical point.
+ *
+ * When all thread passed this code the entry barrier is back to locked state.
+ */
+static inline void task_barrier_enter(struct task_barrier *tb) {
+
+	if (atomic_inc_return(&tb->count) == tb->n)
+			task_barrier_signal_turnstile(&tb->enter_turnstile,
+						      tb->n);
+
+	down(&tb->enter_turnstile);
+}
+
+/*
+ * Lines up all the threads AFTER the critical point.
+ *
+ * This function is used to avoid any one thread running ahead of the reset if
+ * the barrier is used in a loop (repeatedly) .
+ */
+static inline void task_barrier_exit(struct task_barrier *tb) {
+	if (atomic_dec_return(&tb->count) == 0)
+			task_barrier_signal_turnstile(&tb->exit_turnstile,
+						      tb->n);
+
+	down(&tb->exit_turnstile);
+}
+
+#endif
-- 
2.7.4


[-- Attachment #5: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* RE: [PATCH 07/10] drm/amdgpu: add concurrent baco reset support for XGMI
  2019-12-10 19:55                                     ` Andrey Grodzovsky
@ 2019-12-11 12:18                                       ` Ma, Le
  2019-12-11 14:04                                         ` Andrey Grodzovsky
  0 siblings, 1 reply; 57+ messages in thread
From: Ma, Le @ 2019-12-11 12:18 UTC (permalink / raw)
  To: Grodzovsky, Andrey, amd-gfx, Zhou1, Tao, Deucher, Alexander, Li,
	Dennis, Zhang, Hawking
  Cc: Chen, Guchun

[-- Attachment #1.1: Type: text/plain, Size: 6324 bytes --]

[AMD Official Use Only - Internal Distribution Only]

I tried your new patches to run BACO for about 10 loops and the result looks positive, without observing enter/exit baco message failure again.

The time interval between BACO entries or exits in my environment was almost less than 10 us: max 36us, min 2us. I think it's safe enough according to the sample data we collected in both sides.

And it looks not necessary to continue using system_highpri_wq any more because we require all the nodes enter or exit at the same time, while do not mind how long the time interval is b/t enter and exit. The system_unbound_wq can satisfy our requirement here since it wakes different CPUs up to work at the same time.

Regards,
Ma Le

From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
Sent: Wednesday, December 11, 2019 3:56 AM
To: Ma, Le <Le.Ma@amd.com>; amd-gfx@lists.freedesktop.org; Zhou1, Tao <Tao.Zhou1@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Li, Dennis <Dennis.Li@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>
Cc: Chen, Guchun <Guchun.Chen@amd.com>
Subject: Re: [PATCH 07/10] drm/amdgpu: add concurrent baco reset support for XGMI

I switched the workqueue we were using for xgmi_reset_work from system_highpri_wq to system_unbound_wq - the difference is that workers servicing the queue in system_unbound_wq are not bounded to specific CPU and so the reset jobs for each XGMI node are getting scheduled to different CPU while system_highpri_wq is a bounded work queue. I traced it as bellow for 10 consecutive times and didn't see errors any more. Also the time diff between BACO entries or exits was never more then around 2 uS.

Please give this updated patchset a try

   kworker/u16:2-57    [004] ...1   243.276312: trace_code: func: vega20_baco_set_state, line 91 <----- - Before BEACO enter
           <...>-60    [007] ...1   243.276312: trace_code: func: vega20_baco_set_state, line 91 <----- - Before BEACO enter
   kworker/u16:2-57    [004] ...1   243.276384: trace_code: func: vega20_baco_set_state, line 105 <----- - After BEACO enter done
           <...>-60    [007] ...1   243.276392: trace_code: func: vega20_baco_set_state, line 105 <----- - After BEACO enter done
   kworker/u16:3-60    [007] ...1   243.276397: trace_code: func: vega20_baco_set_state, line 108 <----- - Before BEACO exit
   kworker/u16:2-57    [004] ...1   243.276399: trace_code: func: vega20_baco_set_state, line 108 <----- - Before BEACO exit
   kworker/u16:3-60    [007] ...1   243.288067: trace_code: func: vega20_baco_set_state, line 114 <----- - After BEACO exit done
   kworker/u16:2-57    [004] ...1   243.295624: trace_code: func: vega20_baco_set_state, line 114 <----- - After BEACO exit done

Andrey
On 12/9/19 9:45 PM, Ma, Le wrote:

[AMD Official Use Only - Internal Distribution Only]

I'm fine with your solution if synchronization time interval satisfies BACO requirements and loop test can pass on XGMI system.

Regards,
Ma Le

From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com><mailto:Andrey.Grodzovsky@amd.com>
Sent: Monday, December 9, 2019 11:52 PM
To: Ma, Le <Le.Ma@amd.com><mailto:Le.Ma@amd.com>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>; Zhou1, Tao <Tao.Zhou1@amd.com><mailto:Tao.Zhou1@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com><mailto:Alexander.Deucher@amd.com>; Li, Dennis <Dennis.Li@amd.com><mailto:Dennis.Li@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com><mailto:Hawking.Zhang@amd.com>
Cc: Chen, Guchun <Guchun.Chen@amd.com><mailto:Guchun.Chen@amd.com>
Subject: Re: [PATCH 07/10] drm/amdgpu: add concurrent baco reset support for XGMI

Thanks a lot Ma for trying - I think I have to have my own system to debug this so I will keep trying enabling XGMI - i still think the is the right and the generic solution for multiple nodes reset synchronization and in fact the barrier should also be used for synchronizing PSP mode 1 XGMI reset too.

Andrey
On 12/9/19 6:34 AM, Ma, Le wrote:

[AMD Official Use Only - Internal Distribution Only]

Hi Andrey,

I tried your patches on my 2P XGMI platform. The baco can work at most time, and randomly got following error:
[ 1701.542298] amdgpu: [powerplay] Failed to send message 0x25, response 0x0

This error usually means some sync issue exist for xgmi baco case. Feel free to debug your patches on my XGMI platform.

Regards,
Ma Le

From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com><mailto:Andrey.Grodzovsky@amd.com>
Sent: Saturday, December 7, 2019 5:51 AM
To: Ma, Le <Le.Ma@amd.com><mailto:Le.Ma@amd.com>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>; Zhou1, Tao <Tao.Zhou1@amd.com><mailto:Tao.Zhou1@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com><mailto:Alexander.Deucher@amd.com>; Li, Dennis <Dennis.Li@amd.com><mailto:Dennis.Li@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com><mailto:Hawking.Zhang@amd.com>
Cc: Chen, Guchun <Guchun.Chen@amd.com><mailto:Guchun.Chen@amd.com>
Subject: Re: [PATCH 07/10] drm/amdgpu: add concurrent baco reset support for XGMI

Hey Ma, attached a solution - it's just compiled as I still can't make my XGMI setup work (with bridge connected only one device is visible to the system while the other is not). Please try it on your system if you have a chance.

Andrey
On 12/4/19 10:14 PM, Ma, Le wrote:

AFAIK it's enough for even single one node in the hive to to fail the enter the BACO state on time to fail the entire hive reset procedure, no ?
[Le]: Yeah, agree that. I've been thinking that make all nodes entering baco simultaneously can reduce the possibility of node failure to enter/exit BACO risk. For example, in an XGMI hive with 8 nodes, the total time interval of 8 nodes enter/exit BACO on 8 CPUs is less than the interval that 8 nodes enter BACO serially and exit BACO serially depending on one CPU with yield capability. This interval is usually strict for BACO feature itself. Anyway, we need more looping test later on any method we will choose.

Any way - I see our discussion blocks your entire patch set - I think you can go ahead and commit yours way (I think you got an RB from Hawking) and I will look then and see if I can implement my method and if it works will just revert your patch.

[Le]: OK, fine.

Andrey

[-- Attachment #1.2: Type: text/html, Size: 14658 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 07/10] drm/amdgpu: add concurrent baco reset support for XGMI
  2019-12-11 12:18                                       ` Ma, Le
@ 2019-12-11 14:04                                         ` Andrey Grodzovsky
  0 siblings, 0 replies; 57+ messages in thread
From: Andrey Grodzovsky @ 2019-12-11 14:04 UTC (permalink / raw)
  To: Ma, Le, amd-gfx, Zhou1, Tao, Deucher, Alexander, Li, Dennis,
	Zhang, Hawking
  Cc: Chen, Guchun


[-- Attachment #1.1: Type: text/plain, Size: 7437 bytes --]

Great! I will update the patches to also use the barrier in PSP MODE 1 
reset case and resend the patches for formal review.

Andrey

On 12/11/19 7:18 AM, Ma, Le wrote:
>
> [AMD Official Use Only - Internal Distribution Only]
>
> I tried your new patches to run BACO for about 10 loops and the result 
> looks positive, without observing enter/exit baco message failure again.
>
> The time interval between BACO entries or exits in my environment was 
> almost less than 10 us: max 36us, min 2us. I think it’s safe enough 
> according to the sample data we collected in both sides.
>
> And it looks not necessary to continue using system_highpri_wq any 
> more because we require all the nodes enter or exit at the same time, 
> while do not mind how long the time interval is b/t enter and exit. 
> The system_unbound_wq can satisfy our requirement here since it wakes 
> different CPUs up to work at the same time.
>
> Regards,
>
> Ma Le
>
> *From:*Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
> *Sent:* Wednesday, December 11, 2019 3:56 AM
> *To:* Ma, Le <Le.Ma@amd.com>; amd-gfx@lists.freedesktop.org; Zhou1, 
> Tao <Tao.Zhou1@amd.com>; Deucher, Alexander 
> <Alexander.Deucher@amd.com>; Li, Dennis <Dennis.Li@amd.com>; Zhang, 
> Hawking <Hawking.Zhang@amd.com>
> *Cc:* Chen, Guchun <Guchun.Chen@amd.com>
> *Subject:* Re: [PATCH 07/10] drm/amdgpu: add concurrent baco reset 
> support for XGMI
>
> I switched the workqueue we were using for xgmi_reset_work from 
> system_highpri_wq to system_unbound_wq - the difference is that 
> workers servicing the queue in system_unbound_wq are not bounded to 
> specific CPU and so the reset jobs for each XGMI node are getting 
> scheduled to different CPU while system_highpri_wq is a bounded work 
> queue. I traced it as bellow for 10 consecutive times and didn't see 
> errors any more. Also the time diff between BACO entries or exits was 
> never more then around 2 uS.
>
> Please give this updated patchset a try
>
>    kworker/u16:2-57    [004] ...1   243.276312: trace_code: func: 
> vega20_baco_set_state, line 91 <----- - Before BEACO enter
>            <...>-60    [007] ...1   243.276312: trace_code: func: 
> vega20_baco_set_state, line 91 <----- - Before BEACO enter
>    kworker/u16:2-57    [004] ...1   243.276384: trace_code: func: 
> vega20_baco_set_state, line 105 <----- - After BEACO enter done
>            <...>-60    [007] ...1   243.276392: trace_code: func: 
> vega20_baco_set_state, line 105 <----- - After BEACO enter done
>    kworker/u16:3-60    [007] ...1   243.276397: trace_code: func: 
> vega20_baco_set_state, line 108 <----- - Before BEACO exit
>    kworker/u16:2-57    [004] ...1   243.276399: trace_code: func: 
> vega20_baco_set_state, line 108 <----- - Before BEACO exit
>    kworker/u16:3-60    [007] ...1   243.288067: trace_code: func: 
> vega20_baco_set_state, line 114 <----- - After BEACO exit done
>    kworker/u16:2-57    [004] ...1   243.295624: trace_code: func: 
> vega20_baco_set_state, line 114 <----- - After BEACO exit done
>
> Andrey
>
> On 12/9/19 9:45 PM, Ma, Le wrote:
>
>     [AMD Official Use Only - Internal Distribution Only]
>
>     I’m fine with your solution if synchronization time interval
>     satisfies BACO requirements and loop test can pass on XGMI system.
>
>     Regards,
>
>     Ma Le
>
>     *From:*Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>     <mailto:Andrey.Grodzovsky@amd.com>
>     *Sent:* Monday, December 9, 2019 11:52 PM
>     *To:* Ma, Le <Le.Ma@amd.com> <mailto:Le.Ma@amd.com>;
>     amd-gfx@lists.freedesktop.org
>     <mailto:amd-gfx@lists.freedesktop.org>; Zhou1, Tao
>     <Tao.Zhou1@amd.com> <mailto:Tao.Zhou1@amd.com>; Deucher, Alexander
>     <Alexander.Deucher@amd.com> <mailto:Alexander.Deucher@amd.com>;
>     Li, Dennis <Dennis.Li@amd.com> <mailto:Dennis.Li@amd.com>; Zhang,
>     Hawking <Hawking.Zhang@amd.com> <mailto:Hawking.Zhang@amd.com>
>     *Cc:* Chen, Guchun <Guchun.Chen@amd.com> <mailto:Guchun.Chen@amd.com>
>     *Subject:* Re: [PATCH 07/10] drm/amdgpu: add concurrent baco reset
>     support for XGMI
>
>     Thanks a lot Ma for trying - I think I have to have my own system
>     to debug this so I will keep trying enabling XGMI - i still think
>     the is the right and the generic solution for multiple nodes reset
>     synchronization and in fact the barrier should also be used for
>     synchronizing PSP mode 1 XGMI reset too.
>
>     Andrey
>
>     On 12/9/19 6:34 AM, Ma, Le wrote:
>
>         [AMD Official Use Only - Internal Distribution Only]
>
>         Hi Andrey,
>
>         I tried your patches on my 2P XGMI platform. The baco can work
>         at most time, and randomly got following error:
>
>         [ 1701.542298] amdgpu: [powerplay] Failed to send message
>         0x25, response 0x0
>
>         This error usually means some sync issue exist for xgmi baco
>         case. Feel free to debug your patches on my XGMI platform.
>
>         Regards,
>
>         Ma Le
>
>         *From:*Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>         <mailto:Andrey.Grodzovsky@amd.com>
>         *Sent:* Saturday, December 7, 2019 5:51 AM
>         *To:* Ma, Le <Le.Ma@amd.com> <mailto:Le.Ma@amd.com>;
>         amd-gfx@lists.freedesktop.org
>         <mailto:amd-gfx@lists.freedesktop.org>; Zhou1, Tao
>         <Tao.Zhou1@amd.com> <mailto:Tao.Zhou1@amd.com>; Deucher,
>         Alexander <Alexander.Deucher@amd.com>
>         <mailto:Alexander.Deucher@amd.com>; Li, Dennis
>         <Dennis.Li@amd.com> <mailto:Dennis.Li@amd.com>; Zhang, Hawking
>         <Hawking.Zhang@amd.com> <mailto:Hawking.Zhang@amd.com>
>         *Cc:* Chen, Guchun <Guchun.Chen@amd.com>
>         <mailto:Guchun.Chen@amd.com>
>         *Subject:* Re: [PATCH 07/10] drm/amdgpu: add concurrent baco
>         reset support for XGMI
>
>         Hey Ma, attached a solution - it's just compiled as I still
>         can't make my XGMI setup work (with bridge connected only one
>         device is visible to the system while the other is not).
>         Please try it on your system if you have a chance.
>
>         Andrey
>
>         On 12/4/19 10:14 PM, Ma, Le wrote:
>
>             AFAIK it's enough for even single one node in the hive to
>             to fail the enter the BACO state on time to fail the
>             entire hive reset procedure, no ?
>
>             [Le]: Yeah, agree that. I’ve been thinking that make all
>             nodes entering baco simultaneously can reduce the
>             possibility of node failure to enter/exit BACO risk. For
>             example, in an XGMI hive with 8 nodes, the total time
>             interval of 8 nodes enter/exit BACO on 8 CPUs is less than
>             the interval that 8 nodes enter BACO serially and exit
>             BACO serially depending on one CPU with yield capability.
>             This interval is usually strict for BACO feature itself.
>             Anyway, we need more looping test later on any method we
>             will choose.
>
>             Any way - I see our discussion blocks your entire patch
>             set - I think you can go ahead and commit yours way (I
>             think you got an RB from Hawking) and I will look then and
>             see if I can implement my method and if it works will just
>             revert your patch.
>
>             [Le]: OK, fine.
>
>             Andrey
>

[-- Attachment #1.2: Type: text/html, Size: 18101 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 57+ messages in thread

end of thread, other threads:[~2019-12-11 14:04 UTC | newest]

Thread overview: 57+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-11-27  9:15 [PATCH 01/10] drm/amdgpu: remove ras global recovery handling from ras_controller_int handler Le Ma
2019-11-27  9:15 ` Le Ma
     [not found] ` <1574846129-4826-1-git-send-email-le.ma-5C7GfCeVMHo@public.gmane.org>
2019-11-27  9:15   ` [PATCH 02/10] drm/amdgpu: export amdgpu_ras_find_obj to use externally Le Ma
2019-11-27  9:15     ` Le Ma
2019-11-27  9:15   ` [PATCH 03/10] drm/amdgpu: clear ras controller status registers when interrupt occurs Le Ma
2019-11-27  9:15     ` Le Ma
2019-11-27  9:15   ` [PATCH 05/10] drm/amdgpu: enable/disable doorbell interrupt in baco entry/exit helper Le Ma
2019-11-27  9:15     ` Le Ma
     [not found]     ` <1574846129-4826-4-git-send-email-le.ma-5C7GfCeVMHo@public.gmane.org>
2019-11-27 12:04       ` Zhang, Hawking
2019-11-27 12:04         ` Zhang, Hawking
     [not found]         ` <DM5PR12MB14184CF08E965BAF369F4249FC440-2J9CzHegvk81aAVlcVN8UQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2019-11-27 12:14           ` Ma, Le
2019-11-27 12:14             ` Ma, Le
2019-11-28  6:50       ` Zhou1, Tao
2019-11-28  6:50         ` Zhou1, Tao
2019-11-27  9:15   ` [PATCH 06/10] drm/amdgpu: add condition to enable baco for xgmi/ras case Le Ma
2019-11-27  9:15     ` Le Ma
     [not found]     ` <1574846129-4826-5-git-send-email-le.ma-5C7GfCeVMHo@public.gmane.org>
2019-11-27 11:28       ` Zhang, Hawking
2019-11-27 11:28         ` Zhang, Hawking
     [not found]         ` <DM5PR12MB141825CB772FEEF1FD013EDBFC440-2J9CzHegvk81aAVlcVN8UQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2019-11-27 12:35           ` Ma, Le
2019-11-27 12:35             ` Ma, Le
2019-11-27 11:38       ` Zhang, Hawking
2019-11-27 11:38         ` Zhang, Hawking
     [not found]         ` <DM5PR12MB1418D76FD9E6E7748C2F9997FC440-2J9CzHegvk81aAVlcVN8UQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2019-11-27 14:00           ` Ma, Le
2019-11-27 14:00             ` Ma, Le
2019-11-27  9:15   ` [PATCH 07/10] drm/amdgpu: add concurrent baco reset support for XGMI Le Ma
2019-11-27  9:15     ` Le Ma
     [not found]     ` <1574846129-4826-6-git-send-email-le.ma-5C7GfCeVMHo@public.gmane.org>
2019-11-27 15:46       ` Andrey Grodzovsky
2019-11-27 15:46         ` Andrey Grodzovsky
     [not found]         ` <c09d7928-f864-3a80-40e2-b6116abe044c-5C7GfCeVMHo@public.gmane.org>
2019-11-28  9:00           ` Ma, Le
2019-11-28  9:00             ` Ma, Le
2019-11-29 16:21             ` Andrey Grodzovsky
2019-12-02 11:42               ` Ma, Le
2019-12-02 22:05                 ` Andrey Grodzovsky
     [not found]                   ` <MN2PR12MB42855B198BB4064A0D311845F6420@MN2PR12MB4285.namprd12.prod.outlook.com>
     [not found]                     ` <2c4dd3f3-e2ce-9843-312b-1e5c05a51521@amd.com>
2019-12-04  7:09                       ` Ma, Le
2019-12-04 16:05                         ` Andrey Grodzovsky
2019-12-05  3:14                           ` Ma, Le
2019-12-06 21:50                             ` Andrey Grodzovsky
2019-12-09 11:34                               ` Ma, Le
2019-12-09 15:52                                 ` Andrey Grodzovsky
2019-12-10  2:45                                   ` Ma, Le
2019-12-10 19:55                                     ` Andrey Grodzovsky
2019-12-11 12:18                                       ` Ma, Le
2019-12-11 14:04                                         ` Andrey Grodzovsky
2019-12-09 22:00                                 ` Andrey Grodzovsky
2019-12-10  3:27                                   ` Ma, Le
2019-11-27  9:15   ` [PATCH 08/10] drm/amdgpu: support full gpu reset workflow when ras err_event_athub occurs Le Ma
2019-11-27  9:15     ` Le Ma
2019-11-27  9:15   ` [PATCH 09/10] drm/amdgpu: clear err_event_athub flag after reset exit Le Ma
2019-11-27  9:15     ` Le Ma
2019-11-27  9:15   ` [PATCH 10/10] drm/amdgpu: reduce redundant uvd context lost warning message Le Ma
2019-11-27  9:15     ` Le Ma
     [not found]     ` <1574846129-4826-9-git-send-email-le.ma-5C7GfCeVMHo@public.gmane.org>
2019-11-27  9:49       ` Chen, Guchun
2019-11-27  9:49         ` Chen, Guchun
     [not found]         ` <BYAPR12MB280648A1C59519AA77B3FCA9F1440-ZGDeBxoHBPk0CuAkIMgl3QdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2019-11-27  9:54           ` Ma, Le
2019-11-27  9:54             ` Ma, Le
2019-11-28  5:27   ` [PATCH 01/10] drm/amdgpu: remove ras global recovery handling from ras_controller_int handler Zhang, Hawking
2019-11-28  5:27     ` Zhang, Hawking

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.