All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/4] drm/amdgpu: Reset the devices in the XGMI hive duirng probe
@ 2021-02-19  1:18 shaoyunl
  2021-02-19 23:11 ` Liu, Shaoyun
                   ` (2 more replies)
  0 siblings, 3 replies; 5+ messages in thread
From: shaoyunl @ 2021-02-19  1:18 UTC (permalink / raw)
  To: amd-gfx; +Cc: shaoyunl

In passthrough configuration, hypervisior will trigger the SBR(Secondary bus reset) to the devices
without sync to each other. This could cause device hang since for XGMI configuration, all the devices
within the hive need to be reset at a limit time slot. This serial of patches try to solve this issue
by co-operate with new SMU which will only do minimum house keeping to response the SBR request but don't
do the real reset job and leave it to driver. Driver need to do the whole sw init and minimum HW init
to bring up the SMU and trigger the reset(possibly BACO) on all the ASICs at the same time with existing
gpu_recovery routine.

Signed-off-by: shaoyunl <shaoyun.liu@amd.com>
Change-Id: I34e838e611b7623c7ad824704c7ce350808014fc
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 96 +++++++++++++++++-----
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h    |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c    |  6 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   |  6 +-
 4 files changed, 87 insertions(+), 22 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 2f9ad7ed82be..9f574fd151bc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1220,6 +1220,10 @@ bool amdgpu_device_need_post(struct amdgpu_device *adev)
 		}
 	}
 
+	/* Don't post if we need to reset whole hive on init */
+	if (adev->gmc.xgmi.pending_reset)
+		return false;
+
 	if (adev->has_hw_reset) {
 		adev->has_hw_reset = false;
 		return true;
@@ -2147,6 +2151,9 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
 			if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
 				continue;
 
+			if (!adev->ip_blocks[i].status.sw)
+				continue;
+
 			/* no need to do the fw loading again if already done*/
 			if (adev->ip_blocks[i].status.hw == true)
 				break;
@@ -2287,7 +2294,10 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
 
 	if (adev->gmc.xgmi.num_physical_nodes > 1)
 		amdgpu_xgmi_add_device(adev);
-	amdgpu_amdkfd_device_init(adev);
+
+	/* Don't init kfd if whole hive need to be reset during init */
+	if (!adev->gmc.xgmi.pending_reset)
+		amdgpu_amdkfd_device_init(adev);
 
 	amdgpu_fru_get_product_info(adev);
 
@@ -2731,6 +2741,16 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
 			adev->ip_blocks[i].status.hw = false;
 			continue;
 		}
+
+		/* skip unnecessary suspend if we do not initialize them yet */
+		if (adev->gmc.xgmi.pending_reset &&
+		    !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
+		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
+		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
+		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
+			adev->ip_blocks[i].status.hw = false;
+			continue;
+		}
 		/* XXX handle errors */
 		r = adev->ip_blocks[i].version->funcs->suspend(adev);
 		/* XXX handle errors */
@@ -3402,10 +3422,29 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 	 *  E.g., driver was not cleanly unloaded previously, etc.
 	 */
 	if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
-		r = amdgpu_asic_reset(adev);
-		if (r) {
-			dev_err(adev->dev, "asic reset on init failed\n");
-			goto failed;
+		if (adev->gmc.xgmi.num_physical_nodes) {
+			dev_info(adev->dev, "Pending hive reset.\n");
+			adev->gmc.xgmi.pending_reset = true;
+			/* Only need to init necessary block for SMU to handle the reset */
+			for (i = 0; i < adev->num_ip_blocks; i++) {
+				if (!adev->ip_blocks[i].status.valid)
+					continue;
+				if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
+				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
+				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
+				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
+				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
+					DRM_DEBUG("IP %s disabed for hw_init.\n",
+						adev->ip_blocks[i].version->funcs->name);
+					adev->ip_blocks[i].status.hw = true;
+				}
+			}
+		} else {
+			r = amdgpu_asic_reset(adev);
+			if (r) {
+				dev_err(adev->dev, "asic reset on init failed\n");
+				goto failed;
+			}
 		}
 	}
 
@@ -3536,19 +3575,19 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 	/* enable clockgating, etc. after ib tests, etc. since some blocks require
 	 * explicit gating rather than handling it automatically.
 	 */
-	r = amdgpu_device_ip_late_init(adev);
-	if (r) {
-		dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
-		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
-		goto failed;
+	if (!adev->gmc.xgmi.pending_reset) {
+		r = amdgpu_device_ip_late_init(adev);
+		if (r) {
+			dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
+			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
+			goto failed;
+		}
+		/* must succeed. */
+		amdgpu_ras_resume(adev);
+		queue_delayed_work(system_wq, &adev->delayed_init_work,
+				   msecs_to_jiffies(AMDGPU_RESUME_MS));
 	}
 
-	/* must succeed. */
-	amdgpu_ras_resume(adev);
-
-	queue_delayed_work(system_wq, &adev->delayed_init_work,
-			   msecs_to_jiffies(AMDGPU_RESUME_MS));
-
 	if (amdgpu_sriov_vf(adev))
 		flush_delayed_work(&adev->delayed_init_work);
 
@@ -3565,6 +3604,18 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 	if (amdgpu_device_cache_pci_state(adev->pdev))
 		pci_restore_state(pdev);
 
+	if (adev->gmc.xgmi.pending_reset) {
+		struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
+
+		if (atomic_read(&hive->number_devices) ==
+		    adev->gmc.xgmi.num_physical_nodes) {
+			/* Trigger hive reset when all gpus within the hive is ready */
+			dev_info(adev->dev, "Trigger XGMI reset during init.\n");
+			amdgpu_device_gpu_recover(adev, NULL);
+		}
+		amdgpu_put_xgmi_hive(hive);
+	}
+
 	return 0;
 
 failed:
@@ -4241,7 +4292,9 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
 	int i, r = 0;
 	bool need_full_reset  = *need_full_reset_arg;
 
-	amdgpu_debugfs_wait_dump(adev);
+	/* no need to dump if device is not in good state during probe period */
+	if (!adev->gmc.xgmi.pending_reset)
+		amdgpu_debugfs_wait_dump(adev);
 
 	if (amdgpu_sriov_vf(adev)) {
 		/* stop the data exchange thread */
@@ -4304,6 +4357,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
 		list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
 			/* For XGMI run all resets in parallel to speed up the process */
 			if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
+				tmp_adev->gmc.xgmi.pending_reset = false;
 				if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
 					r = -EALREADY;
 			} else
@@ -4343,10 +4397,10 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
 		if (need_full_reset) {
 			/* post card */
-			if (amdgpu_device_asic_init(tmp_adev))
+			r = amdgpu_device_asic_init(tmp_adev);
+			if (r) {
 				dev_warn(tmp_adev->dev, "asic atom init failed!");
-
-			if (!r) {
+			} else {
 				dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
 				r = amdgpu_device_ip_resume_phase1(tmp_adev);
 				if (r)
@@ -4790,6 +4844,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 		if (audio_suspended)
 			amdgpu_device_resume_display_audio(tmp_adev);
 		amdgpu_device_unlock_adev(tmp_adev);
+		/*enable buffer function after reset */
+		amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true);
 	}
 
 skip_recovery:
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
index aa0c83776ce0..8c71d84a2fbe 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
@@ -149,6 +149,7 @@ struct amdgpu_xgmi {
 	struct list_head head;
 	bool supported;
 	struct ras_common_if *ras_if;
+	bool pending_reset;
 };
 
 struct amdgpu_gmc {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 839917eb7bc3..5e127a59b121 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -2261,6 +2261,12 @@ static int psp_load_fw(struct amdgpu_device *adev)
 		goto failed;
 	}
 
+	/*Don't do the real PSP HW init if we are pending on XGMI reset.
+	 *The above init probably should move to psp_sw_init
+	 */
+	if (adev->gmc.xgmi.pending_reset)
+		return 0;
+
 skip_memalloc:
 	ret = psp_hw_start(psp);
 	if (ret)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 659b385b27b5..b1c8fd90c1b0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -492,7 +492,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
 	if (!adev->gmc.xgmi.supported)
 		return 0;
 
-	if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
+	if (!adev->gmc.xgmi.pending_reset &&
+	    amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
 		ret = psp_xgmi_initialize(&adev->psp);
 		if (ret) {
 			dev_err(adev->dev,
@@ -538,7 +539,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
 
 	task_barrier_add_task(&hive->tb);
 
-	if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
+	if (!adev->gmc.xgmi.pending_reset &&
+	    amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
 		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
 			/* update node list for other device in the hive */
 			if (tmp_adev != adev) {
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* RE: [PATCH 1/4] drm/amdgpu: Reset the devices in the XGMI hive duirng probe
  2021-02-19  1:18 [PATCH 1/4] drm/amdgpu: Reset the devices in the XGMI hive duirng probe shaoyunl
@ 2021-02-19 23:11 ` Liu, Shaoyun
  2021-02-22 16:14 ` Liu, Shaoyun
  2021-02-23 16:47 ` Alex Deucher
  2 siblings, 0 replies; 5+ messages in thread
From: Liu, Shaoyun @ 2021-02-19 23:11 UTC (permalink / raw)
  To: amd-gfx

[AMD Official Use Only - Internal Distribution Only]

Ping .

-----Original Message-----
From: Liu, Shaoyun <Shaoyun.Liu@amd.com> 
Sent: Thursday, February 18, 2021 8:19 PM
To: amd-gfx@lists.freedesktop.org
Cc: Liu, Shaoyun <Shaoyun.Liu@amd.com>
Subject: [PATCH 1/4] drm/amdgpu: Reset the devices in the XGMI hive duirng probe

In passthrough configuration, hypervisior will trigger the SBR(Secondary bus reset) to the devices without sync to each other. This could cause device hang since for XGMI configuration, all the devices within the hive need to be reset at a limit time slot. This serial of patches try to solve this issue by co-operate with new SMU which will only do minimum house keeping to response the SBR request but don't do the real reset job and leave it to driver. Driver need to do the whole sw init and minimum HW init to bring up the SMU and trigger the reset(possibly BACO) on all the ASICs at the same time with existing gpu_recovery routine.

Signed-off-by: shaoyunl <shaoyun.liu@amd.com>
Change-Id: I34e838e611b7623c7ad824704c7ce350808014fc
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 96 +++++++++++++++++-----
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h    |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c    |  6 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   |  6 +-
 4 files changed, 87 insertions(+), 22 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 2f9ad7ed82be..9f574fd151bc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1220,6 +1220,10 @@ bool amdgpu_device_need_post(struct amdgpu_device *adev)
 		}
 	}
 
+	/* Don't post if we need to reset whole hive on init */
+	if (adev->gmc.xgmi.pending_reset)
+		return false;
+
 	if (adev->has_hw_reset) {
 		adev->has_hw_reset = false;
 		return true;
@@ -2147,6 +2151,9 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
 			if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
 				continue;
 
+			if (!adev->ip_blocks[i].status.sw)
+				continue;
+
 			/* no need to do the fw loading again if already done*/
 			if (adev->ip_blocks[i].status.hw == true)
 				break;
@@ -2287,7 +2294,10 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
 
 	if (adev->gmc.xgmi.num_physical_nodes > 1)
 		amdgpu_xgmi_add_device(adev);
-	amdgpu_amdkfd_device_init(adev);
+
+	/* Don't init kfd if whole hive need to be reset during init */
+	if (!adev->gmc.xgmi.pending_reset)
+		amdgpu_amdkfd_device_init(adev);
 
 	amdgpu_fru_get_product_info(adev);
 
@@ -2731,6 +2741,16 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
 			adev->ip_blocks[i].status.hw = false;
 			continue;
 		}
+
+		/* skip unnecessary suspend if we do not initialize them yet */
+		if (adev->gmc.xgmi.pending_reset &&
+		    !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
+		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
+		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
+		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
+			adev->ip_blocks[i].status.hw = false;
+			continue;
+		}
 		/* XXX handle errors */
 		r = adev->ip_blocks[i].version->funcs->suspend(adev);
 		/* XXX handle errors */
@@ -3402,10 +3422,29 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 	 *  E.g., driver was not cleanly unloaded previously, etc.
 	 */
 	if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
-		r = amdgpu_asic_reset(adev);
-		if (r) {
-			dev_err(adev->dev, "asic reset on init failed\n");
-			goto failed;
+		if (adev->gmc.xgmi.num_physical_nodes) {
+			dev_info(adev->dev, "Pending hive reset.\n");
+			adev->gmc.xgmi.pending_reset = true;
+			/* Only need to init necessary block for SMU to handle the reset */
+			for (i = 0; i < adev->num_ip_blocks; i++) {
+				if (!adev->ip_blocks[i].status.valid)
+					continue;
+				if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
+				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
+				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
+				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
+				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
+					DRM_DEBUG("IP %s disabed for hw_init.\n",
+						adev->ip_blocks[i].version->funcs->name);
+					adev->ip_blocks[i].status.hw = true;
+				}
+			}
+		} else {
+			r = amdgpu_asic_reset(adev);
+			if (r) {
+				dev_err(adev->dev, "asic reset on init failed\n");
+				goto failed;
+			}
 		}
 	}
 
@@ -3536,19 +3575,19 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 	/* enable clockgating, etc. after ib tests, etc. since some blocks require
 	 * explicit gating rather than handling it automatically.
 	 */
-	r = amdgpu_device_ip_late_init(adev);
-	if (r) {
-		dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
-		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
-		goto failed;
+	if (!adev->gmc.xgmi.pending_reset) {
+		r = amdgpu_device_ip_late_init(adev);
+		if (r) {
+			dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
+			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
+			goto failed;
+		}
+		/* must succeed. */
+		amdgpu_ras_resume(adev);
+		queue_delayed_work(system_wq, &adev->delayed_init_work,
+				   msecs_to_jiffies(AMDGPU_RESUME_MS));
 	}
 
-	/* must succeed. */
-	amdgpu_ras_resume(adev);
-
-	queue_delayed_work(system_wq, &adev->delayed_init_work,
-			   msecs_to_jiffies(AMDGPU_RESUME_MS));
-
 	if (amdgpu_sriov_vf(adev))
 		flush_delayed_work(&adev->delayed_init_work);
 
@@ -3565,6 +3604,18 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 	if (amdgpu_device_cache_pci_state(adev->pdev))
 		pci_restore_state(pdev);
 
+	if (adev->gmc.xgmi.pending_reset) {
+		struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
+
+		if (atomic_read(&hive->number_devices) ==
+		    adev->gmc.xgmi.num_physical_nodes) {
+			/* Trigger hive reset when all gpus within the hive is ready */
+			dev_info(adev->dev, "Trigger XGMI reset during init.\n");
+			amdgpu_device_gpu_recover(adev, NULL);
+		}
+		amdgpu_put_xgmi_hive(hive);
+	}
+
 	return 0;
 
 failed:
@@ -4241,7 +4292,9 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
 	int i, r = 0;
 	bool need_full_reset  = *need_full_reset_arg;
 
-	amdgpu_debugfs_wait_dump(adev);
+	/* no need to dump if device is not in good state during probe period */
+	if (!adev->gmc.xgmi.pending_reset)
+		amdgpu_debugfs_wait_dump(adev);
 
 	if (amdgpu_sriov_vf(adev)) {
 		/* stop the data exchange thread */
@@ -4304,6 +4357,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
 		list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
 			/* For XGMI run all resets in parallel to speed up the process */
 			if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
+				tmp_adev->gmc.xgmi.pending_reset = false;
 				if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
 					r = -EALREADY;
 			} else
@@ -4343,10 +4397,10 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
 		if (need_full_reset) {
 			/* post card */
-			if (amdgpu_device_asic_init(tmp_adev))
+			r = amdgpu_device_asic_init(tmp_adev);
+			if (r) {
 				dev_warn(tmp_adev->dev, "asic atom init failed!");
-
-			if (!r) {
+			} else {
 				dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
 				r = amdgpu_device_ip_resume_phase1(tmp_adev);
 				if (r)
@@ -4790,6 +4844,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 		if (audio_suspended)
 			amdgpu_device_resume_display_audio(tmp_adev);
 		amdgpu_device_unlock_adev(tmp_adev);
+		/*enable buffer function after reset */
+		amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true);
 	}
 
 skip_recovery:
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
index aa0c83776ce0..8c71d84a2fbe 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
@@ -149,6 +149,7 @@ struct amdgpu_xgmi {
 	struct list_head head;
 	bool supported;
 	struct ras_common_if *ras_if;
+	bool pending_reset;
 };
 
 struct amdgpu_gmc {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 839917eb7bc3..5e127a59b121 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -2261,6 +2261,12 @@ static int psp_load_fw(struct amdgpu_device *adev)
 		goto failed;
 	}
 
+	/*Don't do the real PSP HW init if we are pending on XGMI reset.
+	 *The above init probably should move to psp_sw_init
+	 */
+	if (adev->gmc.xgmi.pending_reset)
+		return 0;
+
 skip_memalloc:
 	ret = psp_hw_start(psp);
 	if (ret)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 659b385b27b5..b1c8fd90c1b0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -492,7 +492,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
 	if (!adev->gmc.xgmi.supported)
 		return 0;
 
-	if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
+	if (!adev->gmc.xgmi.pending_reset &&
+	    amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
 		ret = psp_xgmi_initialize(&adev->psp);
 		if (ret) {
 			dev_err(adev->dev,
@@ -538,7 +539,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
 
 	task_barrier_add_task(&hive->tb);
 
-	if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
+	if (!adev->gmc.xgmi.pending_reset &&
+	    amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
 		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
 			/* update node list for other device in the hive */
 			if (tmp_adev != adev) {
--
2.17.1
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* RE: [PATCH 1/4] drm/amdgpu: Reset the devices in the XGMI hive duirng probe
  2021-02-19  1:18 [PATCH 1/4] drm/amdgpu: Reset the devices in the XGMI hive duirng probe shaoyunl
  2021-02-19 23:11 ` Liu, Shaoyun
@ 2021-02-22 16:14 ` Liu, Shaoyun
  2021-02-23 16:47 ` Alex Deucher
  2 siblings, 0 replies; 5+ messages in thread
From: Liu, Shaoyun @ 2021-02-22 16:14 UTC (permalink / raw)
  To: amd-gfx, Deucher,  Alexander, Koenig, Christian, Grodzovsky, Andrey

[AMD Official Use Only - Internal Distribution Only]

Ping. 

-----Original Message-----
From: Liu, Shaoyun <Shaoyun.Liu@amd.com> 
Sent: Thursday, February 18, 2021 8:19 PM
To: amd-gfx@lists.freedesktop.org
Cc: Liu, Shaoyun <Shaoyun.Liu@amd.com>
Subject: [PATCH 1/4] drm/amdgpu: Reset the devices in the XGMI hive duirng probe

In passthrough configuration, hypervisior will trigger the SBR(Secondary bus reset) to the devices without sync to each other. This could cause device hang since for XGMI configuration, all the devices within the hive need to be reset at a limit time slot. This serial of patches try to solve this issue by co-operate with new SMU which will only do minimum house keeping to response the SBR request but don't do the real reset job and leave it to driver. Driver need to do the whole sw init and minimum HW init to bring up the SMU and trigger the reset(possibly BACO) on all the ASICs at the same time with existing gpu_recovery routine.

Signed-off-by: shaoyunl <shaoyun.liu@amd.com>
Change-Id: I34e838e611b7623c7ad824704c7ce350808014fc
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 96 +++++++++++++++++-----
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h    |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c    |  6 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   |  6 +-
 4 files changed, 87 insertions(+), 22 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 2f9ad7ed82be..9f574fd151bc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1220,6 +1220,10 @@ bool amdgpu_device_need_post(struct amdgpu_device *adev)
 		}
 	}
 
+	/* Don't post if we need to reset whole hive on init */
+	if (adev->gmc.xgmi.pending_reset)
+		return false;
+
 	if (adev->has_hw_reset) {
 		adev->has_hw_reset = false;
 		return true;
@@ -2147,6 +2151,9 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
 			if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
 				continue;
 
+			if (!adev->ip_blocks[i].status.sw)
+				continue;
+
 			/* no need to do the fw loading again if already done*/
 			if (adev->ip_blocks[i].status.hw == true)
 				break;
@@ -2287,7 +2294,10 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
 
 	if (adev->gmc.xgmi.num_physical_nodes > 1)
 		amdgpu_xgmi_add_device(adev);
-	amdgpu_amdkfd_device_init(adev);
+
+	/* Don't init kfd if whole hive need to be reset during init */
+	if (!adev->gmc.xgmi.pending_reset)
+		amdgpu_amdkfd_device_init(adev);
 
 	amdgpu_fru_get_product_info(adev);
 
@@ -2731,6 +2741,16 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
 			adev->ip_blocks[i].status.hw = false;
 			continue;
 		}
+
+		/* skip unnecessary suspend if we do not initialize them yet */
+		if (adev->gmc.xgmi.pending_reset &&
+		    !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
+		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
+		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
+		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
+			adev->ip_blocks[i].status.hw = false;
+			continue;
+		}
 		/* XXX handle errors */
 		r = adev->ip_blocks[i].version->funcs->suspend(adev);
 		/* XXX handle errors */
@@ -3402,10 +3422,29 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 	 *  E.g., driver was not cleanly unloaded previously, etc.
 	 */
 	if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
-		r = amdgpu_asic_reset(adev);
-		if (r) {
-			dev_err(adev->dev, "asic reset on init failed\n");
-			goto failed;
+		if (adev->gmc.xgmi.num_physical_nodes) {
+			dev_info(adev->dev, "Pending hive reset.\n");
+			adev->gmc.xgmi.pending_reset = true;
+			/* Only need to init necessary block for SMU to handle the reset */
+			for (i = 0; i < adev->num_ip_blocks; i++) {
+				if (!adev->ip_blocks[i].status.valid)
+					continue;
+				if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
+				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
+				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
+				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
+				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
+					DRM_DEBUG("IP %s disabed for hw_init.\n",
+						adev->ip_blocks[i].version->funcs->name);
+					adev->ip_blocks[i].status.hw = true;
+				}
+			}
+		} else {
+			r = amdgpu_asic_reset(adev);
+			if (r) {
+				dev_err(adev->dev, "asic reset on init failed\n");
+				goto failed;
+			}
 		}
 	}
 
@@ -3536,19 +3575,19 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 	/* enable clockgating, etc. after ib tests, etc. since some blocks require
 	 * explicit gating rather than handling it automatically.
 	 */
-	r = amdgpu_device_ip_late_init(adev);
-	if (r) {
-		dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
-		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
-		goto failed;
+	if (!adev->gmc.xgmi.pending_reset) {
+		r = amdgpu_device_ip_late_init(adev);
+		if (r) {
+			dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
+			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
+			goto failed;
+		}
+		/* must succeed. */
+		amdgpu_ras_resume(adev);
+		queue_delayed_work(system_wq, &adev->delayed_init_work,
+				   msecs_to_jiffies(AMDGPU_RESUME_MS));
 	}
 
-	/* must succeed. */
-	amdgpu_ras_resume(adev);
-
-	queue_delayed_work(system_wq, &adev->delayed_init_work,
-			   msecs_to_jiffies(AMDGPU_RESUME_MS));
-
 	if (amdgpu_sriov_vf(adev))
 		flush_delayed_work(&adev->delayed_init_work);
 
@@ -3565,6 +3604,18 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 	if (amdgpu_device_cache_pci_state(adev->pdev))
 		pci_restore_state(pdev);
 
+	if (adev->gmc.xgmi.pending_reset) {
+		struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
+
+		if (atomic_read(&hive->number_devices) ==
+		    adev->gmc.xgmi.num_physical_nodes) {
+			/* Trigger hive reset when all gpus within the hive is ready */
+			dev_info(adev->dev, "Trigger XGMI reset during init.\n");
+			amdgpu_device_gpu_recover(adev, NULL);
+		}
+		amdgpu_put_xgmi_hive(hive);
+	}
+
 	return 0;
 
 failed:
@@ -4241,7 +4292,9 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
 	int i, r = 0;
 	bool need_full_reset  = *need_full_reset_arg;
 
-	amdgpu_debugfs_wait_dump(adev);
+	/* no need to dump if device is not in good state during probe period */
+	if (!adev->gmc.xgmi.pending_reset)
+		amdgpu_debugfs_wait_dump(adev);
 
 	if (amdgpu_sriov_vf(adev)) {
 		/* stop the data exchange thread */
@@ -4304,6 +4357,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
 		list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
 			/* For XGMI run all resets in parallel to speed up the process */
 			if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
+				tmp_adev->gmc.xgmi.pending_reset = false;
 				if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
 					r = -EALREADY;
 			} else
@@ -4343,10 +4397,10 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
 		if (need_full_reset) {
 			/* post card */
-			if (amdgpu_device_asic_init(tmp_adev))
+			r = amdgpu_device_asic_init(tmp_adev);
+			if (r) {
 				dev_warn(tmp_adev->dev, "asic atom init failed!");
-
-			if (!r) {
+			} else {
 				dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
 				r = amdgpu_device_ip_resume_phase1(tmp_adev);
 				if (r)
@@ -4790,6 +4844,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 		if (audio_suspended)
 			amdgpu_device_resume_display_audio(tmp_adev);
 		amdgpu_device_unlock_adev(tmp_adev);
+		/*enable buffer function after reset */
+		amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true);
 	}
 
 skip_recovery:
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
index aa0c83776ce0..8c71d84a2fbe 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
@@ -149,6 +149,7 @@ struct amdgpu_xgmi {
 	struct list_head head;
 	bool supported;
 	struct ras_common_if *ras_if;
+	bool pending_reset;
 };
 
 struct amdgpu_gmc {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 839917eb7bc3..5e127a59b121 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -2261,6 +2261,12 @@ static int psp_load_fw(struct amdgpu_device *adev)
 		goto failed;
 	}
 
+	/*Don't do the real PSP HW init if we are pending on XGMI reset.
+	 *The above init probably should move to psp_sw_init
+	 */
+	if (adev->gmc.xgmi.pending_reset)
+		return 0;
+
 skip_memalloc:
 	ret = psp_hw_start(psp);
 	if (ret)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 659b385b27b5..b1c8fd90c1b0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -492,7 +492,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
 	if (!adev->gmc.xgmi.supported)
 		return 0;
 
-	if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
+	if (!adev->gmc.xgmi.pending_reset &&
+	    amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
 		ret = psp_xgmi_initialize(&adev->psp);
 		if (ret) {
 			dev_err(adev->dev,
@@ -538,7 +539,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
 
 	task_barrier_add_task(&hive->tb);
 
-	if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
+	if (!adev->gmc.xgmi.pending_reset &&
+	    amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
 		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
 			/* update node list for other device in the hive */
 			if (tmp_adev != adev) {
--
2.17.1
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH 1/4] drm/amdgpu: Reset the devices in the XGMI hive duirng probe
  2021-02-19  1:18 [PATCH 1/4] drm/amdgpu: Reset the devices in the XGMI hive duirng probe shaoyunl
  2021-02-19 23:11 ` Liu, Shaoyun
  2021-02-22 16:14 ` Liu, Shaoyun
@ 2021-02-23 16:47 ` Alex Deucher
  2021-02-23 17:54   ` Liu, Shaoyun
  2 siblings, 1 reply; 5+ messages in thread
From: Alex Deucher @ 2021-02-23 16:47 UTC (permalink / raw)
  To: shaoyunl; +Cc: amd-gfx list

On Thu, Feb 18, 2021 at 8:19 PM shaoyunl <shaoyun.liu@amd.com> wrote:
>
> In passthrough configuration, hypervisior will trigger the SBR(Secondary bus reset) to the devices
> without sync to each other. This could cause device hang since for XGMI configuration, all the devices
> within the hive need to be reset at a limit time slot. This serial of patches try to solve this issue
> by co-operate with new SMU which will only do minimum house keeping to response the SBR request but don't
> do the real reset job and leave it to driver. Driver need to do the whole sw init and minimum HW init
> to bring up the SMU and trigger the reset(possibly BACO) on all the ASICs at the same time with existing
> gpu_recovery routine.
>
> Signed-off-by: shaoyunl <shaoyun.liu@amd.com>
> Change-Id: I34e838e611b7623c7ad824704c7ce350808014fc
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 96 +++++++++++++++++-----
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h    |  1 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c    |  6 ++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   |  6 +-
>  4 files changed, 87 insertions(+), 22 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 2f9ad7ed82be..9f574fd151bc 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -1220,6 +1220,10 @@ bool amdgpu_device_need_post(struct amdgpu_device *adev)
>                 }
>         }
>
> +       /* Don't post if we need to reset whole hive on init */
> +       if (adev->gmc.xgmi.pending_reset)
> +               return false;
> +
>         if (adev->has_hw_reset) {
>                 adev->has_hw_reset = false;
>                 return true;
> @@ -2147,6 +2151,9 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
>                         if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
>                                 continue;
>
> +                       if (!adev->ip_blocks[i].status.sw)
> +                               continue;
> +
>                         /* no need to do the fw loading again if already done*/
>                         if (adev->ip_blocks[i].status.hw == true)
>                                 break;
> @@ -2287,7 +2294,10 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
>
>         if (adev->gmc.xgmi.num_physical_nodes > 1)
>                 amdgpu_xgmi_add_device(adev);
> -       amdgpu_amdkfd_device_init(adev);
> +
> +       /* Don't init kfd if whole hive need to be reset during init */
> +       if (!adev->gmc.xgmi.pending_reset)
> +               amdgpu_amdkfd_device_init(adev);
>
>         amdgpu_fru_get_product_info(adev);
>
> @@ -2731,6 +2741,16 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
>                         adev->ip_blocks[i].status.hw = false;
>                         continue;
>                 }
> +
> +               /* skip unnecessary suspend if we do not initialize them yet */
> +               if (adev->gmc.xgmi.pending_reset &&
> +                   !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
> +                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
> +                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
> +                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
> +                       adev->ip_blocks[i].status.hw = false;
> +                       continue;
> +               }
>                 /* XXX handle errors */
>                 r = adev->ip_blocks[i].version->funcs->suspend(adev);
>                 /* XXX handle errors */
> @@ -3402,10 +3422,29 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>          *  E.g., driver was not cleanly unloaded previously, etc.
>          */
>         if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
> -               r = amdgpu_asic_reset(adev);
> -               if (r) {
> -                       dev_err(adev->dev, "asic reset on init failed\n");
> -                       goto failed;
> +               if (adev->gmc.xgmi.num_physical_nodes) {
> +                       dev_info(adev->dev, "Pending hive reset.\n");
> +                       adev->gmc.xgmi.pending_reset = true;
> +                       /* Only need to init necessary block for SMU to handle the reset */
> +                       for (i = 0; i < adev->num_ip_blocks; i++) {
> +                               if (!adev->ip_blocks[i].status.valid)
> +                                       continue;
> +                               if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
> +                                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
> +                                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
> +                                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
> +                                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
> +                                       DRM_DEBUG("IP %s disabed for hw_init.\n",
> +                                               adev->ip_blocks[i].version->funcs->name);
> +                                       adev->ip_blocks[i].status.hw = true;
> +                               }
> +                       }
> +               } else {
> +                       r = amdgpu_asic_reset(adev);
> +                       if (r) {
> +                               dev_err(adev->dev, "asic reset on init failed\n");
> +                               goto failed;
> +                       }
>                 }
>         }
>
> @@ -3536,19 +3575,19 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>         /* enable clockgating, etc. after ib tests, etc. since some blocks require
>          * explicit gating rather than handling it automatically.
>          */
> -       r = amdgpu_device_ip_late_init(adev);
> -       if (r) {
> -               dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
> -               amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
> -               goto failed;
> +       if (!adev->gmc.xgmi.pending_reset) {
> +               r = amdgpu_device_ip_late_init(adev);
> +               if (r) {
> +                       dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
> +                       amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
> +                       goto failed;
> +               }
> +               /* must succeed. */
> +               amdgpu_ras_resume(adev);
> +               queue_delayed_work(system_wq, &adev->delayed_init_work,
> +                                  msecs_to_jiffies(AMDGPU_RESUME_MS));
>         }
>
> -       /* must succeed. */
> -       amdgpu_ras_resume(adev);
> -
> -       queue_delayed_work(system_wq, &adev->delayed_init_work,
> -                          msecs_to_jiffies(AMDGPU_RESUME_MS));
> -
>         if (amdgpu_sriov_vf(adev))
>                 flush_delayed_work(&adev->delayed_init_work);
>
> @@ -3565,6 +3604,18 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>         if (amdgpu_device_cache_pci_state(adev->pdev))
>                 pci_restore_state(pdev);
>
> +       if (adev->gmc.xgmi.pending_reset) {
> +               struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
> +
> +               if (atomic_read(&hive->number_devices) ==
> +                   adev->gmc.xgmi.num_physical_nodes) {
> +                       /* Trigger hive reset when all gpus within the hive is ready */
> +                       dev_info(adev->dev, "Trigger XGMI reset during init.\n");
> +                       amdgpu_device_gpu_recover(adev, NULL);
> +               }
> +               amdgpu_put_xgmi_hive(hive);
> +       }
> +
>         return 0;
>
>  failed:
> @@ -4241,7 +4292,9 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>         int i, r = 0;
>         bool need_full_reset  = *need_full_reset_arg;
>
> -       amdgpu_debugfs_wait_dump(adev);
> +       /* no need to dump if device is not in good state during probe period */
> +       if (!adev->gmc.xgmi.pending_reset)
> +               amdgpu_debugfs_wait_dump(adev);
>
>         if (amdgpu_sriov_vf(adev)) {
>                 /* stop the data exchange thread */
> @@ -4304,6 +4357,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>                 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>                         /* For XGMI run all resets in parallel to speed up the process */
>                         if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
> +                               tmp_adev->gmc.xgmi.pending_reset = false;
>                                 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
>                                         r = -EALREADY;
>                         } else
> @@ -4343,10 +4397,10 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>                 if (need_full_reset) {
>                         /* post card */
> -                       if (amdgpu_device_asic_init(tmp_adev))
> +                       r = amdgpu_device_asic_init(tmp_adev);
> +                       if (r) {
>                                 dev_warn(tmp_adev->dev, "asic atom init failed!");
> -
> -                       if (!r) {
> +                       } else {
>                                 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
>                                 r = amdgpu_device_ip_resume_phase1(tmp_adev);
>                                 if (r)
> @@ -4790,6 +4844,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>                 if (audio_suspended)
>                         amdgpu_device_resume_display_audio(tmp_adev);
>                 amdgpu_device_unlock_adev(tmp_adev);
> +               /*enable buffer function after reset */

Space between * and comment, e.g., /* Enable

> +               amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true);

Will this cause issues with any of the other flows through this code?

>         }
>
>  skip_recovery:
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> index aa0c83776ce0..8c71d84a2fbe 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> @@ -149,6 +149,7 @@ struct amdgpu_xgmi {
>         struct list_head head;
>         bool supported;
>         struct ras_common_if *ras_if;
> +       bool pending_reset;
>  };
>
>  struct amdgpu_gmc {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> index 839917eb7bc3..5e127a59b121 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> @@ -2261,6 +2261,12 @@ static int psp_load_fw(struct amdgpu_device *adev)
>                 goto failed;
>         }
>
> +       /*Don't do the real PSP HW init if we are pending on XGMI reset.
> +        *The above init probably should move to psp_sw_init
> +        */

Space between * and comment.  E.g., /* Don't

This whole thing seems kind of fragile.  Is there some way we can
restructure the init code to provide two code high level code paths?
One for regular init and one for this case?  I feel like this will be
prone to breakage if this code sees any changes.

Alex


> +       if (adev->gmc.xgmi.pending_reset)
> +               return 0;
> +
>  skip_memalloc:
>         ret = psp_hw_start(psp);
>         if (ret)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> index 659b385b27b5..b1c8fd90c1b0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> @@ -492,7 +492,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>         if (!adev->gmc.xgmi.supported)
>                 return 0;
>
> -       if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
> +       if (!adev->gmc.xgmi.pending_reset &&
> +           amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
>                 ret = psp_xgmi_initialize(&adev->psp);
>                 if (ret) {
>                         dev_err(adev->dev,
> @@ -538,7 +539,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>
>         task_barrier_add_task(&hive->tb);
>
> -       if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
> +       if (!adev->gmc.xgmi.pending_reset &&
> +           amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
>                 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
>                         /* update node list for other device in the hive */
>                         if (tmp_adev != adev) {
> --
> 2.17.1
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 5+ messages in thread

* RE: [PATCH 1/4] drm/amdgpu: Reset the devices in the XGMI hive duirng probe
  2021-02-23 16:47 ` Alex Deucher
@ 2021-02-23 17:54   ` Liu, Shaoyun
  0 siblings, 0 replies; 5+ messages in thread
From: Liu, Shaoyun @ 2021-02-23 17:54 UTC (permalink / raw)
  To: Alex Deucher; +Cc: amd-gfx list

[AMD Official Use Only - Internal Distribution Only]

Comments inline , 

-----Original Message-----
From: Alex Deucher <alexdeucher@gmail.com> 
Sent: Tuesday, February 23, 2021 11:47 AM
To: Liu, Shaoyun <Shaoyun.Liu@amd.com>
Cc: amd-gfx list <amd-gfx@lists.freedesktop.org>
Subject: Re: [PATCH 1/4] drm/amdgpu: Reset the devices in the XGMI hive duirng probe

On Thu, Feb 18, 2021 at 8:19 PM shaoyunl <shaoyun.liu@amd.com> wrote:
>
> In passthrough configuration, hypervisior will trigger the 
> SBR(Secondary bus reset) to the devices without sync to each other. 
> This could cause device hang since for XGMI configuration, all the 
> devices within the hive need to be reset at a limit time slot. This 
> serial of patches try to solve this issue by co-operate with new SMU 
> which will only do minimum house keeping to response the SBR request 
> but don't do the real reset job and leave it to driver. Driver need to do the whole sw init and minimum HW init to bring up the SMU and trigger the reset(possibly BACO) on all the ASICs at the same time with existing gpu_recovery routine.
>
> Signed-off-by: shaoyunl <shaoyun.liu@amd.com>
> Change-Id: I34e838e611b7623c7ad824704c7ce350808014fc
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 96 +++++++++++++++++-----
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h    |  1 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c    |  6 ++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   |  6 +-
>  4 files changed, 87 insertions(+), 22 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 2f9ad7ed82be..9f574fd151bc 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -1220,6 +1220,10 @@ bool amdgpu_device_need_post(struct amdgpu_device *adev)
>                 }
>         }
>
> +       /* Don't post if we need to reset whole hive on init */
> +       if (adev->gmc.xgmi.pending_reset)
> +               return false;
> +
>         if (adev->has_hw_reset) {
>                 adev->has_hw_reset = false;
>                 return true;
> @@ -2147,6 +2151,9 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
>                         if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
>                                 continue;
>
> +                       if (!adev->ip_blocks[i].status.sw)
> +                               continue;
> +
>                         /* no need to do the fw loading again if already done*/
>                         if (adev->ip_blocks[i].status.hw == true)
>                                 break; @@ -2287,7 +2294,10 @@ static 
> int amdgpu_device_ip_init(struct amdgpu_device *adev)
>
>         if (adev->gmc.xgmi.num_physical_nodes > 1)
>                 amdgpu_xgmi_add_device(adev);
> -       amdgpu_amdkfd_device_init(adev);
> +
> +       /* Don't init kfd if whole hive need to be reset during init */
> +       if (!adev->gmc.xgmi.pending_reset)
> +               amdgpu_amdkfd_device_init(adev);
>
>         amdgpu_fru_get_product_info(adev);
>
> @@ -2731,6 +2741,16 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
>                         adev->ip_blocks[i].status.hw = false;
>                         continue;
>                 }
> +
> +               /* skip unnecessary suspend if we do not initialize them yet */
> +               if (adev->gmc.xgmi.pending_reset &&
> +                   !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
> +                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
> +                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
> +                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
> +                       adev->ip_blocks[i].status.hw = false;
> +                       continue;
> +               }
>                 /* XXX handle errors */
>                 r = adev->ip_blocks[i].version->funcs->suspend(adev);
>                 /* XXX handle errors */ @@ -3402,10 +3422,29 @@ int 
> amdgpu_device_init(struct amdgpu_device *adev,
>          *  E.g., driver was not cleanly unloaded previously, etc.
>          */
>         if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
> -               r = amdgpu_asic_reset(adev);
> -               if (r) {
> -                       dev_err(adev->dev, "asic reset on init failed\n");
> -                       goto failed;
> +               if (adev->gmc.xgmi.num_physical_nodes) {
> +                       dev_info(adev->dev, "Pending hive reset.\n");
> +                       adev->gmc.xgmi.pending_reset = true;
> +                       /* Only need to init necessary block for SMU to handle the reset */
> +                       for (i = 0; i < adev->num_ip_blocks; i++) {
> +                               if (!adev->ip_blocks[i].status.valid)
> +                                       continue;
> +                               if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
> +                                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
> +                                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
> +                                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
> +                                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
> +                                       DRM_DEBUG("IP %s disabed for hw_init.\n",
> +                                               adev->ip_blocks[i].version->funcs->name);
> +                                       adev->ip_blocks[i].status.hw = true;
> +                               }
> +                       }
> +               } else {
> +                       r = amdgpu_asic_reset(adev);
> +                       if (r) {
> +                               dev_err(adev->dev, "asic reset on init failed\n");
> +                               goto failed;
> +                       }
>                 }
>         }
>
> @@ -3536,19 +3575,19 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>         /* enable clockgating, etc. after ib tests, etc. since some blocks require
>          * explicit gating rather than handling it automatically.
>          */
> -       r = amdgpu_device_ip_late_init(adev);
> -       if (r) {
> -               dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
> -               amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
> -               goto failed;
> +       if (!adev->gmc.xgmi.pending_reset) {
> +               r = amdgpu_device_ip_late_init(adev);
> +               if (r) {
> +                       dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
> +                       amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
> +                       goto failed;
> +               }
> +               /* must succeed. */
> +               amdgpu_ras_resume(adev);
> +               queue_delayed_work(system_wq, &adev->delayed_init_work,
> +                                  
> + msecs_to_jiffies(AMDGPU_RESUME_MS));
>         }
>
> -       /* must succeed. */
> -       amdgpu_ras_resume(adev);
> -
> -       queue_delayed_work(system_wq, &adev->delayed_init_work,
> -                          msecs_to_jiffies(AMDGPU_RESUME_MS));
> -
>         if (amdgpu_sriov_vf(adev))
>                 flush_delayed_work(&adev->delayed_init_work);
>
> @@ -3565,6 +3604,18 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>         if (amdgpu_device_cache_pci_state(adev->pdev))
>                 pci_restore_state(pdev);
>
> +       if (adev->gmc.xgmi.pending_reset) {
> +               struct amdgpu_hive_info *hive = 
> + amdgpu_get_xgmi_hive(adev);
> +
> +               if (atomic_read(&hive->number_devices) ==
> +                   adev->gmc.xgmi.num_physical_nodes) {
> +                       /* Trigger hive reset when all gpus within the hive is ready */
> +                       dev_info(adev->dev, "Trigger XGMI reset during init.\n");
> +                       amdgpu_device_gpu_recover(adev, NULL);
> +               }
> +               amdgpu_put_xgmi_hive(hive);
> +       }
> +
>         return 0;
>
>  failed:
> @@ -4241,7 +4292,9 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>         int i, r = 0;
>         bool need_full_reset  = *need_full_reset_arg;
>
> -       amdgpu_debugfs_wait_dump(adev);
> +       /* no need to dump if device is not in good state during probe period */
> +       if (!adev->gmc.xgmi.pending_reset)
> +               amdgpu_debugfs_wait_dump(adev);
>
>         if (amdgpu_sriov_vf(adev)) {
>                 /* stop the data exchange thread */ @@ -4304,6 +4357,7 
> @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>                 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>                         /* For XGMI run all resets in parallel to speed up the process */
>                         if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) 
> {
> +                               tmp_adev->gmc.xgmi.pending_reset = 
> + false;
>                                 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
>                                         r = -EALREADY;
>                         } else
> @@ -4343,10 +4397,10 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>                 if (need_full_reset) {
>                         /* post card */
> -                       if (amdgpu_device_asic_init(tmp_adev))
> +                       r = amdgpu_device_asic_init(tmp_adev);
> +                       if (r) {
>                                 dev_warn(tmp_adev->dev, "asic atom 
> init failed!");
> -
> -                       if (!r) {
> +                       } else {
>                                 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
>                                 r = amdgpu_device_ip_resume_phase1(tmp_adev);
>                                 if (r) @@ -4790,6 +4844,8 @@ int 
> amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>                 if (audio_suspended)
>                         amdgpu_device_resume_display_audio(tmp_adev);
>                 amdgpu_device_unlock_adev(tmp_adev);
> +               /*enable buffer function after reset */

Space between * and comment, e.g., /* Enable

> +               amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true);

Will this cause issues with any of the other flows through this code?

[Shaoyunl] it won't , the function internally will check whether it's in reset state . so for normal gpu recovery called on runtime , this won't have side effect. 

>         }
>
>  skip_recovery:
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> index aa0c83776ce0..8c71d84a2fbe 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> @@ -149,6 +149,7 @@ struct amdgpu_xgmi {
>         struct list_head head;
>         bool supported;
>         struct ras_common_if *ras_if;
> +       bool pending_reset;
>  };
>
>  struct amdgpu_gmc {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> index 839917eb7bc3..5e127a59b121 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> @@ -2261,6 +2261,12 @@ static int psp_load_fw(struct amdgpu_device *adev)
>                 goto failed;
>         }
>
> +       /*Don't do the real PSP HW init if we are pending on XGMI reset.
> +        *The above init probably should move to psp_sw_init
> +        */

Space between * and comment.  E.g., /* Don't

This whole thing seems kind of fragile.  Is there some way we can restructure the init code to provide two code high level code paths?
One for regular init and one for this case?  I feel like this will be prone to breakage if this code sees any changes.

Alex

[shaoyunl]  Agree, current psp code mix the SW init code in the HW init function,  to fix that will  involved some  changes not directly relative to this XGMI  reset and  other more tests . I would like to separate that from this serial change . 

Thanks 
Shaoyun.liu


> +       if (adev->gmc.xgmi.pending_reset)
> +               return 0;
> +
>  skip_memalloc:
>         ret = psp_hw_start(psp);
>         if (ret)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> index 659b385b27b5..b1c8fd90c1b0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> @@ -492,7 +492,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>         if (!adev->gmc.xgmi.supported)
>                 return 0;
>
> -       if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
> +       if (!adev->gmc.xgmi.pending_reset &&
> +           amdgpu_device_ip_get_ip_block(adev, 
> + AMD_IP_BLOCK_TYPE_PSP)) {
>                 ret = psp_xgmi_initialize(&adev->psp);
>                 if (ret) {
>                         dev_err(adev->dev, @@ -538,7 +539,8 @@ int 
> amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>
>         task_barrier_add_task(&hive->tb);
>
> -       if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
> +       if (!adev->gmc.xgmi.pending_reset &&
> +           amdgpu_device_ip_get_ip_block(adev, 
> + AMD_IP_BLOCK_TYPE_PSP)) {
>                 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
>                         /* update node list for other device in the hive */
>                         if (tmp_adev != adev) {
> --
> 2.17.1
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flist
> s.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=04%7C01%7Csh
> aoyun.liu%40amd.com%7C0641a47202924dde84c308d8d81aaaf3%7C3dd8961fe4884
> e608e11a82d994e183d%7C0%7C0%7C637496956338383230%7CUnknown%7CTWFpbGZsb
> 3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%
> 7C1000&amp;sdata=Yvp3D7pFpvK%2BRP4StG%2BTH8O0UlWYsbPJ54MzdyKbFGQ%3D&am
> p;reserved=0
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2021-02-23 17:54 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-02-19  1:18 [PATCH 1/4] drm/amdgpu: Reset the devices in the XGMI hive duirng probe shaoyunl
2021-02-19 23:11 ` Liu, Shaoyun
2021-02-22 16:14 ` Liu, Shaoyun
2021-02-23 16:47 ` Alex Deucher
2021-02-23 17:54   ` Liu, Shaoyun

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.