All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 5/5] drm/amdgpu: Reset the devices in the XGMI hive duirng probe
@ 2021-03-05 17:52 shaoyunl
  2021-03-05 19:27 ` Andrey Grodzovsky
  0 siblings, 1 reply; 12+ messages in thread
From: shaoyunl @ 2021-03-05 17:52 UTC (permalink / raw)
  To: amd-gfx; +Cc: shaoyunl

In passthrough configuration, hypervisior will trigger the SBR(Secondary bus reset) to the devices
without sync to each other. This could cause device hang since for XGMI configuration, all the devices
within the hive need to be reset at a limit time slot. This serial of patches try to solve this issue
by co-operate with new SMU which will only do minimum house keeping to response the SBR request but don't
do the real reset job and leave it to driver. Driver need to do the whole sw init and minimum HW init
to bring up the SMU and trigger the reset(possibly BACO) on all the ASICs at the same time

Signed-off-by: shaoyunl <shaoyun.liu@amd.com>
Change-Id: I34e838e611b7623c7ad824704c7ce350808014fc
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  13 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 102 +++++++++++++++------
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  71 ++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h    |   1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   |   8 +-
 5 files changed, 165 insertions(+), 30 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index d46d3794699e..5602c6edee97 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -125,6 +125,10 @@ struct amdgpu_mgpu_info
 	uint32_t			num_gpu;
 	uint32_t			num_dgpu;
 	uint32_t			num_apu;
+
+	/* delayed reset_func for XGMI configuration if necessary */
+	struct delayed_work		delayed_reset_work;
+	bool				pending_reset;
 };
 
 #define AMDGPU_MAX_TIMEOUT_PARAM_LENGTH	256
@@ -1124,6 +1128,15 @@ void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type);
 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev);
 
+int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
+				  struct amdgpu_job *job,
+				  bool *need_full_reset_arg);
+
+int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
+			  struct list_head *device_list_handle,
+			  bool *need_full_reset_arg,
+			  bool skip_hw_reset);
+
 int emu_soc_asic_init(struct amdgpu_device *adev);
 
 /*
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 3c35b0c1e710..5b520f70e660 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1220,6 +1220,10 @@ bool amdgpu_device_need_post(struct amdgpu_device *adev)
 		}
 	}
 
+	/* Don't post if we need to reset whole hive on init */
+	if (adev->gmc.xgmi.pending_reset)
+		return false;
+
 	if (adev->has_hw_reset) {
 		adev->has_hw_reset = false;
 		return true;
@@ -2149,6 +2153,9 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
 			if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
 				continue;
 
+			if (!adev->ip_blocks[i].status.sw)
+				continue;
+
 			/* no need to do the fw loading again if already done*/
 			if (adev->ip_blocks[i].status.hw == true)
 				break;
@@ -2289,7 +2296,10 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
 
 	if (adev->gmc.xgmi.num_physical_nodes > 1)
 		amdgpu_xgmi_add_device(adev);
-	amdgpu_amdkfd_device_init(adev);
+
+	/* Don't init kfd if whole hive need to be reset during init */
+	if (!adev->gmc.xgmi.pending_reset)
+		amdgpu_amdkfd_device_init(adev);
 
 	amdgpu_fru_get_product_info(adev);
 
@@ -2734,6 +2744,16 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
 			adev->ip_blocks[i].status.hw = false;
 			continue;
 		}
+
+		/* skip unnecessary suspend if we do not initialize them yet */
+		if (adev->gmc.xgmi.pending_reset &&
+		    !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
+		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
+		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
+		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
+			adev->ip_blocks[i].status.hw = false;
+			continue;
+		}
 		/* XXX handle errors */
 		r = adev->ip_blocks[i].version->funcs->suspend(adev);
 		/* XXX handle errors */
@@ -3407,10 +3427,28 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 	 *  E.g., driver was not cleanly unloaded previously, etc.
 	 */
 	if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
-		r = amdgpu_asic_reset(adev);
-		if (r) {
-			dev_err(adev->dev, "asic reset on init failed\n");
-			goto failed;
+		if (adev->gmc.xgmi.num_physical_nodes) {
+			dev_info(adev->dev, "Pending hive reset.\n");
+			adev->gmc.xgmi.pending_reset = true;
+			/* Only need to init necessary block for SMU to handle the reset */
+			for (i = 0; i < adev->num_ip_blocks; i++) {
+				if (!adev->ip_blocks[i].status.valid)
+					continue;
+				if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
+				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
+				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
+				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
+					DRM_DEBUG("IP %s disabed for hw_init.\n",
+						adev->ip_blocks[i].version->funcs->name);
+					adev->ip_blocks[i].status.hw = true;
+				}
+			}
+		} else {
+			r = amdgpu_asic_reset(adev);
+			if (r) {
+				dev_err(adev->dev, "asic reset on init failed\n");
+				goto failed;
+			}
 		}
 	}
 
@@ -3541,19 +3579,19 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 	/* enable clockgating, etc. after ib tests, etc. since some blocks require
 	 * explicit gating rather than handling it automatically.
 	 */
-	r = amdgpu_device_ip_late_init(adev);
-	if (r) {
-		dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
-		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
-		goto failed;
+	if (!adev->gmc.xgmi.pending_reset) {
+		r = amdgpu_device_ip_late_init(adev);
+		if (r) {
+			dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
+			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
+			goto failed;
+		}
+		/* must succeed. */
+		amdgpu_ras_resume(adev);
+		queue_delayed_work(system_wq, &adev->delayed_init_work,
+				   msecs_to_jiffies(AMDGPU_RESUME_MS));
 	}
 
-	/* must succeed. */
-	amdgpu_ras_resume(adev);
-
-	queue_delayed_work(system_wq, &adev->delayed_init_work,
-			   msecs_to_jiffies(AMDGPU_RESUME_MS));
-
 	if (amdgpu_sriov_vf(adev))
 		flush_delayed_work(&adev->delayed_init_work);
 
@@ -3570,6 +3608,10 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 	if (amdgpu_device_cache_pci_state(adev->pdev))
 		pci_restore_state(pdev);
 
+	if (adev->gmc.xgmi.pending_reset)
+		queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
+				   msecs_to_jiffies(AMDGPU_RESUME_MS));
+
 	return 0;
 
 failed:
@@ -4240,14 +4282,16 @@ bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
 }
 
 
-static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
-					struct amdgpu_job *job,
-					bool *need_full_reset_arg)
+int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
+				  struct amdgpu_job *job,
+				  bool *need_full_reset_arg)
 {
 	int i, r = 0;
 	bool need_full_reset  = *need_full_reset_arg;
 
-	amdgpu_debugfs_wait_dump(adev);
+	/* no need to dump if device is not in good state during probe period */
+	if (!adev->gmc.xgmi.pending_reset)
+		amdgpu_debugfs_wait_dump(adev);
 
 	if (amdgpu_sriov_vf(adev)) {
 		/* stop the data exchange thread */
@@ -4293,10 +4337,10 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
 	return r;
 }
 
-static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
-			       struct list_head *device_list_handle,
-			       bool *need_full_reset_arg,
-			       bool skip_hw_reset)
+int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
+			  struct list_head *device_list_handle,
+			  bool *need_full_reset_arg,
+			  bool skip_hw_reset)
 {
 	struct amdgpu_device *tmp_adev = NULL;
 	bool need_full_reset = *need_full_reset_arg, vram_lost = false;
@@ -4310,6 +4354,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
 		list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
 			/* For XGMI run all resets in parallel to speed up the process */
 			if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
+				tmp_adev->gmc.xgmi.pending_reset = false;
 				if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
 					r = -EALREADY;
 			} else
@@ -4348,10 +4393,10 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
 		if (need_full_reset) {
 			/* post card */
-			if (amdgpu_device_asic_init(tmp_adev))
+			r = amdgpu_device_asic_init(tmp_adev);
+			if (r) {
 				dev_warn(tmp_adev->dev, "asic atom init failed!");
-
-			if (!r) {
+			} else {
 				dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
 				r = amdgpu_device_ip_resume_phase1(tmp_adev);
 				if (r)
@@ -4384,6 +4429,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
 				 */
 				amdgpu_register_gpu_instance(tmp_adev);
 
+				if (!hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
+					amdgpu_xgmi_add_device(tmp_adev);
+
 				r = amdgpu_device_ip_late_init(tmp_adev);
 				if (r)
 					goto out;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 253c59e0a100..aebe4bc561ee 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -44,6 +44,7 @@
 #include "amdgpu_amdkfd.h"
 
 #include "amdgpu_ras.h"
+#include "amdgpu_xgmi.h"
 
 /*
  * KMS wrapper.
@@ -167,8 +168,13 @@ uint amdgpu_freesync_vid_mode;
 int amdgpu_reset_method = -1; /* auto */
 int amdgpu_num_kcq = -1;
 
+static void amdgpu_drv_delayed_reset_work_handler(struct work_struct *work);
+
 struct amdgpu_mgpu_info mgpu_info = {
 	.mutex = __MUTEX_INITIALIZER(mgpu_info.mutex),
+	.delayed_reset_work = __DELAYED_WORK_INITIALIZER(
+			mgpu_info.delayed_reset_work,
+			amdgpu_drv_delayed_reset_work_handler, 0),
 };
 int amdgpu_ras_enable = -1;
 uint amdgpu_ras_mask = 0xffffffff;
@@ -1297,6 +1303,71 @@ amdgpu_pci_shutdown(struct pci_dev *pdev)
 	adev->mp1_state = PP_MP1_STATE_NONE;
 }
 
+/**
+ * amdgpu_drv_delayed_reset_work_handler - work handler for reset
+ *
+ * @work: work_struct.
+ */
+static void amdgpu_drv_delayed_reset_work_handler(struct work_struct *work)
+{
+	struct list_head device_list;
+	struct amdgpu_device *adev;
+	int i, r;
+	bool need_full_reset = true;
+
+	mutex_lock(&mgpu_info.mutex);
+	if (mgpu_info.pending_reset == true) {
+		mutex_unlock(&mgpu_info.mutex);
+		return;
+	}
+	mgpu_info.pending_reset = true;
+	mutex_unlock(&mgpu_info.mutex);
+
+	for (i = 0; i < mgpu_info.num_dgpu; i++) {
+		adev = mgpu_info.gpu_ins[i].adev;
+		r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset);
+		if (r) {
+			dev_err(adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
+				r, adev_to_drm(adev)->unique);
+		}
+		if (!queue_work(system_unbound_wq, &adev->xgmi_reset_work))
+			r = -EALREADY;
+	}
+	msleep(100);
+	for (i = 0; i < mgpu_info.num_dgpu; i++) {
+		adev = mgpu_info.gpu_ins[i].adev;
+		adev->gmc.xgmi.pending_reset = false;
+		flush_work(&adev->xgmi_reset_work);
+	}
+
+	msleep(100);
+	/* reset function will rebuild the xgmi hive info , clear it now */
+	for (i = 0; i < mgpu_info.num_dgpu; i++)
+		amdgpu_xgmi_remove_device(mgpu_info.gpu_ins[i].adev);
+
+	INIT_LIST_HEAD(&device_list);
+
+	for (i = 0; i < mgpu_info.num_dgpu; i++)
+		list_add_tail(&mgpu_info.gpu_ins[i].adev->reset_list, &device_list);
+
+	/* unregister the GPU first, reset function will add them back */
+	list_for_each_entry(adev, &device_list, reset_list)
+		amdgpu_unregister_gpu_instance(adev);
+
+	r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true);
+	if (r) {
+		DRM_ERROR("reinit gpus failure");
+		return;
+	}
+	for (i = 0; i < mgpu_info.num_dgpu; i++) {
+		adev = mgpu_info.gpu_ins[i].adev;
+		if (!adev->kfd.init_complete)
+			amdgpu_amdkfd_device_init(adev);
+		amdgpu_ttm_set_buffer_funcs_status(adev, true);
+	}
+	return;
+}
+
 static int amdgpu_pmops_suspend(struct device *dev)
 {
 	struct drm_device *drm_dev = dev_get_drvdata(dev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
index aa0c83776ce0..8c71d84a2fbe 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
@@ -149,6 +149,7 @@ struct amdgpu_xgmi {
 	struct list_head head;
 	bool supported;
 	struct ras_common_if *ras_if;
+	bool pending_reset;
 };
 
 struct amdgpu_gmc {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 659b385b27b5..b459ef755ea9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -492,7 +492,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
 	if (!adev->gmc.xgmi.supported)
 		return 0;
 
-	if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
+	if (!adev->gmc.xgmi.pending_reset &&
+	    amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
 		ret = psp_xgmi_initialize(&adev->psp);
 		if (ret) {
 			dev_err(adev->dev,
@@ -538,7 +539,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
 
 	task_barrier_add_task(&hive->tb);
 
-	if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
+	if (!adev->gmc.xgmi.pending_reset &&
+	    amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
 		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
 			/* update node list for other device in the hive */
 			if (tmp_adev != adev) {
@@ -567,7 +569,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
 		}
 	}
 
-	if (!ret)
+	if (!ret && !adev->gmc.xgmi.pending_reset)
 		ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive);
 
 exit_unlock:
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH 5/5] drm/amdgpu: Reset the devices in the XGMI hive duirng probe
  2021-03-05 17:52 [PATCH 5/5] drm/amdgpu: Reset the devices in the XGMI hive duirng probe shaoyunl
@ 2021-03-05 19:27 ` Andrey Grodzovsky
  2021-03-06  1:37   ` Liu, Shaoyun
  0 siblings, 1 reply; 12+ messages in thread
From: Andrey Grodzovsky @ 2021-03-05 19:27 UTC (permalink / raw)
  To: shaoyunl, amd-gfx



On 2021-03-05 12:52 p.m., shaoyunl wrote:
> In passthrough configuration, hypervisior will trigger the SBR(Secondary bus reset) to the devices
> without sync to each other. This could cause device hang since for XGMI configuration, all the devices
> within the hive need to be reset at a limit time slot. This serial of patches try to solve this issue
> by co-operate with new SMU which will only do minimum house keeping to response the SBR request but don't
> do the real reset job and leave it to driver. Driver need to do the whole sw init and minimum HW init
> to bring up the SMU and trigger the reset(possibly BACO) on all the ASICs at the same time
> 
> Signed-off-by: shaoyunl <shaoyun.liu@amd.com>
> Change-Id: I34e838e611b7623c7ad824704c7ce350808014fc
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  13 +++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 102 +++++++++++++++------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  71 ++++++++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h    |   1 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   |   8 +-
>   5 files changed, 165 insertions(+), 30 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index d46d3794699e..5602c6edee97 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -125,6 +125,10 @@ struct amdgpu_mgpu_info
>   	uint32_t			num_gpu;
>   	uint32_t			num_dgpu;
>   	uint32_t			num_apu;
> +
> +	/* delayed reset_func for XGMI configuration if necessary */
> +	struct delayed_work		delayed_reset_work;
> +	bool				pending_reset;
>   };
>   
>   #define AMDGPU_MAX_TIMEOUT_PARAM_LENGTH	256
> @@ -1124,6 +1128,15 @@ void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
>   bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type);
>   bool amdgpu_device_has_dc_support(struct amdgpu_device *adev);
>   
> +int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
> +				  struct amdgpu_job *job,
> +				  bool *need_full_reset_arg);
> +
> +int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
> +			  struct list_head *device_list_handle,
> +			  bool *need_full_reset_arg,
> +			  bool skip_hw_reset);
> +
>   int emu_soc_asic_init(struct amdgpu_device *adev);
>   
>   /*
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 3c35b0c1e710..5b520f70e660 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -1220,6 +1220,10 @@ bool amdgpu_device_need_post(struct amdgpu_device *adev)
>   		}
>   	}
>   
> +	/* Don't post if we need to reset whole hive on init */
> +	if (adev->gmc.xgmi.pending_reset)
> +		return false;
> +
>   	if (adev->has_hw_reset) {
>   		adev->has_hw_reset = false;
>   		return true;
> @@ -2149,6 +2153,9 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
>   			if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
>   				continue;
>   
> +			if (!adev->ip_blocks[i].status.sw)
> +				continue;
> +
>   			/* no need to do the fw loading again if already done*/
>   			if (adev->ip_blocks[i].status.hw == true)
>   				break;
> @@ -2289,7 +2296,10 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
>   
>   	if (adev->gmc.xgmi.num_physical_nodes > 1)
>   		amdgpu_xgmi_add_device(adev);
> -	amdgpu_amdkfd_device_init(adev);
> +
> +	/* Don't init kfd if whole hive need to be reset during init */
> +	if (!adev->gmc.xgmi.pending_reset)
> +		amdgpu_amdkfd_device_init(adev);
>   
>   	amdgpu_fru_get_product_info(adev);
>   
> @@ -2734,6 +2744,16 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
>   			adev->ip_blocks[i].status.hw = false;
>   			continue;
>   		}
> +
> +		/* skip unnecessary suspend if we do not initialize them yet */
> +		if (adev->gmc.xgmi.pending_reset &&
> +		    !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
> +		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
> +		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
> +		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
> +			adev->ip_blocks[i].status.hw = false;
> +			continue;
> +		}
>   		/* XXX handle errors */
>   		r = adev->ip_blocks[i].version->funcs->suspend(adev);
>   		/* XXX handle errors */
> @@ -3407,10 +3427,28 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>   	 *  E.g., driver was not cleanly unloaded previously, etc.
>   	 */
>   	if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
> -		r = amdgpu_asic_reset(adev);
> -		if (r) {
> -			dev_err(adev->dev, "asic reset on init failed\n");
> -			goto failed;
> +		if (adev->gmc.xgmi.num_physical_nodes) {
> +			dev_info(adev->dev, "Pending hive reset.\n");
> +			adev->gmc.xgmi.pending_reset = true;
> +			/* Only need to init necessary block for SMU to handle the reset */
> +			for (i = 0; i < adev->num_ip_blocks; i++) {
> +				if (!adev->ip_blocks[i].status.valid)
> +					continue;
> +				if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
> +				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
> +				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
> +				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
> +					DRM_DEBUG("IP %s disabed for hw_init.\n",
> +						adev->ip_blocks[i].version->funcs->name);
> +					adev->ip_blocks[i].status.hw = true;
> +				}
> +			}
> +		} else {
> +			r = amdgpu_asic_reset(adev);
> +			if (r) {
> +				dev_err(adev->dev, "asic reset on init failed\n");
> +				goto failed;
> +			}
>   		}
>   	}
>   
> @@ -3541,19 +3579,19 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>   	/* enable clockgating, etc. after ib tests, etc. since some blocks require
>   	 * explicit gating rather than handling it automatically.
>   	 */
> -	r = amdgpu_device_ip_late_init(adev);
> -	if (r) {
> -		dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
> -		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
> -		goto failed;
> +	if (!adev->gmc.xgmi.pending_reset) {
> +		r = amdgpu_device_ip_late_init(adev);
> +		if (r) {
> +			dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
> +			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
> +			goto failed;
> +		}
> +		/* must succeed. */
> +		amdgpu_ras_resume(adev);
> +		queue_delayed_work(system_wq, &adev->delayed_init_work,
> +				   msecs_to_jiffies(AMDGPU_RESUME_MS));
>   	}
>   
> -	/* must succeed. */
> -	amdgpu_ras_resume(adev);
> -
> -	queue_delayed_work(system_wq, &adev->delayed_init_work,
> -			   msecs_to_jiffies(AMDGPU_RESUME_MS));
> -
>   	if (amdgpu_sriov_vf(adev))
>   		flush_delayed_work(&adev->delayed_init_work);
>   
> @@ -3570,6 +3608,10 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>   	if (amdgpu_device_cache_pci_state(adev->pdev))
>   		pci_restore_state(pdev);
>   
> +	if (adev->gmc.xgmi.pending_reset)
> +		queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
> +				   msecs_to_jiffies(AMDGPU_RESUME_MS));
> +
>   	return 0;
>   
>   failed:
> @@ -4240,14 +4282,16 @@ bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
>   }
>   
>   
> -static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
> -					struct amdgpu_job *job,
> -					bool *need_full_reset_arg)
> +int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
> +				  struct amdgpu_job *job,
> +				  bool *need_full_reset_arg)
>   {
>   	int i, r = 0;
>   	bool need_full_reset  = *need_full_reset_arg;
>   
> -	amdgpu_debugfs_wait_dump(adev);
> +	/* no need to dump if device is not in good state during probe period */
> +	if (!adev->gmc.xgmi.pending_reset)
> +		amdgpu_debugfs_wait_dump(adev);
>   
>   	if (amdgpu_sriov_vf(adev)) {
>   		/* stop the data exchange thread */
> @@ -4293,10 +4337,10 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>   	return r;
>   }
>   
> -static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
> -			       struct list_head *device_list_handle,
> -			       bool *need_full_reset_arg,
> -			       bool skip_hw_reset)
> +int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
> +			  struct list_head *device_list_handle,
> +			  bool *need_full_reset_arg,
> +			  bool skip_hw_reset)
>   {
>   	struct amdgpu_device *tmp_adev = NULL;
>   	bool need_full_reset = *need_full_reset_arg, vram_lost = false;
> @@ -4310,6 +4354,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>   		list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
>   			/* For XGMI run all resets in parallel to speed up the process */
>   			if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
> +				tmp_adev->gmc.xgmi.pending_reset = false;
>   				if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
>   					r = -EALREADY;
>   			} else
> @@ -4348,10 +4393,10 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>   	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
>   		if (need_full_reset) {
>   			/* post card */
> -			if (amdgpu_device_asic_init(tmp_adev))
> +			r = amdgpu_device_asic_init(tmp_adev);
> +			if (r) {
>   				dev_warn(tmp_adev->dev, "asic atom init failed!");
> -
> -			if (!r) {
> +			} else {
>   				dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
>   				r = amdgpu_device_ip_resume_phase1(tmp_adev);
>   				if (r)
> @@ -4384,6 +4429,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>   				 */
>   				amdgpu_register_gpu_instance(tmp_adev);
>   
> +				if (!hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
> +					amdgpu_xgmi_add_device(tmp_adev);
> +
>   				r = amdgpu_device_ip_late_init(tmp_adev);
>   				if (r)
>   					goto out;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 253c59e0a100..aebe4bc561ee 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -44,6 +44,7 @@
>   #include "amdgpu_amdkfd.h"
>   
>   #include "amdgpu_ras.h"
> +#include "amdgpu_xgmi.h"
>   
>   /*
>    * KMS wrapper.
> @@ -167,8 +168,13 @@ uint amdgpu_freesync_vid_mode;
>   int amdgpu_reset_method = -1; /* auto */
>   int amdgpu_num_kcq = -1;
>   
> +static void amdgpu_drv_delayed_reset_work_handler(struct work_struct *work);
> +
>   struct amdgpu_mgpu_info mgpu_info = {
>   	.mutex = __MUTEX_INITIALIZER(mgpu_info.mutex),
> +	.delayed_reset_work = __DELAYED_WORK_INITIALIZER(
> +			mgpu_info.delayed_reset_work,
> +			amdgpu_drv_delayed_reset_work_handler, 0),
>   };
>   int amdgpu_ras_enable = -1;
>   uint amdgpu_ras_mask = 0xffffffff;
> @@ -1297,6 +1303,71 @@ amdgpu_pci_shutdown(struct pci_dev *pdev)
>   	adev->mp1_state = PP_MP1_STATE_NONE;
>   }
>   
> +/**
> + * amdgpu_drv_delayed_reset_work_handler - work handler for reset
> + *
> + * @work: work_struct.
> + */
> +static void amdgpu_drv_delayed_reset_work_handler(struct work_struct *work)
> +{
> +	struct list_head device_list;
> +	struct amdgpu_device *adev;
> +	int i, r;
> +	bool need_full_reset = true;
> +
> +	mutex_lock(&mgpu_info.mutex);
> +	if (mgpu_info.pending_reset == true) {
> +		mutex_unlock(&mgpu_info.mutex);
> +		return;
> +	}
> +	mgpu_info.pending_reset = true;
> +	mutex_unlock(&mgpu_info.mutex);
> +
> +	for (i = 0; i < mgpu_info.num_dgpu; i++) {
> +		adev = mgpu_info.gpu_ins[i].adev;
> +		r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset);

Why amdgpu_device_pre_asic_reset is needed ?

> +		if (r) {
> +			dev_err(adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
> +				r, adev_to_drm(adev)->unique);
> +		}
> +		if (!queue_work(system_unbound_wq, &adev->xgmi_reset_work))
> +			r = -EALREADY;

amdgpu_do_asic_reset bellow will already schedule xgmi_reset_work for 
this device, what you could do instead is call amdgpu_do_asic_reset for 
each member of the hive and because there is a task barrier in 
amdgpu_device_xgmi_reset_func, it will synchronize all the resets to 
same point in time already.

> +	}
> +	msleep(100);

What's the 100ms is wiating for ?

> +	for (i = 0; i < mgpu_info.num_dgpu; i++) {
> +		adev = mgpu_info.gpu_ins[i].adev;
> +		adev->gmc.xgmi.pending_reset = false;
> +		flush_work(&adev->xgmi_reset_work);
> +	}
> +
> +	msleep(100);

Same as above

> +	/* reset function will rebuild the xgmi hive info , clear it now */
> +	for (i = 0; i < mgpu_info.num_dgpu; i++)
> +		amdgpu_xgmi_remove_device(mgpu_info.gpu_ins[i].adev);
> +
> +	INIT_LIST_HEAD(&device_list);
> +
> +	for (i = 0; i < mgpu_info.num_dgpu; i++)
> +		list_add_tail(&mgpu_info.gpu_ins[i].adev->reset_list, &device_list);
> +
> +	/* unregister the GPU first, reset function will add them back */
> +	list_for_each_entry(adev, &device_list, reset_list)
> +		amdgpu_unregister_gpu_instance(adev);
> +
> +	r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true);
> +	if (r) {
> +		DRM_ERROR("reinit gpus failure");
> +		return;
> +	}
> +	for (i = 0; i < mgpu_info.num_dgpu; i++) {
> +		adev = mgpu_info.gpu_ins[i].adev;
> +		if (!adev->kfd.init_complete)
> +			amdgpu_amdkfd_device_init(adev);
> +		amdgpu_ttm_set_buffer_funcs_status(adev, true);
> +	}
> +	return;
> +}
> +
>   static int amdgpu_pmops_suspend(struct device *dev)
>   {
>   	struct drm_device *drm_dev = dev_get_drvdata(dev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> index aa0c83776ce0..8c71d84a2fbe 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> @@ -149,6 +149,7 @@ struct amdgpu_xgmi {
>   	struct list_head head;
>   	bool supported;
>   	struct ras_common_if *ras_if;
> +	bool pending_reset;
>   };
>   
>   struct amdgpu_gmc {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> index 659b385b27b5..b459ef755ea9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> @@ -492,7 +492,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>   	if (!adev->gmc.xgmi.supported)
>   		return 0;
>   
> -	if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
> +	if (!adev->gmc.xgmi.pending_reset &&
> +	    amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
>   		ret = psp_xgmi_initialize(&adev->psp);
>   		if (ret) {
>   			dev_err(adev->dev,
> @@ -538,7 +539,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>   
>   	task_barrier_add_task(&hive->tb);
>   
> -	if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
> +	if (!adev->gmc.xgmi.pending_reset &&
> +	    amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
>   		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
>   			/* update node list for other device in the hive */
>   			if (tmp_adev != adev) {
> @@ -567,7 +569,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>   		}
>   	}
>   
> -	if (!ret)
> +	if (!ret && !adev->gmc.xgmi.pending_reset)
>   		ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive);
>   
>   exit_unlock:
> 
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [PATCH 5/5] drm/amdgpu: Reset the devices in the XGMI hive duirng probe
  2021-03-05 19:27 ` Andrey Grodzovsky
@ 2021-03-06  1:37   ` Liu, Shaoyun
  2021-03-06  6:09     ` Andrey Grodzovsky
  0 siblings, 1 reply; 12+ messages in thread
From: Liu, Shaoyun @ 2021-03-06  1:37 UTC (permalink / raw)
  To: Grodzovsky, Andrey, amd-gfx

[AMD Official Use Only - Internal Distribution Only]

Hi,  Andrey 
The existing reset function (amdgpu_device_gpu_recover or amd do_asic _reset) assumed driver already have  the correct hive info . But in my case, it's  not true . The gpus are in a bad state and the XGMI TA  might not functional properly , so driver can  not  get the hive and node info when probe the device .  It means driver even don't know  the device belongs to which hive on a system with multiple hive configuration (ex, 8 gpus in  two hive). The only solution I can think of is let driver trigger the reset on all gpus at the same time after driver do the minimum initialization on the HW ( bring up the  SMU IP)  no matter they belongs to the same hive or not and call amdgpu_xgmi_add_device for each device after re-init . 
The 100 ms delay added after the baco reset . I think they can be removed . let me verify it. 

Regards
Shaoyun.liu 



-----Original Message-----
From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com> 
Sent: Friday, March 5, 2021 2:27 PM
To: Liu, Shaoyun <Shaoyun.Liu@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH 5/5] drm/amdgpu: Reset the devices in the XGMI hive duirng probe



On 2021-03-05 12:52 p.m., shaoyunl wrote:
> In passthrough configuration, hypervisior will trigger the 
> SBR(Secondary bus reset) to the devices without sync to each other. 
> This could cause device hang since for XGMI configuration, all the 
> devices within the hive need to be reset at a limit time slot. This 
> serial of patches try to solve this issue by co-operate with new SMU 
> which will only do minimum house keeping to response the SBR request 
> but don't do the real reset job and leave it to driver. Driver need to 
> do the whole sw init and minimum HW init to bring up the SMU and 
> trigger the reset(possibly BACO) on all the ASICs at the same time
> 
> Signed-off-by: shaoyunl <shaoyun.liu@amd.com>
> Change-Id: I34e838e611b7623c7ad824704c7ce350808014fc
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  13 +++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 102 +++++++++++++++------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  71 ++++++++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h    |   1 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   |   8 +-
>   5 files changed, 165 insertions(+), 30 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index d46d3794699e..5602c6edee97 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -125,6 +125,10 @@ struct amdgpu_mgpu_info
>   	uint32_t			num_gpu;
>   	uint32_t			num_dgpu;
>   	uint32_t			num_apu;
> +
> +	/* delayed reset_func for XGMI configuration if necessary */
> +	struct delayed_work		delayed_reset_work;
> +	bool				pending_reset;
>   };
>   
>   #define AMDGPU_MAX_TIMEOUT_PARAM_LENGTH	256
> @@ -1124,6 +1128,15 @@ void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
>   bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type);
>   bool amdgpu_device_has_dc_support(struct amdgpu_device *adev);
>   
> +int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
> +				  struct amdgpu_job *job,
> +				  bool *need_full_reset_arg);
> +
> +int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
> +			  struct list_head *device_list_handle,
> +			  bool *need_full_reset_arg,
> +			  bool skip_hw_reset);
> +
>   int emu_soc_asic_init(struct amdgpu_device *adev);
>   
>   /*
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 3c35b0c1e710..5b520f70e660 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -1220,6 +1220,10 @@ bool amdgpu_device_need_post(struct amdgpu_device *adev)
>   		}
>   	}
>   
> +	/* Don't post if we need to reset whole hive on init */
> +	if (adev->gmc.xgmi.pending_reset)
> +		return false;
> +
>   	if (adev->has_hw_reset) {
>   		adev->has_hw_reset = false;
>   		return true;
> @@ -2149,6 +2153,9 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
>   			if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
>   				continue;
>   
> +			if (!adev->ip_blocks[i].status.sw)
> +				continue;
> +
>   			/* no need to do the fw loading again if already done*/
>   			if (adev->ip_blocks[i].status.hw == true)
>   				break;
> @@ -2289,7 +2296,10 @@ static int amdgpu_device_ip_init(struct 
> amdgpu_device *adev)
>   
>   	if (adev->gmc.xgmi.num_physical_nodes > 1)
>   		amdgpu_xgmi_add_device(adev);
> -	amdgpu_amdkfd_device_init(adev);
> +
> +	/* Don't init kfd if whole hive need to be reset during init */
> +	if (!adev->gmc.xgmi.pending_reset)
> +		amdgpu_amdkfd_device_init(adev);
>   
>   	amdgpu_fru_get_product_info(adev);
>   
> @@ -2734,6 +2744,16 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
>   			adev->ip_blocks[i].status.hw = false;
>   			continue;
>   		}
> +
> +		/* skip unnecessary suspend if we do not initialize them yet */
> +		if (adev->gmc.xgmi.pending_reset &&
> +		    !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
> +		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
> +		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
> +		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
> +			adev->ip_blocks[i].status.hw = false;
> +			continue;
> +		}
>   		/* XXX handle errors */
>   		r = adev->ip_blocks[i].version->funcs->suspend(adev);
>   		/* XXX handle errors */
> @@ -3407,10 +3427,28 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>   	 *  E.g., driver was not cleanly unloaded previously, etc.
>   	 */
>   	if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
> -		r = amdgpu_asic_reset(adev);
> -		if (r) {
> -			dev_err(adev->dev, "asic reset on init failed\n");
> -			goto failed;
> +		if (adev->gmc.xgmi.num_physical_nodes) {
> +			dev_info(adev->dev, "Pending hive reset.\n");
> +			adev->gmc.xgmi.pending_reset = true;
> +			/* Only need to init necessary block for SMU to handle the reset */
> +			for (i = 0; i < adev->num_ip_blocks; i++) {
> +				if (!adev->ip_blocks[i].status.valid)
> +					continue;
> +				if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
> +				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
> +				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
> +				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
> +					DRM_DEBUG("IP %s disabed for hw_init.\n",
> +						adev->ip_blocks[i].version->funcs->name);
> +					adev->ip_blocks[i].status.hw = true;
> +				}
> +			}
> +		} else {
> +			r = amdgpu_asic_reset(adev);
> +			if (r) {
> +				dev_err(adev->dev, "asic reset on init failed\n");
> +				goto failed;
> +			}
>   		}
>   	}
>   
> @@ -3541,19 +3579,19 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>   	/* enable clockgating, etc. after ib tests, etc. since some blocks require
>   	 * explicit gating rather than handling it automatically.
>   	 */
> -	r = amdgpu_device_ip_late_init(adev);
> -	if (r) {
> -		dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
> -		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
> -		goto failed;
> +	if (!adev->gmc.xgmi.pending_reset) {
> +		r = amdgpu_device_ip_late_init(adev);
> +		if (r) {
> +			dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
> +			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
> +			goto failed;
> +		}
> +		/* must succeed. */
> +		amdgpu_ras_resume(adev);
> +		queue_delayed_work(system_wq, &adev->delayed_init_work,
> +				   msecs_to_jiffies(AMDGPU_RESUME_MS));
>   	}
>   
> -	/* must succeed. */
> -	amdgpu_ras_resume(adev);
> -
> -	queue_delayed_work(system_wq, &adev->delayed_init_work,
> -			   msecs_to_jiffies(AMDGPU_RESUME_MS));
> -
>   	if (amdgpu_sriov_vf(adev))
>   		flush_delayed_work(&adev->delayed_init_work);
>   
> @@ -3570,6 +3608,10 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>   	if (amdgpu_device_cache_pci_state(adev->pdev))
>   		pci_restore_state(pdev);
>   
> +	if (adev->gmc.xgmi.pending_reset)
> +		queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
> +				   msecs_to_jiffies(AMDGPU_RESUME_MS));
> +
>   	return 0;
>   
>   failed:
> @@ -4240,14 +4282,16 @@ bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
>   }
>   
>   
> -static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
> -					struct amdgpu_job *job,
> -					bool *need_full_reset_arg)
> +int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
> +				  struct amdgpu_job *job,
> +				  bool *need_full_reset_arg)
>   {
>   	int i, r = 0;
>   	bool need_full_reset  = *need_full_reset_arg;
>   
> -	amdgpu_debugfs_wait_dump(adev);
> +	/* no need to dump if device is not in good state during probe period */
> +	if (!adev->gmc.xgmi.pending_reset)
> +		amdgpu_debugfs_wait_dump(adev);
>   
>   	if (amdgpu_sriov_vf(adev)) {
>   		/* stop the data exchange thread */ @@ -4293,10 +4337,10 @@ static 
> int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>   	return r;
>   }
>   
> -static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
> -			       struct list_head *device_list_handle,
> -			       bool *need_full_reset_arg,
> -			       bool skip_hw_reset)
> +int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
> +			  struct list_head *device_list_handle,
> +			  bool *need_full_reset_arg,
> +			  bool skip_hw_reset)
>   {
>   	struct amdgpu_device *tmp_adev = NULL;
>   	bool need_full_reset = *need_full_reset_arg, vram_lost = false; @@ 
> -4310,6 +4354,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>   		list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
>   			/* For XGMI run all resets in parallel to speed up the process */
>   			if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
> +				tmp_adev->gmc.xgmi.pending_reset = false;
>   				if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
>   					r = -EALREADY;
>   			} else
> @@ -4348,10 +4393,10 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>   	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
>   		if (need_full_reset) {
>   			/* post card */
> -			if (amdgpu_device_asic_init(tmp_adev))
> +			r = amdgpu_device_asic_init(tmp_adev);
> +			if (r) {
>   				dev_warn(tmp_adev->dev, "asic atom init failed!");
> -
> -			if (!r) {
> +			} else {
>   				dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
>   				r = amdgpu_device_ip_resume_phase1(tmp_adev);
>   				if (r)
> @@ -4384,6 +4429,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>   				 */
>   				amdgpu_register_gpu_instance(tmp_adev);
>   
> +				if (!hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
> +					amdgpu_xgmi_add_device(tmp_adev);
> +
>   				r = amdgpu_device_ip_late_init(tmp_adev);
>   				if (r)
>   					goto out;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 253c59e0a100..aebe4bc561ee 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -44,6 +44,7 @@
>   #include "amdgpu_amdkfd.h"
>   
>   #include "amdgpu_ras.h"
> +#include "amdgpu_xgmi.h"
>   
>   /*
>    * KMS wrapper.
> @@ -167,8 +168,13 @@ uint amdgpu_freesync_vid_mode;
>   int amdgpu_reset_method = -1; /* auto */
>   int amdgpu_num_kcq = -1;
>   
> +static void amdgpu_drv_delayed_reset_work_handler(struct work_struct 
> +*work);
> +
>   struct amdgpu_mgpu_info mgpu_info = {
>   	.mutex = __MUTEX_INITIALIZER(mgpu_info.mutex),
> +	.delayed_reset_work = __DELAYED_WORK_INITIALIZER(
> +			mgpu_info.delayed_reset_work,
> +			amdgpu_drv_delayed_reset_work_handler, 0),
>   };
>   int amdgpu_ras_enable = -1;
>   uint amdgpu_ras_mask = 0xffffffff;
> @@ -1297,6 +1303,71 @@ amdgpu_pci_shutdown(struct pci_dev *pdev)
>   	adev->mp1_state = PP_MP1_STATE_NONE;
>   }
>   
> +/**
> + * amdgpu_drv_delayed_reset_work_handler - work handler for reset
> + *
> + * @work: work_struct.
> + */
> +static void amdgpu_drv_delayed_reset_work_handler(struct work_struct 
> +*work) {
> +	struct list_head device_list;
> +	struct amdgpu_device *adev;
> +	int i, r;
> +	bool need_full_reset = true;
> +
> +	mutex_lock(&mgpu_info.mutex);
> +	if (mgpu_info.pending_reset == true) {
> +		mutex_unlock(&mgpu_info.mutex);
> +		return;
> +	}
> +	mgpu_info.pending_reset = true;
> +	mutex_unlock(&mgpu_info.mutex);
> +
> +	for (i = 0; i < mgpu_info.num_dgpu; i++) {
> +		adev = mgpu_info.gpu_ins[i].adev;
> +		r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset);

Why amdgpu_device_pre_asic_reset is needed ?

> +		if (r) {
> +			dev_err(adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
> +				r, adev_to_drm(adev)->unique);
> +		}
> +		if (!queue_work(system_unbound_wq, &adev->xgmi_reset_work))
> +			r = -EALREADY;

amdgpu_do_asic_reset bellow will already schedule xgmi_reset_work for this device, what you could do instead is call amdgpu_do_asic_reset for each member of the hive and because there is a task barrier in amdgpu_device_xgmi_reset_func, it will synchronize all the resets to same point in time already.

> +	}
> +	msleep(100);

What's the 100ms is wiating for ?

> +	for (i = 0; i < mgpu_info.num_dgpu; i++) {
> +		adev = mgpu_info.gpu_ins[i].adev;
> +		adev->gmc.xgmi.pending_reset = false;
> +		flush_work(&adev->xgmi_reset_work);
> +	}
> +
> +	msleep(100);

Same as above

> +	/* reset function will rebuild the xgmi hive info , clear it now */
> +	for (i = 0; i < mgpu_info.num_dgpu; i++)
> +		amdgpu_xgmi_remove_device(mgpu_info.gpu_ins[i].adev);
> +
> +	INIT_LIST_HEAD(&device_list);
> +
> +	for (i = 0; i < mgpu_info.num_dgpu; i++)
> +		list_add_tail(&mgpu_info.gpu_ins[i].adev->reset_list, 
> +&device_list);
> +
> +	/* unregister the GPU first, reset function will add them back */
> +	list_for_each_entry(adev, &device_list, reset_list)
> +		amdgpu_unregister_gpu_instance(adev);
> +
> +	r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true);
> +	if (r) {
> +		DRM_ERROR("reinit gpus failure");
> +		return;
> +	}
> +	for (i = 0; i < mgpu_info.num_dgpu; i++) {
> +		adev = mgpu_info.gpu_ins[i].adev;
> +		if (!adev->kfd.init_complete)
> +			amdgpu_amdkfd_device_init(adev);
> +		amdgpu_ttm_set_buffer_funcs_status(adev, true);
> +	}
> +	return;
> +}
> +
>   static int amdgpu_pmops_suspend(struct device *dev)
>   {
>   	struct drm_device *drm_dev = dev_get_drvdata(dev); diff --git 
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> index aa0c83776ce0..8c71d84a2fbe 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> @@ -149,6 +149,7 @@ struct amdgpu_xgmi {
>   	struct list_head head;
>   	bool supported;
>   	struct ras_common_if *ras_if;
> +	bool pending_reset;
>   };
>   
>   struct amdgpu_gmc {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> index 659b385b27b5..b459ef755ea9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> @@ -492,7 +492,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>   	if (!adev->gmc.xgmi.supported)
>   		return 0;
>   
> -	if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
> +	if (!adev->gmc.xgmi.pending_reset &&
> +	    amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
>   		ret = psp_xgmi_initialize(&adev->psp);
>   		if (ret) {
>   			dev_err(adev->dev,
> @@ -538,7 +539,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device 
> *adev)
>   
>   	task_barrier_add_task(&hive->tb);
>   
> -	if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
> +	if (!adev->gmc.xgmi.pending_reset &&
> +	    amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
>   		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
>   			/* update node list for other device in the hive */
>   			if (tmp_adev != adev) {
> @@ -567,7 +569,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>   		}
>   	}
>   
> -	if (!ret)
> +	if (!ret && !adev->gmc.xgmi.pending_reset)
>   		ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive);
>   
>   exit_unlock:
> 
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 5/5] drm/amdgpu: Reset the devices in the XGMI hive duirng probe
  2021-03-06  1:37   ` Liu, Shaoyun
@ 2021-03-06  6:09     ` Andrey Grodzovsky
  2021-03-06 20:41       ` Liu, Shaoyun
  0 siblings, 1 reply; 12+ messages in thread
From: Andrey Grodzovsky @ 2021-03-06  6:09 UTC (permalink / raw)
  To: Liu, Shaoyun, amd-gfx

Thanks for explaining this, one thing I still don't understand is why 
you schedule the reset work explicilty in the begining of 
amdgpu_drv_delayed_reset_work_handler and then also call 
amdgpu_do_asic_reset which will do the same thing too. It looks like the 
physical reset will execute twice for each device.
Another thing is, more like improvement suggestion  - currently you 
schedule delayed_reset_work using AMDGPU_RESUME_MS - so i guesss this 
should give enough time for SMU to start ? Is there maybe a way to 
instead poll for SMU start completion and then execute this - some SMU 
status registers maybe ? Just to avoid relying on this arbitrary value.

Andrey

On 2021-03-05 8:37 p.m., Liu, Shaoyun wrote:
> [AMD Official Use Only - Internal Distribution Only]
> 
> Hi,  Andrey
> The existing reset function (amdgpu_device_gpu_recover or amd do_asic _reset) assumed driver already have  the correct hive info . But in my case, it's  not true . The gpus are in a bad state and the XGMI TA  might not functional properly , so driver can  not  get the hive and node info when probe the device .  It means driver even don't know  the device belongs to which hive on a system with multiple hive configuration (ex, 8 gpus in  two hive). The only solution I can think of is let driver trigger the reset on all gpus at the same time after driver do the minimum initialization on the HW ( bring up the  SMU IP)  no matter they belongs to the same hive or not and call amdgpu_xgmi_add_device for each device after re-init .
> The 100 ms delay added after the baco reset . I think they can be removed . let me verify it.
> 
> Regards
> Shaoyun.liu
> 
> 
> 
> -----Original Message-----
> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
> Sent: Friday, March 5, 2021 2:27 PM
> To: Liu, Shaoyun <Shaoyun.Liu@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH 5/5] drm/amdgpu: Reset the devices in the XGMI hive duirng probe
> 
> 
> 
> On 2021-03-05 12:52 p.m., shaoyunl wrote:
>> In passthrough configuration, hypervisior will trigger the
>> SBR(Secondary bus reset) to the devices without sync to each other.
>> This could cause device hang since for XGMI configuration, all the
>> devices within the hive need to be reset at a limit time slot. This
>> serial of patches try to solve this issue by co-operate with new SMU
>> which will only do minimum house keeping to response the SBR request
>> but don't do the real reset job and leave it to driver. Driver need to
>> do the whole sw init and minimum HW init to bring up the SMU and
>> trigger the reset(possibly BACO) on all the ASICs at the same time
>>
>> Signed-off-by: shaoyunl <shaoyun.liu@amd.com>
>> Change-Id: I34e838e611b7623c7ad824704c7ce350808014fc
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  13 +++
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 102 +++++++++++++++------
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  71 ++++++++++++++
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h    |   1 +
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   |   8 +-
>>    5 files changed, 165 insertions(+), 30 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index d46d3794699e..5602c6edee97 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -125,6 +125,10 @@ struct amdgpu_mgpu_info
>>    	uint32_t			num_gpu;
>>    	uint32_t			num_dgpu;
>>    	uint32_t			num_apu;
>> +
>> +	/* delayed reset_func for XGMI configuration if necessary */
>> +	struct delayed_work		delayed_reset_work;
>> +	bool				pending_reset;
>>    };
>>    
>>    #define AMDGPU_MAX_TIMEOUT_PARAM_LENGTH	256
>> @@ -1124,6 +1128,15 @@ void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
>>    bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type);
>>    bool amdgpu_device_has_dc_support(struct amdgpu_device *adev);
>>    
>> +int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>> +				  struct amdgpu_job *job,
>> +				  bool *need_full_reset_arg);
>> +
>> +int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>> +			  struct list_head *device_list_handle,
>> +			  bool *need_full_reset_arg,
>> +			  bool skip_hw_reset);
>> +
>>    int emu_soc_asic_init(struct amdgpu_device *adev);
>>    
>>    /*
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 3c35b0c1e710..5b520f70e660 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -1220,6 +1220,10 @@ bool amdgpu_device_need_post(struct amdgpu_device *adev)
>>    		}
>>    	}
>>    
>> +	/* Don't post if we need to reset whole hive on init */
>> +	if (adev->gmc.xgmi.pending_reset)
>> +		return false;
>> +
>>    	if (adev->has_hw_reset) {
>>    		adev->has_hw_reset = false;
>>    		return true;
>> @@ -2149,6 +2153,9 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
>>    			if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
>>    				continue;
>>    
>> +			if (!adev->ip_blocks[i].status.sw)
>> +				continue;
>> +
>>    			/* no need to do the fw loading again if already done*/
>>    			if (adev->ip_blocks[i].status.hw == true)
>>    				break;
>> @@ -2289,7 +2296,10 @@ static int amdgpu_device_ip_init(struct
>> amdgpu_device *adev)
>>    
>>    	if (adev->gmc.xgmi.num_physical_nodes > 1)
>>    		amdgpu_xgmi_add_device(adev);
>> -	amdgpu_amdkfd_device_init(adev);
>> +
>> +	/* Don't init kfd if whole hive need to be reset during init */
>> +	if (!adev->gmc.xgmi.pending_reset)
>> +		amdgpu_amdkfd_device_init(adev);
>>    
>>    	amdgpu_fru_get_product_info(adev);
>>    
>> @@ -2734,6 +2744,16 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
>>    			adev->ip_blocks[i].status.hw = false;
>>    			continue;
>>    		}
>> +
>> +		/* skip unnecessary suspend if we do not initialize them yet */
>> +		if (adev->gmc.xgmi.pending_reset &&
>> +		    !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
>> +		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
>> +		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
>> +		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
>> +			adev->ip_blocks[i].status.hw = false;
>> +			continue;
>> +		}
>>    		/* XXX handle errors */
>>    		r = adev->ip_blocks[i].version->funcs->suspend(adev);
>>    		/* XXX handle errors */
>> @@ -3407,10 +3427,28 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>    	 *  E.g., driver was not cleanly unloaded previously, etc.
>>    	 */
>>    	if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
>> -		r = amdgpu_asic_reset(adev);
>> -		if (r) {
>> -			dev_err(adev->dev, "asic reset on init failed\n");
>> -			goto failed;
>> +		if (adev->gmc.xgmi.num_physical_nodes) {
>> +			dev_info(adev->dev, "Pending hive reset.\n");
>> +			adev->gmc.xgmi.pending_reset = true;
>> +			/* Only need to init necessary block for SMU to handle the reset */
>> +			for (i = 0; i < adev->num_ip_blocks; i++) {
>> +				if (!adev->ip_blocks[i].status.valid)
>> +					continue;
>> +				if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
>> +				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
>> +				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
>> +				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
>> +					DRM_DEBUG("IP %s disabed for hw_init.\n",
>> +						adev->ip_blocks[i].version->funcs->name);
>> +					adev->ip_blocks[i].status.hw = true;
>> +				}
>> +			}
>> +		} else {
>> +			r = amdgpu_asic_reset(adev);
>> +			if (r) {
>> +				dev_err(adev->dev, "asic reset on init failed\n");
>> +				goto failed;
>> +			}
>>    		}
>>    	}
>>    
>> @@ -3541,19 +3579,19 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>    	/* enable clockgating, etc. after ib tests, etc. since some blocks require
>>    	 * explicit gating rather than handling it automatically.
>>    	 */
>> -	r = amdgpu_device_ip_late_init(adev);
>> -	if (r) {
>> -		dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
>> -		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
>> -		goto failed;
>> +	if (!adev->gmc.xgmi.pending_reset) {
>> +		r = amdgpu_device_ip_late_init(adev);
>> +		if (r) {
>> +			dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
>> +			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
>> +			goto failed;
>> +		}
>> +		/* must succeed. */
>> +		amdgpu_ras_resume(adev);
>> +		queue_delayed_work(system_wq, &adev->delayed_init_work,
>> +				   msecs_to_jiffies(AMDGPU_RESUME_MS));
>>    	}
>>    
>> -	/* must succeed. */
>> -	amdgpu_ras_resume(adev);
>> -
>> -	queue_delayed_work(system_wq, &adev->delayed_init_work,
>> -			   msecs_to_jiffies(AMDGPU_RESUME_MS));
>> -
>>    	if (amdgpu_sriov_vf(adev))
>>    		flush_delayed_work(&adev->delayed_init_work);
>>    
>> @@ -3570,6 +3608,10 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>    	if (amdgpu_device_cache_pci_state(adev->pdev))
>>    		pci_restore_state(pdev);
>>    
>> +	if (adev->gmc.xgmi.pending_reset)
>> +		queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
>> +				   msecs_to_jiffies(AMDGPU_RESUME_MS));
>> +
>>    	return 0;
>>    
>>    failed:
>> @@ -4240,14 +4282,16 @@ bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
>>    }
>>    
>>    
>> -static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>> -					struct amdgpu_job *job,
>> -					bool *need_full_reset_arg)
>> +int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>> +				  struct amdgpu_job *job,
>> +				  bool *need_full_reset_arg)
>>    {
>>    	int i, r = 0;
>>    	bool need_full_reset  = *need_full_reset_arg;
>>    
>> -	amdgpu_debugfs_wait_dump(adev);
>> +	/* no need to dump if device is not in good state during probe period */
>> +	if (!adev->gmc.xgmi.pending_reset)
>> +		amdgpu_debugfs_wait_dump(adev);
>>    
>>    	if (amdgpu_sriov_vf(adev)) {
>>    		/* stop the data exchange thread */ @@ -4293,10 +4337,10 @@ static
>> int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>    	return r;
>>    }
>>    
>> -static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>> -			       struct list_head *device_list_handle,
>> -			       bool *need_full_reset_arg,
>> -			       bool skip_hw_reset)
>> +int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>> +			  struct list_head *device_list_handle,
>> +			  bool *need_full_reset_arg,
>> +			  bool skip_hw_reset)
>>    {
>>    	struct amdgpu_device *tmp_adev = NULL;
>>    	bool need_full_reset = *need_full_reset_arg, vram_lost = false; @@
>> -4310,6 +4354,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>    		list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
>>    			/* For XGMI run all resets in parallel to speed up the process */
>>    			if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
>> +				tmp_adev->gmc.xgmi.pending_reset = false;
>>    				if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
>>    					r = -EALREADY;
>>    			} else
>> @@ -4348,10 +4393,10 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>    	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
>>    		if (need_full_reset) {
>>    			/* post card */
>> -			if (amdgpu_device_asic_init(tmp_adev))
>> +			r = amdgpu_device_asic_init(tmp_adev);
>> +			if (r) {
>>    				dev_warn(tmp_adev->dev, "asic atom init failed!");
>> -
>> -			if (!r) {
>> +			} else {
>>    				dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
>>    				r = amdgpu_device_ip_resume_phase1(tmp_adev);
>>    				if (r)
>> @@ -4384,6 +4429,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>    				 */
>>    				amdgpu_register_gpu_instance(tmp_adev);
>>    
>> +				if (!hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
>> +					amdgpu_xgmi_add_device(tmp_adev);
>> +
>>    				r = amdgpu_device_ip_late_init(tmp_adev);
>>    				if (r)
>>    					goto out;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> index 253c59e0a100..aebe4bc561ee 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> @@ -44,6 +44,7 @@
>>    #include "amdgpu_amdkfd.h"
>>    
>>    #include "amdgpu_ras.h"
>> +#include "amdgpu_xgmi.h"
>>    
>>    /*
>>     * KMS wrapper.
>> @@ -167,8 +168,13 @@ uint amdgpu_freesync_vid_mode;
>>    int amdgpu_reset_method = -1; /* auto */
>>    int amdgpu_num_kcq = -1;
>>    
>> +static void amdgpu_drv_delayed_reset_work_handler(struct work_struct
>> +*work);
>> +
>>    struct amdgpu_mgpu_info mgpu_info = {
>>    	.mutex = __MUTEX_INITIALIZER(mgpu_info.mutex),
>> +	.delayed_reset_work = __DELAYED_WORK_INITIALIZER(
>> +			mgpu_info.delayed_reset_work,
>> +			amdgpu_drv_delayed_reset_work_handler, 0),
>>    };
>>    int amdgpu_ras_enable = -1;
>>    uint amdgpu_ras_mask = 0xffffffff;
>> @@ -1297,6 +1303,71 @@ amdgpu_pci_shutdown(struct pci_dev *pdev)
>>    	adev->mp1_state = PP_MP1_STATE_NONE;
>>    }
>>    
>> +/**
>> + * amdgpu_drv_delayed_reset_work_handler - work handler for reset
>> + *
>> + * @work: work_struct.
>> + */
>> +static void amdgpu_drv_delayed_reset_work_handler(struct work_struct
>> +*work) {
>> +	struct list_head device_list;
>> +	struct amdgpu_device *adev;
>> +	int i, r;
>> +	bool need_full_reset = true;
>> +
>> +	mutex_lock(&mgpu_info.mutex);
>> +	if (mgpu_info.pending_reset == true) {
>> +		mutex_unlock(&mgpu_info.mutex);
>> +		return;
>> +	}
>> +	mgpu_info.pending_reset = true;
>> +	mutex_unlock(&mgpu_info.mutex);
>> +
>> +	for (i = 0; i < mgpu_info.num_dgpu; i++) {
>> +		adev = mgpu_info.gpu_ins[i].adev;
>> +		r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset);
> 
> Why amdgpu_device_pre_asic_reset is needed ?
> 
>> +		if (r) {
>> +			dev_err(adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
>> +				r, adev_to_drm(adev)->unique);
>> +		}
>> +		if (!queue_work(system_unbound_wq, &adev->xgmi_reset_work))
>> +			r = -EALREADY;
> 
> amdgpu_do_asic_reset bellow will already schedule xgmi_reset_work for this device, what you could do instead is call amdgpu_do_asic_reset for each member of the hive and because there is a task barrier in amdgpu_device_xgmi_reset_func, it will synchronize all the resets to same point in time already.
> 
>> +	}
>> +	msleep(100);
> 
> What's the 100ms is wiating for ?
> 
>> +	for (i = 0; i < mgpu_info.num_dgpu; i++) {
>> +		adev = mgpu_info.gpu_ins[i].adev;
>> +		adev->gmc.xgmi.pending_reset = false;
>> +		flush_work(&adev->xgmi_reset_work);
>> +	}
>> +
>> +	msleep(100);
> 
> Same as above
> 
>> +	/* reset function will rebuild the xgmi hive info , clear it now */
>> +	for (i = 0; i < mgpu_info.num_dgpu; i++)
>> +		amdgpu_xgmi_remove_device(mgpu_info.gpu_ins[i].adev);
>> +
>> +	INIT_LIST_HEAD(&device_list);
>> +
>> +	for (i = 0; i < mgpu_info.num_dgpu; i++)
>> +		list_add_tail(&mgpu_info.gpu_ins[i].adev->reset_list,
>> +&device_list);
>> +
>> +	/* unregister the GPU first, reset function will add them back */
>> +	list_for_each_entry(adev, &device_list, reset_list)
>> +		amdgpu_unregister_gpu_instance(adev);
>> +
>> +	r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true);
>> +	if (r) {
>> +		DRM_ERROR("reinit gpus failure");
>> +		return;
>> +	}
>> +	for (i = 0; i < mgpu_info.num_dgpu; i++) {
>> +		adev = mgpu_info.gpu_ins[i].adev;
>> +		if (!adev->kfd.init_complete)
>> +			amdgpu_amdkfd_device_init(adev);
>> +		amdgpu_ttm_set_buffer_funcs_status(adev, true);
>> +	}
>> +	return;
>> +}
>> +
>>    static int amdgpu_pmops_suspend(struct device *dev)
>>    {
>>    	struct drm_device *drm_dev = dev_get_drvdata(dev); diff --git
>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>> index aa0c83776ce0..8c71d84a2fbe 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>> @@ -149,6 +149,7 @@ struct amdgpu_xgmi {
>>    	struct list_head head;
>>    	bool supported;
>>    	struct ras_common_if *ras_if;
>> +	bool pending_reset;
>>    };
>>    
>>    struct amdgpu_gmc {
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>> index 659b385b27b5..b459ef755ea9 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>> @@ -492,7 +492,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>>    	if (!adev->gmc.xgmi.supported)
>>    		return 0;
>>    
>> -	if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
>> +	if (!adev->gmc.xgmi.pending_reset &&
>> +	    amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
>>    		ret = psp_xgmi_initialize(&adev->psp);
>>    		if (ret) {
>>    			dev_err(adev->dev,
>> @@ -538,7 +539,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device
>> *adev)
>>    
>>    	task_barrier_add_task(&hive->tb);
>>    
>> -	if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
>> +	if (!adev->gmc.xgmi.pending_reset &&
>> +	    amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
>>    		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
>>    			/* update node list for other device in the hive */
>>    			if (tmp_adev != adev) {
>> @@ -567,7 +569,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>>    		}
>>    	}
>>    
>> -	if (!ret)
>> +	if (!ret && !adev->gmc.xgmi.pending_reset)
>>    		ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive);
>>    
>>    exit_unlock:
>>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [PATCH 5/5] drm/amdgpu: Reset the devices in the XGMI hive duirng probe
  2021-03-06  6:09     ` Andrey Grodzovsky
@ 2021-03-06 20:41       ` Liu, Shaoyun
  2021-03-06 21:11         ` Liu, Shaoyun
  0 siblings, 1 reply; 12+ messages in thread
From: Liu, Shaoyun @ 2021-03-06 20:41 UTC (permalink / raw)
  To: Grodzovsky, Andrey, amd-gfx

[AMD Official Use Only - Internal Distribution Only]

I call the amdgpu_do_asic_reset with the parameter skip_hw_reset = true  so the reset won't be execute twice .  but probably I can  set this parameter to true and remove the code schedule for reset since now I already build the device_list not based on hive. Let me try that . 
For the  schedule delayed work thread with AMDGPU_RESUME_MS, It's actually not wait for SMU  to start. As I explained , I need to reset the all the GPUs in the system since I don't know which gpus belongs to which hive.  So this time is allow system to probe all the GPUs  in the system which means when this delayed thread starts ,  we can assume all the devices already been  populated in mgpu_info.

Regards
Shaoyun.liu

-----Original Message-----
From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com> 
Sent: Saturday, March 6, 2021 1:09 AM
To: Liu, Shaoyun <Shaoyun.Liu@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH 5/5] drm/amdgpu: Reset the devices in the XGMI hive duirng probe

Thanks for explaining this, one thing I still don't understand is why you schedule the reset work explicilty in the begining of amdgpu_drv_delayed_reset_work_handler and then also call amdgpu_do_asic_reset which will do the same thing too. It looks like the physical reset will execute twice for each device.
Another thing is, more like improvement suggestion  - currently you schedule delayed_reset_work using AMDGPU_RESUME_MS - so i guesss this should give enough time for SMU to start ? Is there maybe a way to instead poll for SMU start completion and then execute this - some SMU status registers maybe ? Just to avoid relying on this arbitrary value.

Andrey

On 2021-03-05 8:37 p.m., Liu, Shaoyun wrote:
> [AMD Official Use Only - Internal Distribution Only]
> 
> Hi,  Andrey
> The existing reset function (amdgpu_device_gpu_recover or amd do_asic _reset) assumed driver already have  the correct hive info . But in my case, it's  not true . The gpus are in a bad state and the XGMI TA  might not functional properly , so driver can  not  get the hive and node info when probe the device .  It means driver even don't know  the device belongs to which hive on a system with multiple hive configuration (ex, 8 gpus in  two hive). The only solution I can think of is let driver trigger the reset on all gpus at the same time after driver do the minimum initialization on the HW ( bring up the  SMU IP)  no matter they belongs to the same hive or not and call amdgpu_xgmi_add_device for each device after re-init .
> The 100 ms delay added after the baco reset . I think they can be removed . let me verify it.
> 
> Regards
> Shaoyun.liu
> 
> 
> 
> -----Original Message-----
> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
> Sent: Friday, March 5, 2021 2:27 PM
> To: Liu, Shaoyun <Shaoyun.Liu@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH 5/5] drm/amdgpu: Reset the devices in the XGMI 
> hive duirng probe
> 
> 
> 
> On 2021-03-05 12:52 p.m., shaoyunl wrote:
>> In passthrough configuration, hypervisior will trigger the 
>> SBR(Secondary bus reset) to the devices without sync to each other.
>> This could cause device hang since for XGMI configuration, all the 
>> devices within the hive need to be reset at a limit time slot. This 
>> serial of patches try to solve this issue by co-operate with new SMU 
>> which will only do minimum house keeping to response the SBR request 
>> but don't do the real reset job and leave it to driver. Driver need 
>> to do the whole sw init and minimum HW init to bring up the SMU and 
>> trigger the reset(possibly BACO) on all the ASICs at the same time
>>
>> Signed-off-by: shaoyunl <shaoyun.liu@amd.com>
>> Change-Id: I34e838e611b7623c7ad824704c7ce350808014fc
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  13 +++
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 102 +++++++++++++++------
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  71 ++++++++++++++
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h    |   1 +
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   |   8 +-
>>    5 files changed, 165 insertions(+), 30 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index d46d3794699e..5602c6edee97 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -125,6 +125,10 @@ struct amdgpu_mgpu_info
>>    	uint32_t			num_gpu;
>>    	uint32_t			num_dgpu;
>>    	uint32_t			num_apu;
>> +
>> +	/* delayed reset_func for XGMI configuration if necessary */
>> +	struct delayed_work		delayed_reset_work;
>> +	bool				pending_reset;
>>    };
>>    
>>    #define AMDGPU_MAX_TIMEOUT_PARAM_LENGTH	256
>> @@ -1124,6 +1128,15 @@ void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
>>    bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type);
>>    bool amdgpu_device_has_dc_support(struct amdgpu_device *adev);
>>    
>> +int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>> +				  struct amdgpu_job *job,
>> +				  bool *need_full_reset_arg);
>> +
>> +int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>> +			  struct list_head *device_list_handle,
>> +			  bool *need_full_reset_arg,
>> +			  bool skip_hw_reset);
>> +
>>    int emu_soc_asic_init(struct amdgpu_device *adev);
>>    
>>    /*
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 3c35b0c1e710..5b520f70e660 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -1220,6 +1220,10 @@ bool amdgpu_device_need_post(struct amdgpu_device *adev)
>>    		}
>>    	}
>>    
>> +	/* Don't post if we need to reset whole hive on init */
>> +	if (adev->gmc.xgmi.pending_reset)
>> +		return false;
>> +
>>    	if (adev->has_hw_reset) {
>>    		adev->has_hw_reset = false;
>>    		return true;
>> @@ -2149,6 +2153,9 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
>>    			if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
>>    				continue;
>>    
>> +			if (!adev->ip_blocks[i].status.sw)
>> +				continue;
>> +
>>    			/* no need to do the fw loading again if already done*/
>>    			if (adev->ip_blocks[i].status.hw == true)
>>    				break;
>> @@ -2289,7 +2296,10 @@ static int amdgpu_device_ip_init(struct 
>> amdgpu_device *adev)
>>    
>>    	if (adev->gmc.xgmi.num_physical_nodes > 1)
>>    		amdgpu_xgmi_add_device(adev);
>> -	amdgpu_amdkfd_device_init(adev);
>> +
>> +	/* Don't init kfd if whole hive need to be reset during init */
>> +	if (!adev->gmc.xgmi.pending_reset)
>> +		amdgpu_amdkfd_device_init(adev);
>>    
>>    	amdgpu_fru_get_product_info(adev);
>>    
>> @@ -2734,6 +2744,16 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
>>    			adev->ip_blocks[i].status.hw = false;
>>    			continue;
>>    		}
>> +
>> +		/* skip unnecessary suspend if we do not initialize them yet */
>> +		if (adev->gmc.xgmi.pending_reset &&
>> +		    !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
>> +		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
>> +		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
>> +		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
>> +			adev->ip_blocks[i].status.hw = false;
>> +			continue;
>> +		}
>>    		/* XXX handle errors */
>>    		r = adev->ip_blocks[i].version->funcs->suspend(adev);
>>    		/* XXX handle errors */
>> @@ -3407,10 +3427,28 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>    	 *  E.g., driver was not cleanly unloaded previously, etc.
>>    	 */
>>    	if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
>> -		r = amdgpu_asic_reset(adev);
>> -		if (r) {
>> -			dev_err(adev->dev, "asic reset on init failed\n");
>> -			goto failed;
>> +		if (adev->gmc.xgmi.num_physical_nodes) {
>> +			dev_info(adev->dev, "Pending hive reset.\n");
>> +			adev->gmc.xgmi.pending_reset = true;
>> +			/* Only need to init necessary block for SMU to handle the reset */
>> +			for (i = 0; i < adev->num_ip_blocks; i++) {
>> +				if (!adev->ip_blocks[i].status.valid)
>> +					continue;
>> +				if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
>> +				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
>> +				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
>> +				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
>> +					DRM_DEBUG("IP %s disabed for hw_init.\n",
>> +						adev->ip_blocks[i].version->funcs->name);
>> +					adev->ip_blocks[i].status.hw = true;
>> +				}
>> +			}
>> +		} else {
>> +			r = amdgpu_asic_reset(adev);
>> +			if (r) {
>> +				dev_err(adev->dev, "asic reset on init failed\n");
>> +				goto failed;
>> +			}
>>    		}
>>    	}
>>    
>> @@ -3541,19 +3579,19 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>    	/* enable clockgating, etc. after ib tests, etc. since some blocks require
>>    	 * explicit gating rather than handling it automatically.
>>    	 */
>> -	r = amdgpu_device_ip_late_init(adev);
>> -	if (r) {
>> -		dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
>> -		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
>> -		goto failed;
>> +	if (!adev->gmc.xgmi.pending_reset) {
>> +		r = amdgpu_device_ip_late_init(adev);
>> +		if (r) {
>> +			dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
>> +			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
>> +			goto failed;
>> +		}
>> +		/* must succeed. */
>> +		amdgpu_ras_resume(adev);
>> +		queue_delayed_work(system_wq, &adev->delayed_init_work,
>> +				   msecs_to_jiffies(AMDGPU_RESUME_MS));
>>    	}
>>    
>> -	/* must succeed. */
>> -	amdgpu_ras_resume(adev);
>> -
>> -	queue_delayed_work(system_wq, &adev->delayed_init_work,
>> -			   msecs_to_jiffies(AMDGPU_RESUME_MS));
>> -
>>    	if (amdgpu_sriov_vf(adev))
>>    		flush_delayed_work(&adev->delayed_init_work);
>>    
>> @@ -3570,6 +3608,10 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>    	if (amdgpu_device_cache_pci_state(adev->pdev))
>>    		pci_restore_state(pdev);
>>    
>> +	if (adev->gmc.xgmi.pending_reset)
>> +		queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
>> +				   msecs_to_jiffies(AMDGPU_RESUME_MS));
>> +
>>    	return 0;
>>    
>>    failed:
>> @@ -4240,14 +4282,16 @@ bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
>>    }
>>    
>>    
>> -static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>> -					struct amdgpu_job *job,
>> -					bool *need_full_reset_arg)
>> +int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>> +				  struct amdgpu_job *job,
>> +				  bool *need_full_reset_arg)
>>    {
>>    	int i, r = 0;
>>    	bool need_full_reset  = *need_full_reset_arg;
>>    
>> -	amdgpu_debugfs_wait_dump(adev);
>> +	/* no need to dump if device is not in good state during probe period */
>> +	if (!adev->gmc.xgmi.pending_reset)
>> +		amdgpu_debugfs_wait_dump(adev);
>>    
>>    	if (amdgpu_sriov_vf(adev)) {
>>    		/* stop the data exchange thread */ @@ -4293,10 +4337,10 @@ 
>> static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>    	return r;
>>    }
>>    
>> -static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>> -			       struct list_head *device_list_handle,
>> -			       bool *need_full_reset_arg,
>> -			       bool skip_hw_reset)
>> +int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>> +			  struct list_head *device_list_handle,
>> +			  bool *need_full_reset_arg,
>> +			  bool skip_hw_reset)
>>    {
>>    	struct amdgpu_device *tmp_adev = NULL;
>>    	bool need_full_reset = *need_full_reset_arg, vram_lost = false; 
>> @@
>> -4310,6 +4354,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>    		list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
>>    			/* For XGMI run all resets in parallel to speed up the process */
>>    			if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
>> +				tmp_adev->gmc.xgmi.pending_reset = false;
>>    				if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
>>    					r = -EALREADY;
>>    			} else
>> @@ -4348,10 +4393,10 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>    	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
>>    		if (need_full_reset) {
>>    			/* post card */
>> -			if (amdgpu_device_asic_init(tmp_adev))
>> +			r = amdgpu_device_asic_init(tmp_adev);
>> +			if (r) {
>>    				dev_warn(tmp_adev->dev, "asic atom init failed!");
>> -
>> -			if (!r) {
>> +			} else {
>>    				dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
>>    				r = amdgpu_device_ip_resume_phase1(tmp_adev);
>>    				if (r)
>> @@ -4384,6 +4429,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>    				 */
>>    				amdgpu_register_gpu_instance(tmp_adev);
>>    
>> +				if (!hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
>> +					amdgpu_xgmi_add_device(tmp_adev);
>> +
>>    				r = amdgpu_device_ip_late_init(tmp_adev);
>>    				if (r)
>>    					goto out;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> index 253c59e0a100..aebe4bc561ee 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> @@ -44,6 +44,7 @@
>>    #include "amdgpu_amdkfd.h"
>>    
>>    #include "amdgpu_ras.h"
>> +#include "amdgpu_xgmi.h"
>>    
>>    /*
>>     * KMS wrapper.
>> @@ -167,8 +168,13 @@ uint amdgpu_freesync_vid_mode;
>>    int amdgpu_reset_method = -1; /* auto */
>>    int amdgpu_num_kcq = -1;
>>    
>> +static void amdgpu_drv_delayed_reset_work_handler(struct work_struct 
>> +*work);
>> +
>>    struct amdgpu_mgpu_info mgpu_info = {
>>    	.mutex = __MUTEX_INITIALIZER(mgpu_info.mutex),
>> +	.delayed_reset_work = __DELAYED_WORK_INITIALIZER(
>> +			mgpu_info.delayed_reset_work,
>> +			amdgpu_drv_delayed_reset_work_handler, 0),
>>    };
>>    int amdgpu_ras_enable = -1;
>>    uint amdgpu_ras_mask = 0xffffffff; @@ -1297,6 +1303,71 @@ 
>> amdgpu_pci_shutdown(struct pci_dev *pdev)
>>    	adev->mp1_state = PP_MP1_STATE_NONE;
>>    }
>>    
>> +/**
>> + * amdgpu_drv_delayed_reset_work_handler - work handler for reset
>> + *
>> + * @work: work_struct.
>> + */
>> +static void amdgpu_drv_delayed_reset_work_handler(struct work_struct
>> +*work) {
>> +	struct list_head device_list;
>> +	struct amdgpu_device *adev;
>> +	int i, r;
>> +	bool need_full_reset = true;
>> +
>> +	mutex_lock(&mgpu_info.mutex);
>> +	if (mgpu_info.pending_reset == true) {
>> +		mutex_unlock(&mgpu_info.mutex);
>> +		return;
>> +	}
>> +	mgpu_info.pending_reset = true;
>> +	mutex_unlock(&mgpu_info.mutex);
>> +
>> +	for (i = 0; i < mgpu_info.num_dgpu; i++) {
>> +		adev = mgpu_info.gpu_ins[i].adev;
>> +		r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset);
> 
> Why amdgpu_device_pre_asic_reset is needed ?
> 
>> +		if (r) {
>> +			dev_err(adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
>> +				r, adev_to_drm(adev)->unique);
>> +		}
>> +		if (!queue_work(system_unbound_wq, &adev->xgmi_reset_work))
>> +			r = -EALREADY;
> 
> amdgpu_do_asic_reset bellow will already schedule xgmi_reset_work for this device, what you could do instead is call amdgpu_do_asic_reset for each member of the hive and because there is a task barrier in amdgpu_device_xgmi_reset_func, it will synchronize all the resets to same point in time already.
> 
>> +	}
>> +	msleep(100);
> 
> What's the 100ms is wiating for ?
> 
>> +	for (i = 0; i < mgpu_info.num_dgpu; i++) {
>> +		adev = mgpu_info.gpu_ins[i].adev;
>> +		adev->gmc.xgmi.pending_reset = false;
>> +		flush_work(&adev->xgmi_reset_work);
>> +	}
>> +
>> +	msleep(100);
> 
> Same as above
> 
>> +	/* reset function will rebuild the xgmi hive info , clear it now */
>> +	for (i = 0; i < mgpu_info.num_dgpu; i++)
>> +		amdgpu_xgmi_remove_device(mgpu_info.gpu_ins[i].adev);
>> +
>> +	INIT_LIST_HEAD(&device_list);
>> +
>> +	for (i = 0; i < mgpu_info.num_dgpu; i++)
>> +		list_add_tail(&mgpu_info.gpu_ins[i].adev->reset_list,
>> +&device_list);
>> +
>> +	/* unregister the GPU first, reset function will add them back */
>> +	list_for_each_entry(adev, &device_list, reset_list)
>> +		amdgpu_unregister_gpu_instance(adev);
>> +
>> +	r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true);
>> +	if (r) {
>> +		DRM_ERROR("reinit gpus failure");
>> +		return;
>> +	}
>> +	for (i = 0; i < mgpu_info.num_dgpu; i++) {
>> +		adev = mgpu_info.gpu_ins[i].adev;
>> +		if (!adev->kfd.init_complete)
>> +			amdgpu_amdkfd_device_init(adev);
>> +		amdgpu_ttm_set_buffer_funcs_status(adev, true);
>> +	}
>> +	return;
>> +}
>> +
>>    static int amdgpu_pmops_suspend(struct device *dev)
>>    {
>>    	struct drm_device *drm_dev = dev_get_drvdata(dev); diff --git 
>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>> index aa0c83776ce0..8c71d84a2fbe 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>> @@ -149,6 +149,7 @@ struct amdgpu_xgmi {
>>    	struct list_head head;
>>    	bool supported;
>>    	struct ras_common_if *ras_if;
>> +	bool pending_reset;
>>    };
>>    
>>    struct amdgpu_gmc {
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>> index 659b385b27b5..b459ef755ea9 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>> @@ -492,7 +492,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>>    	if (!adev->gmc.xgmi.supported)
>>    		return 0;
>>    
>> -	if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
>> +	if (!adev->gmc.xgmi.pending_reset &&
>> +	    amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
>>    		ret = psp_xgmi_initialize(&adev->psp);
>>    		if (ret) {
>>    			dev_err(adev->dev,
>> @@ -538,7 +539,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device
>> *adev)
>>    
>>    	task_barrier_add_task(&hive->tb);
>>    
>> -	if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
>> +	if (!adev->gmc.xgmi.pending_reset &&
>> +	    amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
>>    		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
>>    			/* update node list for other device in the hive */
>>    			if (tmp_adev != adev) {
>> @@ -567,7 +569,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>>    		}
>>    	}
>>    
>> -	if (!ret)
>> +	if (!ret && !adev->gmc.xgmi.pending_reset)
>>    		ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive);
>>    
>>    exit_unlock:
>>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [PATCH 5/5] drm/amdgpu: Reset the devices in the XGMI hive duirng probe
  2021-03-06 20:41       ` Liu, Shaoyun
@ 2021-03-06 21:11         ` Liu, Shaoyun
  2021-03-08  6:28           ` Andrey Grodzovsky
  0 siblings, 1 reply; 12+ messages in thread
From: Liu, Shaoyun @ 2021-03-06 21:11 UTC (permalink / raw)
  To: Liu, Shaoyun, Grodzovsky, Andrey, amd-gfx

[AMD Official Use Only - Internal Distribution Only]

It  seems I can  not directly reuse the reset HW  function inside the  amdgpu_do_asic_reset,  the  synchronization is based on hive->tb,   but as explained , we actually don't know the GPU belongs to which hive and will rebuild the correct hive info inside the amdgpu_do_asic_reset function with amdgpu_xgmi_add_device .  so I need to remove  all GPUs from the hive first . This will lead to the sync don't work since the hive->tb will be removed as well when all GPUs are removed . 

Thanks 
shaopyunliu

-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Liu, Shaoyun
Sent: Saturday, March 6, 2021 3:41 PM
To: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; amd-gfx@lists.freedesktop.org
Subject: RE: [PATCH 5/5] drm/amdgpu: Reset the devices in the XGMI hive duirng probe

[AMD Official Use Only - Internal Distribution Only]

I call the amdgpu_do_asic_reset with the parameter skip_hw_reset = true  so the reset won't be execute twice .  but probably I can  set this parameter to true and remove the code schedule for reset since now I already build the device_list not based on hive. Let me try that . 
For the  schedule delayed work thread with AMDGPU_RESUME_MS, It's actually not wait for SMU  to start. As I explained , I need to reset the all the GPUs in the system since I don't know which gpus belongs to which hive.  So this time is allow system to probe all the GPUs  in the system which means when this delayed thread starts ,  we can assume all the devices already been  populated in mgpu_info.

Regards
Shaoyun.liu

-----Original Message-----
From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
Sent: Saturday, March 6, 2021 1:09 AM
To: Liu, Shaoyun <Shaoyun.Liu@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH 5/5] drm/amdgpu: Reset the devices in the XGMI hive duirng probe

Thanks for explaining this, one thing I still don't understand is why you schedule the reset work explicilty in the begining of amdgpu_drv_delayed_reset_work_handler and then also call amdgpu_do_asic_reset which will do the same thing too. It looks like the physical reset will execute twice for each device.
Another thing is, more like improvement suggestion  - currently you schedule delayed_reset_work using AMDGPU_RESUME_MS - so i guesss this should give enough time for SMU to start ? Is there maybe a way to instead poll for SMU start completion and then execute this - some SMU status registers maybe ? Just to avoid relying on this arbitrary value.

Andrey

On 2021-03-05 8:37 p.m., Liu, Shaoyun wrote:
> [AMD Official Use Only - Internal Distribution Only]
> 
> Hi,  Andrey
> The existing reset function (amdgpu_device_gpu_recover or amd do_asic _reset) assumed driver already have  the correct hive info . But in my case, it's  not true . The gpus are in a bad state and the XGMI TA  might not functional properly , so driver can  not  get the hive and node info when probe the device .  It means driver even don't know  the device belongs to which hive on a system with multiple hive configuration (ex, 8 gpus in  two hive). The only solution I can think of is let driver trigger the reset on all gpus at the same time after driver do the minimum initialization on the HW ( bring up the  SMU IP)  no matter they belongs to the same hive or not and call amdgpu_xgmi_add_device for each device after re-init .
> The 100 ms delay added after the baco reset . I think they can be removed . let me verify it.
> 
> Regards
> Shaoyun.liu
> 
> 
> 
> -----Original Message-----
> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
> Sent: Friday, March 5, 2021 2:27 PM
> To: Liu, Shaoyun <Shaoyun.Liu@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH 5/5] drm/amdgpu: Reset the devices in the XGMI 
> hive duirng probe
> 
> 
> 
> On 2021-03-05 12:52 p.m., shaoyunl wrote:
>> In passthrough configuration, hypervisior will trigger the 
>> SBR(Secondary bus reset) to the devices without sync to each other.
>> This could cause device hang since for XGMI configuration, all the 
>> devices within the hive need to be reset at a limit time slot. This 
>> serial of patches try to solve this issue by co-operate with new SMU 
>> which will only do minimum house keeping to response the SBR request 
>> but don't do the real reset job and leave it to driver. Driver need 
>> to do the whole sw init and minimum HW init to bring up the SMU and 
>> trigger the reset(possibly BACO) on all the ASICs at the same time
>>
>> Signed-off-by: shaoyunl <shaoyun.liu@amd.com>
>> Change-Id: I34e838e611b7623c7ad824704c7ce350808014fc
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  13 +++
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 102 +++++++++++++++------
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  71 ++++++++++++++
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h    |   1 +
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   |   8 +-
>>    5 files changed, 165 insertions(+), 30 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index d46d3794699e..5602c6edee97 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -125,6 +125,10 @@ struct amdgpu_mgpu_info
>>    	uint32_t			num_gpu;
>>    	uint32_t			num_dgpu;
>>    	uint32_t			num_apu;
>> +
>> +	/* delayed reset_func for XGMI configuration if necessary */
>> +	struct delayed_work		delayed_reset_work;
>> +	bool				pending_reset;
>>    };
>>    
>>    #define AMDGPU_MAX_TIMEOUT_PARAM_LENGTH	256
>> @@ -1124,6 +1128,15 @@ void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
>>    bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type);
>>    bool amdgpu_device_has_dc_support(struct amdgpu_device *adev);
>>    
>> +int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>> +				  struct amdgpu_job *job,
>> +				  bool *need_full_reset_arg);
>> +
>> +int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>> +			  struct list_head *device_list_handle,
>> +			  bool *need_full_reset_arg,
>> +			  bool skip_hw_reset);
>> +
>>    int emu_soc_asic_init(struct amdgpu_device *adev);
>>    
>>    /*
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 3c35b0c1e710..5b520f70e660 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -1220,6 +1220,10 @@ bool amdgpu_device_need_post(struct amdgpu_device *adev)
>>    		}
>>    	}
>>    
>> +	/* Don't post if we need to reset whole hive on init */
>> +	if (adev->gmc.xgmi.pending_reset)
>> +		return false;
>> +
>>    	if (adev->has_hw_reset) {
>>    		adev->has_hw_reset = false;
>>    		return true;
>> @@ -2149,6 +2153,9 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
>>    			if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
>>    				continue;
>>    
>> +			if (!adev->ip_blocks[i].status.sw)
>> +				continue;
>> +
>>    			/* no need to do the fw loading again if already done*/
>>    			if (adev->ip_blocks[i].status.hw == true)
>>    				break;
>> @@ -2289,7 +2296,10 @@ static int amdgpu_device_ip_init(struct 
>> amdgpu_device *adev)
>>    
>>    	if (adev->gmc.xgmi.num_physical_nodes > 1)
>>    		amdgpu_xgmi_add_device(adev);
>> -	amdgpu_amdkfd_device_init(adev);
>> +
>> +	/* Don't init kfd if whole hive need to be reset during init */
>> +	if (!adev->gmc.xgmi.pending_reset)
>> +		amdgpu_amdkfd_device_init(adev);
>>    
>>    	amdgpu_fru_get_product_info(adev);
>>    
>> @@ -2734,6 +2744,16 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
>>    			adev->ip_blocks[i].status.hw = false;
>>    			continue;
>>    		}
>> +
>> +		/* skip unnecessary suspend if we do not initialize them yet */
>> +		if (adev->gmc.xgmi.pending_reset &&
>> +		    !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
>> +		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
>> +		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
>> +		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
>> +			adev->ip_blocks[i].status.hw = false;
>> +			continue;
>> +		}
>>    		/* XXX handle errors */
>>    		r = adev->ip_blocks[i].version->funcs->suspend(adev);
>>    		/* XXX handle errors */
>> @@ -3407,10 +3427,28 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>    	 *  E.g., driver was not cleanly unloaded previously, etc.
>>    	 */
>>    	if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
>> -		r = amdgpu_asic_reset(adev);
>> -		if (r) {
>> -			dev_err(adev->dev, "asic reset on init failed\n");
>> -			goto failed;
>> +		if (adev->gmc.xgmi.num_physical_nodes) {
>> +			dev_info(adev->dev, "Pending hive reset.\n");
>> +			adev->gmc.xgmi.pending_reset = true;
>> +			/* Only need to init necessary block for SMU to handle the reset */
>> +			for (i = 0; i < adev->num_ip_blocks; i++) {
>> +				if (!adev->ip_blocks[i].status.valid)
>> +					continue;
>> +				if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
>> +				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
>> +				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
>> +				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
>> +					DRM_DEBUG("IP %s disabed for hw_init.\n",
>> +						adev->ip_blocks[i].version->funcs->name);
>> +					adev->ip_blocks[i].status.hw = true;
>> +				}
>> +			}
>> +		} else {
>> +			r = amdgpu_asic_reset(adev);
>> +			if (r) {
>> +				dev_err(adev->dev, "asic reset on init failed\n");
>> +				goto failed;
>> +			}
>>    		}
>>    	}
>>    
>> @@ -3541,19 +3579,19 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>    	/* enable clockgating, etc. after ib tests, etc. since some blocks require
>>    	 * explicit gating rather than handling it automatically.
>>    	 */
>> -	r = amdgpu_device_ip_late_init(adev);
>> -	if (r) {
>> -		dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
>> -		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
>> -		goto failed;
>> +	if (!adev->gmc.xgmi.pending_reset) {
>> +		r = amdgpu_device_ip_late_init(adev);
>> +		if (r) {
>> +			dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
>> +			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
>> +			goto failed;
>> +		}
>> +		/* must succeed. */
>> +		amdgpu_ras_resume(adev);
>> +		queue_delayed_work(system_wq, &adev->delayed_init_work,
>> +				   msecs_to_jiffies(AMDGPU_RESUME_MS));
>>    	}
>>    
>> -	/* must succeed. */
>> -	amdgpu_ras_resume(adev);
>> -
>> -	queue_delayed_work(system_wq, &adev->delayed_init_work,
>> -			   msecs_to_jiffies(AMDGPU_RESUME_MS));
>> -
>>    	if (amdgpu_sriov_vf(adev))
>>    		flush_delayed_work(&adev->delayed_init_work);
>>    
>> @@ -3570,6 +3608,10 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>    	if (amdgpu_device_cache_pci_state(adev->pdev))
>>    		pci_restore_state(pdev);
>>    
>> +	if (adev->gmc.xgmi.pending_reset)
>> +		queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
>> +				   msecs_to_jiffies(AMDGPU_RESUME_MS));
>> +
>>    	return 0;
>>    
>>    failed:
>> @@ -4240,14 +4282,16 @@ bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
>>    }
>>    
>>    
>> -static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>> -					struct amdgpu_job *job,
>> -					bool *need_full_reset_arg)
>> +int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>> +				  struct amdgpu_job *job,
>> +				  bool *need_full_reset_arg)
>>    {
>>    	int i, r = 0;
>>    	bool need_full_reset  = *need_full_reset_arg;
>>    
>> -	amdgpu_debugfs_wait_dump(adev);
>> +	/* no need to dump if device is not in good state during probe period */
>> +	if (!adev->gmc.xgmi.pending_reset)
>> +		amdgpu_debugfs_wait_dump(adev);
>>    
>>    	if (amdgpu_sriov_vf(adev)) {
>>    		/* stop the data exchange thread */ @@ -4293,10 +4337,10 @@ 
>> static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>    	return r;
>>    }
>>    
>> -static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>> -			       struct list_head *device_list_handle,
>> -			       bool *need_full_reset_arg,
>> -			       bool skip_hw_reset)
>> +int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>> +			  struct list_head *device_list_handle,
>> +			  bool *need_full_reset_arg,
>> +			  bool skip_hw_reset)
>>    {
>>    	struct amdgpu_device *tmp_adev = NULL;
>>    	bool need_full_reset = *need_full_reset_arg, vram_lost = false; 
>> @@
>> -4310,6 +4354,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>    		list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
>>    			/* For XGMI run all resets in parallel to speed up the process */
>>    			if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
>> +				tmp_adev->gmc.xgmi.pending_reset = false;
>>    				if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
>>    					r = -EALREADY;
>>    			} else
>> @@ -4348,10 +4393,10 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>    	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
>>    		if (need_full_reset) {
>>    			/* post card */
>> -			if (amdgpu_device_asic_init(tmp_adev))
>> +			r = amdgpu_device_asic_init(tmp_adev);
>> +			if (r) {
>>    				dev_warn(tmp_adev->dev, "asic atom init failed!");
>> -
>> -			if (!r) {
>> +			} else {
>>    				dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
>>    				r = amdgpu_device_ip_resume_phase1(tmp_adev);
>>    				if (r)
>> @@ -4384,6 +4429,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>    				 */
>>    				amdgpu_register_gpu_instance(tmp_adev);
>>    
>> +				if (!hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
>> +					amdgpu_xgmi_add_device(tmp_adev);
>> +
>>    				r = amdgpu_device_ip_late_init(tmp_adev);
>>    				if (r)
>>    					goto out;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> index 253c59e0a100..aebe4bc561ee 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> @@ -44,6 +44,7 @@
>>    #include "amdgpu_amdkfd.h"
>>    
>>    #include "amdgpu_ras.h"
>> +#include "amdgpu_xgmi.h"
>>    
>>    /*
>>     * KMS wrapper.
>> @@ -167,8 +168,13 @@ uint amdgpu_freesync_vid_mode;
>>    int amdgpu_reset_method = -1; /* auto */
>>    int amdgpu_num_kcq = -1;
>>    
>> +static void amdgpu_drv_delayed_reset_work_handler(struct work_struct 
>> +*work);
>> +
>>    struct amdgpu_mgpu_info mgpu_info = {
>>    	.mutex = __MUTEX_INITIALIZER(mgpu_info.mutex),
>> +	.delayed_reset_work = __DELAYED_WORK_INITIALIZER(
>> +			mgpu_info.delayed_reset_work,
>> +			amdgpu_drv_delayed_reset_work_handler, 0),
>>    };
>>    int amdgpu_ras_enable = -1;
>>    uint amdgpu_ras_mask = 0xffffffff; @@ -1297,6 +1303,71 @@ 
>> amdgpu_pci_shutdown(struct pci_dev *pdev)
>>    	adev->mp1_state = PP_MP1_STATE_NONE;
>>    }
>>    
>> +/**
>> + * amdgpu_drv_delayed_reset_work_handler - work handler for reset
>> + *
>> + * @work: work_struct.
>> + */
>> +static void amdgpu_drv_delayed_reset_work_handler(struct work_struct
>> +*work) {
>> +	struct list_head device_list;
>> +	struct amdgpu_device *adev;
>> +	int i, r;
>> +	bool need_full_reset = true;
>> +
>> +	mutex_lock(&mgpu_info.mutex);
>> +	if (mgpu_info.pending_reset == true) {
>> +		mutex_unlock(&mgpu_info.mutex);
>> +		return;
>> +	}
>> +	mgpu_info.pending_reset = true;
>> +	mutex_unlock(&mgpu_info.mutex);
>> +
>> +	for (i = 0; i < mgpu_info.num_dgpu; i++) {
>> +		adev = mgpu_info.gpu_ins[i].adev;
>> +		r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset);
> 
> Why amdgpu_device_pre_asic_reset is needed ?
> 
>> +		if (r) {
>> +			dev_err(adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
>> +				r, adev_to_drm(adev)->unique);
>> +		}
>> +		if (!queue_work(system_unbound_wq, &adev->xgmi_reset_work))
>> +			r = -EALREADY;
> 
> amdgpu_do_asic_reset bellow will already schedule xgmi_reset_work for this device, what you could do instead is call amdgpu_do_asic_reset for each member of the hive and because there is a task barrier in amdgpu_device_xgmi_reset_func, it will synchronize all the resets to same point in time already.
> 
>> +	}
>> +	msleep(100);
> 
> What's the 100ms is wiating for ?
> 
>> +	for (i = 0; i < mgpu_info.num_dgpu; i++) {
>> +		adev = mgpu_info.gpu_ins[i].adev;
>> +		adev->gmc.xgmi.pending_reset = false;
>> +		flush_work(&adev->xgmi_reset_work);
>> +	}
>> +
>> +	msleep(100);
> 
> Same as above
> 
>> +	/* reset function will rebuild the xgmi hive info , clear it now */
>> +	for (i = 0; i < mgpu_info.num_dgpu; i++)
>> +		amdgpu_xgmi_remove_device(mgpu_info.gpu_ins[i].adev);
>> +
>> +	INIT_LIST_HEAD(&device_list);
>> +
>> +	for (i = 0; i < mgpu_info.num_dgpu; i++)
>> +		list_add_tail(&mgpu_info.gpu_ins[i].adev->reset_list,
>> +&device_list);
>> +
>> +	/* unregister the GPU first, reset function will add them back */
>> +	list_for_each_entry(adev, &device_list, reset_list)
>> +		amdgpu_unregister_gpu_instance(adev);
>> +
>> +	r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true);
>> +	if (r) {
>> +		DRM_ERROR("reinit gpus failure");
>> +		return;
>> +	}
>> +	for (i = 0; i < mgpu_info.num_dgpu; i++) {
>> +		adev = mgpu_info.gpu_ins[i].adev;
>> +		if (!adev->kfd.init_complete)
>> +			amdgpu_amdkfd_device_init(adev);
>> +		amdgpu_ttm_set_buffer_funcs_status(adev, true);
>> +	}
>> +	return;
>> +}
>> +
>>    static int amdgpu_pmops_suspend(struct device *dev)
>>    {
>>    	struct drm_device *drm_dev = dev_get_drvdata(dev); diff --git 
>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>> index aa0c83776ce0..8c71d84a2fbe 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>> @@ -149,6 +149,7 @@ struct amdgpu_xgmi {
>>    	struct list_head head;
>>    	bool supported;
>>    	struct ras_common_if *ras_if;
>> +	bool pending_reset;
>>    };
>>    
>>    struct amdgpu_gmc {
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>> index 659b385b27b5..b459ef755ea9 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>> @@ -492,7 +492,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>>    	if (!adev->gmc.xgmi.supported)
>>    		return 0;
>>    
>> -	if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
>> +	if (!adev->gmc.xgmi.pending_reset &&
>> +	    amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
>>    		ret = psp_xgmi_initialize(&adev->psp);
>>    		if (ret) {
>>    			dev_err(adev->dev,
>> @@ -538,7 +539,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device
>> *adev)
>>    
>>    	task_barrier_add_task(&hive->tb);
>>    
>> -	if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
>> +	if (!adev->gmc.xgmi.pending_reset &&
>> +	    amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
>>    		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
>>    			/* update node list for other device in the hive */
>>    			if (tmp_adev != adev) {
>> @@ -567,7 +569,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>>    		}
>>    	}
>>    
>> -	if (!ret)
>> +	if (!ret && !adev->gmc.xgmi.pending_reset)
>>    		ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive);
>>    
>>    exit_unlock:
>>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=04%7C01%7CShaoyun.Liu%40amd.com%7C73608bda6abd48a3047608d8e0e02e4f%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637506600750277165%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=2qASlLYQ08twP0Ud5EcisCsVlJ9WG0QJv5idbuhDe4o%3D&amp;reserved=0
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 5/5] drm/amdgpu: Reset the devices in the XGMI hive duirng probe
  2021-03-06 21:11         ` Liu, Shaoyun
@ 2021-03-08  6:28           ` Andrey Grodzovsky
  2021-03-08 15:27             ` Liu, Shaoyun
  0 siblings, 1 reply; 12+ messages in thread
From: Andrey Grodzovsky @ 2021-03-08  6:28 UTC (permalink / raw)
  To: Liu, Shaoyun, amd-gfx

But the hive->tb object is used regardless, inside 
amdgpu_device_xgmi_reset_func currently, it means then even when you 
explcicitly schdule xgmi_reset_work as you do now they code will try to 
sync using a not well iniitlized tb object. Maybe you can define a 
global static tb object, fill it in the loop where you send 
xgmi_reset_work for all devices in system and use it from within
amdgpu_device_xgmi_reset_func instead of the regular per hive tb object 
(obviosly under your special use case).

Andrey

On 2021-03-06 4:11 p.m., Liu, Shaoyun wrote:
> [AMD Official Use Only - Internal Distribution Only]
> 
> It  seems I can  not directly reuse the reset HW  function inside the  amdgpu_do_asic_reset,  the  synchronization is based on hive->tb,   but as explained , we actually don't know the GPU belongs to which hive and will rebuild the correct hive info inside the amdgpu_do_asic_reset function with amdgpu_xgmi_add_device .  so I need to remove  all GPUs from the hive first . This will lead to the sync don't work since the hive->tb will be removed as well when all GPUs are removed .
> 
> Thanks
> shaopyunliu
> 
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Liu, Shaoyun
> Sent: Saturday, March 6, 2021 3:41 PM
> To: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: RE: [PATCH 5/5] drm/amdgpu: Reset the devices in the XGMI hive duirng probe
> 
> [AMD Official Use Only - Internal Distribution Only]
> 
> I call the amdgpu_do_asic_reset with the parameter skip_hw_reset = true  so the reset won't be execute twice .  but probably I can  set this parameter to true and remove the code schedule for reset since now I already build the device_list not based on hive. Let me try that .
> For the  schedule delayed work thread with AMDGPU_RESUME_MS, It's actually not wait for SMU  to start. As I explained , I need to reset the all the GPUs in the system since I don't know which gpus belongs to which hive.  So this time is allow system to probe all the GPUs  in the system which means when this delayed thread starts ,  we can assume all the devices already been  populated in mgpu_info.
> 
> Regards
> Shaoyun.liu
> 
> -----Original Message-----
> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
> Sent: Saturday, March 6, 2021 1:09 AM
> To: Liu, Shaoyun <Shaoyun.Liu@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH 5/5] drm/amdgpu: Reset the devices in the XGMI hive duirng probe
> 
> Thanks for explaining this, one thing I still don't understand is why you schedule the reset work explicilty in the begining of amdgpu_drv_delayed_reset_work_handler and then also call amdgpu_do_asic_reset which will do the same thing too. It looks like the physical reset will execute twice for each device.
> Another thing is, more like improvement suggestion  - currently you schedule delayed_reset_work using AMDGPU_RESUME_MS - so i guesss this should give enough time for SMU to start ? Is there maybe a way to instead poll for SMU start completion and then execute this - some SMU status registers maybe ? Just to avoid relying on this arbitrary value.
> 
> Andrey
> 
> On 2021-03-05 8:37 p.m., Liu, Shaoyun wrote:
>> [AMD Official Use Only - Internal Distribution Only]
>>
>> Hi,  Andrey
>> The existing reset function (amdgpu_device_gpu_recover or amd do_asic _reset) assumed driver already have  the correct hive info . But in my case, it's  not true . The gpus are in a bad state and the XGMI TA  might not functional properly , so driver can  not  get the hive and node info when probe the device .  It means driver even don't know  the device belongs to which hive on a system with multiple hive configuration (ex, 8 gpus in  two hive). The only solution I can think of is let driver trigger the reset on all gpus at the same time after driver do the minimum initialization on the HW ( bring up the  SMU IP)  no matter they belongs to the same hive or not and call amdgpu_xgmi_add_device for each device after re-init .
>> The 100 ms delay added after the baco reset . I think they can be removed . let me verify it.
>>
>> Regards
>> Shaoyun.liu
>>
>>
>>
>> -----Original Message-----
>> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>> Sent: Friday, March 5, 2021 2:27 PM
>> To: Liu, Shaoyun <Shaoyun.Liu@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH 5/5] drm/amdgpu: Reset the devices in the XGMI
>> hive duirng probe
>>
>>
>>
>> On 2021-03-05 12:52 p.m., shaoyunl wrote:
>>> In passthrough configuration, hypervisior will trigger the
>>> SBR(Secondary bus reset) to the devices without sync to each other.
>>> This could cause device hang since for XGMI configuration, all the
>>> devices within the hive need to be reset at a limit time slot. This
>>> serial of patches try to solve this issue by co-operate with new SMU
>>> which will only do minimum house keeping to response the SBR request
>>> but don't do the real reset job and leave it to driver. Driver need
>>> to do the whole sw init and minimum HW init to bring up the SMU and
>>> trigger the reset(possibly BACO) on all the ASICs at the same time
>>>
>>> Signed-off-by: shaoyunl <shaoyun.liu@amd.com>
>>> Change-Id: I34e838e611b7623c7ad824704c7ce350808014fc
>>> ---
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  13 +++
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 102 +++++++++++++++------
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  71 ++++++++++++++
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h    |   1 +
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   |   8 +-
>>>     5 files changed, 165 insertions(+), 30 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> index d46d3794699e..5602c6edee97 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> @@ -125,6 +125,10 @@ struct amdgpu_mgpu_info
>>>     	uint32_t			num_gpu;
>>>     	uint32_t			num_dgpu;
>>>     	uint32_t			num_apu;
>>> +
>>> +	/* delayed reset_func for XGMI configuration if necessary */
>>> +	struct delayed_work		delayed_reset_work;
>>> +	bool				pending_reset;
>>>     };
>>>     
>>>     #define AMDGPU_MAX_TIMEOUT_PARAM_LENGTH	256
>>> @@ -1124,6 +1128,15 @@ void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
>>>     bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type);
>>>     bool amdgpu_device_has_dc_support(struct amdgpu_device *adev);
>>>     
>>> +int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>> +				  struct amdgpu_job *job,
>>> +				  bool *need_full_reset_arg);
>>> +
>>> +int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>> +			  struct list_head *device_list_handle,
>>> +			  bool *need_full_reset_arg,
>>> +			  bool skip_hw_reset);
>>> +
>>>     int emu_soc_asic_init(struct amdgpu_device *adev);
>>>     
>>>     /*
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index 3c35b0c1e710..5b520f70e660 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -1220,6 +1220,10 @@ bool amdgpu_device_need_post(struct amdgpu_device *adev)
>>>     		}
>>>     	}
>>>     
>>> +	/* Don't post if we need to reset whole hive on init */
>>> +	if (adev->gmc.xgmi.pending_reset)
>>> +		return false;
>>> +
>>>     	if (adev->has_hw_reset) {
>>>     		adev->has_hw_reset = false;
>>>     		return true;
>>> @@ -2149,6 +2153,9 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
>>>     			if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
>>>     				continue;
>>>     
>>> +			if (!adev->ip_blocks[i].status.sw)
>>> +				continue;
>>> +
>>>     			/* no need to do the fw loading again if already done*/
>>>     			if (adev->ip_blocks[i].status.hw == true)
>>>     				break;
>>> @@ -2289,7 +2296,10 @@ static int amdgpu_device_ip_init(struct
>>> amdgpu_device *adev)
>>>     
>>>     	if (adev->gmc.xgmi.num_physical_nodes > 1)
>>>     		amdgpu_xgmi_add_device(adev);
>>> -	amdgpu_amdkfd_device_init(adev);
>>> +
>>> +	/* Don't init kfd if whole hive need to be reset during init */
>>> +	if (!adev->gmc.xgmi.pending_reset)
>>> +		amdgpu_amdkfd_device_init(adev);
>>>     
>>>     	amdgpu_fru_get_product_info(adev);
>>>     
>>> @@ -2734,6 +2744,16 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
>>>     			adev->ip_blocks[i].status.hw = false;
>>>     			continue;
>>>     		}
>>> +
>>> +		/* skip unnecessary suspend if we do not initialize them yet */
>>> +		if (adev->gmc.xgmi.pending_reset &&
>>> +		    !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
>>> +		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
>>> +		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
>>> +		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
>>> +			adev->ip_blocks[i].status.hw = false;
>>> +			continue;
>>> +		}
>>>     		/* XXX handle errors */
>>>     		r = adev->ip_blocks[i].version->funcs->suspend(adev);
>>>     		/* XXX handle errors */
>>> @@ -3407,10 +3427,28 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>>     	 *  E.g., driver was not cleanly unloaded previously, etc.
>>>     	 */
>>>     	if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
>>> -		r = amdgpu_asic_reset(adev);
>>> -		if (r) {
>>> -			dev_err(adev->dev, "asic reset on init failed\n");
>>> -			goto failed;
>>> +		if (adev->gmc.xgmi.num_physical_nodes) {
>>> +			dev_info(adev->dev, "Pending hive reset.\n");
>>> +			adev->gmc.xgmi.pending_reset = true;
>>> +			/* Only need to init necessary block for SMU to handle the reset */
>>> +			for (i = 0; i < adev->num_ip_blocks; i++) {
>>> +				if (!adev->ip_blocks[i].status.valid)
>>> +					continue;
>>> +				if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
>>> +				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
>>> +				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
>>> +				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
>>> +					DRM_DEBUG("IP %s disabed for hw_init.\n",
>>> +						adev->ip_blocks[i].version->funcs->name);
>>> +					adev->ip_blocks[i].status.hw = true;
>>> +				}
>>> +			}
>>> +		} else {
>>> +			r = amdgpu_asic_reset(adev);
>>> +			if (r) {
>>> +				dev_err(adev->dev, "asic reset on init failed\n");
>>> +				goto failed;
>>> +			}
>>>     		}
>>>     	}
>>>     
>>> @@ -3541,19 +3579,19 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>>     	/* enable clockgating, etc. after ib tests, etc. since some blocks require
>>>     	 * explicit gating rather than handling it automatically.
>>>     	 */
>>> -	r = amdgpu_device_ip_late_init(adev);
>>> -	if (r) {
>>> -		dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
>>> -		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
>>> -		goto failed;
>>> +	if (!adev->gmc.xgmi.pending_reset) {
>>> +		r = amdgpu_device_ip_late_init(adev);
>>> +		if (r) {
>>> +			dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
>>> +			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
>>> +			goto failed;
>>> +		}
>>> +		/* must succeed. */
>>> +		amdgpu_ras_resume(adev);
>>> +		queue_delayed_work(system_wq, &adev->delayed_init_work,
>>> +				   msecs_to_jiffies(AMDGPU_RESUME_MS));
>>>     	}
>>>     
>>> -	/* must succeed. */
>>> -	amdgpu_ras_resume(adev);
>>> -
>>> -	queue_delayed_work(system_wq, &adev->delayed_init_work,
>>> -			   msecs_to_jiffies(AMDGPU_RESUME_MS));
>>> -
>>>     	if (amdgpu_sriov_vf(adev))
>>>     		flush_delayed_work(&adev->delayed_init_work);
>>>     
>>> @@ -3570,6 +3608,10 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>>     	if (amdgpu_device_cache_pci_state(adev->pdev))
>>>     		pci_restore_state(pdev);
>>>     
>>> +	if (adev->gmc.xgmi.pending_reset)
>>> +		queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
>>> +				   msecs_to_jiffies(AMDGPU_RESUME_MS));
>>> +
>>>     	return 0;
>>>     
>>>     failed:
>>> @@ -4240,14 +4282,16 @@ bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
>>>     }
>>>     
>>>     
>>> -static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>> -					struct amdgpu_job *job,
>>> -					bool *need_full_reset_arg)
>>> +int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>> +				  struct amdgpu_job *job,
>>> +				  bool *need_full_reset_arg)
>>>     {
>>>     	int i, r = 0;
>>>     	bool need_full_reset  = *need_full_reset_arg;
>>>     
>>> -	amdgpu_debugfs_wait_dump(adev);
>>> +	/* no need to dump if device is not in good state during probe period */
>>> +	if (!adev->gmc.xgmi.pending_reset)
>>> +		amdgpu_debugfs_wait_dump(adev);
>>>     
>>>     	if (amdgpu_sriov_vf(adev)) {
>>>     		/* stop the data exchange thread */ @@ -4293,10 +4337,10 @@
>>> static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>>     	return r;
>>>     }
>>>     
>>> -static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>> -			       struct list_head *device_list_handle,
>>> -			       bool *need_full_reset_arg,
>>> -			       bool skip_hw_reset)
>>> +int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>> +			  struct list_head *device_list_handle,
>>> +			  bool *need_full_reset_arg,
>>> +			  bool skip_hw_reset)
>>>     {
>>>     	struct amdgpu_device *tmp_adev = NULL;
>>>     	bool need_full_reset = *need_full_reset_arg, vram_lost = false;
>>> @@
>>> -4310,6 +4354,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>>     		list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
>>>     			/* For XGMI run all resets in parallel to speed up the process */
>>>     			if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
>>> +				tmp_adev->gmc.xgmi.pending_reset = false;
>>>     				if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
>>>     					r = -EALREADY;
>>>     			} else
>>> @@ -4348,10 +4393,10 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>>     	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
>>>     		if (need_full_reset) {
>>>     			/* post card */
>>> -			if (amdgpu_device_asic_init(tmp_adev))
>>> +			r = amdgpu_device_asic_init(tmp_adev);
>>> +			if (r) {
>>>     				dev_warn(tmp_adev->dev, "asic atom init failed!");
>>> -
>>> -			if (!r) {
>>> +			} else {
>>>     				dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
>>>     				r = amdgpu_device_ip_resume_phase1(tmp_adev);
>>>     				if (r)
>>> @@ -4384,6 +4429,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>>     				 */
>>>     				amdgpu_register_gpu_instance(tmp_adev);
>>>     
>>> +				if (!hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
>>> +					amdgpu_xgmi_add_device(tmp_adev);
>>> +
>>>     				r = amdgpu_device_ip_late_init(tmp_adev);
>>>     				if (r)
>>>     					goto out;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> index 253c59e0a100..aebe4bc561ee 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> @@ -44,6 +44,7 @@
>>>     #include "amdgpu_amdkfd.h"
>>>     
>>>     #include "amdgpu_ras.h"
>>> +#include "amdgpu_xgmi.h"
>>>     
>>>     /*
>>>      * KMS wrapper.
>>> @@ -167,8 +168,13 @@ uint amdgpu_freesync_vid_mode;
>>>     int amdgpu_reset_method = -1; /* auto */
>>>     int amdgpu_num_kcq = -1;
>>>     
>>> +static void amdgpu_drv_delayed_reset_work_handler(struct work_struct
>>> +*work);
>>> +
>>>     struct amdgpu_mgpu_info mgpu_info = {
>>>     	.mutex = __MUTEX_INITIALIZER(mgpu_info.mutex),
>>> +	.delayed_reset_work = __DELAYED_WORK_INITIALIZER(
>>> +			mgpu_info.delayed_reset_work,
>>> +			amdgpu_drv_delayed_reset_work_handler, 0),
>>>     };
>>>     int amdgpu_ras_enable = -1;
>>>     uint amdgpu_ras_mask = 0xffffffff; @@ -1297,6 +1303,71 @@
>>> amdgpu_pci_shutdown(struct pci_dev *pdev)
>>>     	adev->mp1_state = PP_MP1_STATE_NONE;
>>>     }
>>>     
>>> +/**
>>> + * amdgpu_drv_delayed_reset_work_handler - work handler for reset
>>> + *
>>> + * @work: work_struct.
>>> + */
>>> +static void amdgpu_drv_delayed_reset_work_handler(struct work_struct
>>> +*work) {
>>> +	struct list_head device_list;
>>> +	struct amdgpu_device *adev;
>>> +	int i, r;
>>> +	bool need_full_reset = true;
>>> +
>>> +	mutex_lock(&mgpu_info.mutex);
>>> +	if (mgpu_info.pending_reset == true) {
>>> +		mutex_unlock(&mgpu_info.mutex);
>>> +		return;
>>> +	}
>>> +	mgpu_info.pending_reset = true;
>>> +	mutex_unlock(&mgpu_info.mutex);
>>> +
>>> +	for (i = 0; i < mgpu_info.num_dgpu; i++) {
>>> +		adev = mgpu_info.gpu_ins[i].adev;
>>> +		r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset);
>>
>> Why amdgpu_device_pre_asic_reset is needed ?
>>
>>> +		if (r) {
>>> +			dev_err(adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
>>> +				r, adev_to_drm(adev)->unique);
>>> +		}
>>> +		if (!queue_work(system_unbound_wq, &adev->xgmi_reset_work))
>>> +			r = -EALREADY;
>>
>> amdgpu_do_asic_reset bellow will already schedule xgmi_reset_work for this device, what you could do instead is call amdgpu_do_asic_reset for each member of the hive and because there is a task barrier in amdgpu_device_xgmi_reset_func, it will synchronize all the resets to same point in time already.
>>
>>> +	}
>>> +	msleep(100);
>>
>> What's the 100ms is wiating for ?
>>
>>> +	for (i = 0; i < mgpu_info.num_dgpu; i++) {
>>> +		adev = mgpu_info.gpu_ins[i].adev;
>>> +		adev->gmc.xgmi.pending_reset = false;
>>> +		flush_work(&adev->xgmi_reset_work);
>>> +	}
>>> +
>>> +	msleep(100);
>>
>> Same as above
>>
>>> +	/* reset function will rebuild the xgmi hive info , clear it now */
>>> +	for (i = 0; i < mgpu_info.num_dgpu; i++)
>>> +		amdgpu_xgmi_remove_device(mgpu_info.gpu_ins[i].adev);
>>> +
>>> +	INIT_LIST_HEAD(&device_list);
>>> +
>>> +	for (i = 0; i < mgpu_info.num_dgpu; i++)
>>> +		list_add_tail(&mgpu_info.gpu_ins[i].adev->reset_list,
>>> +&device_list);
>>> +
>>> +	/* unregister the GPU first, reset function will add them back */
>>> +	list_for_each_entry(adev, &device_list, reset_list)
>>> +		amdgpu_unregister_gpu_instance(adev);
>>> +
>>> +	r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true);
>>> +	if (r) {
>>> +		DRM_ERROR("reinit gpus failure");
>>> +		return;
>>> +	}
>>> +	for (i = 0; i < mgpu_info.num_dgpu; i++) {
>>> +		adev = mgpu_info.gpu_ins[i].adev;
>>> +		if (!adev->kfd.init_complete)
>>> +			amdgpu_amdkfd_device_init(adev);
>>> +		amdgpu_ttm_set_buffer_funcs_status(adev, true);
>>> +	}
>>> +	return;
>>> +}
>>> +
>>>     static int amdgpu_pmops_suspend(struct device *dev)
>>>     {
>>>     	struct drm_device *drm_dev = dev_get_drvdata(dev); diff --git
>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>>> index aa0c83776ce0..8c71d84a2fbe 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>>> @@ -149,6 +149,7 @@ struct amdgpu_xgmi {
>>>     	struct list_head head;
>>>     	bool supported;
>>>     	struct ras_common_if *ras_if;
>>> +	bool pending_reset;
>>>     };
>>>     
>>>     struct amdgpu_gmc {
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>>> index 659b385b27b5..b459ef755ea9 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>>> @@ -492,7 +492,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>>>     	if (!adev->gmc.xgmi.supported)
>>>     		return 0;
>>>     
>>> -	if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
>>> +	if (!adev->gmc.xgmi.pending_reset &&
>>> +	    amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
>>>     		ret = psp_xgmi_initialize(&adev->psp);
>>>     		if (ret) {
>>>     			dev_err(adev->dev,
>>> @@ -538,7 +539,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device
>>> *adev)
>>>     
>>>     	task_barrier_add_task(&hive->tb);
>>>     
>>> -	if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
>>> +	if (!adev->gmc.xgmi.pending_reset &&
>>> +	    amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
>>>     		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
>>>     			/* update node list for other device in the hive */
>>>     			if (tmp_adev != adev) {
>>> @@ -567,7 +569,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>>>     		}
>>>     	}
>>>     
>>> -	if (!ret)
>>> +	if (!ret && !adev->gmc.xgmi.pending_reset)
>>>     		ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive);
>>>     
>>>     exit_unlock:
>>>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=04%7C01%7CShaoyun.Liu%40amd.com%7C73608bda6abd48a3047608d8e0e02e4f%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637506600750277165%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=2qASlLYQ08twP0Ud5EcisCsVlJ9WG0QJv5idbuhDe4o%3D&amp;reserved=0
> 
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [PATCH 5/5] drm/amdgpu: Reset the devices in the XGMI hive duirng probe
  2021-03-08  6:28           ` Andrey Grodzovsky
@ 2021-03-08 15:27             ` Liu, Shaoyun
  2021-03-08 15:53               ` Andrey Grodzovsky
  0 siblings, 1 reply; 12+ messages in thread
From: Liu, Shaoyun @ 2021-03-08 15:27 UTC (permalink / raw)
  To: Grodzovsky, Andrey, amd-gfx

[AMD Official Use Only - Internal Distribution Only]

Check the function amdgpu_xgmi_add_device, when  psp XGMI TA is bot available ,  the driver will assign a faked hive ID 0x10 for all  GPUs, it means all GPU will belongs to one same hive .  So I can still use hive->tb to sync the reset on all GPUs.   The reason I can  not use the default amdgpu_do_asic_reset function  is because we  want to build correct hive and node topology for all GPUs after reset, so we need to call amdgpu_xgmi_add_device inside the amdgpu_do_asic_reset function . To make this works ,  we need to destroy the hive by remove  the device (call amdgpu_xgmi_remove_device) first , so when calling amdgpu_do_asic_reset ,  the  faked hive(0x10) already   been destroyed. And  the hive->tb will not work in this case .   That's the reason I need to call the reset explicitly with the faked hive and then destroy the hive ,  build the device_list for amdgpu_do_asic_reset without the hive . 
Hope I explain it clearly . 

Thanks 
Shaoyun.liu

-----Original Message-----
From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com> 
Sent: Monday, March 8, 2021 1:28 AM
To: Liu, Shaoyun <Shaoyun.Liu@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH 5/5] drm/amdgpu: Reset the devices in the XGMI hive duirng probe

But the hive->tb object is used regardless, inside amdgpu_device_xgmi_reset_func currently, it means then even when you explcicitly schdule xgmi_reset_work as you do now they code will try to sync using a not well iniitlized tb object. Maybe you can define a global static tb object, fill it in the loop where you send xgmi_reset_work for all devices in system and use it from within amdgpu_device_xgmi_reset_func instead of the regular per hive tb object (obviosly under your special use case).

Andrey

On 2021-03-06 4:11 p.m., Liu, Shaoyun wrote:
> [AMD Official Use Only - Internal Distribution Only]
> 
> It  seems I can  not directly reuse the reset HW  function inside the  amdgpu_do_asic_reset,  the  synchronization is based on hive->tb,   but as explained , we actually don't know the GPU belongs to which hive and will rebuild the correct hive info inside the amdgpu_do_asic_reset function with amdgpu_xgmi_add_device .  so I need to remove  all GPUs from the hive first . This will lead to the sync don't work since the hive->tb will be removed as well when all GPUs are removed .
> 
> Thanks
> shaopyunliu
> 
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of 
> Liu, Shaoyun
> Sent: Saturday, March 6, 2021 3:41 PM
> To: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; 
> amd-gfx@lists.freedesktop.org
> Subject: RE: [PATCH 5/5] drm/amdgpu: Reset the devices in the XGMI 
> hive duirng probe
> 
> [AMD Official Use Only - Internal Distribution Only]
> 
> I call the amdgpu_do_asic_reset with the parameter skip_hw_reset = true  so the reset won't be execute twice .  but probably I can  set this parameter to true and remove the code schedule for reset since now I already build the device_list not based on hive. Let me try that .
> For the  schedule delayed work thread with AMDGPU_RESUME_MS, It's actually not wait for SMU  to start. As I explained , I need to reset the all the GPUs in the system since I don't know which gpus belongs to which hive.  So this time is allow system to probe all the GPUs  in the system which means when this delayed thread starts ,  we can assume all the devices already been  populated in mgpu_info.
> 
> Regards
> Shaoyun.liu
> 
> -----Original Message-----
> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
> Sent: Saturday, March 6, 2021 1:09 AM
> To: Liu, Shaoyun <Shaoyun.Liu@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH 5/5] drm/amdgpu: Reset the devices in the XGMI 
> hive duirng probe
> 
> Thanks for explaining this, one thing I still don't understand is why you schedule the reset work explicilty in the begining of amdgpu_drv_delayed_reset_work_handler and then also call amdgpu_do_asic_reset which will do the same thing too. It looks like the physical reset will execute twice for each device.
> Another thing is, more like improvement suggestion  - currently you schedule delayed_reset_work using AMDGPU_RESUME_MS - so i guesss this should give enough time for SMU to start ? Is there maybe a way to instead poll for SMU start completion and then execute this - some SMU status registers maybe ? Just to avoid relying on this arbitrary value.
> 
> Andrey
> 
> On 2021-03-05 8:37 p.m., Liu, Shaoyun wrote:
>> [AMD Official Use Only - Internal Distribution Only]
>>
>> Hi,  Andrey
>> The existing reset function (amdgpu_device_gpu_recover or amd do_asic _reset) assumed driver already have  the correct hive info . But in my case, it's  not true . The gpus are in a bad state and the XGMI TA  might not functional properly , so driver can  not  get the hive and node info when probe the device .  It means driver even don't know  the device belongs to which hive on a system with multiple hive configuration (ex, 8 gpus in  two hive). The only solution I can think of is let driver trigger the reset on all gpus at the same time after driver do the minimum initialization on the HW ( bring up the  SMU IP)  no matter they belongs to the same hive or not and call amdgpu_xgmi_add_device for each device after re-init .
>> The 100 ms delay added after the baco reset . I think they can be removed . let me verify it.
>>
>> Regards
>> Shaoyun.liu
>>
>>
>>
>> -----Original Message-----
>> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>> Sent: Friday, March 5, 2021 2:27 PM
>> To: Liu, Shaoyun <Shaoyun.Liu@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH 5/5] drm/amdgpu: Reset the devices in the XGMI 
>> hive duirng probe
>>
>>
>>
>> On 2021-03-05 12:52 p.m., shaoyunl wrote:
>>> In passthrough configuration, hypervisior will trigger the 
>>> SBR(Secondary bus reset) to the devices without sync to each other.
>>> This could cause device hang since for XGMI configuration, all the 
>>> devices within the hive need to be reset at a limit time slot. This 
>>> serial of patches try to solve this issue by co-operate with new SMU 
>>> which will only do minimum house keeping to response the SBR request 
>>> but don't do the real reset job and leave it to driver. Driver need 
>>> to do the whole sw init and minimum HW init to bring up the SMU and 
>>> trigger the reset(possibly BACO) on all the ASICs at the same time
>>>
>>> Signed-off-by: shaoyunl <shaoyun.liu@amd.com>
>>> Change-Id: I34e838e611b7623c7ad824704c7ce350808014fc
>>> ---
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  13 +++
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 102 +++++++++++++++------
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  71 ++++++++++++++
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h    |   1 +
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   |   8 +-
>>>     5 files changed, 165 insertions(+), 30 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> index d46d3794699e..5602c6edee97 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> @@ -125,6 +125,10 @@ struct amdgpu_mgpu_info
>>>     	uint32_t			num_gpu;
>>>     	uint32_t			num_dgpu;
>>>     	uint32_t			num_apu;
>>> +
>>> +	/* delayed reset_func for XGMI configuration if necessary */
>>> +	struct delayed_work		delayed_reset_work;
>>> +	bool				pending_reset;
>>>     };
>>>     
>>>     #define AMDGPU_MAX_TIMEOUT_PARAM_LENGTH	256
>>> @@ -1124,6 +1128,15 @@ void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
>>>     bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type);
>>>     bool amdgpu_device_has_dc_support(struct amdgpu_device *adev);
>>>     
>>> +int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>> +				  struct amdgpu_job *job,
>>> +				  bool *need_full_reset_arg);
>>> +
>>> +int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>> +			  struct list_head *device_list_handle,
>>> +			  bool *need_full_reset_arg,
>>> +			  bool skip_hw_reset);
>>> +
>>>     int emu_soc_asic_init(struct amdgpu_device *adev);
>>>     
>>>     /*
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index 3c35b0c1e710..5b520f70e660 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -1220,6 +1220,10 @@ bool amdgpu_device_need_post(struct amdgpu_device *adev)
>>>     		}
>>>     	}
>>>     
>>> +	/* Don't post if we need to reset whole hive on init */
>>> +	if (adev->gmc.xgmi.pending_reset)
>>> +		return false;
>>> +
>>>     	if (adev->has_hw_reset) {
>>>     		adev->has_hw_reset = false;
>>>     		return true;
>>> @@ -2149,6 +2153,9 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
>>>     			if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
>>>     				continue;
>>>     
>>> +			if (!adev->ip_blocks[i].status.sw)
>>> +				continue;
>>> +
>>>     			/* no need to do the fw loading again if already done*/
>>>     			if (adev->ip_blocks[i].status.hw == true)
>>>     				break;
>>> @@ -2289,7 +2296,10 @@ static int amdgpu_device_ip_init(struct 
>>> amdgpu_device *adev)
>>>     
>>>     	if (adev->gmc.xgmi.num_physical_nodes > 1)
>>>     		amdgpu_xgmi_add_device(adev);
>>> -	amdgpu_amdkfd_device_init(adev);
>>> +
>>> +	/* Don't init kfd if whole hive need to be reset during init */
>>> +	if (!adev->gmc.xgmi.pending_reset)
>>> +		amdgpu_amdkfd_device_init(adev);
>>>     
>>>     	amdgpu_fru_get_product_info(adev);
>>>     
>>> @@ -2734,6 +2744,16 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
>>>     			adev->ip_blocks[i].status.hw = false;
>>>     			continue;
>>>     		}
>>> +
>>> +		/* skip unnecessary suspend if we do not initialize them yet */
>>> +		if (adev->gmc.xgmi.pending_reset &&
>>> +		    !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
>>> +		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
>>> +		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
>>> +		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
>>> +			adev->ip_blocks[i].status.hw = false;
>>> +			continue;
>>> +		}
>>>     		/* XXX handle errors */
>>>     		r = adev->ip_blocks[i].version->funcs->suspend(adev);
>>>     		/* XXX handle errors */
>>> @@ -3407,10 +3427,28 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>>     	 *  E.g., driver was not cleanly unloaded previously, etc.
>>>     	 */
>>>     	if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
>>> -		r = amdgpu_asic_reset(adev);
>>> -		if (r) {
>>> -			dev_err(adev->dev, "asic reset on init failed\n");
>>> -			goto failed;
>>> +		if (adev->gmc.xgmi.num_physical_nodes) {
>>> +			dev_info(adev->dev, "Pending hive reset.\n");
>>> +			adev->gmc.xgmi.pending_reset = true;
>>> +			/* Only need to init necessary block for SMU to handle the reset */
>>> +			for (i = 0; i < adev->num_ip_blocks; i++) {
>>> +				if (!adev->ip_blocks[i].status.valid)
>>> +					continue;
>>> +				if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
>>> +				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
>>> +				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
>>> +				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
>>> +					DRM_DEBUG("IP %s disabed for hw_init.\n",
>>> +						adev->ip_blocks[i].version->funcs->name);
>>> +					adev->ip_blocks[i].status.hw = true;
>>> +				}
>>> +			}
>>> +		} else {
>>> +			r = amdgpu_asic_reset(adev);
>>> +			if (r) {
>>> +				dev_err(adev->dev, "asic reset on init failed\n");
>>> +				goto failed;
>>> +			}
>>>     		}
>>>     	}
>>>     
>>> @@ -3541,19 +3579,19 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>>     	/* enable clockgating, etc. after ib tests, etc. since some blocks require
>>>     	 * explicit gating rather than handling it automatically.
>>>     	 */
>>> -	r = amdgpu_device_ip_late_init(adev);
>>> -	if (r) {
>>> -		dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
>>> -		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
>>> -		goto failed;
>>> +	if (!adev->gmc.xgmi.pending_reset) {
>>> +		r = amdgpu_device_ip_late_init(adev);
>>> +		if (r) {
>>> +			dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
>>> +			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
>>> +			goto failed;
>>> +		}
>>> +		/* must succeed. */
>>> +		amdgpu_ras_resume(adev);
>>> +		queue_delayed_work(system_wq, &adev->delayed_init_work,
>>> +				   msecs_to_jiffies(AMDGPU_RESUME_MS));
>>>     	}
>>>     
>>> -	/* must succeed. */
>>> -	amdgpu_ras_resume(adev);
>>> -
>>> -	queue_delayed_work(system_wq, &adev->delayed_init_work,
>>> -			   msecs_to_jiffies(AMDGPU_RESUME_MS));
>>> -
>>>     	if (amdgpu_sriov_vf(adev))
>>>     		flush_delayed_work(&adev->delayed_init_work);
>>>     
>>> @@ -3570,6 +3608,10 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>>     	if (amdgpu_device_cache_pci_state(adev->pdev))
>>>     		pci_restore_state(pdev);
>>>     
>>> +	if (adev->gmc.xgmi.pending_reset)
>>> +		queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
>>> +				   msecs_to_jiffies(AMDGPU_RESUME_MS));
>>> +
>>>     	return 0;
>>>     
>>>     failed:
>>> @@ -4240,14 +4282,16 @@ bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
>>>     }
>>>     
>>>     
>>> -static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>> -					struct amdgpu_job *job,
>>> -					bool *need_full_reset_arg)
>>> +int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>> +				  struct amdgpu_job *job,
>>> +				  bool *need_full_reset_arg)
>>>     {
>>>     	int i, r = 0;
>>>     	bool need_full_reset  = *need_full_reset_arg;
>>>     
>>> -	amdgpu_debugfs_wait_dump(adev);
>>> +	/* no need to dump if device is not in good state during probe period */
>>> +	if (!adev->gmc.xgmi.pending_reset)
>>> +		amdgpu_debugfs_wait_dump(adev);
>>>     
>>>     	if (amdgpu_sriov_vf(adev)) {
>>>     		/* stop the data exchange thread */ @@ -4293,10 +4337,10 @@ 
>>> static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>>     	return r;
>>>     }
>>>     
>>> -static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>> -			       struct list_head *device_list_handle,
>>> -			       bool *need_full_reset_arg,
>>> -			       bool skip_hw_reset)
>>> +int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>> +			  struct list_head *device_list_handle,
>>> +			  bool *need_full_reset_arg,
>>> +			  bool skip_hw_reset)
>>>     {
>>>     	struct amdgpu_device *tmp_adev = NULL;
>>>     	bool need_full_reset = *need_full_reset_arg, vram_lost = false; 
>>> @@
>>> -4310,6 +4354,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>>     		list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
>>>     			/* For XGMI run all resets in parallel to speed up the process */
>>>     			if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
>>> +				tmp_adev->gmc.xgmi.pending_reset = false;
>>>     				if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
>>>     					r = -EALREADY;
>>>     			} else
>>> @@ -4348,10 +4393,10 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>>     	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
>>>     		if (need_full_reset) {
>>>     			/* post card */
>>> -			if (amdgpu_device_asic_init(tmp_adev))
>>> +			r = amdgpu_device_asic_init(tmp_adev);
>>> +			if (r) {
>>>     				dev_warn(tmp_adev->dev, "asic atom init failed!");
>>> -
>>> -			if (!r) {
>>> +			} else {
>>>     				dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
>>>     				r = amdgpu_device_ip_resume_phase1(tmp_adev);
>>>     				if (r)
>>> @@ -4384,6 +4429,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>>     				 */
>>>     				amdgpu_register_gpu_instance(tmp_adev);
>>>     
>>> +				if (!hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
>>> +					amdgpu_xgmi_add_device(tmp_adev);
>>> +
>>>     				r = amdgpu_device_ip_late_init(tmp_adev);
>>>     				if (r)
>>>     					goto out;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> index 253c59e0a100..aebe4bc561ee 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> @@ -44,6 +44,7 @@
>>>     #include "amdgpu_amdkfd.h"
>>>     
>>>     #include "amdgpu_ras.h"
>>> +#include "amdgpu_xgmi.h"
>>>     
>>>     /*
>>>      * KMS wrapper.
>>> @@ -167,8 +168,13 @@ uint amdgpu_freesync_vid_mode;
>>>     int amdgpu_reset_method = -1; /* auto */
>>>     int amdgpu_num_kcq = -1;
>>>     
>>> +static void amdgpu_drv_delayed_reset_work_handler(struct 
>>> +work_struct *work);
>>> +
>>>     struct amdgpu_mgpu_info mgpu_info = {
>>>     	.mutex = __MUTEX_INITIALIZER(mgpu_info.mutex),
>>> +	.delayed_reset_work = __DELAYED_WORK_INITIALIZER(
>>> +			mgpu_info.delayed_reset_work,
>>> +			amdgpu_drv_delayed_reset_work_handler, 0),
>>>     };
>>>     int amdgpu_ras_enable = -1;
>>>     uint amdgpu_ras_mask = 0xffffffff; @@ -1297,6 +1303,71 @@ 
>>> amdgpu_pci_shutdown(struct pci_dev *pdev)
>>>     	adev->mp1_state = PP_MP1_STATE_NONE;
>>>     }
>>>     
>>> +/**
>>> + * amdgpu_drv_delayed_reset_work_handler - work handler for reset
>>> + *
>>> + * @work: work_struct.
>>> + */
>>> +static void amdgpu_drv_delayed_reset_work_handler(struct 
>>> +work_struct
>>> +*work) {
>>> +	struct list_head device_list;
>>> +	struct amdgpu_device *adev;
>>> +	int i, r;
>>> +	bool need_full_reset = true;
>>> +
>>> +	mutex_lock(&mgpu_info.mutex);
>>> +	if (mgpu_info.pending_reset == true) {
>>> +		mutex_unlock(&mgpu_info.mutex);
>>> +		return;
>>> +	}
>>> +	mgpu_info.pending_reset = true;
>>> +	mutex_unlock(&mgpu_info.mutex);
>>> +
>>> +	for (i = 0; i < mgpu_info.num_dgpu; i++) {
>>> +		adev = mgpu_info.gpu_ins[i].adev;
>>> +		r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset);
>>
>> Why amdgpu_device_pre_asic_reset is needed ?
>>
>>> +		if (r) {
>>> +			dev_err(adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
>>> +				r, adev_to_drm(adev)->unique);
>>> +		}
>>> +		if (!queue_work(system_unbound_wq, &adev->xgmi_reset_work))
>>> +			r = -EALREADY;
>>
>> amdgpu_do_asic_reset bellow will already schedule xgmi_reset_work for this device, what you could do instead is call amdgpu_do_asic_reset for each member of the hive and because there is a task barrier in amdgpu_device_xgmi_reset_func, it will synchronize all the resets to same point in time already.
>>
>>> +	}
>>> +	msleep(100);
>>
>> What's the 100ms is wiating for ?
>>
>>> +	for (i = 0; i < mgpu_info.num_dgpu; i++) {
>>> +		adev = mgpu_info.gpu_ins[i].adev;
>>> +		adev->gmc.xgmi.pending_reset = false;
>>> +		flush_work(&adev->xgmi_reset_work);
>>> +	}
>>> +
>>> +	msleep(100);
>>
>> Same as above
>>
>>> +	/* reset function will rebuild the xgmi hive info , clear it now */
>>> +	for (i = 0; i < mgpu_info.num_dgpu; i++)
>>> +		amdgpu_xgmi_remove_device(mgpu_info.gpu_ins[i].adev);
>>> +
>>> +	INIT_LIST_HEAD(&device_list);
>>> +
>>> +	for (i = 0; i < mgpu_info.num_dgpu; i++)
>>> +		list_add_tail(&mgpu_info.gpu_ins[i].adev->reset_list,
>>> +&device_list);
>>> +
>>> +	/* unregister the GPU first, reset function will add them back */
>>> +	list_for_each_entry(adev, &device_list, reset_list)
>>> +		amdgpu_unregister_gpu_instance(adev);
>>> +
>>> +	r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true);
>>> +	if (r) {
>>> +		DRM_ERROR("reinit gpus failure");
>>> +		return;
>>> +	}
>>> +	for (i = 0; i < mgpu_info.num_dgpu; i++) {
>>> +		adev = mgpu_info.gpu_ins[i].adev;
>>> +		if (!adev->kfd.init_complete)
>>> +			amdgpu_amdkfd_device_init(adev);
>>> +		amdgpu_ttm_set_buffer_funcs_status(adev, true);
>>> +	}
>>> +	return;
>>> +}
>>> +
>>>     static int amdgpu_pmops_suspend(struct device *dev)
>>>     {
>>>     	struct drm_device *drm_dev = dev_get_drvdata(dev); diff --git 
>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>>> index aa0c83776ce0..8c71d84a2fbe 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>>> @@ -149,6 +149,7 @@ struct amdgpu_xgmi {
>>>     	struct list_head head;
>>>     	bool supported;
>>>     	struct ras_common_if *ras_if;
>>> +	bool pending_reset;
>>>     };
>>>     
>>>     struct amdgpu_gmc {
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>>> index 659b385b27b5..b459ef755ea9 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>>> @@ -492,7 +492,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>>>     	if (!adev->gmc.xgmi.supported)
>>>     		return 0;
>>>     
>>> -	if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
>>> +	if (!adev->gmc.xgmi.pending_reset &&
>>> +	    amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
>>>     		ret = psp_xgmi_initialize(&adev->psp);
>>>     		if (ret) {
>>>     			dev_err(adev->dev,
>>> @@ -538,7 +539,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device
>>> *adev)
>>>     
>>>     	task_barrier_add_task(&hive->tb);
>>>     
>>> -	if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
>>> +	if (!adev->gmc.xgmi.pending_reset &&
>>> +	    amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
>>>     		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
>>>     			/* update node list for other device in the hive */
>>>     			if (tmp_adev != adev) {
>>> @@ -567,7 +569,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>>>     		}
>>>     	}
>>>     
>>> -	if (!ret)
>>> +	if (!ret && !adev->gmc.xgmi.pending_reset)
>>>     		ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive);
>>>     
>>>     exit_unlock:
>>>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flist
> s.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=04%7C01%7CSh
> aoyun.Liu%40amd.com%7C73608bda6abd48a3047608d8e0e02e4f%7C3dd8961fe4884
> e608e11a82d994e183d%7C0%7C0%7C637506600750277165%7CUnknown%7CTWFpbGZsb
> 3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%
> 7C1000&amp;sdata=2qASlLYQ08twP0Ud5EcisCsVlJ9WG0QJv5idbuhDe4o%3D&amp;re
> served=0
> 
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 5/5] drm/amdgpu: Reset the devices in the XGMI hive duirng probe
  2021-03-08 15:27             ` Liu, Shaoyun
@ 2021-03-08 15:53               ` Andrey Grodzovsky
  2021-03-08 16:10                 ` Liu, Shaoyun
  0 siblings, 1 reply; 12+ messages in thread
From: Andrey Grodzovsky @ 2021-03-08 15:53 UTC (permalink / raw)
  To: Liu, Shaoyun, amd-gfx

I see, thanks for explaning.

Andrey

On 2021-03-08 10:27 a.m., Liu, Shaoyun wrote:
> [AMD Official Use Only - Internal Distribution Only]
> 
> Check the function amdgpu_xgmi_add_device, when  psp XGMI TA is bot available ,  the driver will assign a faked hive ID 0x10 for all  GPUs, it means all GPU will belongs to one same hive .  So I can still use hive->tb to sync the reset on all GPUs.   The reason I can  not use the default amdgpu_do_asic_reset function  is because we  want to build correct hive and node topology for all GPUs after reset, so we need to call amdgpu_xgmi_add_device inside the amdgpu_do_asic_reset function . To make this works ,  we need to destroy the hive by remove  the device (call amdgpu_xgmi_remove_device) first , so when calling amdgpu_do_asic_reset ,  the  faked hive(0x10) already   been destroyed. And  the hive->tb will not work in this case .   That's the reason I need to call the reset explicitly with the faked hive and then destroy the hive ,  build the device_list for amdgpu_do_asic_reset without the hive .
> Hope I explain it clearly .
> 
> Thanks
> Shaoyun.liu
> 
> -----Original Message-----
> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
> Sent: Monday, March 8, 2021 1:28 AM
> To: Liu, Shaoyun <Shaoyun.Liu@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH 5/5] drm/amdgpu: Reset the devices in the XGMI hive duirng probe
> 
> But the hive->tb object is used regardless, inside amdgpu_device_xgmi_reset_func currently, it means then even when you explcicitly schdule xgmi_reset_work as you do now they code will try to sync using a not well iniitlized tb object. Maybe you can define a global static tb object, fill it in the loop where you send xgmi_reset_work for all devices in system and use it from within amdgpu_device_xgmi_reset_func instead of the regular per hive tb object (obviosly under your special use case).
> 
> Andrey
> 
> On 2021-03-06 4:11 p.m., Liu, Shaoyun wrote:
>> [AMD Official Use Only - Internal Distribution Only]
>>
>> It  seems I can  not directly reuse the reset HW  function inside the  amdgpu_do_asic_reset,  the  synchronization is based on hive->tb,   but as explained , we actually don't know the GPU belongs to which hive and will rebuild the correct hive info inside the amdgpu_do_asic_reset function with amdgpu_xgmi_add_device .  so I need to remove  all GPUs from the hive first . This will lead to the sync don't work since the hive->tb will be removed as well when all GPUs are removed .
>>
>> Thanks
>> shaopyunliu
>>
>> -----Original Message-----
>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
>> Liu, Shaoyun
>> Sent: Saturday, March 6, 2021 3:41 PM
>> To: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>;
>> amd-gfx@lists.freedesktop.org
>> Subject: RE: [PATCH 5/5] drm/amdgpu: Reset the devices in the XGMI
>> hive duirng probe
>>
>> [AMD Official Use Only - Internal Distribution Only]
>>
>> I call the amdgpu_do_asic_reset with the parameter skip_hw_reset = true  so the reset won't be execute twice .  but probably I can  set this parameter to true and remove the code schedule for reset since now I already build the device_list not based on hive. Let me try that .
>> For the  schedule delayed work thread with AMDGPU_RESUME_MS, It's actually not wait for SMU  to start. As I explained , I need to reset the all the GPUs in the system since I don't know which gpus belongs to which hive.  So this time is allow system to probe all the GPUs  in the system which means when this delayed thread starts ,  we can assume all the devices already been  populated in mgpu_info.
>>
>> Regards
>> Shaoyun.liu
>>
>> -----Original Message-----
>> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>> Sent: Saturday, March 6, 2021 1:09 AM
>> To: Liu, Shaoyun <Shaoyun.Liu@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH 5/5] drm/amdgpu: Reset the devices in the XGMI
>> hive duirng probe
>>
>> Thanks for explaining this, one thing I still don't understand is why you schedule the reset work explicilty in the begining of amdgpu_drv_delayed_reset_work_handler and then also call amdgpu_do_asic_reset which will do the same thing too. It looks like the physical reset will execute twice for each device.
>> Another thing is, more like improvement suggestion  - currently you schedule delayed_reset_work using AMDGPU_RESUME_MS - so i guesss this should give enough time for SMU to start ? Is there maybe a way to instead poll for SMU start completion and then execute this - some SMU status registers maybe ? Just to avoid relying on this arbitrary value.
>>
>> Andrey
>>
>> On 2021-03-05 8:37 p.m., Liu, Shaoyun wrote:
>>> [AMD Official Use Only - Internal Distribution Only]
>>>
>>> Hi,  Andrey
>>> The existing reset function (amdgpu_device_gpu_recover or amd do_asic _reset) assumed driver already have  the correct hive info . But in my case, it's  not true . The gpus are in a bad state and the XGMI TA  might not functional properly , so driver can  not  get the hive and node info when probe the device .  It means driver even don't know  the device belongs to which hive on a system with multiple hive configuration (ex, 8 gpus in  two hive). The only solution I can think of is let driver trigger the reset on all gpus at the same time after driver do the minimum initialization on the HW ( bring up the  SMU IP)  no matter they belongs to the same hive or not and call amdgpu_xgmi_add_device for each device after re-init .
>>> The 100 ms delay added after the baco reset . I think they can be removed . let me verify it.
>>>
>>> Regards
>>> Shaoyun.liu
>>>
>>>
>>>
>>> -----Original Message-----
>>> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>>> Sent: Friday, March 5, 2021 2:27 PM
>>> To: Liu, Shaoyun <Shaoyun.Liu@amd.com>; amd-gfx@lists.freedesktop.org
>>> Subject: Re: [PATCH 5/5] drm/amdgpu: Reset the devices in the XGMI
>>> hive duirng probe
>>>
>>>
>>>
>>> On 2021-03-05 12:52 p.m., shaoyunl wrote:
>>>> In passthrough configuration, hypervisior will trigger the
>>>> SBR(Secondary bus reset) to the devices without sync to each other.
>>>> This could cause device hang since for XGMI configuration, all the
>>>> devices within the hive need to be reset at a limit time slot. This
>>>> serial of patches try to solve this issue by co-operate with new SMU
>>>> which will only do minimum house keeping to response the SBR request
>>>> but don't do the real reset job and leave it to driver. Driver need
>>>> to do the whole sw init and minimum HW init to bring up the SMU and
>>>> trigger the reset(possibly BACO) on all the ASICs at the same time
>>>>
>>>> Signed-off-by: shaoyunl <shaoyun.liu@amd.com>
>>>> Change-Id: I34e838e611b7623c7ad824704c7ce350808014fc
>>>> ---
>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  13 +++
>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 102 +++++++++++++++------
>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  71 ++++++++++++++
>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h    |   1 +
>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   |   8 +-
>>>>      5 files changed, 165 insertions(+), 30 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> index d46d3794699e..5602c6edee97 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> @@ -125,6 +125,10 @@ struct amdgpu_mgpu_info
>>>>      	uint32_t			num_gpu;
>>>>      	uint32_t			num_dgpu;
>>>>      	uint32_t			num_apu;
>>>> +
>>>> +	/* delayed reset_func for XGMI configuration if necessary */
>>>> +	struct delayed_work		delayed_reset_work;
>>>> +	bool				pending_reset;
>>>>      };
>>>>      
>>>>      #define AMDGPU_MAX_TIMEOUT_PARAM_LENGTH	256
>>>> @@ -1124,6 +1128,15 @@ void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
>>>>      bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type);
>>>>      bool amdgpu_device_has_dc_support(struct amdgpu_device *adev);
>>>>      
>>>> +int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>>> +				  struct amdgpu_job *job,
>>>> +				  bool *need_full_reset_arg);
>>>> +
>>>> +int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>>> +			  struct list_head *device_list_handle,
>>>> +			  bool *need_full_reset_arg,
>>>> +			  bool skip_hw_reset);
>>>> +
>>>>      int emu_soc_asic_init(struct amdgpu_device *adev);
>>>>      
>>>>      /*
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> index 3c35b0c1e710..5b520f70e660 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> @@ -1220,6 +1220,10 @@ bool amdgpu_device_need_post(struct amdgpu_device *adev)
>>>>      		}
>>>>      	}
>>>>      
>>>> +	/* Don't post if we need to reset whole hive on init */
>>>> +	if (adev->gmc.xgmi.pending_reset)
>>>> +		return false;
>>>> +
>>>>      	if (adev->has_hw_reset) {
>>>>      		adev->has_hw_reset = false;
>>>>      		return true;
>>>> @@ -2149,6 +2153,9 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
>>>>      			if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
>>>>      				continue;
>>>>      
>>>> +			if (!adev->ip_blocks[i].status.sw)
>>>> +				continue;
>>>> +
>>>>      			/* no need to do the fw loading again if already done*/
>>>>      			if (adev->ip_blocks[i].status.hw == true)
>>>>      				break;
>>>> @@ -2289,7 +2296,10 @@ static int amdgpu_device_ip_init(struct
>>>> amdgpu_device *adev)
>>>>      
>>>>      	if (adev->gmc.xgmi.num_physical_nodes > 1)
>>>>      		amdgpu_xgmi_add_device(adev);
>>>> -	amdgpu_amdkfd_device_init(adev);
>>>> +
>>>> +	/* Don't init kfd if whole hive need to be reset during init */
>>>> +	if (!adev->gmc.xgmi.pending_reset)
>>>> +		amdgpu_amdkfd_device_init(adev);
>>>>      
>>>>      	amdgpu_fru_get_product_info(adev);
>>>>      
>>>> @@ -2734,6 +2744,16 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
>>>>      			adev->ip_blocks[i].status.hw = false;
>>>>      			continue;
>>>>      		}
>>>> +
>>>> +		/* skip unnecessary suspend if we do not initialize them yet */
>>>> +		if (adev->gmc.xgmi.pending_reset &&
>>>> +		    !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
>>>> +		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
>>>> +		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
>>>> +		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
>>>> +			adev->ip_blocks[i].status.hw = false;
>>>> +			continue;
>>>> +		}
>>>>      		/* XXX handle errors */
>>>>      		r = adev->ip_blocks[i].version->funcs->suspend(adev);
>>>>      		/* XXX handle errors */
>>>> @@ -3407,10 +3427,28 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>>>      	 *  E.g., driver was not cleanly unloaded previously, etc.
>>>>      	 */
>>>>      	if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
>>>> -		r = amdgpu_asic_reset(adev);
>>>> -		if (r) {
>>>> -			dev_err(adev->dev, "asic reset on init failed\n");
>>>> -			goto failed;
>>>> +		if (adev->gmc.xgmi.num_physical_nodes) {
>>>> +			dev_info(adev->dev, "Pending hive reset.\n");
>>>> +			adev->gmc.xgmi.pending_reset = true;
>>>> +			/* Only need to init necessary block for SMU to handle the reset */
>>>> +			for (i = 0; i < adev->num_ip_blocks; i++) {
>>>> +				if (!adev->ip_blocks[i].status.valid)
>>>> +					continue;
>>>> +				if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
>>>> +				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
>>>> +				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
>>>> +				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
>>>> +					DRM_DEBUG("IP %s disabed for hw_init.\n",
>>>> +						adev->ip_blocks[i].version->funcs->name);
>>>> +					adev->ip_blocks[i].status.hw = true;
>>>> +				}
>>>> +			}
>>>> +		} else {
>>>> +			r = amdgpu_asic_reset(adev);
>>>> +			if (r) {
>>>> +				dev_err(adev->dev, "asic reset on init failed\n");
>>>> +				goto failed;
>>>> +			}
>>>>      		}
>>>>      	}
>>>>      
>>>> @@ -3541,19 +3579,19 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>>>      	/* enable clockgating, etc. after ib tests, etc. since some blocks require
>>>>      	 * explicit gating rather than handling it automatically.
>>>>      	 */
>>>> -	r = amdgpu_device_ip_late_init(adev);
>>>> -	if (r) {
>>>> -		dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
>>>> -		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
>>>> -		goto failed;
>>>> +	if (!adev->gmc.xgmi.pending_reset) {
>>>> +		r = amdgpu_device_ip_late_init(adev);
>>>> +		if (r) {
>>>> +			dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
>>>> +			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
>>>> +			goto failed;
>>>> +		}
>>>> +		/* must succeed. */
>>>> +		amdgpu_ras_resume(adev);
>>>> +		queue_delayed_work(system_wq, &adev->delayed_init_work,
>>>> +				   msecs_to_jiffies(AMDGPU_RESUME_MS));
>>>>      	}
>>>>      
>>>> -	/* must succeed. */
>>>> -	amdgpu_ras_resume(adev);
>>>> -
>>>> -	queue_delayed_work(system_wq, &adev->delayed_init_work,
>>>> -			   msecs_to_jiffies(AMDGPU_RESUME_MS));
>>>> -
>>>>      	if (amdgpu_sriov_vf(adev))
>>>>      		flush_delayed_work(&adev->delayed_init_work);
>>>>      
>>>> @@ -3570,6 +3608,10 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>>>      	if (amdgpu_device_cache_pci_state(adev->pdev))
>>>>      		pci_restore_state(pdev);
>>>>      
>>>> +	if (adev->gmc.xgmi.pending_reset)
>>>> +		queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
>>>> +				   msecs_to_jiffies(AMDGPU_RESUME_MS));
>>>> +
>>>>      	return 0;
>>>>      
>>>>      failed:
>>>> @@ -4240,14 +4282,16 @@ bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
>>>>      }
>>>>      
>>>>      
>>>> -static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>>> -					struct amdgpu_job *job,
>>>> -					bool *need_full_reset_arg)
>>>> +int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>>> +				  struct amdgpu_job *job,
>>>> +				  bool *need_full_reset_arg)
>>>>      {
>>>>      	int i, r = 0;
>>>>      	bool need_full_reset  = *need_full_reset_arg;
>>>>      
>>>> -	amdgpu_debugfs_wait_dump(adev);
>>>> +	/* no need to dump if device is not in good state during probe period */
>>>> +	if (!adev->gmc.xgmi.pending_reset)
>>>> +		amdgpu_debugfs_wait_dump(adev);
>>>>      
>>>>      	if (amdgpu_sriov_vf(adev)) {
>>>>      		/* stop the data exchange thread */ @@ -4293,10 +4337,10 @@
>>>> static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>>>      	return r;
>>>>      }
>>>>      
>>>> -static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>>> -			       struct list_head *device_list_handle,
>>>> -			       bool *need_full_reset_arg,
>>>> -			       bool skip_hw_reset)
>>>> +int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>>> +			  struct list_head *device_list_handle,
>>>> +			  bool *need_full_reset_arg,
>>>> +			  bool skip_hw_reset)
>>>>      {
>>>>      	struct amdgpu_device *tmp_adev = NULL;
>>>>      	bool need_full_reset = *need_full_reset_arg, vram_lost = false;
>>>> @@
>>>> -4310,6 +4354,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>>>      		list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
>>>>      			/* For XGMI run all resets in parallel to speed up the process */
>>>>      			if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
>>>> +				tmp_adev->gmc.xgmi.pending_reset = false;
>>>>      				if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
>>>>      					r = -EALREADY;
>>>>      			} else
>>>> @@ -4348,10 +4393,10 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>>>      	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
>>>>      		if (need_full_reset) {
>>>>      			/* post card */
>>>> -			if (amdgpu_device_asic_init(tmp_adev))
>>>> +			r = amdgpu_device_asic_init(tmp_adev);
>>>> +			if (r) {
>>>>      				dev_warn(tmp_adev->dev, "asic atom init failed!");
>>>> -
>>>> -			if (!r) {
>>>> +			} else {
>>>>      				dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
>>>>      				r = amdgpu_device_ip_resume_phase1(tmp_adev);
>>>>      				if (r)
>>>> @@ -4384,6 +4429,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>>>      				 */
>>>>      				amdgpu_register_gpu_instance(tmp_adev);
>>>>      
>>>> +				if (!hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
>>>> +					amdgpu_xgmi_add_device(tmp_adev);
>>>> +
>>>>      				r = amdgpu_device_ip_late_init(tmp_adev);
>>>>      				if (r)
>>>>      					goto out;
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>> index 253c59e0a100..aebe4bc561ee 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>> @@ -44,6 +44,7 @@
>>>>      #include "amdgpu_amdkfd.h"
>>>>      
>>>>      #include "amdgpu_ras.h"
>>>> +#include "amdgpu_xgmi.h"
>>>>      
>>>>      /*
>>>>       * KMS wrapper.
>>>> @@ -167,8 +168,13 @@ uint amdgpu_freesync_vid_mode;
>>>>      int amdgpu_reset_method = -1; /* auto */
>>>>      int amdgpu_num_kcq = -1;
>>>>      
>>>> +static void amdgpu_drv_delayed_reset_work_handler(struct
>>>> +work_struct *work);
>>>> +
>>>>      struct amdgpu_mgpu_info mgpu_info = {
>>>>      	.mutex = __MUTEX_INITIALIZER(mgpu_info.mutex),
>>>> +	.delayed_reset_work = __DELAYED_WORK_INITIALIZER(
>>>> +			mgpu_info.delayed_reset_work,
>>>> +			amdgpu_drv_delayed_reset_work_handler, 0),
>>>>      };
>>>>      int amdgpu_ras_enable = -1;
>>>>      uint amdgpu_ras_mask = 0xffffffff; @@ -1297,6 +1303,71 @@
>>>> amdgpu_pci_shutdown(struct pci_dev *pdev)
>>>>      	adev->mp1_state = PP_MP1_STATE_NONE;
>>>>      }
>>>>      
>>>> +/**
>>>> + * amdgpu_drv_delayed_reset_work_handler - work handler for reset
>>>> + *
>>>> + * @work: work_struct.
>>>> + */
>>>> +static void amdgpu_drv_delayed_reset_work_handler(struct
>>>> +work_struct
>>>> +*work) {
>>>> +	struct list_head device_list;
>>>> +	struct amdgpu_device *adev;
>>>> +	int i, r;
>>>> +	bool need_full_reset = true;
>>>> +
>>>> +	mutex_lock(&mgpu_info.mutex);
>>>> +	if (mgpu_info.pending_reset == true) {
>>>> +		mutex_unlock(&mgpu_info.mutex);
>>>> +		return;
>>>> +	}
>>>> +	mgpu_info.pending_reset = true;
>>>> +	mutex_unlock(&mgpu_info.mutex);
>>>> +
>>>> +	for (i = 0; i < mgpu_info.num_dgpu; i++) {
>>>> +		adev = mgpu_info.gpu_ins[i].adev;
>>>> +		r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset);
>>>
>>> Why amdgpu_device_pre_asic_reset is needed ?
>>>
>>>> +		if (r) {
>>>> +			dev_err(adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
>>>> +				r, adev_to_drm(adev)->unique);
>>>> +		}
>>>> +		if (!queue_work(system_unbound_wq, &adev->xgmi_reset_work))
>>>> +			r = -EALREADY;
>>>
>>> amdgpu_do_asic_reset bellow will already schedule xgmi_reset_work for this device, what you could do instead is call amdgpu_do_asic_reset for each member of the hive and because there is a task barrier in amdgpu_device_xgmi_reset_func, it will synchronize all the resets to same point in time already.
>>>
>>>> +	}
>>>> +	msleep(100);
>>>
>>> What's the 100ms is wiating for ?
>>>
>>>> +	for (i = 0; i < mgpu_info.num_dgpu; i++) {
>>>> +		adev = mgpu_info.gpu_ins[i].adev;
>>>> +		adev->gmc.xgmi.pending_reset = false;
>>>> +		flush_work(&adev->xgmi_reset_work);
>>>> +	}
>>>> +
>>>> +	msleep(100);
>>>
>>> Same as above
>>>
>>>> +	/* reset function will rebuild the xgmi hive info , clear it now */
>>>> +	for (i = 0; i < mgpu_info.num_dgpu; i++)
>>>> +		amdgpu_xgmi_remove_device(mgpu_info.gpu_ins[i].adev);
>>>> +
>>>> +	INIT_LIST_HEAD(&device_list);
>>>> +
>>>> +	for (i = 0; i < mgpu_info.num_dgpu; i++)
>>>> +		list_add_tail(&mgpu_info.gpu_ins[i].adev->reset_list,
>>>> +&device_list);
>>>> +
>>>> +	/* unregister the GPU first, reset function will add them back */
>>>> +	list_for_each_entry(adev, &device_list, reset_list)
>>>> +		amdgpu_unregister_gpu_instance(adev);
>>>> +
>>>> +	r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true);
>>>> +	if (r) {
>>>> +		DRM_ERROR("reinit gpus failure");
>>>> +		return;
>>>> +	}
>>>> +	for (i = 0; i < mgpu_info.num_dgpu; i++) {
>>>> +		adev = mgpu_info.gpu_ins[i].adev;
>>>> +		if (!adev->kfd.init_complete)
>>>> +			amdgpu_amdkfd_device_init(adev);
>>>> +		amdgpu_ttm_set_buffer_funcs_status(adev, true);
>>>> +	}
>>>> +	return;
>>>> +}
>>>> +
>>>>      static int amdgpu_pmops_suspend(struct device *dev)
>>>>      {
>>>>      	struct drm_device *drm_dev = dev_get_drvdata(dev); diff --git
>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>>>> index aa0c83776ce0..8c71d84a2fbe 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>>>> @@ -149,6 +149,7 @@ struct amdgpu_xgmi {
>>>>      	struct list_head head;
>>>>      	bool supported;
>>>>      	struct ras_common_if *ras_if;
>>>> +	bool pending_reset;
>>>>      };
>>>>      
>>>>      struct amdgpu_gmc {
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>>>> index 659b385b27b5..b459ef755ea9 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>>>> @@ -492,7 +492,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>>>>      	if (!adev->gmc.xgmi.supported)
>>>>      		return 0;
>>>>      
>>>> -	if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
>>>> +	if (!adev->gmc.xgmi.pending_reset &&
>>>> +	    amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
>>>>      		ret = psp_xgmi_initialize(&adev->psp);
>>>>      		if (ret) {
>>>>      			dev_err(adev->dev,
>>>> @@ -538,7 +539,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device
>>>> *adev)
>>>>      
>>>>      	task_barrier_add_task(&hive->tb);
>>>>      
>>>> -	if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
>>>> +	if (!adev->gmc.xgmi.pending_reset &&
>>>> +	    amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
>>>>      		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
>>>>      			/* update node list for other device in the hive */
>>>>      			if (tmp_adev != adev) {
>>>> @@ -567,7 +569,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>>>>      		}
>>>>      	}
>>>>      
>>>> -	if (!ret)
>>>> +	if (!ret && !adev->gmc.xgmi.pending_reset)
>>>>      		ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive);
>>>>      
>>>>      exit_unlock:
>>>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flist
>> s.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=04%7C01%7CSh
>> aoyun.Liu%40amd.com%7C73608bda6abd48a3047608d8e0e02e4f%7C3dd8961fe4884
>> e608e11a82d994e183d%7C0%7C0%7C637506600750277165%7CUnknown%7CTWFpbGZsb
>> 3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%
>> 7C1000&amp;sdata=2qASlLYQ08twP0Ud5EcisCsVlJ9WG0QJv5idbuhDe4o%3D&amp;re
>> served=0
>>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [PATCH 5/5] drm/amdgpu: Reset the devices in the XGMI hive duirng probe
  2021-03-08 15:53               ` Andrey Grodzovsky
@ 2021-03-08 16:10                 ` Liu, Shaoyun
  2021-03-08 17:26                   ` Andrey Grodzovsky
  0 siblings, 1 reply; 12+ messages in thread
From: Liu, Shaoyun @ 2021-03-08 16:10 UTC (permalink / raw)
  To: Grodzovsky, Andrey, amd-gfx

[AMD Official Use Only - Internal Distribution Only]

Hi, Andrey. 
The first 3 patches in this serial already been acked by Alex. D, can you help review the rest two ? 

Thanks
Shaoyun.liu

-----Original Message-----
From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com> 
Sent: Monday, March 8, 2021 10:53 AM
To: Liu, Shaoyun <Shaoyun.Liu@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH 5/5] drm/amdgpu: Reset the devices in the XGMI hive duirng probe

I see, thanks for explaning.

Andrey

On 2021-03-08 10:27 a.m., Liu, Shaoyun wrote:
> [AMD Official Use Only - Internal Distribution Only]
> 
> Check the function amdgpu_xgmi_add_device, when  psp XGMI TA is bot available ,  the driver will assign a faked hive ID 0x10 for all  GPUs, it means all GPU will belongs to one same hive .  So I can still use hive->tb to sync the reset on all GPUs.   The reason I can  not use the default amdgpu_do_asic_reset function  is because we  want to build correct hive and node topology for all GPUs after reset, so we need to call amdgpu_xgmi_add_device inside the amdgpu_do_asic_reset function . To make this works ,  we need to destroy the hive by remove  the device (call amdgpu_xgmi_remove_device) first , so when calling amdgpu_do_asic_reset ,  the  faked hive(0x10) already   been destroyed. And  the hive->tb will not work in this case .   That's the reason I need to call the reset explicitly with the faked hive and then destroy the hive ,  build the device_list for amdgpu_do_asic_reset without the hive .
> Hope I explain it clearly .
> 
> Thanks
> Shaoyun.liu
> 
> -----Original Message-----
> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
> Sent: Monday, March 8, 2021 1:28 AM
> To: Liu, Shaoyun <Shaoyun.Liu@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH 5/5] drm/amdgpu: Reset the devices in the XGMI 
> hive duirng probe
> 
> But the hive->tb object is used regardless, inside amdgpu_device_xgmi_reset_func currently, it means then even when you explcicitly schdule xgmi_reset_work as you do now they code will try to sync using a not well iniitlized tb object. Maybe you can define a global static tb object, fill it in the loop where you send xgmi_reset_work for all devices in system and use it from within amdgpu_device_xgmi_reset_func instead of the regular per hive tb object (obviosly under your special use case).
> 
> Andrey
> 
> On 2021-03-06 4:11 p.m., Liu, Shaoyun wrote:
>> [AMD Official Use Only - Internal Distribution Only]
>>
>> It  seems I can  not directly reuse the reset HW  function inside the  amdgpu_do_asic_reset,  the  synchronization is based on hive->tb,   but as explained , we actually don't know the GPU belongs to which hive and will rebuild the correct hive info inside the amdgpu_do_asic_reset function with amdgpu_xgmi_add_device .  so I need to remove  all GPUs from the hive first . This will lead to the sync don't work since the hive->tb will be removed as well when all GPUs are removed .
>>
>> Thanks
>> shaopyunliu
>>
>> -----Original Message-----
>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of 
>> Liu, Shaoyun
>> Sent: Saturday, March 6, 2021 3:41 PM
>> To: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; 
>> amd-gfx@lists.freedesktop.org
>> Subject: RE: [PATCH 5/5] drm/amdgpu: Reset the devices in the XGMI 
>> hive duirng probe
>>
>> [AMD Official Use Only - Internal Distribution Only]
>>
>> I call the amdgpu_do_asic_reset with the parameter skip_hw_reset = true  so the reset won't be execute twice .  but probably I can  set this parameter to true and remove the code schedule for reset since now I already build the device_list not based on hive. Let me try that .
>> For the  schedule delayed work thread with AMDGPU_RESUME_MS, It's actually not wait for SMU  to start. As I explained , I need to reset the all the GPUs in the system since I don't know which gpus belongs to which hive.  So this time is allow system to probe all the GPUs  in the system which means when this delayed thread starts ,  we can assume all the devices already been  populated in mgpu_info.
>>
>> Regards
>> Shaoyun.liu
>>
>> -----Original Message-----
>> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>> Sent: Saturday, March 6, 2021 1:09 AM
>> To: Liu, Shaoyun <Shaoyun.Liu@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH 5/5] drm/amdgpu: Reset the devices in the XGMI 
>> hive duirng probe
>>
>> Thanks for explaining this, one thing I still don't understand is why you schedule the reset work explicilty in the begining of amdgpu_drv_delayed_reset_work_handler and then also call amdgpu_do_asic_reset which will do the same thing too. It looks like the physical reset will execute twice for each device.
>> Another thing is, more like improvement suggestion  - currently you schedule delayed_reset_work using AMDGPU_RESUME_MS - so i guesss this should give enough time for SMU to start ? Is there maybe a way to instead poll for SMU start completion and then execute this - some SMU status registers maybe ? Just to avoid relying on this arbitrary value.
>>
>> Andrey
>>
>> On 2021-03-05 8:37 p.m., Liu, Shaoyun wrote:
>>> [AMD Official Use Only - Internal Distribution Only]
>>>
>>> Hi,  Andrey
>>> The existing reset function (amdgpu_device_gpu_recover or amd do_asic _reset) assumed driver already have  the correct hive info . But in my case, it's  not true . The gpus are in a bad state and the XGMI TA  might not functional properly , so driver can  not  get the hive and node info when probe the device .  It means driver even don't know  the device belongs to which hive on a system with multiple hive configuration (ex, 8 gpus in  two hive). The only solution I can think of is let driver trigger the reset on all gpus at the same time after driver do the minimum initialization on the HW ( bring up the  SMU IP)  no matter they belongs to the same hive or not and call amdgpu_xgmi_add_device for each device after re-init .
>>> The 100 ms delay added after the baco reset . I think they can be removed . let me verify it.
>>>
>>> Regards
>>> Shaoyun.liu
>>>
>>>
>>>
>>> -----Original Message-----
>>> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>>> Sent: Friday, March 5, 2021 2:27 PM
>>> To: Liu, Shaoyun <Shaoyun.Liu@amd.com>; 
>>> amd-gfx@lists.freedesktop.org
>>> Subject: Re: [PATCH 5/5] drm/amdgpu: Reset the devices in the XGMI 
>>> hive duirng probe
>>>
>>>
>>>
>>> On 2021-03-05 12:52 p.m., shaoyunl wrote:
>>>> In passthrough configuration, hypervisior will trigger the 
>>>> SBR(Secondary bus reset) to the devices without sync to each other.
>>>> This could cause device hang since for XGMI configuration, all the 
>>>> devices within the hive need to be reset at a limit time slot. This 
>>>> serial of patches try to solve this issue by co-operate with new 
>>>> SMU which will only do minimum house keeping to response the SBR 
>>>> request but don't do the real reset job and leave it to driver. 
>>>> Driver need to do the whole sw init and minimum HW init to bring up 
>>>> the SMU and trigger the reset(possibly BACO) on all the ASICs at 
>>>> the same time
>>>>
>>>> Signed-off-by: shaoyunl <shaoyun.liu@amd.com>
>>>> Change-Id: I34e838e611b7623c7ad824704c7ce350808014fc
>>>> ---
>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  13 +++
>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 102 +++++++++++++++------
>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  71 ++++++++++++++
>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h    |   1 +
>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   |   8 +-
>>>>      5 files changed, 165 insertions(+), 30 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> index d46d3794699e..5602c6edee97 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> @@ -125,6 +125,10 @@ struct amdgpu_mgpu_info
>>>>      	uint32_t			num_gpu;
>>>>      	uint32_t			num_dgpu;
>>>>      	uint32_t			num_apu;
>>>> +
>>>> +	/* delayed reset_func for XGMI configuration if necessary */
>>>> +	struct delayed_work		delayed_reset_work;
>>>> +	bool				pending_reset;
>>>>      };
>>>>      
>>>>      #define AMDGPU_MAX_TIMEOUT_PARAM_LENGTH	256
>>>> @@ -1124,6 +1128,15 @@ void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
>>>>      bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type);
>>>>      bool amdgpu_device_has_dc_support(struct amdgpu_device *adev);
>>>>      
>>>> +int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>>> +				  struct amdgpu_job *job,
>>>> +				  bool *need_full_reset_arg);
>>>> +
>>>> +int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>>> +			  struct list_head *device_list_handle,
>>>> +			  bool *need_full_reset_arg,
>>>> +			  bool skip_hw_reset);
>>>> +
>>>>      int emu_soc_asic_init(struct amdgpu_device *adev);
>>>>      
>>>>      /*
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> index 3c35b0c1e710..5b520f70e660 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> @@ -1220,6 +1220,10 @@ bool amdgpu_device_need_post(struct amdgpu_device *adev)
>>>>      		}
>>>>      	}
>>>>      
>>>> +	/* Don't post if we need to reset whole hive on init */
>>>> +	if (adev->gmc.xgmi.pending_reset)
>>>> +		return false;
>>>> +
>>>>      	if (adev->has_hw_reset) {
>>>>      		adev->has_hw_reset = false;
>>>>      		return true;
>>>> @@ -2149,6 +2153,9 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
>>>>      			if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
>>>>      				continue;
>>>>      
>>>> +			if (!adev->ip_blocks[i].status.sw)
>>>> +				continue;
>>>> +
>>>>      			/* no need to do the fw loading again if already done*/
>>>>      			if (adev->ip_blocks[i].status.hw == true)
>>>>      				break;
>>>> @@ -2289,7 +2296,10 @@ static int amdgpu_device_ip_init(struct 
>>>> amdgpu_device *adev)
>>>>      
>>>>      	if (adev->gmc.xgmi.num_physical_nodes > 1)
>>>>      		amdgpu_xgmi_add_device(adev);
>>>> -	amdgpu_amdkfd_device_init(adev);
>>>> +
>>>> +	/* Don't init kfd if whole hive need to be reset during init */
>>>> +	if (!adev->gmc.xgmi.pending_reset)
>>>> +		amdgpu_amdkfd_device_init(adev);
>>>>      
>>>>      	amdgpu_fru_get_product_info(adev);
>>>>      
>>>> @@ -2734,6 +2744,16 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
>>>>      			adev->ip_blocks[i].status.hw = false;
>>>>      			continue;
>>>>      		}
>>>> +
>>>> +		/* skip unnecessary suspend if we do not initialize them yet */
>>>> +		if (adev->gmc.xgmi.pending_reset &&
>>>> +		    !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
>>>> +		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
>>>> +		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
>>>> +		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
>>>> +			adev->ip_blocks[i].status.hw = false;
>>>> +			continue;
>>>> +		}
>>>>      		/* XXX handle errors */
>>>>      		r = adev->ip_blocks[i].version->funcs->suspend(adev);
>>>>      		/* XXX handle errors */
>>>> @@ -3407,10 +3427,28 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>>>      	 *  E.g., driver was not cleanly unloaded previously, etc.
>>>>      	 */
>>>>      	if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
>>>> -		r = amdgpu_asic_reset(adev);
>>>> -		if (r) {
>>>> -			dev_err(adev->dev, "asic reset on init failed\n");
>>>> -			goto failed;
>>>> +		if (adev->gmc.xgmi.num_physical_nodes) {
>>>> +			dev_info(adev->dev, "Pending hive reset.\n");
>>>> +			adev->gmc.xgmi.pending_reset = true;
>>>> +			/* Only need to init necessary block for SMU to handle the reset */
>>>> +			for (i = 0; i < adev->num_ip_blocks; i++) {
>>>> +				if (!adev->ip_blocks[i].status.valid)
>>>> +					continue;
>>>> +				if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
>>>> +				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
>>>> +				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
>>>> +				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
>>>> +					DRM_DEBUG("IP %s disabed for hw_init.\n",
>>>> +						adev->ip_blocks[i].version->funcs->name);
>>>> +					adev->ip_blocks[i].status.hw = true;
>>>> +				}
>>>> +			}
>>>> +		} else {
>>>> +			r = amdgpu_asic_reset(adev);
>>>> +			if (r) {
>>>> +				dev_err(adev->dev, "asic reset on init failed\n");
>>>> +				goto failed;
>>>> +			}
>>>>      		}
>>>>      	}
>>>>      
>>>> @@ -3541,19 +3579,19 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>>>      	/* enable clockgating, etc. after ib tests, etc. since some blocks require
>>>>      	 * explicit gating rather than handling it automatically.
>>>>      	 */
>>>> -	r = amdgpu_device_ip_late_init(adev);
>>>> -	if (r) {
>>>> -		dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
>>>> -		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
>>>> -		goto failed;
>>>> +	if (!adev->gmc.xgmi.pending_reset) {
>>>> +		r = amdgpu_device_ip_late_init(adev);
>>>> +		if (r) {
>>>> +			dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
>>>> +			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
>>>> +			goto failed;
>>>> +		}
>>>> +		/* must succeed. */
>>>> +		amdgpu_ras_resume(adev);
>>>> +		queue_delayed_work(system_wq, &adev->delayed_init_work,
>>>> +				   msecs_to_jiffies(AMDGPU_RESUME_MS));
>>>>      	}
>>>>      
>>>> -	/* must succeed. */
>>>> -	amdgpu_ras_resume(adev);
>>>> -
>>>> -	queue_delayed_work(system_wq, &adev->delayed_init_work,
>>>> -			   msecs_to_jiffies(AMDGPU_RESUME_MS));
>>>> -
>>>>      	if (amdgpu_sriov_vf(adev))
>>>>      		flush_delayed_work(&adev->delayed_init_work);
>>>>      
>>>> @@ -3570,6 +3608,10 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>>>      	if (amdgpu_device_cache_pci_state(adev->pdev))
>>>>      		pci_restore_state(pdev);
>>>>      
>>>> +	if (adev->gmc.xgmi.pending_reset)
>>>> +		queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
>>>> +				   msecs_to_jiffies(AMDGPU_RESUME_MS));
>>>> +
>>>>      	return 0;
>>>>      
>>>>      failed:
>>>> @@ -4240,14 +4282,16 @@ bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
>>>>      }
>>>>      
>>>>      
>>>> -static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>>> -					struct amdgpu_job *job,
>>>> -					bool *need_full_reset_arg)
>>>> +int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>>> +				  struct amdgpu_job *job,
>>>> +				  bool *need_full_reset_arg)
>>>>      {
>>>>      	int i, r = 0;
>>>>      	bool need_full_reset  = *need_full_reset_arg;
>>>>      
>>>> -	amdgpu_debugfs_wait_dump(adev);
>>>> +	/* no need to dump if device is not in good state during probe period */
>>>> +	if (!adev->gmc.xgmi.pending_reset)
>>>> +		amdgpu_debugfs_wait_dump(adev);
>>>>      
>>>>      	if (amdgpu_sriov_vf(adev)) {
>>>>      		/* stop the data exchange thread */ @@ -4293,10 +4337,10 @@ 
>>>> static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>>>      	return r;
>>>>      }
>>>>      
>>>> -static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>>> -			       struct list_head *device_list_handle,
>>>> -			       bool *need_full_reset_arg,
>>>> -			       bool skip_hw_reset)
>>>> +int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>>> +			  struct list_head *device_list_handle,
>>>> +			  bool *need_full_reset_arg,
>>>> +			  bool skip_hw_reset)
>>>>      {
>>>>      	struct amdgpu_device *tmp_adev = NULL;
>>>>      	bool need_full_reset = *need_full_reset_arg, vram_lost = 
>>>> false; @@
>>>> -4310,6 +4354,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>>>      		list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
>>>>      			/* For XGMI run all resets in parallel to speed up the process */
>>>>      			if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
>>>> +				tmp_adev->gmc.xgmi.pending_reset = false;
>>>>      				if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
>>>>      					r = -EALREADY;
>>>>      			} else
>>>> @@ -4348,10 +4393,10 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>>>      	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
>>>>      		if (need_full_reset) {
>>>>      			/* post card */
>>>> -			if (amdgpu_device_asic_init(tmp_adev))
>>>> +			r = amdgpu_device_asic_init(tmp_adev);
>>>> +			if (r) {
>>>>      				dev_warn(tmp_adev->dev, "asic atom init failed!");
>>>> -
>>>> -			if (!r) {
>>>> +			} else {
>>>>      				dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
>>>>      				r = amdgpu_device_ip_resume_phase1(tmp_adev);
>>>>      				if (r)
>>>> @@ -4384,6 +4429,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>>>      				 */
>>>>      				amdgpu_register_gpu_instance(tmp_adev);
>>>>      
>>>> +				if (!hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
>>>> +					amdgpu_xgmi_add_device(tmp_adev);
>>>> +
>>>>      				r = amdgpu_device_ip_late_init(tmp_adev);
>>>>      				if (r)
>>>>      					goto out;
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>> index 253c59e0a100..aebe4bc561ee 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>> @@ -44,6 +44,7 @@
>>>>      #include "amdgpu_amdkfd.h"
>>>>      
>>>>      #include "amdgpu_ras.h"
>>>> +#include "amdgpu_xgmi.h"
>>>>      
>>>>      /*
>>>>       * KMS wrapper.
>>>> @@ -167,8 +168,13 @@ uint amdgpu_freesync_vid_mode;
>>>>      int amdgpu_reset_method = -1; /* auto */
>>>>      int amdgpu_num_kcq = -1;
>>>>      
>>>> +static void amdgpu_drv_delayed_reset_work_handler(struct
>>>> +work_struct *work);
>>>> +
>>>>      struct amdgpu_mgpu_info mgpu_info = {
>>>>      	.mutex = __MUTEX_INITIALIZER(mgpu_info.mutex),
>>>> +	.delayed_reset_work = __DELAYED_WORK_INITIALIZER(
>>>> +			mgpu_info.delayed_reset_work,
>>>> +			amdgpu_drv_delayed_reset_work_handler, 0),
>>>>      };
>>>>      int amdgpu_ras_enable = -1;
>>>>      uint amdgpu_ras_mask = 0xffffffff; @@ -1297,6 +1303,71 @@ 
>>>> amdgpu_pci_shutdown(struct pci_dev *pdev)
>>>>      	adev->mp1_state = PP_MP1_STATE_NONE;
>>>>      }
>>>>      
>>>> +/**
>>>> + * amdgpu_drv_delayed_reset_work_handler - work handler for reset
>>>> + *
>>>> + * @work: work_struct.
>>>> + */
>>>> +static void amdgpu_drv_delayed_reset_work_handler(struct
>>>> +work_struct
>>>> +*work) {
>>>> +	struct list_head device_list;
>>>> +	struct amdgpu_device *adev;
>>>> +	int i, r;
>>>> +	bool need_full_reset = true;
>>>> +
>>>> +	mutex_lock(&mgpu_info.mutex);
>>>> +	if (mgpu_info.pending_reset == true) {
>>>> +		mutex_unlock(&mgpu_info.mutex);
>>>> +		return;
>>>> +	}
>>>> +	mgpu_info.pending_reset = true;
>>>> +	mutex_unlock(&mgpu_info.mutex);
>>>> +
>>>> +	for (i = 0; i < mgpu_info.num_dgpu; i++) {
>>>> +		adev = mgpu_info.gpu_ins[i].adev;
>>>> +		r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset);
>>>
>>> Why amdgpu_device_pre_asic_reset is needed ?
>>>
>>>> +		if (r) {
>>>> +			dev_err(adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
>>>> +				r, adev_to_drm(adev)->unique);
>>>> +		}
>>>> +		if (!queue_work(system_unbound_wq, &adev->xgmi_reset_work))
>>>> +			r = -EALREADY;
>>>
>>> amdgpu_do_asic_reset bellow will already schedule xgmi_reset_work for this device, what you could do instead is call amdgpu_do_asic_reset for each member of the hive and because there is a task barrier in amdgpu_device_xgmi_reset_func, it will synchronize all the resets to same point in time already.
>>>
>>>> +	}
>>>> +	msleep(100);
>>>
>>> What's the 100ms is wiating for ?
>>>
>>>> +	for (i = 0; i < mgpu_info.num_dgpu; i++) {
>>>> +		adev = mgpu_info.gpu_ins[i].adev;
>>>> +		adev->gmc.xgmi.pending_reset = false;
>>>> +		flush_work(&adev->xgmi_reset_work);
>>>> +	}
>>>> +
>>>> +	msleep(100);
>>>
>>> Same as above
>>>
>>>> +	/* reset function will rebuild the xgmi hive info , clear it now */
>>>> +	for (i = 0; i < mgpu_info.num_dgpu; i++)
>>>> +		amdgpu_xgmi_remove_device(mgpu_info.gpu_ins[i].adev);
>>>> +
>>>> +	INIT_LIST_HEAD(&device_list);
>>>> +
>>>> +	for (i = 0; i < mgpu_info.num_dgpu; i++)
>>>> +		list_add_tail(&mgpu_info.gpu_ins[i].adev->reset_list,
>>>> +&device_list);
>>>> +
>>>> +	/* unregister the GPU first, reset function will add them back */
>>>> +	list_for_each_entry(adev, &device_list, reset_list)
>>>> +		amdgpu_unregister_gpu_instance(adev);
>>>> +
>>>> +	r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true);
>>>> +	if (r) {
>>>> +		DRM_ERROR("reinit gpus failure");
>>>> +		return;
>>>> +	}
>>>> +	for (i = 0; i < mgpu_info.num_dgpu; i++) {
>>>> +		adev = mgpu_info.gpu_ins[i].adev;
>>>> +		if (!adev->kfd.init_complete)
>>>> +			amdgpu_amdkfd_device_init(adev);
>>>> +		amdgpu_ttm_set_buffer_funcs_status(adev, true);
>>>> +	}
>>>> +	return;
>>>> +}
>>>> +
>>>>      static int amdgpu_pmops_suspend(struct device *dev)
>>>>      {
>>>>      	struct drm_device *drm_dev = dev_get_drvdata(dev); diff --git 
>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>>>> index aa0c83776ce0..8c71d84a2fbe 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>>>> @@ -149,6 +149,7 @@ struct amdgpu_xgmi {
>>>>      	struct list_head head;
>>>>      	bool supported;
>>>>      	struct ras_common_if *ras_if;
>>>> +	bool pending_reset;
>>>>      };
>>>>      
>>>>      struct amdgpu_gmc {
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>>>> index 659b385b27b5..b459ef755ea9 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>>>> @@ -492,7 +492,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>>>>      	if (!adev->gmc.xgmi.supported)
>>>>      		return 0;
>>>>      
>>>> -	if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
>>>> +	if (!adev->gmc.xgmi.pending_reset &&
>>>> +	    amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
>>>>      		ret = psp_xgmi_initialize(&adev->psp);
>>>>      		if (ret) {
>>>>      			dev_err(adev->dev,
>>>> @@ -538,7 +539,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device
>>>> *adev)
>>>>      
>>>>      	task_barrier_add_task(&hive->tb);
>>>>      
>>>> -	if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
>>>> +	if (!adev->gmc.xgmi.pending_reset &&
>>>> +	    amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
>>>>      		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
>>>>      			/* update node list for other device in the hive */
>>>>      			if (tmp_adev != adev) {
>>>> @@ -567,7 +569,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>>>>      		}
>>>>      	}
>>>>      
>>>> -	if (!ret)
>>>> +	if (!ret && !adev->gmc.xgmi.pending_reset)
>>>>      		ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive);
>>>>      
>>>>      exit_unlock:
>>>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flis
>> t 
>> s.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=04%7C01%7CS
>> h
>> aoyun.Liu%40amd.com%7C73608bda6abd48a3047608d8e0e02e4f%7C3dd8961fe488
>> 4 
>> e608e11a82d994e183d%7C0%7C0%7C637506600750277165%7CUnknown%7CTWFpbGZs
>> b 
>> 3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D
>> % 
>> 7C1000&amp;sdata=2qASlLYQ08twP0Ud5EcisCsVlJ9WG0QJv5idbuhDe4o%3D&amp;r
>> e
>> served=0
>>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 5/5] drm/amdgpu: Reset the devices in the XGMI hive duirng probe
  2021-03-08 16:10                 ` Liu, Shaoyun
@ 2021-03-08 17:26                   ` Andrey Grodzovsky
  0 siblings, 0 replies; 12+ messages in thread
From: Andrey Grodzovsky @ 2021-03-08 17:26 UTC (permalink / raw)
  To: Liu, Shaoyun, amd-gfx

Sure, patch 4 Reviewed-by: Andrey Grodzovsky andrey.grodzovsky@amd.com 
and patch 5 Acked-by: Andrey Grodzovsky andrey.grodzovsky@amd.com since 
I am not sure about the KFD bits.

Andrey

On 2021-03-08 11:10 a.m., Liu, Shaoyun wrote:
> [AMD Official Use Only - Internal Distribution Only]
> 
> Hi, Andrey.
> The first 3 patches in this serial already been acked by Alex. D, can you help review the rest two ?
> 
> Thanks
> Shaoyun.liu
> 
> -----Original Message-----
> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
> Sent: Monday, March 8, 2021 10:53 AM
> To: Liu, Shaoyun <Shaoyun.Liu@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH 5/5] drm/amdgpu: Reset the devices in the XGMI hive duirng probe
> 
> I see, thanks for explaning.
> 
> Andrey
> 
> On 2021-03-08 10:27 a.m., Liu, Shaoyun wrote:
>> [AMD Official Use Only - Internal Distribution Only]
>>
>> Check the function amdgpu_xgmi_add_device, when  psp XGMI TA is bot available ,  the driver will assign a faked hive ID 0x10 for all  GPUs, it means all GPU will belongs to one same hive .  So I can still use hive->tb to sync the reset on all GPUs.   The reason I can  not use the default amdgpu_do_asic_reset function  is because we  want to build correct hive and node topology for all GPUs after reset, so we need to call amdgpu_xgmi_add_device inside the amdgpu_do_asic_reset function . To make this works ,  we need to destroy the hive by remove  the device (call amdgpu_xgmi_remove_device) first , so when calling amdgpu_do_asic_reset ,  the  faked hive(0x10) already   been destroyed. And  the hive->tb will not work in this case .   That's the reason I need to call the reset explicitly with the faked hive and then destroy the hive ,  build the device_list for amdgpu_do_asic_reset without the hive .
>> Hope I explain it clearly .
>>
>> Thanks
>> Shaoyun.liu
>>
>> -----Original Message-----
>> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>> Sent: Monday, March 8, 2021 1:28 AM
>> To: Liu, Shaoyun <Shaoyun.Liu@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH 5/5] drm/amdgpu: Reset the devices in the XGMI
>> hive duirng probe
>>
>> But the hive->tb object is used regardless, inside amdgpu_device_xgmi_reset_func currently, it means then even when you explcicitly schdule xgmi_reset_work as you do now they code will try to sync using a not well iniitlized tb object. Maybe you can define a global static tb object, fill it in the loop where you send xgmi_reset_work for all devices in system and use it from within amdgpu_device_xgmi_reset_func instead of the regular per hive tb object (obviosly under your special use case).
>>
>> Andrey
>>
>> On 2021-03-06 4:11 p.m., Liu, Shaoyun wrote:
>>> [AMD Official Use Only - Internal Distribution Only]
>>>
>>> It  seems I can  not directly reuse the reset HW  function inside the  amdgpu_do_asic_reset,  the  synchronization is based on hive->tb,   but as explained , we actually don't know the GPU belongs to which hive and will rebuild the correct hive info inside the amdgpu_do_asic_reset function with amdgpu_xgmi_add_device .  so I need to remove  all GPUs from the hive first . This will lead to the sync don't work since the hive->tb will be removed as well when all GPUs are removed .
>>>
>>> Thanks
>>> shaopyunliu
>>>
>>> -----Original Message-----
>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
>>> Liu, Shaoyun
>>> Sent: Saturday, March 6, 2021 3:41 PM
>>> To: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>;
>>> amd-gfx@lists.freedesktop.org
>>> Subject: RE: [PATCH 5/5] drm/amdgpu: Reset the devices in the XGMI
>>> hive duirng probe
>>>
>>> [AMD Official Use Only - Internal Distribution Only]
>>>
>>> I call the amdgpu_do_asic_reset with the parameter skip_hw_reset = true  so the reset won't be execute twice .  but probably I can  set this parameter to true and remove the code schedule for reset since now I already build the device_list not based on hive. Let me try that .
>>> For the  schedule delayed work thread with AMDGPU_RESUME_MS, It's actually not wait for SMU  to start. As I explained , I need to reset the all the GPUs in the system since I don't know which gpus belongs to which hive.  So this time is allow system to probe all the GPUs  in the system which means when this delayed thread starts ,  we can assume all the devices already been  populated in mgpu_info.
>>>
>>> Regards
>>> Shaoyun.liu
>>>
>>> -----Original Message-----
>>> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>>> Sent: Saturday, March 6, 2021 1:09 AM
>>> To: Liu, Shaoyun <Shaoyun.Liu@amd.com>; amd-gfx@lists.freedesktop.org
>>> Subject: Re: [PATCH 5/5] drm/amdgpu: Reset the devices in the XGMI
>>> hive duirng probe
>>>
>>> Thanks for explaining this, one thing I still don't understand is why you schedule the reset work explicilty in the begining of amdgpu_drv_delayed_reset_work_handler and then also call amdgpu_do_asic_reset which will do the same thing too. It looks like the physical reset will execute twice for each device.
>>> Another thing is, more like improvement suggestion  - currently you schedule delayed_reset_work using AMDGPU_RESUME_MS - so i guesss this should give enough time for SMU to start ? Is there maybe a way to instead poll for SMU start completion and then execute this - some SMU status registers maybe ? Just to avoid relying on this arbitrary value.
>>>
>>> Andrey
>>>
>>> On 2021-03-05 8:37 p.m., Liu, Shaoyun wrote:
>>>> [AMD Official Use Only - Internal Distribution Only]
>>>>
>>>> Hi,  Andrey
>>>> The existing reset function (amdgpu_device_gpu_recover or amd do_asic _reset) assumed driver already have  the correct hive info . But in my case, it's  not true . The gpus are in a bad state and the XGMI TA  might not functional properly , so driver can  not  get the hive and node info when probe the device .  It means driver even don't know  the device belongs to which hive on a system with multiple hive configuration (ex, 8 gpus in  two hive). The only solution I can think of is let driver trigger the reset on all gpus at the same time after driver do the minimum initialization on the HW ( bring up the  SMU IP)  no matter they belongs to the same hive or not and call amdgpu_xgmi_add_device for each device after re-init .
>>>> The 100 ms delay added after the baco reset . I think they can be removed . let me verify it.
>>>>
>>>> Regards
>>>> Shaoyun.liu
>>>>
>>>>
>>>>
>>>> -----Original Message-----
>>>> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>>>> Sent: Friday, March 5, 2021 2:27 PM
>>>> To: Liu, Shaoyun <Shaoyun.Liu@amd.com>;
>>>> amd-gfx@lists.freedesktop.org
>>>> Subject: Re: [PATCH 5/5] drm/amdgpu: Reset the devices in the XGMI
>>>> hive duirng probe
>>>>
>>>>
>>>>
>>>> On 2021-03-05 12:52 p.m., shaoyunl wrote:
>>>>> In passthrough configuration, hypervisior will trigger the
>>>>> SBR(Secondary bus reset) to the devices without sync to each other.
>>>>> This could cause device hang since for XGMI configuration, all the
>>>>> devices within the hive need to be reset at a limit time slot. This
>>>>> serial of patches try to solve this issue by co-operate with new
>>>>> SMU which will only do minimum house keeping to response the SBR
>>>>> request but don't do the real reset job and leave it to driver.
>>>>> Driver need to do the whole sw init and minimum HW init to bring up
>>>>> the SMU and trigger the reset(possibly BACO) on all the ASICs at
>>>>> the same time
>>>>>
>>>>> Signed-off-by: shaoyunl <shaoyun.liu@amd.com>
>>>>> Change-Id: I34e838e611b7623c7ad824704c7ce350808014fc
>>>>> ---
>>>>>       drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  13 +++
>>>>>       drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 102 +++++++++++++++------
>>>>>       drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  71 ++++++++++++++
>>>>>       drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h    |   1 +
>>>>>       drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   |   8 +-
>>>>>       5 files changed, 165 insertions(+), 30 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>> index d46d3794699e..5602c6edee97 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>> @@ -125,6 +125,10 @@ struct amdgpu_mgpu_info
>>>>>       	uint32_t			num_gpu;
>>>>>       	uint32_t			num_dgpu;
>>>>>       	uint32_t			num_apu;
>>>>> +
>>>>> +	/* delayed reset_func for XGMI configuration if necessary */
>>>>> +	struct delayed_work		delayed_reset_work;
>>>>> +	bool				pending_reset;
>>>>>       };
>>>>>       
>>>>>       #define AMDGPU_MAX_TIMEOUT_PARAM_LENGTH	256
>>>>> @@ -1124,6 +1128,15 @@ void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
>>>>>       bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type);
>>>>>       bool amdgpu_device_has_dc_support(struct amdgpu_device *adev);
>>>>>       
>>>>> +int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>>>> +				  struct amdgpu_job *job,
>>>>> +				  bool *need_full_reset_arg);
>>>>> +
>>>>> +int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>>>> +			  struct list_head *device_list_handle,
>>>>> +			  bool *need_full_reset_arg,
>>>>> +			  bool skip_hw_reset);
>>>>> +
>>>>>       int emu_soc_asic_init(struct amdgpu_device *adev);
>>>>>       
>>>>>       /*
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> index 3c35b0c1e710..5b520f70e660 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> @@ -1220,6 +1220,10 @@ bool amdgpu_device_need_post(struct amdgpu_device *adev)
>>>>>       		}
>>>>>       	}
>>>>>       
>>>>> +	/* Don't post if we need to reset whole hive on init */
>>>>> +	if (adev->gmc.xgmi.pending_reset)
>>>>> +		return false;
>>>>> +
>>>>>       	if (adev->has_hw_reset) {
>>>>>       		adev->has_hw_reset = false;
>>>>>       		return true;
>>>>> @@ -2149,6 +2153,9 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
>>>>>       			if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
>>>>>       				continue;
>>>>>       
>>>>> +			if (!adev->ip_blocks[i].status.sw)
>>>>> +				continue;
>>>>> +
>>>>>       			/* no need to do the fw loading again if already done*/
>>>>>       			if (adev->ip_blocks[i].status.hw == true)
>>>>>       				break;
>>>>> @@ -2289,7 +2296,10 @@ static int amdgpu_device_ip_init(struct
>>>>> amdgpu_device *adev)
>>>>>       
>>>>>       	if (adev->gmc.xgmi.num_physical_nodes > 1)
>>>>>       		amdgpu_xgmi_add_device(adev);
>>>>> -	amdgpu_amdkfd_device_init(adev);
>>>>> +
>>>>> +	/* Don't init kfd if whole hive need to be reset during init */
>>>>> +	if (!adev->gmc.xgmi.pending_reset)
>>>>> +		amdgpu_amdkfd_device_init(adev);
>>>>>       
>>>>>       	amdgpu_fru_get_product_info(adev);
>>>>>       
>>>>> @@ -2734,6 +2744,16 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
>>>>>       			adev->ip_blocks[i].status.hw = false;
>>>>>       			continue;
>>>>>       		}
>>>>> +
>>>>> +		/* skip unnecessary suspend if we do not initialize them yet */
>>>>> +		if (adev->gmc.xgmi.pending_reset &&
>>>>> +		    !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
>>>>> +		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
>>>>> +		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
>>>>> +		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
>>>>> +			adev->ip_blocks[i].status.hw = false;
>>>>> +			continue;
>>>>> +		}
>>>>>       		/* XXX handle errors */
>>>>>       		r = adev->ip_blocks[i].version->funcs->suspend(adev);
>>>>>       		/* XXX handle errors */
>>>>> @@ -3407,10 +3427,28 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>>>>       	 *  E.g., driver was not cleanly unloaded previously, etc.
>>>>>       	 */
>>>>>       	if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
>>>>> -		r = amdgpu_asic_reset(adev);
>>>>> -		if (r) {
>>>>> -			dev_err(adev->dev, "asic reset on init failed\n");
>>>>> -			goto failed;
>>>>> +		if (adev->gmc.xgmi.num_physical_nodes) {
>>>>> +			dev_info(adev->dev, "Pending hive reset.\n");
>>>>> +			adev->gmc.xgmi.pending_reset = true;
>>>>> +			/* Only need to init necessary block for SMU to handle the reset */
>>>>> +			for (i = 0; i < adev->num_ip_blocks; i++) {
>>>>> +				if (!adev->ip_blocks[i].status.valid)
>>>>> +					continue;
>>>>> +				if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
>>>>> +				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
>>>>> +				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
>>>>> +				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
>>>>> +					DRM_DEBUG("IP %s disabed for hw_init.\n",
>>>>> +						adev->ip_blocks[i].version->funcs->name);
>>>>> +					adev->ip_blocks[i].status.hw = true;
>>>>> +				}
>>>>> +			}
>>>>> +		} else {
>>>>> +			r = amdgpu_asic_reset(adev);
>>>>> +			if (r) {
>>>>> +				dev_err(adev->dev, "asic reset on init failed\n");
>>>>> +				goto failed;
>>>>> +			}
>>>>>       		}
>>>>>       	}
>>>>>       
>>>>> @@ -3541,19 +3579,19 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>>>>       	/* enable clockgating, etc. after ib tests, etc. since some blocks require
>>>>>       	 * explicit gating rather than handling it automatically.
>>>>>       	 */
>>>>> -	r = amdgpu_device_ip_late_init(adev);
>>>>> -	if (r) {
>>>>> -		dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
>>>>> -		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
>>>>> -		goto failed;
>>>>> +	if (!adev->gmc.xgmi.pending_reset) {
>>>>> +		r = amdgpu_device_ip_late_init(adev);
>>>>> +		if (r) {
>>>>> +			dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
>>>>> +			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
>>>>> +			goto failed;
>>>>> +		}
>>>>> +		/* must succeed. */
>>>>> +		amdgpu_ras_resume(adev);
>>>>> +		queue_delayed_work(system_wq, &adev->delayed_init_work,
>>>>> +				   msecs_to_jiffies(AMDGPU_RESUME_MS));
>>>>>       	}
>>>>>       
>>>>> -	/* must succeed. */
>>>>> -	amdgpu_ras_resume(adev);
>>>>> -
>>>>> -	queue_delayed_work(system_wq, &adev->delayed_init_work,
>>>>> -			   msecs_to_jiffies(AMDGPU_RESUME_MS));
>>>>> -
>>>>>       	if (amdgpu_sriov_vf(adev))
>>>>>       		flush_delayed_work(&adev->delayed_init_work);
>>>>>       
>>>>> @@ -3570,6 +3608,10 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>>>>       	if (amdgpu_device_cache_pci_state(adev->pdev))
>>>>>       		pci_restore_state(pdev);
>>>>>       
>>>>> +	if (adev->gmc.xgmi.pending_reset)
>>>>> +		queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
>>>>> +				   msecs_to_jiffies(AMDGPU_RESUME_MS));
>>>>> +
>>>>>       	return 0;
>>>>>       
>>>>>       failed:
>>>>> @@ -4240,14 +4282,16 @@ bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
>>>>>       }
>>>>>       
>>>>>       
>>>>> -static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>>>> -					struct amdgpu_job *job,
>>>>> -					bool *need_full_reset_arg)
>>>>> +int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>>>> +				  struct amdgpu_job *job,
>>>>> +				  bool *need_full_reset_arg)
>>>>>       {
>>>>>       	int i, r = 0;
>>>>>       	bool need_full_reset  = *need_full_reset_arg;
>>>>>       
>>>>> -	amdgpu_debugfs_wait_dump(adev);
>>>>> +	/* no need to dump if device is not in good state during probe period */
>>>>> +	if (!adev->gmc.xgmi.pending_reset)
>>>>> +		amdgpu_debugfs_wait_dump(adev);
>>>>>       
>>>>>       	if (amdgpu_sriov_vf(adev)) {
>>>>>       		/* stop the data exchange thread */ @@ -4293,10 +4337,10 @@
>>>>> static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>>>>       	return r;
>>>>>       }
>>>>>       
>>>>> -static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>>>> -			       struct list_head *device_list_handle,
>>>>> -			       bool *need_full_reset_arg,
>>>>> -			       bool skip_hw_reset)
>>>>> +int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>>>> +			  struct list_head *device_list_handle,
>>>>> +			  bool *need_full_reset_arg,
>>>>> +			  bool skip_hw_reset)
>>>>>       {
>>>>>       	struct amdgpu_device *tmp_adev = NULL;
>>>>>       	bool need_full_reset = *need_full_reset_arg, vram_lost =
>>>>> false; @@
>>>>> -4310,6 +4354,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>>>>       		list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
>>>>>       			/* For XGMI run all resets in parallel to speed up the process */
>>>>>       			if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
>>>>> +				tmp_adev->gmc.xgmi.pending_reset = false;
>>>>>       				if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
>>>>>       					r = -EALREADY;
>>>>>       			} else
>>>>> @@ -4348,10 +4393,10 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>>>>       	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
>>>>>       		if (need_full_reset) {
>>>>>       			/* post card */
>>>>> -			if (amdgpu_device_asic_init(tmp_adev))
>>>>> +			r = amdgpu_device_asic_init(tmp_adev);
>>>>> +			if (r) {
>>>>>       				dev_warn(tmp_adev->dev, "asic atom init failed!");
>>>>> -
>>>>> -			if (!r) {
>>>>> +			} else {
>>>>>       				dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
>>>>>       				r = amdgpu_device_ip_resume_phase1(tmp_adev);
>>>>>       				if (r)
>>>>> @@ -4384,6 +4429,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>>>>       				 */
>>>>>       				amdgpu_register_gpu_instance(tmp_adev);
>>>>>       
>>>>> +				if (!hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
>>>>> +					amdgpu_xgmi_add_device(tmp_adev);
>>>>> +
>>>>>       				r = amdgpu_device_ip_late_init(tmp_adev);
>>>>>       				if (r)
>>>>>       					goto out;
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>>> index 253c59e0a100..aebe4bc561ee 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>>> @@ -44,6 +44,7 @@
>>>>>       #include "amdgpu_amdkfd.h"
>>>>>       
>>>>>       #include "amdgpu_ras.h"
>>>>> +#include "amdgpu_xgmi.h"
>>>>>       
>>>>>       /*
>>>>>        * KMS wrapper.
>>>>> @@ -167,8 +168,13 @@ uint amdgpu_freesync_vid_mode;
>>>>>       int amdgpu_reset_method = -1; /* auto */
>>>>>       int amdgpu_num_kcq = -1;
>>>>>       
>>>>> +static void amdgpu_drv_delayed_reset_work_handler(struct
>>>>> +work_struct *work);
>>>>> +
>>>>>       struct amdgpu_mgpu_info mgpu_info = {
>>>>>       	.mutex = __MUTEX_INITIALIZER(mgpu_info.mutex),
>>>>> +	.delayed_reset_work = __DELAYED_WORK_INITIALIZER(
>>>>> +			mgpu_info.delayed_reset_work,
>>>>> +			amdgpu_drv_delayed_reset_work_handler, 0),
>>>>>       };
>>>>>       int amdgpu_ras_enable = -1;
>>>>>       uint amdgpu_ras_mask = 0xffffffff; @@ -1297,6 +1303,71 @@
>>>>> amdgpu_pci_shutdown(struct pci_dev *pdev)
>>>>>       	adev->mp1_state = PP_MP1_STATE_NONE;
>>>>>       }
>>>>>       
>>>>> +/**
>>>>> + * amdgpu_drv_delayed_reset_work_handler - work handler for reset
>>>>> + *
>>>>> + * @work: work_struct.
>>>>> + */
>>>>> +static void amdgpu_drv_delayed_reset_work_handler(struct
>>>>> +work_struct
>>>>> +*work) {
>>>>> +	struct list_head device_list;
>>>>> +	struct amdgpu_device *adev;
>>>>> +	int i, r;
>>>>> +	bool need_full_reset = true;
>>>>> +
>>>>> +	mutex_lock(&mgpu_info.mutex);
>>>>> +	if (mgpu_info.pending_reset == true) {
>>>>> +		mutex_unlock(&mgpu_info.mutex);
>>>>> +		return;
>>>>> +	}
>>>>> +	mgpu_info.pending_reset = true;
>>>>> +	mutex_unlock(&mgpu_info.mutex);
>>>>> +
>>>>> +	for (i = 0; i < mgpu_info.num_dgpu; i++) {
>>>>> +		adev = mgpu_info.gpu_ins[i].adev;
>>>>> +		r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset);
>>>>
>>>> Why amdgpu_device_pre_asic_reset is needed ?
>>>>
>>>>> +		if (r) {
>>>>> +			dev_err(adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
>>>>> +				r, adev_to_drm(adev)->unique);
>>>>> +		}
>>>>> +		if (!queue_work(system_unbound_wq, &adev->xgmi_reset_work))
>>>>> +			r = -EALREADY;
>>>>
>>>> amdgpu_do_asic_reset bellow will already schedule xgmi_reset_work for this device, what you could do instead is call amdgpu_do_asic_reset for each member of the hive and because there is a task barrier in amdgpu_device_xgmi_reset_func, it will synchronize all the resets to same point in time already.
>>>>
>>>>> +	}
>>>>> +	msleep(100);
>>>>
>>>> What's the 100ms is wiating for ?
>>>>
>>>>> +	for (i = 0; i < mgpu_info.num_dgpu; i++) {
>>>>> +		adev = mgpu_info.gpu_ins[i].adev;
>>>>> +		adev->gmc.xgmi.pending_reset = false;
>>>>> +		flush_work(&adev->xgmi_reset_work);
>>>>> +	}
>>>>> +
>>>>> +	msleep(100);
>>>>
>>>> Same as above
>>>>
>>>>> +	/* reset function will rebuild the xgmi hive info , clear it now */
>>>>> +	for (i = 0; i < mgpu_info.num_dgpu; i++)
>>>>> +		amdgpu_xgmi_remove_device(mgpu_info.gpu_ins[i].adev);
>>>>> +
>>>>> +	INIT_LIST_HEAD(&device_list);
>>>>> +
>>>>> +	for (i = 0; i < mgpu_info.num_dgpu; i++)
>>>>> +		list_add_tail(&mgpu_info.gpu_ins[i].adev->reset_list,
>>>>> +&device_list);
>>>>> +
>>>>> +	/* unregister the GPU first, reset function will add them back */
>>>>> +	list_for_each_entry(adev, &device_list, reset_list)
>>>>> +		amdgpu_unregister_gpu_instance(adev);
>>>>> +
>>>>> +	r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true);
>>>>> +	if (r) {
>>>>> +		DRM_ERROR("reinit gpus failure");
>>>>> +		return;
>>>>> +	}
>>>>> +	for (i = 0; i < mgpu_info.num_dgpu; i++) {
>>>>> +		adev = mgpu_info.gpu_ins[i].adev;
>>>>> +		if (!adev->kfd.init_complete)
>>>>> +			amdgpu_amdkfd_device_init(adev);
>>>>> +		amdgpu_ttm_set_buffer_funcs_status(adev, true);
>>>>> +	}
>>>>> +	return;
>>>>> +}
>>>>> +
>>>>>       static int amdgpu_pmops_suspend(struct device *dev)
>>>>>       {
>>>>>       	struct drm_device *drm_dev = dev_get_drvdata(dev); diff --git
>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>>>>> index aa0c83776ce0..8c71d84a2fbe 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>>>>> @@ -149,6 +149,7 @@ struct amdgpu_xgmi {
>>>>>       	struct list_head head;
>>>>>       	bool supported;
>>>>>       	struct ras_common_if *ras_if;
>>>>> +	bool pending_reset;
>>>>>       };
>>>>>       
>>>>>       struct amdgpu_gmc {
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>>>>> index 659b385b27b5..b459ef755ea9 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>>>>> @@ -492,7 +492,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>>>>>       	if (!adev->gmc.xgmi.supported)
>>>>>       		return 0;
>>>>>       
>>>>> -	if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
>>>>> +	if (!adev->gmc.xgmi.pending_reset &&
>>>>> +	    amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
>>>>>       		ret = psp_xgmi_initialize(&adev->psp);
>>>>>       		if (ret) {
>>>>>       			dev_err(adev->dev,
>>>>> @@ -538,7 +539,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device
>>>>> *adev)
>>>>>       
>>>>>       	task_barrier_add_task(&hive->tb);
>>>>>       
>>>>> -	if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
>>>>> +	if (!adev->gmc.xgmi.pending_reset &&
>>>>> +	    amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
>>>>>       		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
>>>>>       			/* update node list for other device in the hive */
>>>>>       			if (tmp_adev != adev) {
>>>>> @@ -567,7 +569,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>>>>>       		}
>>>>>       	}
>>>>>       
>>>>> -	if (!ret)
>>>>> +	if (!ret && !adev->gmc.xgmi.pending_reset)
>>>>>       		ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive);
>>>>>       
>>>>>       exit_unlock:
>>>>>
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flis
>>> t
>>> s.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=04%7C01%7CS
>>> h
>>> aoyun.Liu%40amd.com%7C73608bda6abd48a3047608d8e0e02e4f%7C3dd8961fe488
>>> 4
>>> e608e11a82d994e183d%7C0%7C0%7C637506600750277165%7CUnknown%7CTWFpbGZs
>>> b
>>> 3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D
>>> %
>>> 7C1000&amp;sdata=2qASlLYQ08twP0Ud5EcisCsVlJ9WG0QJv5idbuhDe4o%3D&amp;r
>>> e
>>> served=0
>>>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 5/5] drm/amdgpu: Reset the devices in the XGMI hive duirng probe
@ 2021-03-06 21:20 shaoyunl
  0 siblings, 0 replies; 12+ messages in thread
From: shaoyunl @ 2021-03-06 21:20 UTC (permalink / raw)
  To: amd-gfx; +Cc: shaoyunl

In passthrough configuration, hypervisior will trigger the SBR(Secondary bus reset) to the devices
without sync to each other. This could cause device hang since for XGMI configuration, all the devices
within the hive need to be reset at a limit time slot. This serial of patches try to solve this issue
by co-operate with new SMU which will only do minimum house keeping to response the SBR request but don't
do the real reset job and leave it to driver. Driver need to do the whole sw init and minimum HW init
to bring up the SMU and trigger the reset(possibly BACO) on all the ASICs at the same time

Signed-off-by: shaoyunl <shaoyun.liu@amd.com>
Change-Id: I34e838e611b7623c7ad824704c7ce350808014fc
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  13 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 102 +++++++++++++++------
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  69 ++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h    |   1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   |   8 +-
 5 files changed, 163 insertions(+), 30 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index d46d3794699e..5602c6edee97 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -125,6 +125,10 @@ struct amdgpu_mgpu_info
 	uint32_t			num_gpu;
 	uint32_t			num_dgpu;
 	uint32_t			num_apu;
+
+	/* delayed reset_func for XGMI configuration if necessary */
+	struct delayed_work		delayed_reset_work;
+	bool				pending_reset;
 };
 
 #define AMDGPU_MAX_TIMEOUT_PARAM_LENGTH	256
@@ -1124,6 +1128,15 @@ void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type);
 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev);
 
+int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
+				  struct amdgpu_job *job,
+				  bool *need_full_reset_arg);
+
+int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
+			  struct list_head *device_list_handle,
+			  bool *need_full_reset_arg,
+			  bool skip_hw_reset);
+
 int emu_soc_asic_init(struct amdgpu_device *adev);
 
 /*
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 3c35b0c1e710..5b520f70e660 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1220,6 +1220,10 @@ bool amdgpu_device_need_post(struct amdgpu_device *adev)
 		}
 	}
 
+	/* Don't post if we need to reset whole hive on init */
+	if (adev->gmc.xgmi.pending_reset)
+		return false;
+
 	if (adev->has_hw_reset) {
 		adev->has_hw_reset = false;
 		return true;
@@ -2149,6 +2153,9 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
 			if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
 				continue;
 
+			if (!adev->ip_blocks[i].status.sw)
+				continue;
+
 			/* no need to do the fw loading again if already done*/
 			if (adev->ip_blocks[i].status.hw == true)
 				break;
@@ -2289,7 +2296,10 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
 
 	if (adev->gmc.xgmi.num_physical_nodes > 1)
 		amdgpu_xgmi_add_device(adev);
-	amdgpu_amdkfd_device_init(adev);
+
+	/* Don't init kfd if whole hive need to be reset during init */
+	if (!adev->gmc.xgmi.pending_reset)
+		amdgpu_amdkfd_device_init(adev);
 
 	amdgpu_fru_get_product_info(adev);
 
@@ -2734,6 +2744,16 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
 			adev->ip_blocks[i].status.hw = false;
 			continue;
 		}
+
+		/* skip unnecessary suspend if we do not initialize them yet */
+		if (adev->gmc.xgmi.pending_reset &&
+		    !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
+		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
+		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
+		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
+			adev->ip_blocks[i].status.hw = false;
+			continue;
+		}
 		/* XXX handle errors */
 		r = adev->ip_blocks[i].version->funcs->suspend(adev);
 		/* XXX handle errors */
@@ -3407,10 +3427,28 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 	 *  E.g., driver was not cleanly unloaded previously, etc.
 	 */
 	if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
-		r = amdgpu_asic_reset(adev);
-		if (r) {
-			dev_err(adev->dev, "asic reset on init failed\n");
-			goto failed;
+		if (adev->gmc.xgmi.num_physical_nodes) {
+			dev_info(adev->dev, "Pending hive reset.\n");
+			adev->gmc.xgmi.pending_reset = true;
+			/* Only need to init necessary block for SMU to handle the reset */
+			for (i = 0; i < adev->num_ip_blocks; i++) {
+				if (!adev->ip_blocks[i].status.valid)
+					continue;
+				if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
+				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
+				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
+				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
+					DRM_DEBUG("IP %s disabed for hw_init.\n",
+						adev->ip_blocks[i].version->funcs->name);
+					adev->ip_blocks[i].status.hw = true;
+				}
+			}
+		} else {
+			r = amdgpu_asic_reset(adev);
+			if (r) {
+				dev_err(adev->dev, "asic reset on init failed\n");
+				goto failed;
+			}
 		}
 	}
 
@@ -3541,19 +3579,19 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 	/* enable clockgating, etc. after ib tests, etc. since some blocks require
 	 * explicit gating rather than handling it automatically.
 	 */
-	r = amdgpu_device_ip_late_init(adev);
-	if (r) {
-		dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
-		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
-		goto failed;
+	if (!adev->gmc.xgmi.pending_reset) {
+		r = amdgpu_device_ip_late_init(adev);
+		if (r) {
+			dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
+			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
+			goto failed;
+		}
+		/* must succeed. */
+		amdgpu_ras_resume(adev);
+		queue_delayed_work(system_wq, &adev->delayed_init_work,
+				   msecs_to_jiffies(AMDGPU_RESUME_MS));
 	}
 
-	/* must succeed. */
-	amdgpu_ras_resume(adev);
-
-	queue_delayed_work(system_wq, &adev->delayed_init_work,
-			   msecs_to_jiffies(AMDGPU_RESUME_MS));
-
 	if (amdgpu_sriov_vf(adev))
 		flush_delayed_work(&adev->delayed_init_work);
 
@@ -3570,6 +3608,10 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 	if (amdgpu_device_cache_pci_state(adev->pdev))
 		pci_restore_state(pdev);
 
+	if (adev->gmc.xgmi.pending_reset)
+		queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
+				   msecs_to_jiffies(AMDGPU_RESUME_MS));
+
 	return 0;
 
 failed:
@@ -4240,14 +4282,16 @@ bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
 }
 
 
-static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
-					struct amdgpu_job *job,
-					bool *need_full_reset_arg)
+int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
+				  struct amdgpu_job *job,
+				  bool *need_full_reset_arg)
 {
 	int i, r = 0;
 	bool need_full_reset  = *need_full_reset_arg;
 
-	amdgpu_debugfs_wait_dump(adev);
+	/* no need to dump if device is not in good state during probe period */
+	if (!adev->gmc.xgmi.pending_reset)
+		amdgpu_debugfs_wait_dump(adev);
 
 	if (amdgpu_sriov_vf(adev)) {
 		/* stop the data exchange thread */
@@ -4293,10 +4337,10 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
 	return r;
 }
 
-static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
-			       struct list_head *device_list_handle,
-			       bool *need_full_reset_arg,
-			       bool skip_hw_reset)
+int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
+			  struct list_head *device_list_handle,
+			  bool *need_full_reset_arg,
+			  bool skip_hw_reset)
 {
 	struct amdgpu_device *tmp_adev = NULL;
 	bool need_full_reset = *need_full_reset_arg, vram_lost = false;
@@ -4310,6 +4354,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
 		list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
 			/* For XGMI run all resets in parallel to speed up the process */
 			if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
+				tmp_adev->gmc.xgmi.pending_reset = false;
 				if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
 					r = -EALREADY;
 			} else
@@ -4348,10 +4393,10 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
 		if (need_full_reset) {
 			/* post card */
-			if (amdgpu_device_asic_init(tmp_adev))
+			r = amdgpu_device_asic_init(tmp_adev);
+			if (r) {
 				dev_warn(tmp_adev->dev, "asic atom init failed!");
-
-			if (!r) {
+			} else {
 				dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
 				r = amdgpu_device_ip_resume_phase1(tmp_adev);
 				if (r)
@@ -4384,6 +4429,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
 				 */
 				amdgpu_register_gpu_instance(tmp_adev);
 
+				if (!hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
+					amdgpu_xgmi_add_device(tmp_adev);
+
 				r = amdgpu_device_ip_late_init(tmp_adev);
 				if (r)
 					goto out;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 253c59e0a100..0bfc5f978d4f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -44,6 +44,7 @@
 #include "amdgpu_amdkfd.h"
 
 #include "amdgpu_ras.h"
+#include "amdgpu_xgmi.h"
 
 /*
  * KMS wrapper.
@@ -167,8 +168,13 @@ uint amdgpu_freesync_vid_mode;
 int amdgpu_reset_method = -1; /* auto */
 int amdgpu_num_kcq = -1;
 
+static void amdgpu_drv_delayed_reset_work_handler(struct work_struct *work);
+
 struct amdgpu_mgpu_info mgpu_info = {
 	.mutex = __MUTEX_INITIALIZER(mgpu_info.mutex),
+	.delayed_reset_work = __DELAYED_WORK_INITIALIZER(
+			mgpu_info.delayed_reset_work,
+			amdgpu_drv_delayed_reset_work_handler, 0),
 };
 int amdgpu_ras_enable = -1;
 uint amdgpu_ras_mask = 0xffffffff;
@@ -1297,6 +1303,69 @@ amdgpu_pci_shutdown(struct pci_dev *pdev)
 	adev->mp1_state = PP_MP1_STATE_NONE;
 }
 
+/**
+ * amdgpu_drv_delayed_reset_work_handler - work handler for reset
+ *
+ * @work: work_struct.
+ */
+static void amdgpu_drv_delayed_reset_work_handler(struct work_struct *work)
+{
+	struct list_head device_list;
+	struct amdgpu_device *adev;
+	int i, r;
+	bool need_full_reset = true;
+
+	mutex_lock(&mgpu_info.mutex);
+	if (mgpu_info.pending_reset == true) {
+		mutex_unlock(&mgpu_info.mutex);
+		return;
+	}
+	mgpu_info.pending_reset = true;
+	mutex_unlock(&mgpu_info.mutex);
+
+	for (i = 0; i < mgpu_info.num_dgpu; i++) {
+		adev = mgpu_info.gpu_ins[i].adev;
+		r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset);
+		if (r) {
+			dev_err(adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
+				r, adev_to_drm(adev)->unique);
+		}
+		if (!queue_work(system_unbound_wq, &adev->xgmi_reset_work))
+			r = -EALREADY;
+	}
+	for (i = 0; i < mgpu_info.num_dgpu; i++) {
+		adev = mgpu_info.gpu_ins[i].adev;
+		adev->gmc.xgmi.pending_reset = false;
+		flush_work(&adev->xgmi_reset_work);
+	}
+
+	/* reset function will rebuild the xgmi hive info , clear it now */
+	for (i = 0; i < mgpu_info.num_dgpu; i++)
+		amdgpu_xgmi_remove_device(mgpu_info.gpu_ins[i].adev);
+
+	INIT_LIST_HEAD(&device_list);
+
+	for (i = 0; i < mgpu_info.num_dgpu; i++)
+		list_add_tail(&mgpu_info.gpu_ins[i].adev->reset_list, &device_list);
+
+	/* unregister the GPU first, reset function will add them back */
+	list_for_each_entry(adev, &device_list, reset_list)
+		amdgpu_unregister_gpu_instance(adev);
+
+	r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true);
+	if (r) {
+		DRM_ERROR("reinit gpus failure");
+		return;
+	}
+	for (i = 0; i < mgpu_info.num_dgpu; i++) {
+		adev = mgpu_info.gpu_ins[i].adev;
+		if (!adev->kfd.init_complete)
+			amdgpu_amdkfd_device_init(adev);
+		amdgpu_ttm_set_buffer_funcs_status(adev, true);
+	}
+	return;
+}
+
 static int amdgpu_pmops_suspend(struct device *dev)
 {
 	struct drm_device *drm_dev = dev_get_drvdata(dev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
index aa0c83776ce0..8c71d84a2fbe 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
@@ -149,6 +149,7 @@ struct amdgpu_xgmi {
 	struct list_head head;
 	bool supported;
 	struct ras_common_if *ras_if;
+	bool pending_reset;
 };
 
 struct amdgpu_gmc {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 659b385b27b5..b459ef755ea9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -492,7 +492,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
 	if (!adev->gmc.xgmi.supported)
 		return 0;
 
-	if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
+	if (!adev->gmc.xgmi.pending_reset &&
+	    amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
 		ret = psp_xgmi_initialize(&adev->psp);
 		if (ret) {
 			dev_err(adev->dev,
@@ -538,7 +539,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
 
 	task_barrier_add_task(&hive->tb);
 
-	if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
+	if (!adev->gmc.xgmi.pending_reset &&
+	    amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
 		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
 			/* update node list for other device in the hive */
 			if (tmp_adev != adev) {
@@ -567,7 +569,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
 		}
 	}
 
-	if (!ret)
+	if (!ret && !adev->gmc.xgmi.pending_reset)
 		ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive);
 
 exit_unlock:
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2021-03-08 17:26 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-03-05 17:52 [PATCH 5/5] drm/amdgpu: Reset the devices in the XGMI hive duirng probe shaoyunl
2021-03-05 19:27 ` Andrey Grodzovsky
2021-03-06  1:37   ` Liu, Shaoyun
2021-03-06  6:09     ` Andrey Grodzovsky
2021-03-06 20:41       ` Liu, Shaoyun
2021-03-06 21:11         ` Liu, Shaoyun
2021-03-08  6:28           ` Andrey Grodzovsky
2021-03-08 15:27             ` Liu, Shaoyun
2021-03-08 15:53               ` Andrey Grodzovsky
2021-03-08 16:10                 ` Liu, Shaoyun
2021-03-08 17:26                   ` Andrey Grodzovsky
2021-03-06 21:20 shaoyunl

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.