[PATCH 1/2] drm/amdgpu: fix possible pstate switch race condition

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 1/2] drm/amdgpu: fix possible pstate switch race condition
@ 2019-11-05 10:23 ` Quan, Evan
  0 siblings, 0 replies; 20+ messages in thread
From: Quan, Evan @ 2019-11-05 10:23 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Strawbridge, Michael, Kim, Jonathan, Quan, Evan

Added lock protection so that the p-state switch will
be guarded to be sequential. Also update the hive
pstate only all device from the hive are in the same
state.

Change-Id: I165a6f44e8aec1e6da56eefa0fc49d36670e56fe
Signed-off-by: Evan Quan <evan.quan@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h      |  3 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 34 ++++++++++++++++++++++--
 2 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 0469cc51a6fb..41cf2abd6209 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1041,6 +1041,9 @@ struct amdgpu_device {
 
 	uint64_t			unique_id;
 	uint64_t	df_perfmon_config_assign_mask[AMDGPU_MAX_DF_PERFMONS];
+
+	/* device pstate */
+	int				pstate;
 };
 
 static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 167d9fbd2c4f..de20a9a1c444 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -274,12 +274,18 @@ int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate)
 {
 	int ret = 0;
 	struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
+	struct amdgpu_device *tmp_adev;
+	bool update_hive_pstate = true;
 
 	if (!hive)
 		return 0;
 
-	if (hive->pstate == pstate)
+	mutex_lock(&hive->hive_lock);
+
+	if (hive->pstate == pstate) {
+		mutex_unlock(&hive->hive_lock);
 		return 0;
+	}
 
 	dev_dbg(adev->dev, "Set xgmi pstate %d.\n", pstate);
 
@@ -290,11 +296,32 @@ int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate)
 		ret = adev->powerplay.pp_funcs->set_xgmi_pstate(adev->powerplay.pp_handle,
 								pstate);
 
-	if (ret)
+	if (ret) {
 		dev_err(adev->dev,
 			"XGMI: Set pstate failure on device %llx, hive %llx, ret %d",
 			adev->gmc.xgmi.node_id,
 			adev->gmc.xgmi.hive_id, ret);
+		goto out;
+	}
+
+	/* Update device pstate */
+	adev->pstate = pstate;
+
+	/*
+	 * Update the hive pstate only all devices of the hive
+	 * are in the same pstate
+	 */
+	list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
+		if (tmp_adev->pstate != adev->pstate) {
+			update_hive_pstate = false;
+			break;
+		}
+	}
+	if (update_hive_pstate)
+		hive->pstate = pstate;
+
+out:
+	mutex_unlock(&hive->hive_lock);
 
 	return ret;
 }
@@ -369,6 +396,9 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
 		goto exit;
 	}
 
+	/* Set default device pstate */
+	adev->pstate = -1;
+
 	top_info = &adev->psp.xgmi_context.top_info;
 
 	list_add_tail(&adev->gmc.xgmi.head, &hive->device_list);
-- 
2.23.0

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 1/2] drm/amdgpu: fix possible pstate switch race condition
@ 2019-11-05 10:23 ` Quan, Evan
  0 siblings, 0 replies; 20+ messages in thread
From: Quan, Evan @ 2019-11-05 10:23 UTC (permalink / raw)
  To: amd-gfx; +Cc: Strawbridge, Michael, Kim, Jonathan, Quan, Evan

Added lock protection so that the p-state switch will
be guarded to be sequential. Also update the hive
pstate only all device from the hive are in the same
state.

Change-Id: I165a6f44e8aec1e6da56eefa0fc49d36670e56fe
Signed-off-by: Evan Quan <evan.quan@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h      |  3 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 34 ++++++++++++++++++++++--
 2 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 0469cc51a6fb..41cf2abd6209 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1041,6 +1041,9 @@ struct amdgpu_device {
 
 	uint64_t			unique_id;
 	uint64_t	df_perfmon_config_assign_mask[AMDGPU_MAX_DF_PERFMONS];
+
+	/* device pstate */
+	int				pstate;
 };
 
 static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 167d9fbd2c4f..de20a9a1c444 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -274,12 +274,18 @@ int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate)
 {
 	int ret = 0;
 	struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
+	struct amdgpu_device *tmp_adev;
+	bool update_hive_pstate = true;
 
 	if (!hive)
 		return 0;
 
-	if (hive->pstate == pstate)
+	mutex_lock(&hive->hive_lock);
+
+	if (hive->pstate == pstate) {
+		mutex_unlock(&hive->hive_lock);
 		return 0;
+	}
 
 	dev_dbg(adev->dev, "Set xgmi pstate %d.\n", pstate);
 
@@ -290,11 +296,32 @@ int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate)
 		ret = adev->powerplay.pp_funcs->set_xgmi_pstate(adev->powerplay.pp_handle,
 								pstate);
 
-	if (ret)
+	if (ret) {
 		dev_err(adev->dev,
 			"XGMI: Set pstate failure on device %llx, hive %llx, ret %d",
 			adev->gmc.xgmi.node_id,
 			adev->gmc.xgmi.hive_id, ret);
+		goto out;
+	}
+
+	/* Update device pstate */
+	adev->pstate = pstate;
+
+	/*
+	 * Update the hive pstate only all devices of the hive
+	 * are in the same pstate
+	 */
+	list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
+		if (tmp_adev->pstate != adev->pstate) {
+			update_hive_pstate = false;
+			break;
+		}
+	}
+	if (update_hive_pstate)
+		hive->pstate = pstate;
+
+out:
+	mutex_unlock(&hive->hive_lock);
 
 	return ret;
 }
@@ -369,6 +396,9 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
 		goto exit;
 	}
 
+	/* Set default device pstate */
+	adev->pstate = -1;
+
 	top_info = &adev->psp.xgmi_context.top_info;
 
 	list_add_tail(&adev->gmc.xgmi.head, &hive->device_list);
-- 
2.23.0

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized
@ 2019-11-05 10:23     ` Quan, Evan
  0 siblings, 0 replies; 20+ messages in thread
From: Quan, Evan @ 2019-11-05 10:23 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Strawbridge, Michael, Kim, Jonathan, Quan, Evan

P-state switch should be performed after all devices from the hive
get initialized.

Change-Id: Ifc7cac9ef0cf250447d2a412da35d601e2ac79ec
Signed-off-by: Evan Quan <evan.quan@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 47 ++++++++++++++++------
 1 file changed, 35 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e6ce949670e5..2d72d206cead 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2090,6 +2090,7 @@ static int amdgpu_device_enable_mgpu_fan_boost(void)
  */
 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
 {
+	struct amdgpu_gpu_instance *gpu_instance;
 	int i = 0, r;
 
 	for (i = 0; i < adev->num_ip_blocks; i++) {
@@ -2115,6 +2116,40 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
 	if (r)
 		DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
 
+
+	if (adev->gmc.xgmi.num_physical_nodes > 1) {
+		mutex_lock(&mgpu_info.mutex);
+
+		/*
+		 * Reset device p-state to low as this was booted with high.
+		 *
+		 * This should be performed only after all devices from the same
+		 * hive get initialized.
+		 *
+		 * However, it's unknown how many device in the hive in advance.
+		 * As this is counted one by one during devices initializations.
+		 *
+		 * So, we wait for all XGMI interlinked devices initialized.
+		 * This may bring some delays as those devices may come from
+		 * different hives. But that should be OK.
+		 */
+		if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
+			for (i = 0; i < mgpu_info.num_gpu; i++) {
+				gpu_instance = &(mgpu_info.gpu_ins[i]);
+				if (gpu_instance->adev->flags & AMD_IS_APU)
+					continue;
+
+				r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 0);
+				if (r) {
+					DRM_ERROR("pstate setting failed (%d).\n", r);
+					break;
+				}
+			}
+		}
+
+		mutex_unlock(&mgpu_info.mutex);
+	}
+
 	return 0;
 }
 
@@ -2227,18 +2262,6 @@ static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
 	r = amdgpu_ib_ring_tests(adev);
 	if (r)
 		DRM_ERROR("ib ring test failed (%d).\n", r);
-
-	/*
-	 * set to low pstate by default
-	 * This should be performed after all devices from
-	 * XGMI finish their initializations. Thus it's moved
-	 * to here.
-	 * The time delay is 2S. TODO: confirm whether that
-	 * is enough for all possible XGMI setups.
-	 */
-	r = amdgpu_xgmi_set_pstate(adev, 0);
-	if (r)
-		DRM_ERROR("pstate setting failed (%d).\n", r);
 }
 
 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
-- 
2.23.0

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized
@ 2019-11-05 10:23     ` Quan, Evan
  0 siblings, 0 replies; 20+ messages in thread
From: Quan, Evan @ 2019-11-05 10:23 UTC (permalink / raw)
  To: amd-gfx; +Cc: Strawbridge, Michael, Kim, Jonathan, Quan, Evan

P-state switch should be performed after all devices from the hive
get initialized.

Change-Id: Ifc7cac9ef0cf250447d2a412da35d601e2ac79ec
Signed-off-by: Evan Quan <evan.quan@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 47 ++++++++++++++++------
 1 file changed, 35 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e6ce949670e5..2d72d206cead 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2090,6 +2090,7 @@ static int amdgpu_device_enable_mgpu_fan_boost(void)
  */
 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
 {
+	struct amdgpu_gpu_instance *gpu_instance;
 	int i = 0, r;
 
 	for (i = 0; i < adev->num_ip_blocks; i++) {
@@ -2115,6 +2116,40 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
 	if (r)
 		DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
 
+
+	if (adev->gmc.xgmi.num_physical_nodes > 1) {
+		mutex_lock(&mgpu_info.mutex);
+
+		/*
+		 * Reset device p-state to low as this was booted with high.
+		 *
+		 * This should be performed only after all devices from the same
+		 * hive get initialized.
+		 *
+		 * However, it's unknown how many device in the hive in advance.
+		 * As this is counted one by one during devices initializations.
+		 *
+		 * So, we wait for all XGMI interlinked devices initialized.
+		 * This may bring some delays as those devices may come from
+		 * different hives. But that should be OK.
+		 */
+		if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
+			for (i = 0; i < mgpu_info.num_gpu; i++) {
+				gpu_instance = &(mgpu_info.gpu_ins[i]);
+				if (gpu_instance->adev->flags & AMD_IS_APU)
+					continue;
+
+				r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 0);
+				if (r) {
+					DRM_ERROR("pstate setting failed (%d).\n", r);
+					break;
+				}
+			}
+		}
+
+		mutex_unlock(&mgpu_info.mutex);
+	}
+
 	return 0;
 }
 
@@ -2227,18 +2262,6 @@ static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
 	r = amdgpu_ib_ring_tests(adev);
 	if (r)
 		DRM_ERROR("ib ring test failed (%d).\n", r);
-
-	/*
-	 * set to low pstate by default
-	 * This should be performed after all devices from
-	 * XGMI finish their initializations. Thus it's moved
-	 * to here.
-	 * The time delay is 2S. TODO: confirm whether that
-	 * is enough for all possible XGMI setups.
-	 */
-	r = amdgpu_xgmi_set_pstate(adev, 0);
-	if (r)
-		DRM_ERROR("pstate setting failed (%d).\n", r);
 }
 
 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
-- 
2.23.0

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* RE: [PATCH 1/2] drm/amdgpu: fix possible pstate switch race condition
@ 2019-11-05 11:34     ` Xu, Feifei
  0 siblings, 0 replies; 20+ messages in thread
From: Xu, Feifei @ 2019-11-05 11:34 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Strawbridge, Michael, Quan, Evan, Kim, Jonathan



Series is Reviewed-by: Feifei Xu <Feifei.Xu@amd.com>

-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Quan, Evan
Sent: 2019年11月5日 18:24
To: amd-gfx@lists.freedesktop.org
Cc: Strawbridge, Michael <Michael.Strawbridge@amd.com>; Kim, Jonathan <Jonathan.Kim@amd.com>; Quan, Evan <Evan.Quan@amd.com>
Subject: [PATCH 1/2] drm/amdgpu: fix possible pstate switch race condition

Added lock protection so that the p-state switch will be guarded to be sequential. Also update the hive pstate only all device from the hive are in the same state.

Change-Id: I165a6f44e8aec1e6da56eefa0fc49d36670e56fe
Signed-off-by: Evan Quan <evan.quan@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h      |  3 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 34 ++++++++++++++++++++++--
 2 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 0469cc51a6fb..41cf2abd6209 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1041,6 +1041,9 @@ struct amdgpu_device {
 
 	uint64_t			unique_id;
 	uint64_t	df_perfmon_config_assign_mask[AMDGPU_MAX_DF_PERFMONS];
+
+	/* device pstate */
+	int				pstate;
 };
 
 static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 167d9fbd2c4f..de20a9a1c444 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -274,12 +274,18 @@ int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate)  {
 	int ret = 0;
 	struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
+	struct amdgpu_device *tmp_adev;
+	bool update_hive_pstate = true;
 
 	if (!hive)
 		return 0;
 
-	if (hive->pstate == pstate)
+	mutex_lock(&hive->hive_lock);
+
+	if (hive->pstate == pstate) {
+		mutex_unlock(&hive->hive_lock);
 		return 0;
+	}
 
 	dev_dbg(adev->dev, "Set xgmi pstate %d.\n", pstate);
 
@@ -290,11 +296,32 @@ int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate)
 		ret = adev->powerplay.pp_funcs->set_xgmi_pstate(adev->powerplay.pp_handle,
 								pstate);
 
-	if (ret)
+	if (ret) {
 		dev_err(adev->dev,
 			"XGMI: Set pstate failure on device %llx, hive %llx, ret %d",
 			adev->gmc.xgmi.node_id,
 			adev->gmc.xgmi.hive_id, ret);
+		goto out;
+	}
+
+	/* Update device pstate */
+	adev->pstate = pstate;
+
+	/*
+	 * Update the hive pstate only all devices of the hive
+	 * are in the same pstate
+	 */
+	list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
+		if (tmp_adev->pstate != adev->pstate) {
+			update_hive_pstate = false;
+			break;
+		}
+	}
+	if (update_hive_pstate)
+		hive->pstate = pstate;
+
+out:
+	mutex_unlock(&hive->hive_lock);
 
 	return ret;
 }
@@ -369,6 +396,9 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
 		goto exit;
 	}
 
+	/* Set default device pstate */
+	adev->pstate = -1;
+
 	top_info = &adev->psp.xgmi_context.top_info;
 
 	list_add_tail(&adev->gmc.xgmi.head, &hive->device_list);
--
2.23.0

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* RE: [PATCH 1/2] drm/amdgpu: fix possible pstate switch race condition
@ 2019-11-05 11:34     ` Xu, Feifei
  0 siblings, 0 replies; 20+ messages in thread
From: Xu, Feifei @ 2019-11-05 11:34 UTC (permalink / raw)
  To: Quan, Evan, amd-gfx; +Cc: Strawbridge, Michael, Quan, Evan, Kim, Jonathan



Series is Reviewed-by: Feifei Xu <Feifei.Xu@amd.com>

-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Quan, Evan
Sent: 2019年11月5日 18:24
To: amd-gfx@lists.freedesktop.org
Cc: Strawbridge, Michael <Michael.Strawbridge@amd.com>; Kim, Jonathan <Jonathan.Kim@amd.com>; Quan, Evan <Evan.Quan@amd.com>
Subject: [PATCH 1/2] drm/amdgpu: fix possible pstate switch race condition

Added lock protection so that the p-state switch will be guarded to be sequential. Also update the hive pstate only all device from the hive are in the same state.

Change-Id: I165a6f44e8aec1e6da56eefa0fc49d36670e56fe
Signed-off-by: Evan Quan <evan.quan@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h      |  3 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 34 ++++++++++++++++++++++--
 2 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 0469cc51a6fb..41cf2abd6209 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1041,6 +1041,9 @@ struct amdgpu_device {
 
 	uint64_t			unique_id;
 	uint64_t	df_perfmon_config_assign_mask[AMDGPU_MAX_DF_PERFMONS];
+
+	/* device pstate */
+	int				pstate;
 };
 
 static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 167d9fbd2c4f..de20a9a1c444 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -274,12 +274,18 @@ int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate)  {
 	int ret = 0;
 	struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
+	struct amdgpu_device *tmp_adev;
+	bool update_hive_pstate = true;
 
 	if (!hive)
 		return 0;
 
-	if (hive->pstate == pstate)
+	mutex_lock(&hive->hive_lock);
+
+	if (hive->pstate == pstate) {
+		mutex_unlock(&hive->hive_lock);
 		return 0;
+	}
 
 	dev_dbg(adev->dev, "Set xgmi pstate %d.\n", pstate);
 
@@ -290,11 +296,32 @@ int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate)
 		ret = adev->powerplay.pp_funcs->set_xgmi_pstate(adev->powerplay.pp_handle,
 								pstate);
 
-	if (ret)
+	if (ret) {
 		dev_err(adev->dev,
 			"XGMI: Set pstate failure on device %llx, hive %llx, ret %d",
 			adev->gmc.xgmi.node_id,
 			adev->gmc.xgmi.hive_id, ret);
+		goto out;
+	}
+
+	/* Update device pstate */
+	adev->pstate = pstate;
+
+	/*
+	 * Update the hive pstate only all devices of the hive
+	 * are in the same pstate
+	 */
+	list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
+		if (tmp_adev->pstate != adev->pstate) {
+			update_hive_pstate = false;
+			break;
+		}
+	}
+	if (update_hive_pstate)
+		hive->pstate = pstate;
+
+out:
+	mutex_unlock(&hive->hive_lock);
 
 	return ret;
 }
@@ -369,6 +396,9 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
 		goto exit;
 	}
 
+	/* Set default device pstate */
+	adev->pstate = -1;
+
 	top_info = &adev->psp.xgmi_context.top_info;
 
 	list_add_tail(&adev->gmc.xgmi.head, &hive->device_list);
--
2.23.0

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* RE: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized
@ 2019-11-05 16:32         ` Kim, Jonathan
  0 siblings, 0 replies; 20+ messages in thread
From: Kim, Jonathan @ 2019-11-05 16:32 UTC (permalink / raw)
  To: Quan, Evan, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Strawbridge, Michael

Please see inline.

Jon

-----Original Message-----
From: Quan, Evan <Evan.Quan@amd.com> 
Sent: Tuesday, November 5, 2019 5:24 AM
To: amd-gfx@lists.freedesktop.org
Cc: Kim, Jonathan <Jonathan.Kim@amd.com>; Strawbridge, Michael <Michael.Strawbridge@amd.com>; Quan, Evan <Evan.Quan@amd.com>
Subject: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized

P-state switch should be performed after all devices from the hive get initialized.

Change-Id: Ifc7cac9ef0cf250447d2a412da35d601e2ac79ec
Signed-off-by: Evan Quan <evan.quan@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 47 ++++++++++++++++------
 1 file changed, 35 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e6ce949670e5..2d72d206cead 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2090,6 +2090,7 @@ static int amdgpu_device_enable_mgpu_fan_boost(void)
  */
 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)  {
+	struct amdgpu_gpu_instance *gpu_instance;
 	int i = 0, r;
 
 	for (i = 0; i < adev->num_ip_blocks; i++) { @@ -2115,6 +2116,40 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
 	if (r)
 		DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
 
+
+	if (adev->gmc.xgmi.num_physical_nodes > 1) {
+		mutex_lock(&mgpu_info.mutex);
+
+		/*
+		 * Reset device p-state to low as this was booted with high.
+		 *
+		 * This should be performed only after all devices from the same
+		 * hive get initialized.
+		 *
+		 * However, it's unknown how many device in the hive in advance.
+		 * As this is counted one by one during devices initializations.
+		 *
+		 * So, we wait for all XGMI interlinked devices initialized.
+		 * This may bring some delays as those devices may come from
+		 * different hives. But that should be OK.
+		 */
+		if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
[JK] This condition will never succeed.  mgpu_info.num_dgpu is 0-indexed while num_physical_nodes give total nodes.

+			for (i = 0; i < mgpu_info.num_gpu; i++) {
+				gpu_instance = &(mgpu_info.gpu_ins[i]);
+				if (gpu_instance->adev->flags & AMD_IS_APU)
+					continue;
+
+				r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 0);
+				if (r) {
+					DRM_ERROR("pstate setting failed (%d).\n", r);
+					break;
+				}
+			}
+		}
+
+		mutex_unlock(&mgpu_info.mutex);
+	}
+
 	return 0;
 }
 
@@ -2227,18 +2262,6 @@ static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
 	r = amdgpu_ib_ring_tests(adev);
 	if (r)
 		DRM_ERROR("ib ring test failed (%d).\n", r);
-
-	/*
-	 * set to low pstate by default
-	 * This should be performed after all devices from
-	 * XGMI finish their initializations. Thus it's moved
-	 * to here.
-	 * The time delay is 2S. TODO: confirm whether that
-	 * is enough for all possible XGMI setups.
-	 */
-	r = amdgpu_xgmi_set_pstate(adev, 0);
-	if (r)
-		DRM_ERROR("pstate setting failed (%d).\n", r);
 }
 
 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
--
2.23.0

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* RE: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized
@ 2019-11-05 16:32         ` Kim, Jonathan
  0 siblings, 0 replies; 20+ messages in thread
From: Kim, Jonathan @ 2019-11-05 16:32 UTC (permalink / raw)
  To: Quan, Evan, amd-gfx; +Cc: Strawbridge, Michael

Please see inline.

Jon

-----Original Message-----
From: Quan, Evan <Evan.Quan@amd.com> 
Sent: Tuesday, November 5, 2019 5:24 AM
To: amd-gfx@lists.freedesktop.org
Cc: Kim, Jonathan <Jonathan.Kim@amd.com>; Strawbridge, Michael <Michael.Strawbridge@amd.com>; Quan, Evan <Evan.Quan@amd.com>
Subject: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized

P-state switch should be performed after all devices from the hive get initialized.

Change-Id: Ifc7cac9ef0cf250447d2a412da35d601e2ac79ec
Signed-off-by: Evan Quan <evan.quan@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 47 ++++++++++++++++------
 1 file changed, 35 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e6ce949670e5..2d72d206cead 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2090,6 +2090,7 @@ static int amdgpu_device_enable_mgpu_fan_boost(void)
  */
 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)  {
+	struct amdgpu_gpu_instance *gpu_instance;
 	int i = 0, r;
 
 	for (i = 0; i < adev->num_ip_blocks; i++) { @@ -2115,6 +2116,40 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
 	if (r)
 		DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
 
+
+	if (adev->gmc.xgmi.num_physical_nodes > 1) {
+		mutex_lock(&mgpu_info.mutex);
+
+		/*
+		 * Reset device p-state to low as this was booted with high.
+		 *
+		 * This should be performed only after all devices from the same
+		 * hive get initialized.
+		 *
+		 * However, it's unknown how many device in the hive in advance.
+		 * As this is counted one by one during devices initializations.
+		 *
+		 * So, we wait for all XGMI interlinked devices initialized.
+		 * This may bring some delays as those devices may come from
+		 * different hives. But that should be OK.
+		 */
+		if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
[JK] This condition will never succeed.  mgpu_info.num_dgpu is 0-indexed while num_physical_nodes give total nodes.

+			for (i = 0; i < mgpu_info.num_gpu; i++) {
+				gpu_instance = &(mgpu_info.gpu_ins[i]);
+				if (gpu_instance->adev->flags & AMD_IS_APU)
+					continue;
+
+				r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 0);
+				if (r) {
+					DRM_ERROR("pstate setting failed (%d).\n", r);
+					break;
+				}
+			}
+		}
+
+		mutex_unlock(&mgpu_info.mutex);
+	}
+
 	return 0;
 }
 
@@ -2227,18 +2262,6 @@ static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
 	r = amdgpu_ib_ring_tests(adev);
 	if (r)
 		DRM_ERROR("ib ring test failed (%d).\n", r);
-
-	/*
-	 * set to low pstate by default
-	 * This should be performed after all devices from
-	 * XGMI finish their initializations. Thus it's moved
-	 * to here.
-	 * The time delay is 2S. TODO: confirm whether that
-	 * is enough for all possible XGMI setups.
-	 */
-	r = amdgpu_xgmi_set_pstate(adev, 0);
-	if (r)
-		DRM_ERROR("pstate setting failed (%d).\n", r);
 }
 
 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
--
2.23.0

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized
@ 2019-11-05 16:39             ` Strawbridge, Michael
  0 siblings, 0 replies; 20+ messages in thread
From: Strawbridge, Michael @ 2019-11-05 16:39 UTC (permalink / raw)
  To: Kim, Jonathan, Quan, Evan, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1: Type: text/plain, Size: 4632 bytes --]

Hi Jon,

Does that mean we can simply use this instead?

+               if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes - 1) {

Thanks,
Michael
________________________________
From: Kim, Jonathan <Jonathan.Kim-5C7GfCeVMHo@public.gmane.org>
Sent: 05 November 2019 11:32 AM
To: Quan, Evan <Evan.Quan-5C7GfCeVMHo@public.gmane.org>; amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org <amd-gfx@lists.freedesktop.org>
Cc: Strawbridge, Michael <Michael.Strawbridge-5C7GfCeVMHo@public.gmane.org>
Subject: RE: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized

Please see inline.

Jon

-----Original Message-----
From: Quan, Evan <Evan.Quan-5C7GfCeVMHo@public.gmane.org>
Sent: Tuesday, November 5, 2019 5:24 AM
To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
Cc: Kim, Jonathan <Jonathan.Kim-5C7GfCeVMHo@public.gmane.org>; Strawbridge, Michael <Michael.Strawbridge-5C7GfCeVMHo@public.gmane.org>; Quan, Evan <Evan.Quan-5C7GfCeVMHo@public.gmane.org>
Subject: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized

P-state switch should be performed after all devices from the hive get initialized.

Change-Id: Ifc7cac9ef0cf250447d2a412da35d601e2ac79ec
Signed-off-by: Evan Quan <evan.quan-5C7GfCeVMHo@public.gmane.org>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 47 ++++++++++++++++------
 1 file changed, 35 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e6ce949670e5..2d72d206cead 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2090,6 +2090,7 @@ static int amdgpu_device_enable_mgpu_fan_boost(void)
  */
 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)  {
+       struct amdgpu_gpu_instance *gpu_instance;
         int i = 0, r;

         for (i = 0; i < adev->num_ip_blocks; i++) { @@ -2115,6 +2116,40 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
         if (r)
                 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);

+
+       if (adev->gmc.xgmi.num_physical_nodes > 1) {
+               mutex_lock(&mgpu_info.mutex);
+
+               /*
+                * Reset device p-state to low as this was booted with high.
+                *
+                * This should be performed only after all devices from the same
+                * hive get initialized.
+                *
+                * However, it's unknown how many device in the hive in advance.
+                * As this is counted one by one during devices initializations.
+                *
+                * So, we wait for all XGMI interlinked devices initialized.
+                * This may bring some delays as those devices may come from
+                * different hives. But that should be OK.
+                */
+               if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
[JK] This condition will never succeed.  mgpu_info.num_dgpu is 0-indexed while num_physical_nodes give total nodes.

+                       for (i = 0; i < mgpu_info.num_gpu; i++) {
+                               gpu_instance = &(mgpu_info.gpu_ins[i]);
+                               if (gpu_instance->adev->flags & AMD_IS_APU)
+                                       continue;
+
+                               r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 0);
+                               if (r) {
+                                       DRM_ERROR("pstate setting failed (%d).\n", r);
+                                       break;
+                               }
+                       }
+               }
+
+               mutex_unlock(&mgpu_info.mutex);
+       }
+
         return 0;
 }

@@ -2227,18 +2262,6 @@ static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
         r = amdgpu_ib_ring_tests(adev);
         if (r)
                 DRM_ERROR("ib ring test failed (%d).\n", r);
-
-       /*
-        * set to low pstate by default
-        * This should be performed after all devices from
-        * XGMI finish their initializations. Thus it's moved
-        * to here.
-        * The time delay is 2S. TODO: confirm whether that
-        * is enough for all possible XGMI setups.
-        */
-       r = amdgpu_xgmi_set_pstate(adev, 0);
-       if (r)
-               DRM_ERROR("pstate setting failed (%d).\n", r);
 }

 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
--
2.23.0


[-- Attachment #1.2: Type: text/html, Size: 11033 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized
@ 2019-11-05 16:39             ` Strawbridge, Michael
  0 siblings, 0 replies; 20+ messages in thread
From: Strawbridge, Michael @ 2019-11-05 16:39 UTC (permalink / raw)
  To: Kim, Jonathan, Quan, Evan, amd-gfx


[-- Attachment #1.1: Type: text/plain, Size: 4408 bytes --]

Hi Jon,

Does that mean we can simply use this instead?

+               if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes - 1) {

Thanks,
Michael
________________________________
From: Kim, Jonathan <Jonathan.Kim@amd.com>
Sent: 05 November 2019 11:32 AM
To: Quan, Evan <Evan.Quan@amd.com>; amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>
Cc: Strawbridge, Michael <Michael.Strawbridge@amd.com>
Subject: RE: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized

Please see inline.

Jon

-----Original Message-----
From: Quan, Evan <Evan.Quan@amd.com>
Sent: Tuesday, November 5, 2019 5:24 AM
To: amd-gfx@lists.freedesktop.org
Cc: Kim, Jonathan <Jonathan.Kim@amd.com>; Strawbridge, Michael <Michael.Strawbridge@amd.com>; Quan, Evan <Evan.Quan@amd.com>
Subject: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized

P-state switch should be performed after all devices from the hive get initialized.

Change-Id: Ifc7cac9ef0cf250447d2a412da35d601e2ac79ec
Signed-off-by: Evan Quan <evan.quan@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 47 ++++++++++++++++------
 1 file changed, 35 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e6ce949670e5..2d72d206cead 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2090,6 +2090,7 @@ static int amdgpu_device_enable_mgpu_fan_boost(void)
  */
 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)  {
+       struct amdgpu_gpu_instance *gpu_instance;
         int i = 0, r;

         for (i = 0; i < adev->num_ip_blocks; i++) { @@ -2115,6 +2116,40 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
         if (r)
                 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);

+
+       if (adev->gmc.xgmi.num_physical_nodes > 1) {
+               mutex_lock(&mgpu_info.mutex);
+
+               /*
+                * Reset device p-state to low as this was booted with high.
+                *
+                * This should be performed only after all devices from the same
+                * hive get initialized.
+                *
+                * However, it's unknown how many device in the hive in advance.
+                * As this is counted one by one during devices initializations.
+                *
+                * So, we wait for all XGMI interlinked devices initialized.
+                * This may bring some delays as those devices may come from
+                * different hives. But that should be OK.
+                */
+               if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
[JK] This condition will never succeed.  mgpu_info.num_dgpu is 0-indexed while num_physical_nodes give total nodes.

+                       for (i = 0; i < mgpu_info.num_gpu; i++) {
+                               gpu_instance = &(mgpu_info.gpu_ins[i]);
+                               if (gpu_instance->adev->flags & AMD_IS_APU)
+                                       continue;
+
+                               r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 0);
+                               if (r) {
+                                       DRM_ERROR("pstate setting failed (%d).\n", r);
+                                       break;
+                               }
+                       }
+               }
+
+               mutex_unlock(&mgpu_info.mutex);
+       }
+
         return 0;
 }

@@ -2227,18 +2262,6 @@ static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
         r = amdgpu_ib_ring_tests(adev);
         if (r)
                 DRM_ERROR("ib ring test failed (%d).\n", r);
-
-       /*
-        * set to low pstate by default
-        * This should be performed after all devices from
-        * XGMI finish their initializations. Thus it's moved
-        * to here.
-        * The time delay is 2S. TODO: confirm whether that
-        * is enough for all possible XGMI setups.
-        */
-       r = amdgpu_xgmi_set_pstate(adev, 0);
-       if (r)
-               DRM_ERROR("pstate setting failed (%d).\n", r);
 }

 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
--
2.23.0


[-- Attachment #1.2: Type: text/html, Size: 10778 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* RE: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized
@ 2019-11-05 16:41                 ` Kim, Jonathan
  0 siblings, 0 replies; 20+ messages in thread
From: Kim, Jonathan @ 2019-11-05 16:41 UTC (permalink / raw)
  To: Strawbridge, Michael, Quan, Evan,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1: Type: text/plain, Size: 5659 bytes --]

Yes that's correct.  This should fix the issue.

Jon

From: Strawbridge, Michael <Michael.Strawbridge-5C7GfCeVMHo@public.gmane.org>
Sent: Tuesday, November 5, 2019 11:40 AM
To: Kim, Jonathan <Jonathan.Kim-5C7GfCeVMHo@public.gmane.org>; Quan, Evan <Evan.Quan-5C7GfCeVMHo@public.gmane.org>; amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
Subject: Re: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized

Hi Jon,

Does that mean we can simply use this instead?

+               if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes - 1) {

Thanks,
Michael
________________________________
From: Kim, Jonathan <Jonathan.Kim-5C7GfCeVMHo@public.gmane.org<mailto:Jonathan.Kim-5C7GfCeVMHo@public.gmane.org>>
Sent: 05 November 2019 11:32 AM
To: Quan, Evan <Evan.Quan-5C7GfCeVMHo@public.gmane.org<mailto:Evan.Quan-5C7GfCeVMHo@public.gmane.org>>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org> <amd-gfx-PD4FTy7X32mMSPqsTGOZug@public.gmane.orgesktop.org<mailto:amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org>>
Cc: Strawbridge, Michael <Michael.Strawbridge-5C7GfCeVMHo@public.gmane.org<mailto:Michael.Strawbridge-5C7GfCeVMHo@public.gmane.org>>
Subject: RE: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized

Please see inline.

Jon

-----Original Message-----
From: Quan, Evan <Evan.Quan-5C7GfCeVMHo@public.gmane.org<mailto:Evan.Quan-5C7GfCeVMHo@public.gmane.org>>
Sent: Tuesday, November 5, 2019 5:24 AM
To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org<mailto:amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org>
Cc: Kim, Jonathan <Jonathan.Kim-5C7GfCeVMHo@public.gmane.org<mailto:Jonathan.Kim-5C7GfCeVMHo@public.gmane.org>>; Strawbridge, Michael <Michael.Strawbridge-5C7GfCeVMHo@public.gmane.org<mailto:Michael.Strawbridge@amd.com>>; Quan, Evan <Evan.Quan-5C7GfCeVMHo@public.gmane.org<mailto:Evan.Quan-5C7GfCeVMHo@public.gmane.org>>
Subject: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized

P-state switch should be performed after all devices from the hive get initialized.

Change-Id: Ifc7cac9ef0cf250447d2a412da35d601e2ac79ec
Signed-off-by: Evan Quan <evan.quan-5C7GfCeVMHo@public.gmane.org<mailto:evan.quan-5C7GfCeVMHo@public.gmane.org>>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 47 ++++++++++++++++------
 1 file changed, 35 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e6ce949670e5..2d72d206cead 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2090,6 +2090,7 @@ static int amdgpu_device_enable_mgpu_fan_boost(void)
  */
 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)  {
+       struct amdgpu_gpu_instance *gpu_instance;
         int i = 0, r;

         for (i = 0; i < adev->num_ip_blocks; i++) { @@ -2115,6 +2116,40 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
         if (r)
                 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);

+
+       if (adev->gmc.xgmi.num_physical_nodes > 1) {
+               mutex_lock(&mgpu_info.mutex);
+
+               /*
+                * Reset device p-state to low as this was booted with high.
+                *
+                * This should be performed only after all devices from the same
+                * hive get initialized.
+                *
+                * However, it's unknown how many device in the hive in advance.
+                * As this is counted one by one during devices initializations.
+                *
+                * So, we wait for all XGMI interlinked devices initialized.
+                * This may bring some delays as those devices may come from
+                * different hives. But that should be OK.
+                */
+               if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
[JK] This condition will never succeed.  mgpu_info.num_dgpu is 0-indexed while num_physical_nodes give total nodes.

+                       for (i = 0; i < mgpu_info.num_gpu; i++) {
+                               gpu_instance = &(mgpu_info.gpu_ins[i]);
+                               if (gpu_instance->adev->flags & AMD_IS_APU)
+                                       continue;
+
+                               r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 0);
+                               if (r) {
+                                       DRM_ERROR("pstate setting failed (%d).\n", r);
+                                       break;
+                               }
+                       }
+               }
+
+               mutex_unlock(&mgpu_info.mutex);
+       }
+
         return 0;
 }

@@ -2227,18 +2262,6 @@ static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
         r = amdgpu_ib_ring_tests(adev);
         if (r)
                 DRM_ERROR("ib ring test failed (%d).\n", r);
-
-       /*
-        * set to low pstate by default
-        * This should be performed after all devices from
-        * XGMI finish their initializations. Thus it's moved
-        * to here.
-        * The time delay is 2S. TODO: confirm whether that
-        * is enough for all possible XGMI setups.
-        */
-       r = amdgpu_xgmi_set_pstate(adev, 0);
-       if (r)
-               DRM_ERROR("pstate setting failed (%d).\n", r);
 }

 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
--
2.23.0

[-- Attachment #1.2: Type: text/html, Size: 14075 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* RE: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized
@ 2019-11-05 16:41                 ` Kim, Jonathan
  0 siblings, 0 replies; 20+ messages in thread
From: Kim, Jonathan @ 2019-11-05 16:41 UTC (permalink / raw)
  To: Strawbridge, Michael, Quan, Evan, amd-gfx


[-- Attachment #1.1: Type: text/plain, Size: 5113 bytes --]

Yes that's correct.  This should fix the issue.

Jon

From: Strawbridge, Michael <Michael.Strawbridge@amd.com>
Sent: Tuesday, November 5, 2019 11:40 AM
To: Kim, Jonathan <Jonathan.Kim@amd.com>; Quan, Evan <Evan.Quan@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized

Hi Jon,

Does that mean we can simply use this instead?

+               if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes - 1) {

Thanks,
Michael
________________________________
From: Kim, Jonathan <Jonathan.Kim@amd.com<mailto:Jonathan.Kim@amd.com>>
Sent: 05 November 2019 11:32 AM
To: Quan, Evan <Evan.Quan@amd.com<mailto:Evan.Quan@amd.com>>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org> <amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>>
Cc: Strawbridge, Michael <Michael.Strawbridge@amd.com<mailto:Michael.Strawbridge@amd.com>>
Subject: RE: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized

Please see inline.

Jon

-----Original Message-----
From: Quan, Evan <Evan.Quan@amd.com<mailto:Evan.Quan@amd.com>>
Sent: Tuesday, November 5, 2019 5:24 AM
To: amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
Cc: Kim, Jonathan <Jonathan.Kim@amd.com<mailto:Jonathan.Kim@amd.com>>; Strawbridge, Michael <Michael.Strawbridge@amd.com<mailto:Michael.Strawbridge@amd.com>>; Quan, Evan <Evan.Quan@amd.com<mailto:Evan.Quan@amd.com>>
Subject: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized

P-state switch should be performed after all devices from the hive get initialized.

Change-Id: Ifc7cac9ef0cf250447d2a412da35d601e2ac79ec
Signed-off-by: Evan Quan <evan.quan@amd.com<mailto:evan.quan@amd.com>>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 47 ++++++++++++++++------
 1 file changed, 35 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e6ce949670e5..2d72d206cead 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2090,6 +2090,7 @@ static int amdgpu_device_enable_mgpu_fan_boost(void)
  */
 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)  {
+       struct amdgpu_gpu_instance *gpu_instance;
         int i = 0, r;

         for (i = 0; i < adev->num_ip_blocks; i++) { @@ -2115,6 +2116,40 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
         if (r)
                 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);

+
+       if (adev->gmc.xgmi.num_physical_nodes > 1) {
+               mutex_lock(&mgpu_info.mutex);
+
+               /*
+                * Reset device p-state to low as this was booted with high.
+                *
+                * This should be performed only after all devices from the same
+                * hive get initialized.
+                *
+                * However, it's unknown how many device in the hive in advance.
+                * As this is counted one by one during devices initializations.
+                *
+                * So, we wait for all XGMI interlinked devices initialized.
+                * This may bring some delays as those devices may come from
+                * different hives. But that should be OK.
+                */
+               if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
[JK] This condition will never succeed.  mgpu_info.num_dgpu is 0-indexed while num_physical_nodes give total nodes.

+                       for (i = 0; i < mgpu_info.num_gpu; i++) {
+                               gpu_instance = &(mgpu_info.gpu_ins[i]);
+                               if (gpu_instance->adev->flags & AMD_IS_APU)
+                                       continue;
+
+                               r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 0);
+                               if (r) {
+                                       DRM_ERROR("pstate setting failed (%d).\n", r);
+                                       break;
+                               }
+                       }
+               }
+
+               mutex_unlock(&mgpu_info.mutex);
+       }
+
         return 0;
 }

@@ -2227,18 +2262,6 @@ static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
         r = amdgpu_ib_ring_tests(adev);
         if (r)
                 DRM_ERROR("ib ring test failed (%d).\n", r);
-
-       /*
-        * set to low pstate by default
-        * This should be performed after all devices from
-        * XGMI finish their initializations. Thus it's moved
-        * to here.
-        * The time delay is 2S. TODO: confirm whether that
-        * is enough for all possible XGMI setups.
-        */
-       r = amdgpu_xgmi_set_pstate(adev, 0);
-       if (r)
-               DRM_ERROR("pstate setting failed (%d).\n", r);
 }

 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
--
2.23.0

[-- Attachment #1.2: Type: text/html, Size: 13613 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* RE: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized
@ 2019-11-05 17:07                     ` Kim, Jonathan
  0 siblings, 0 replies; 20+ messages in thread
From: Kim, Jonathan @ 2019-11-05 17:07 UTC (permalink / raw)
  To: Strawbridge, Michael, Quan, Evan,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1: Type: text/plain, Size: 6895 bytes --]

+                       for (i = 0; i < mgpu_info.num_gpu; i++) {
Also while num_physical_nodes should be used here instead.  It doesn't make sense to have a pre-condition check i.e. (if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes - 1) ) against the total number of nodes while the loop checks for the current num gpus from mgpu_info.  This will fail to set the last node to low pstate for the same 0-indexing issue once the pre-condition passes.

+                               gpu_instance = &(mgpu_info.gpu_ins[i]);
+                               if (gpu_instance->adev->flags & AMD_IS_APU)
+                                       continue;
+

From: Kim, Jonathan
Sent: Tuesday, November 5, 2019 11:42 AM
To: Strawbridge, Michael <Michael.Strawbridge-5C7GfCeVMHo@public.gmane.org>; Quan, Evan <Evan.Quan-5C7GfCeVMHo@public.gmane.org>; amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
Subject: RE: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized

Yes that's correct.  This should fix the issue.

Jon

From: Strawbridge, Michael <Michael.Strawbridge-5C7GfCeVMHo@public.gmane.org<mailto:Michael.Strawbridge-5C7GfCeVMHo@public.gmane.org>>
Sent: Tuesday, November 5, 2019 11:40 AM
To: Kim, Jonathan <Jonathan.Kim-5C7GfCeVMHo@public.gmane.org<mailto:Jonathan.Kim-5C7GfCeVMHo@public.gmane.org>>; Quan, Evan <Evan.Quan-5C7GfCeVMHo@public.gmane.org<mailto:Evan.Quan-5C7GfCeVMHo@public.gmane.org>>; amd-gfx-PD4FTy7X32mqWrfYKbYh0A@public.gmane.orgktop.org<mailto:amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org>
Subject: Re: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized

Hi Jon,

Does that mean we can simply use this instead?

+               if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes - 1) {

Thanks,
Michael
________________________________
From: Kim, Jonathan <Jonathan.Kim-5C7GfCeVMHo@public.gmane.org<mailto:Jonathan.Kim-5C7GfCeVMHo@public.gmane.org>>
Sent: 05 November 2019 11:32 AM
To: Quan, Evan <Evan.Quan-5C7GfCeVMHo@public.gmane.org<mailto:Evan.Quan-5C7GfCeVMHo@public.gmane.org>>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org> <amd-gfx-PD4FTy7X32mMSPqsTGOZug@public.gmane.orgesktop.org<mailto:amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org>>
Cc: Strawbridge, Michael <Michael.Strawbridge-5C7GfCeVMHo@public.gmane.org<mailto:Michael.Strawbridge-5C7GfCeVMHo@public.gmane.org>>
Subject: RE: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized

Please see inline.

Jon

-----Original Message-----
From: Quan, Evan <Evan.Quan-5C7GfCeVMHo@public.gmane.org<mailto:Evan.Quan-5C7GfCeVMHo@public.gmane.org>>
Sent: Tuesday, November 5, 2019 5:24 AM
To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org<mailto:amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org>
Cc: Kim, Jonathan <Jonathan.Kim-5C7GfCeVMHo@public.gmane.org<mailto:Jonathan.Kim-5C7GfCeVMHo@public.gmane.org>>; Strawbridge, Michael <Michael.Strawbridge-5C7GfCeVMHo@public.gmane.org<mailto:Michael.Strawbridge@amd.com>>; Quan, Evan <Evan.Quan-5C7GfCeVMHo@public.gmane.org<mailto:Evan.Quan-5C7GfCeVMHo@public.gmane.org>>
Subject: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized

P-state switch should be performed after all devices from the hive get initialized.

Change-Id: Ifc7cac9ef0cf250447d2a412da35d601e2ac79ec
Signed-off-by: Evan Quan <evan.quan-5C7GfCeVMHo@public.gmane.org<mailto:evan.quan-5C7GfCeVMHo@public.gmane.org>>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 47 ++++++++++++++++------
 1 file changed, 35 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e6ce949670e5..2d72d206cead 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2090,6 +2090,7 @@ static int amdgpu_device_enable_mgpu_fan_boost(void)
  */
 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)  {
+       struct amdgpu_gpu_instance *gpu_instance;
         int i = 0, r;

         for (i = 0; i < adev->num_ip_blocks; i++) { @@ -2115,6 +2116,40 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
         if (r)
                 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);

+
+       if (adev->gmc.xgmi.num_physical_nodes > 1) {
+               mutex_lock(&mgpu_info.mutex);
+
+               /*
+                * Reset device p-state to low as this was booted with high.
+                *
+                * This should be performed only after all devices from the same
+                * hive get initialized.
+                *
+                * However, it's unknown how many device in the hive in advance.
+                * As this is counted one by one during devices initializations.
+                *
+                * So, we wait for all XGMI interlinked devices initialized.
+                * This may bring some delays as those devices may come from
+                * different hives. But that should be OK.
+                */
+               if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
[JK] This condition will never succeed.  mgpu_info.num_dgpu is 0-indexed while num_physical_nodes give total nodes.

+                       for (i = 0; i < mgpu_info.num_gpu; i++) {
+                               gpu_instance = &(mgpu_info.gpu_ins[i]);
+                               if (gpu_instance->adev->flags & AMD_IS_APU)
+                                       continue;
+
+                               r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 0);
+                               if (r) {
+                                       DRM_ERROR("pstate setting failed (%d).\n", r);
+                                       break;
+                               }
+                       }
+               }
+
+               mutex_unlock(&mgpu_info.mutex);
+       }
+
         return 0;
 }

@@ -2227,18 +2262,6 @@ static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
         r = amdgpu_ib_ring_tests(adev);
         if (r)
                 DRM_ERROR("ib ring test failed (%d).\n", r);
-
-       /*
-        * set to low pstate by default
-        * This should be performed after all devices from
-        * XGMI finish their initializations. Thus it's moved
-        * to here.
-        * The time delay is 2S. TODO: confirm whether that
-        * is enough for all possible XGMI setups.
-        */
-       r = amdgpu_xgmi_set_pstate(adev, 0);
-       if (r)
-               DRM_ERROR("pstate setting failed (%d).\n", r);
 }

 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
--
2.23.0

[-- Attachment #1.2: Type: text/html, Size: 16899 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* RE: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized
@ 2019-11-05 17:07                     ` Kim, Jonathan
  0 siblings, 0 replies; 20+ messages in thread
From: Kim, Jonathan @ 2019-11-05 17:07 UTC (permalink / raw)
  To: Strawbridge, Michael, Quan, Evan, amd-gfx


[-- Attachment #1.1: Type: text/plain, Size: 6190 bytes --]

+                       for (i = 0; i < mgpu_info.num_gpu; i++) {
Also while num_physical_nodes should be used here instead.  It doesn't make sense to have a pre-condition check i.e. (if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes - 1) ) against the total number of nodes while the loop checks for the current num gpus from mgpu_info.  This will fail to set the last node to low pstate for the same 0-indexing issue once the pre-condition passes.

+                               gpu_instance = &(mgpu_info.gpu_ins[i]);
+                               if (gpu_instance->adev->flags & AMD_IS_APU)
+                                       continue;
+

From: Kim, Jonathan
Sent: Tuesday, November 5, 2019 11:42 AM
To: Strawbridge, Michael <Michael.Strawbridge@amd.com>; Quan, Evan <Evan.Quan@amd.com>; amd-gfx@lists.freedesktop.org
Subject: RE: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized

Yes that's correct.  This should fix the issue.

Jon

From: Strawbridge, Michael <Michael.Strawbridge@amd.com<mailto:Michael.Strawbridge@amd.com>>
Sent: Tuesday, November 5, 2019 11:40 AM
To: Kim, Jonathan <Jonathan.Kim@amd.com<mailto:Jonathan.Kim@amd.com>>; Quan, Evan <Evan.Quan@amd.com<mailto:Evan.Quan@amd.com>>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
Subject: Re: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized

Hi Jon,

Does that mean we can simply use this instead?

+               if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes - 1) {

Thanks,
Michael
________________________________
From: Kim, Jonathan <Jonathan.Kim@amd.com<mailto:Jonathan.Kim@amd.com>>
Sent: 05 November 2019 11:32 AM
To: Quan, Evan <Evan.Quan@amd.com<mailto:Evan.Quan@amd.com>>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org> <amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>>
Cc: Strawbridge, Michael <Michael.Strawbridge@amd.com<mailto:Michael.Strawbridge@amd.com>>
Subject: RE: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized

Please see inline.

Jon

-----Original Message-----
From: Quan, Evan <Evan.Quan@amd.com<mailto:Evan.Quan@amd.com>>
Sent: Tuesday, November 5, 2019 5:24 AM
To: amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
Cc: Kim, Jonathan <Jonathan.Kim@amd.com<mailto:Jonathan.Kim@amd.com>>; Strawbridge, Michael <Michael.Strawbridge@amd.com<mailto:Michael.Strawbridge@amd.com>>; Quan, Evan <Evan.Quan@amd.com<mailto:Evan.Quan@amd.com>>
Subject: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized

P-state switch should be performed after all devices from the hive get initialized.

Change-Id: Ifc7cac9ef0cf250447d2a412da35d601e2ac79ec
Signed-off-by: Evan Quan <evan.quan@amd.com<mailto:evan.quan@amd.com>>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 47 ++++++++++++++++------
 1 file changed, 35 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e6ce949670e5..2d72d206cead 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2090,6 +2090,7 @@ static int amdgpu_device_enable_mgpu_fan_boost(void)
  */
 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)  {
+       struct amdgpu_gpu_instance *gpu_instance;
         int i = 0, r;

         for (i = 0; i < adev->num_ip_blocks; i++) { @@ -2115,6 +2116,40 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
         if (r)
                 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);

+
+       if (adev->gmc.xgmi.num_physical_nodes > 1) {
+               mutex_lock(&mgpu_info.mutex);
+
+               /*
+                * Reset device p-state to low as this was booted with high.
+                *
+                * This should be performed only after all devices from the same
+                * hive get initialized.
+                *
+                * However, it's unknown how many device in the hive in advance.
+                * As this is counted one by one during devices initializations.
+                *
+                * So, we wait for all XGMI interlinked devices initialized.
+                * This may bring some delays as those devices may come from
+                * different hives. But that should be OK.
+                */
+               if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
[JK] This condition will never succeed.  mgpu_info.num_dgpu is 0-indexed while num_physical_nodes give total nodes.

+                       for (i = 0; i < mgpu_info.num_gpu; i++) {
+                               gpu_instance = &(mgpu_info.gpu_ins[i]);
+                               if (gpu_instance->adev->flags & AMD_IS_APU)
+                                       continue;
+
+                               r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 0);
+                               if (r) {
+                                       DRM_ERROR("pstate setting failed (%d).\n", r);
+                                       break;
+                               }
+                       }
+               }
+
+               mutex_unlock(&mgpu_info.mutex);
+       }
+
         return 0;
 }

@@ -2227,18 +2262,6 @@ static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
         r = amdgpu_ib_ring_tests(adev);
         if (r)
                 DRM_ERROR("ib ring test failed (%d).\n", r);
-
-       /*
-        * set to low pstate by default
-        * This should be performed after all devices from
-        * XGMI finish their initializations. Thus it's moved
-        * to here.
-        * The time delay is 2S. TODO: confirm whether that
-        * is enough for all possible XGMI setups.
-        */
-       r = amdgpu_xgmi_set_pstate(adev, 0);
-       if (r)
-               DRM_ERROR("pstate setting failed (%d).\n", r);
 }

 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
--
2.23.0

[-- Attachment #1.2: Type: text/html, Size: 16185 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* RE: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized
@ 2019-11-06  2:17                         ` Kim, Jonathan
  0 siblings, 0 replies; 20+ messages in thread
From: Kim, Jonathan @ 2019-11-06  2:17 UTC (permalink / raw)
  To: Strawbridge, Michael, Quan, Evan,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1: Type: text/plain, Size: 7773 bytes --]

Taking a look at this again, it's not an indexing issue, it's a placement problem.  I don't think this solution will work if we expect to pstate switch all gpus.

>From amdgpu_kms.c,  amdgpu_register_gpu_instance comes after amdgpu_device_init.  This means we'll never reach mgpu_info.num_gpu == adev->gmc.xgmi.num_physical_nodes until in this code path.

From: Kim, Jonathan
Sent: Tuesday, November 5, 2019 12:07 PM
To: Strawbridge, Michael <Michael.Strawbridge-5C7GfCeVMHo@public.gmane.org>; Quan, Evan <Evan.Quan-5C7GfCeVMHo@public.gmane.org>; amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
Subject: RE: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized

+                       for (i = 0; i < mgpu_info.num_gpu; i++) {
Also while num_physical_nodes should be used here instead.  It doesn't make sense to have a pre-condition check i.e. (if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes - 1) ) against the total number of nodes while the loop checks for the current num gpus from mgpu_info.  This will fail to set the last node to low pstate for the same 0-indexing issue once the pre-condition passes.

+                               gpu_instance = &(mgpu_info.gpu_ins[i]);
+                               if (gpu_instance->adev->flags & AMD_IS_APU)
+                                       continue;
+

From: Kim, Jonathan
Sent: Tuesday, November 5, 2019 11:42 AM
To: Strawbridge, Michael <Michael.Strawbridge-5C7GfCeVMHo@public.gmane.org<mailto:Michael.Strawbridge-5C7GfCeVMHo@public.gmane.org>>; Quan, Evan <Evan.Quan-5C7GfCeVMHo@public.gmane.org<mailto:Evan.Quan-5C7GfCeVMHo@public.gmane.org>>; amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org<mailto:amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org>
Subject: RE: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized

Yes that's correct.  This should fix the issue.

Jon

From: Strawbridge, Michael <Michael.Strawbridge-5C7GfCeVMHo@public.gmane.org<mailto:Michael.Strawbridge-5C7GfCeVMHo@public.gmane.org>>
Sent: Tuesday, November 5, 2019 11:40 AM
To: Kim, Jonathan <Jonathan.Kim-5C7GfCeVMHo@public.gmane.org<mailto:Jonathan.Kim-5C7GfCeVMHo@public.gmane.org>>; Quan, Evan <Evan.Quan-5C7GfCeVMHo@public.gmane.org<mailto:Evan.Quan-5C7GfCeVMHo@public.gmane.org>>; amd-gfx-PD4FTy7X32mqWrfYKbYh0A@public.gmane.orgktop.org<mailto:amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org>
Subject: Re: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized

Hi Jon,

Does that mean we can simply use this instead?

+               if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes - 1) {

Thanks,
Michael
________________________________
From: Kim, Jonathan <Jonathan.Kim-5C7GfCeVMHo@public.gmane.org<mailto:Jonathan.Kim-5C7GfCeVMHo@public.gmane.org>>
Sent: 05 November 2019 11:32 AM
To: Quan, Evan <Evan.Quan-5C7GfCeVMHo@public.gmane.org<mailto:Evan.Quan-5C7GfCeVMHo@public.gmane.org>>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org> <amd-gfx-PD4FTy7X32mMSPqsTGOZug@public.gmane.orgesktop.org<mailto:amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org>>
Cc: Strawbridge, Michael <Michael.Strawbridge-5C7GfCeVMHo@public.gmane.org<mailto:Michael.Strawbridge-5C7GfCeVMHo@public.gmane.org>>
Subject: RE: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized

Please see inline.

Jon

-----Original Message-----
From: Quan, Evan <Evan.Quan-5C7GfCeVMHo@public.gmane.org<mailto:Evan.Quan-5C7GfCeVMHo@public.gmane.org>>
Sent: Tuesday, November 5, 2019 5:24 AM
To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org<mailto:amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org>
Cc: Kim, Jonathan <Jonathan.Kim-5C7GfCeVMHo@public.gmane.org<mailto:Jonathan.Kim-5C7GfCeVMHo@public.gmane.org>>; Strawbridge, Michael <Michael.Strawbridge-5C7GfCeVMHo@public.gmane.org<mailto:Michael.Strawbridge@amd.com>>; Quan, Evan <Evan.Quan-5C7GfCeVMHo@public.gmane.org<mailto:Evan.Quan-5C7GfCeVMHo@public.gmane.org>>
Subject: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized

P-state switch should be performed after all devices from the hive get initialized.

Change-Id: Ifc7cac9ef0cf250447d2a412da35d601e2ac79ec
Signed-off-by: Evan Quan <evan.quan-5C7GfCeVMHo@public.gmane.org<mailto:evan.quan-5C7GfCeVMHo@public.gmane.org>>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 47 ++++++++++++++++------
 1 file changed, 35 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e6ce949670e5..2d72d206cead 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2090,6 +2090,7 @@ static int amdgpu_device_enable_mgpu_fan_boost(void)
  */
 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)  {
+       struct amdgpu_gpu_instance *gpu_instance;
         int i = 0, r;

         for (i = 0; i < adev->num_ip_blocks; i++) { @@ -2115,6 +2116,40 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
         if (r)
                 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);

+
+       if (adev->gmc.xgmi.num_physical_nodes > 1) {
+               mutex_lock(&mgpu_info.mutex);
+
+               /*
+                * Reset device p-state to low as this was booted with high.
+                *
+                * This should be performed only after all devices from the same
+                * hive get initialized.
+                *
+                * However, it's unknown how many device in the hive in advance.
+                * As this is counted one by one during devices initializations.
+                *
+                * So, we wait for all XGMI interlinked devices initialized.
+                * This may bring some delays as those devices may come from
+                * different hives. But that should be OK.
+                */
+               if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
[JK] This condition will never succeed.  mgpu_info.num_dgpu is 0-indexed while num_physical_nodes give total nodes.

+                       for (i = 0; i < mgpu_info.num_gpu; i++) {
+                               gpu_instance = &(mgpu_info.gpu_ins[i]);
+                               if (gpu_instance->adev->flags & AMD_IS_APU)
+                                       continue;
+
+                               r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 0);
+                               if (r) {
+                                       DRM_ERROR("pstate setting failed (%d).\n", r);
+                                       break;
+                               }
+                       }
+               }
+
+               mutex_unlock(&mgpu_info.mutex);
+       }
+
         return 0;
 }

@@ -2227,18 +2262,6 @@ static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
         r = amdgpu_ib_ring_tests(adev);
         if (r)
                 DRM_ERROR("ib ring test failed (%d).\n", r);
-
-       /*
-        * set to low pstate by default
-        * This should be performed after all devices from
-        * XGMI finish their initializations. Thus it's moved
-        * to here.
-        * The time delay is 2S. TODO: confirm whether that
-        * is enough for all possible XGMI setups.
-        */
-       r = amdgpu_xgmi_set_pstate(adev, 0);
-       if (r)
-               DRM_ERROR("pstate setting failed (%d).\n", r);
 }

 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
--
2.23.0

[-- Attachment #1.2: Type: text/html, Size: 18338 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* RE: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized
@ 2019-11-06  2:17                         ` Kim, Jonathan
  0 siblings, 0 replies; 20+ messages in thread
From: Kim, Jonathan @ 2019-11-06  2:17 UTC (permalink / raw)
  To: Strawbridge, Michael, Quan, Evan, amd-gfx


[-- Attachment #1.1: Type: text/plain, Size: 6927 bytes --]

Taking a look at this again, it's not an indexing issue, it's a placement problem.  I don't think this solution will work if we expect to pstate switch all gpus.

From amdgpu_kms.c,  amdgpu_register_gpu_instance comes after amdgpu_device_init.  This means we'll never reach mgpu_info.num_gpu == adev->gmc.xgmi.num_physical_nodes until in this code path.

From: Kim, Jonathan
Sent: Tuesday, November 5, 2019 12:07 PM
To: Strawbridge, Michael <Michael.Strawbridge@amd.com>; Quan, Evan <Evan.Quan@amd.com>; amd-gfx@lists.freedesktop.org
Subject: RE: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized

+                       for (i = 0; i < mgpu_info.num_gpu; i++) {
Also while num_physical_nodes should be used here instead.  It doesn't make sense to have a pre-condition check i.e. (if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes - 1) ) against the total number of nodes while the loop checks for the current num gpus from mgpu_info.  This will fail to set the last node to low pstate for the same 0-indexing issue once the pre-condition passes.

+                               gpu_instance = &(mgpu_info.gpu_ins[i]);
+                               if (gpu_instance->adev->flags & AMD_IS_APU)
+                                       continue;
+

From: Kim, Jonathan
Sent: Tuesday, November 5, 2019 11:42 AM
To: Strawbridge, Michael <Michael.Strawbridge@amd.com<mailto:Michael.Strawbridge@amd.com>>; Quan, Evan <Evan.Quan@amd.com<mailto:Evan.Quan@amd.com>>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
Subject: RE: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized

Yes that's correct.  This should fix the issue.

Jon

From: Strawbridge, Michael <Michael.Strawbridge@amd.com<mailto:Michael.Strawbridge@amd.com>>
Sent: Tuesday, November 5, 2019 11:40 AM
To: Kim, Jonathan <Jonathan.Kim@amd.com<mailto:Jonathan.Kim@amd.com>>; Quan, Evan <Evan.Quan@amd.com<mailto:Evan.Quan@amd.com>>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
Subject: Re: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized

Hi Jon,

Does that mean we can simply use this instead?

+               if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes - 1) {

Thanks,
Michael
________________________________
From: Kim, Jonathan <Jonathan.Kim@amd.com<mailto:Jonathan.Kim@amd.com>>
Sent: 05 November 2019 11:32 AM
To: Quan, Evan <Evan.Quan@amd.com<mailto:Evan.Quan@amd.com>>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org> <amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>>
Cc: Strawbridge, Michael <Michael.Strawbridge@amd.com<mailto:Michael.Strawbridge@amd.com>>
Subject: RE: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized

Please see inline.

Jon

-----Original Message-----
From: Quan, Evan <Evan.Quan@amd.com<mailto:Evan.Quan@amd.com>>
Sent: Tuesday, November 5, 2019 5:24 AM
To: amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
Cc: Kim, Jonathan <Jonathan.Kim@amd.com<mailto:Jonathan.Kim@amd.com>>; Strawbridge, Michael <Michael.Strawbridge@amd.com<mailto:Michael.Strawbridge@amd.com>>; Quan, Evan <Evan.Quan@amd.com<mailto:Evan.Quan@amd.com>>
Subject: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized

P-state switch should be performed after all devices from the hive get initialized.

Change-Id: Ifc7cac9ef0cf250447d2a412da35d601e2ac79ec
Signed-off-by: Evan Quan <evan.quan@amd.com<mailto:evan.quan@amd.com>>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 47 ++++++++++++++++------
 1 file changed, 35 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e6ce949670e5..2d72d206cead 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2090,6 +2090,7 @@ static int amdgpu_device_enable_mgpu_fan_boost(void)
  */
 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)  {
+       struct amdgpu_gpu_instance *gpu_instance;
         int i = 0, r;

         for (i = 0; i < adev->num_ip_blocks; i++) { @@ -2115,6 +2116,40 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
         if (r)
                 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);

+
+       if (adev->gmc.xgmi.num_physical_nodes > 1) {
+               mutex_lock(&mgpu_info.mutex);
+
+               /*
+                * Reset device p-state to low as this was booted with high.
+                *
+                * This should be performed only after all devices from the same
+                * hive get initialized.
+                *
+                * However, it's unknown how many device in the hive in advance.
+                * As this is counted one by one during devices initializations.
+                *
+                * So, we wait for all XGMI interlinked devices initialized.
+                * This may bring some delays as those devices may come from
+                * different hives. But that should be OK.
+                */
+               if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
[JK] This condition will never succeed.  mgpu_info.num_dgpu is 0-indexed while num_physical_nodes give total nodes.

+                       for (i = 0; i < mgpu_info.num_gpu; i++) {
+                               gpu_instance = &(mgpu_info.gpu_ins[i]);
+                               if (gpu_instance->adev->flags & AMD_IS_APU)
+                                       continue;
+
+                               r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 0);
+                               if (r) {
+                                       DRM_ERROR("pstate setting failed (%d).\n", r);
+                                       break;
+                               }
+                       }
+               }
+
+               mutex_unlock(&mgpu_info.mutex);
+       }
+
         return 0;
 }

@@ -2227,18 +2262,6 @@ static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
         r = amdgpu_ib_ring_tests(adev);
         if (r)
                 DRM_ERROR("ib ring test failed (%d).\n", r);
-
-       /*
-        * set to low pstate by default
-        * This should be performed after all devices from
-        * XGMI finish their initializations. Thus it's moved
-        * to here.
-        * The time delay is 2S. TODO: confirm whether that
-        * is enough for all possible XGMI setups.
-        */
-       r = amdgpu_xgmi_set_pstate(adev, 0);
-       if (r)
-               DRM_ERROR("pstate setting failed (%d).\n", r);
 }

 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
--
2.23.0

[-- Attachment #1.2: Type: text/html, Size: 17502 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* RE: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized
@ 2019-11-06  3:10                             ` Quan, Evan
  0 siblings, 0 replies; 20+ messages in thread
From: Quan, Evan @ 2019-11-06  3:10 UTC (permalink / raw)
  To: Kim, Jonathan, Strawbridge, Michael,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1: Type: text/plain, Size: 8573 bytes --]

Thanks Jon.
Per discussed in another mail thread, this should be applied after the patch below
https://lists.freedesktop.org/archives/amd-gfx/2019-November/042160.html
Sorry for missing this important information.

Regards,
Evan
From: Kim, Jonathan <Jonathan.Kim-5C7GfCeVMHo@public.gmane.org>
Sent: Wednesday, November 6, 2019 10:18 AM
To: Strawbridge, Michael <Michael.Strawbridge-5C7GfCeVMHo@public.gmane.org>; Quan, Evan <Evan.Quan-5C7GfCeVMHo@public.gmane.org>; amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
Subject: RE: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized

Taking a look at this again, it's not an indexing issue, it's a placement problem.  I don't think this solution will work if we expect to pstate switch all gpus.

>From amdgpu_kms.c,  amdgpu_register_gpu_instance comes after amdgpu_device_init.  This means we'll never reach mgpu_info.num_gpu == adev->gmc.xgmi.num_physical_nodes until in this code path.

From: Kim, Jonathan
Sent: Tuesday, November 5, 2019 12:07 PM
To: Strawbridge, Michael <Michael.Strawbridge-5C7GfCeVMHo@public.gmane.org<mailto:Michael.Strawbridge-5C7GfCeVMHo@public.gmane.org>>; Quan, Evan <Evan.Quan-5C7GfCeVMHo@public.gmane.org<mailto:Evan.Quan-5C7GfCeVMHo@public.gmane.org>>; amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org<mailto:amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org>
Subject: RE: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized

+                       for (i = 0; i < mgpu_info.num_gpu; i++) {
Also while num_physical_nodes should be used here instead.  It doesn't make sense to have a pre-condition check i.e. (if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes - 1) ) against the total number of nodes while the loop checks for the current num gpus from mgpu_info.  This will fail to set the last node to low pstate for the same 0-indexing issue once the pre-condition passes.

+                               gpu_instance = &(mgpu_info.gpu_ins[i]);
+                               if (gpu_instance->adev->flags & AMD_IS_APU)
+                                       continue;
+

From: Kim, Jonathan
Sent: Tuesday, November 5, 2019 11:42 AM
To: Strawbridge, Michael <Michael.Strawbridge-5C7GfCeVMHo@public.gmane.org<mailto:Michael.Strawbridge-5C7GfCeVMHo@public.gmane.org>>; Quan, Evan <Evan.Quan-5C7GfCeVMHo@public.gmane.org<mailto:Evan.Quan-5C7GfCeVMHo@public.gmane.org>>; amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org<mailto:amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org>
Subject: RE: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized

Yes that's correct.  This should fix the issue.

Jon

From: Strawbridge, Michael <Michael.Strawbridge-5C7GfCeVMHo@public.gmane.org<mailto:Michael.Strawbridge-5C7GfCeVMHo@public.gmane.org>>
Sent: Tuesday, November 5, 2019 11:40 AM
To: Kim, Jonathan <Jonathan.Kim-5C7GfCeVMHo@public.gmane.org<mailto:Jonathan.Kim-5C7GfCeVMHo@public.gmane.org>>; Quan, Evan <Evan.Quan-5C7GfCeVMHo@public.gmane.org<mailto:Evan.Quan-5C7GfCeVMHo@public.gmane.org>>; amd-gfx-PD4FTy7X32mqWrfYKbYh0A@public.gmane.orgktop.org<mailto:amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org>
Subject: Re: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized

Hi Jon,

Does that mean we can simply use this instead?

+               if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes - 1) {

Thanks,
Michael
________________________________
From: Kim, Jonathan <Jonathan.Kim-5C7GfCeVMHo@public.gmane.org<mailto:Jonathan.Kim-5C7GfCeVMHo@public.gmane.org>>
Sent: 05 November 2019 11:32 AM
To: Quan, Evan <Evan.Quan-5C7GfCeVMHo@public.gmane.org<mailto:Evan.Quan-5C7GfCeVMHo@public.gmane.org>>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org> <amd-gfx-PD4FTy7X32mMSPqsTGOZug@public.gmane.orgesktop.org<mailto:amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org>>
Cc: Strawbridge, Michael <Michael.Strawbridge-5C7GfCeVMHo@public.gmane.org<mailto:Michael.Strawbridge-5C7GfCeVMHo@public.gmane.org>>
Subject: RE: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized

Please see inline.

Jon

-----Original Message-----
From: Quan, Evan <Evan.Quan-5C7GfCeVMHo@public.gmane.org<mailto:Evan.Quan-5C7GfCeVMHo@public.gmane.org>>
Sent: Tuesday, November 5, 2019 5:24 AM
To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org<mailto:amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org>
Cc: Kim, Jonathan <Jonathan.Kim-5C7GfCeVMHo@public.gmane.org<mailto:Jonathan.Kim-5C7GfCeVMHo@public.gmane.org>>; Strawbridge, Michael <Michael.Strawbridge-5C7GfCeVMHo@public.gmane.org<mailto:Michael.Strawbridge@amd.com>>; Quan, Evan <Evan.Quan-5C7GfCeVMHo@public.gmane.org<mailto:Evan.Quan-5C7GfCeVMHo@public.gmane.org>>
Subject: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized

P-state switch should be performed after all devices from the hive get initialized.

Change-Id: Ifc7cac9ef0cf250447d2a412da35d601e2ac79ec
Signed-off-by: Evan Quan <evan.quan-5C7GfCeVMHo@public.gmane.org<mailto:evan.quan-5C7GfCeVMHo@public.gmane.org>>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 47 ++++++++++++++++------
 1 file changed, 35 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e6ce949670e5..2d72d206cead 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2090,6 +2090,7 @@ static int amdgpu_device_enable_mgpu_fan_boost(void)
  */
 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)  {
+       struct amdgpu_gpu_instance *gpu_instance;
         int i = 0, r;

         for (i = 0; i < adev->num_ip_blocks; i++) { @@ -2115,6 +2116,40 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
         if (r)
                 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);

+
+       if (adev->gmc.xgmi.num_physical_nodes > 1) {
+               mutex_lock(&mgpu_info.mutex);
+
+               /*
+                * Reset device p-state to low as this was booted with high.
+                *
+                * This should be performed only after all devices from the same
+                * hive get initialized.
+                *
+                * However, it's unknown how many device in the hive in advance.
+                * As this is counted one by one during devices initializations.
+                *
+                * So, we wait for all XGMI interlinked devices initialized.
+                * This may bring some delays as those devices may come from
+                * different hives. But that should be OK.
+                */
+               if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
[JK] This condition will never succeed.  mgpu_info.num_dgpu is 0-indexed while num_physical_nodes give total nodes.

+                       for (i = 0; i < mgpu_info.num_gpu; i++) {
+                               gpu_instance = &(mgpu_info.gpu_ins[i]);
+                               if (gpu_instance->adev->flags & AMD_IS_APU)
+                                       continue;
+
+                               r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 0);
+                               if (r) {
+                                       DRM_ERROR("pstate setting failed (%d).\n", r);
+                                       break;
+                               }
+                       }
+               }
+
+               mutex_unlock(&mgpu_info.mutex);
+       }
+
         return 0;
 }

@@ -2227,18 +2262,6 @@ static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
         r = amdgpu_ib_ring_tests(adev);
         if (r)
                 DRM_ERROR("ib ring test failed (%d).\n", r);
-
-       /*
-        * set to low pstate by default
-        * This should be performed after all devices from
-        * XGMI finish their initializations. Thus it's moved
-        * to here.
-        * The time delay is 2S. TODO: confirm whether that
-        * is enough for all possible XGMI setups.
-        */
-       r = amdgpu_xgmi_set_pstate(adev, 0);
-       if (r)
-               DRM_ERROR("pstate setting failed (%d).\n", r);
 }

 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
--
2.23.0

[-- Attachment #1.2: Type: text/html, Size: 20083 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* RE: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized
@ 2019-11-06  3:10                             ` Quan, Evan
  0 siblings, 0 replies; 20+ messages in thread
From: Quan, Evan @ 2019-11-06  3:10 UTC (permalink / raw)
  To: Kim, Jonathan, Strawbridge, Michael, amd-gfx


[-- Attachment #1.1: Type: text/plain, Size: 7566 bytes --]

Thanks Jon.
Per discussed in another mail thread, this should be applied after the patch below
https://lists.freedesktop.org/archives/amd-gfx/2019-November/042160.html
Sorry for missing this important information.

Regards,
Evan
From: Kim, Jonathan <Jonathan.Kim@amd.com>
Sent: Wednesday, November 6, 2019 10:18 AM
To: Strawbridge, Michael <Michael.Strawbridge@amd.com>; Quan, Evan <Evan.Quan@amd.com>; amd-gfx@lists.freedesktop.org
Subject: RE: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized

Taking a look at this again, it's not an indexing issue, it's a placement problem.  I don't think this solution will work if we expect to pstate switch all gpus.

From amdgpu_kms.c,  amdgpu_register_gpu_instance comes after amdgpu_device_init.  This means we'll never reach mgpu_info.num_gpu == adev->gmc.xgmi.num_physical_nodes until in this code path.

From: Kim, Jonathan
Sent: Tuesday, November 5, 2019 12:07 PM
To: Strawbridge, Michael <Michael.Strawbridge@amd.com<mailto:Michael.Strawbridge@amd.com>>; Quan, Evan <Evan.Quan@amd.com<mailto:Evan.Quan@amd.com>>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
Subject: RE: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized

+                       for (i = 0; i < mgpu_info.num_gpu; i++) {
Also while num_physical_nodes should be used here instead.  It doesn't make sense to have a pre-condition check i.e. (if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes - 1) ) against the total number of nodes while the loop checks for the current num gpus from mgpu_info.  This will fail to set the last node to low pstate for the same 0-indexing issue once the pre-condition passes.

+                               gpu_instance = &(mgpu_info.gpu_ins[i]);
+                               if (gpu_instance->adev->flags & AMD_IS_APU)
+                                       continue;
+

From: Kim, Jonathan
Sent: Tuesday, November 5, 2019 11:42 AM
To: Strawbridge, Michael <Michael.Strawbridge@amd.com<mailto:Michael.Strawbridge@amd.com>>; Quan, Evan <Evan.Quan@amd.com<mailto:Evan.Quan@amd.com>>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
Subject: RE: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized

Yes that's correct.  This should fix the issue.

Jon

From: Strawbridge, Michael <Michael.Strawbridge@amd.com<mailto:Michael.Strawbridge@amd.com>>
Sent: Tuesday, November 5, 2019 11:40 AM
To: Kim, Jonathan <Jonathan.Kim@amd.com<mailto:Jonathan.Kim@amd.com>>; Quan, Evan <Evan.Quan@amd.com<mailto:Evan.Quan@amd.com>>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
Subject: Re: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized

Hi Jon,

Does that mean we can simply use this instead?

+               if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes - 1) {

Thanks,
Michael
________________________________
From: Kim, Jonathan <Jonathan.Kim@amd.com<mailto:Jonathan.Kim@amd.com>>
Sent: 05 November 2019 11:32 AM
To: Quan, Evan <Evan.Quan@amd.com<mailto:Evan.Quan@amd.com>>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org> <amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>>
Cc: Strawbridge, Michael <Michael.Strawbridge@amd.com<mailto:Michael.Strawbridge@amd.com>>
Subject: RE: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized

Please see inline.

Jon

-----Original Message-----
From: Quan, Evan <Evan.Quan@amd.com<mailto:Evan.Quan@amd.com>>
Sent: Tuesday, November 5, 2019 5:24 AM
To: amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
Cc: Kim, Jonathan <Jonathan.Kim@amd.com<mailto:Jonathan.Kim@amd.com>>; Strawbridge, Michael <Michael.Strawbridge@amd.com<mailto:Michael.Strawbridge@amd.com>>; Quan, Evan <Evan.Quan@amd.com<mailto:Evan.Quan@amd.com>>
Subject: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized

P-state switch should be performed after all devices from the hive get initialized.

Change-Id: Ifc7cac9ef0cf250447d2a412da35d601e2ac79ec
Signed-off-by: Evan Quan <evan.quan@amd.com<mailto:evan.quan@amd.com>>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 47 ++++++++++++++++------
 1 file changed, 35 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e6ce949670e5..2d72d206cead 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2090,6 +2090,7 @@ static int amdgpu_device_enable_mgpu_fan_boost(void)
  */
 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)  {
+       struct amdgpu_gpu_instance *gpu_instance;
         int i = 0, r;

         for (i = 0; i < adev->num_ip_blocks; i++) { @@ -2115,6 +2116,40 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
         if (r)
                 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);

+
+       if (adev->gmc.xgmi.num_physical_nodes > 1) {
+               mutex_lock(&mgpu_info.mutex);
+
+               /*
+                * Reset device p-state to low as this was booted with high.
+                *
+                * This should be performed only after all devices from the same
+                * hive get initialized.
+                *
+                * However, it's unknown how many device in the hive in advance.
+                * As this is counted one by one during devices initializations.
+                *
+                * So, we wait for all XGMI interlinked devices initialized.
+                * This may bring some delays as those devices may come from
+                * different hives. But that should be OK.
+                */
+               if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
[JK] This condition will never succeed.  mgpu_info.num_dgpu is 0-indexed while num_physical_nodes give total nodes.

+                       for (i = 0; i < mgpu_info.num_gpu; i++) {
+                               gpu_instance = &(mgpu_info.gpu_ins[i]);
+                               if (gpu_instance->adev->flags & AMD_IS_APU)
+                                       continue;
+
+                               r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 0);
+                               if (r) {
+                                       DRM_ERROR("pstate setting failed (%d).\n", r);
+                                       break;
+                               }
+                       }
+               }
+
+               mutex_unlock(&mgpu_info.mutex);
+       }
+
         return 0;
 }

@@ -2227,18 +2262,6 @@ static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
         r = amdgpu_ib_ring_tests(adev);
         if (r)
                 DRM_ERROR("ib ring test failed (%d).\n", r);
-
-       /*
-        * set to low pstate by default
-        * This should be performed after all devices from
-        * XGMI finish their initializations. Thus it's moved
-        * to here.
-        * The time delay is 2S. TODO: confirm whether that
-        * is enough for all possible XGMI setups.
-        */
-       r = amdgpu_xgmi_set_pstate(adev, 0);
-       if (r)
-               DRM_ERROR("pstate setting failed (%d).\n", r);
 }

 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
--
2.23.0

[-- Attachment #1.2: Type: text/html, Size: 19104 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* RE: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized
@ 2019-11-06  3:49                                 ` Kim, Jonathan
  0 siblings, 0 replies; 20+ messages in thread
From: Kim, Jonathan @ 2019-11-06  3:49 UTC (permalink / raw)
  To: Quan, Evan, Strawbridge, Michael,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1: Type: text/plain, Size: 9375 bytes --]

No problem.  With the incoming patch referenced below, this patch should be ok since it doesn't break the build.

Reviewed-by:  Jonathan Kim <Jonathan.Kim-5C7GfCeVMHo@public.gmane.org>

From: Quan, Evan <Evan.Quan-5C7GfCeVMHo@public.gmane.org>
Sent: Tuesday, November 5, 2019 10:11 PM
To: Kim, Jonathan <Jonathan.Kim-5C7GfCeVMHo@public.gmane.org>; Strawbridge, Michael <Michael.Strawbridge-5C7GfCeVMHo@public.gmane.org>; amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
Subject: RE: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized

Thanks Jon.
Per discussed in another mail thread, this should be applied after the patch below
https://lists.freedesktop.org/archives/amd-gfx/2019-November/042160.html
Sorry for missing this important information.

Regards,
Evan
From: Kim, Jonathan <Jonathan.Kim-5C7GfCeVMHo@public.gmane.org<mailto:Jonathan.Kim-5C7GfCeVMHo@public.gmane.org>>
Sent: Wednesday, November 6, 2019 10:18 AM
To: Strawbridge, Michael <Michael.Strawbridge-5C7GfCeVMHo@public.gmane.org<mailto:Michael.Strawbridge-5C7GfCeVMHo@public.gmane.org>>; Quan, Evan <Evan.Quan-5C7GfCeVMHo@public.gmane.org<mailto:Evan.Quan-5C7GfCeVMHo@public.gmane.org>>; amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org<mailto:amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org>
Subject: RE: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized

Taking a look at this again, it's not an indexing issue, it's a placement problem.  I don't think this solution will work if we expect to pstate switch all gpus.

>From amdgpu_kms.c,  amdgpu_register_gpu_instance comes after amdgpu_device_init.  This means we'll never reach mgpu_info.num_gpu == adev->gmc.xgmi.num_physical_nodes until in this code path.

From: Kim, Jonathan
Sent: Tuesday, November 5, 2019 12:07 PM
To: Strawbridge, Michael <Michael.Strawbridge-5C7GfCeVMHo@public.gmane.org<mailto:Michael.Strawbridge-5C7GfCeVMHo@public.gmane.org>>; Quan, Evan <Evan.Quan-5C7GfCeVMHo@public.gmane.org<mailto:Evan.Quan-5C7GfCeVMHo@public.gmane.org>>; amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org<mailto:amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org>
Subject: RE: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized

+                       for (i = 0; i < mgpu_info.num_gpu; i++) {
Also while num_physical_nodes should be used here instead.  It doesn't make sense to have a pre-condition check i.e. (if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes - 1) ) against the total number of nodes while the loop checks for the current num gpus from mgpu_info.  This will fail to set the last node to low pstate for the same 0-indexing issue once the pre-condition passes.

+                               gpu_instance = &(mgpu_info.gpu_ins[i]);
+                               if (gpu_instance->adev->flags & AMD_IS_APU)
+                                       continue;
+

From: Kim, Jonathan
Sent: Tuesday, November 5, 2019 11:42 AM
To: Strawbridge, Michael <Michael.Strawbridge-5C7GfCeVMHo@public.gmane.org<mailto:Michael.Strawbridge-5C7GfCeVMHo@public.gmane.org>>; Quan, Evan <Evan.Quan-5C7GfCeVMHo@public.gmane.org<mailto:Evan.Quan-5C7GfCeVMHo@public.gmane.org>>; amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org<mailto:amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org>
Subject: RE: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized

Yes that's correct.  This should fix the issue.

Jon

From: Strawbridge, Michael <Michael.Strawbridge-5C7GfCeVMHo@public.gmane.org<mailto:Michael.Strawbridge-5C7GfCeVMHo@public.gmane.org>>
Sent: Tuesday, November 5, 2019 11:40 AM
To: Kim, Jonathan <Jonathan.Kim-5C7GfCeVMHo@public.gmane.org<mailto:Jonathan.Kim-5C7GfCeVMHo@public.gmane.org>>; Quan, Evan <Evan.Quan-5C7GfCeVMHo@public.gmane.org<mailto:Evan.Quan-5C7GfCeVMHo@public.gmane.org>>; amd-gfx-PD4FTy7X32mqWrfYKbYh0A@public.gmane.orgktop.org<mailto:amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org>
Subject: Re: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized

Hi Jon,

Does that mean we can simply use this instead?

+               if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes - 1) {

Thanks,
Michael
________________________________
From: Kim, Jonathan <Jonathan.Kim-5C7GfCeVMHo@public.gmane.org<mailto:Jonathan.Kim-5C7GfCeVMHo@public.gmane.org>>
Sent: 05 November 2019 11:32 AM
To: Quan, Evan <Evan.Quan-5C7GfCeVMHo@public.gmane.org<mailto:Evan.Quan-5C7GfCeVMHo@public.gmane.org>>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org> <amd-gfx-PD4FTy7X32mMSPqsTGOZug@public.gmane.orgesktop.org<mailto:amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org>>
Cc: Strawbridge, Michael <Michael.Strawbridge-5C7GfCeVMHo@public.gmane.org<mailto:Michael.Strawbridge-5C7GfCeVMHo@public.gmane.org>>
Subject: RE: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized

Please see inline.

Jon

-----Original Message-----
From: Quan, Evan <Evan.Quan-5C7GfCeVMHo@public.gmane.org<mailto:Evan.Quan-5C7GfCeVMHo@public.gmane.org>>
Sent: Tuesday, November 5, 2019 5:24 AM
To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org<mailto:amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org>
Cc: Kim, Jonathan <Jonathan.Kim-5C7GfCeVMHo@public.gmane.org<mailto:Jonathan.Kim-5C7GfCeVMHo@public.gmane.org>>; Strawbridge, Michael <Michael.Strawbridge-5C7GfCeVMHo@public.gmane.org<mailto:Michael.Strawbridge@amd.com>>; Quan, Evan <Evan.Quan-5C7GfCeVMHo@public.gmane.org<mailto:Evan.Quan-5C7GfCeVMHo@public.gmane.org>>
Subject: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized

P-state switch should be performed after all devices from the hive get initialized.

Change-Id: Ifc7cac9ef0cf250447d2a412da35d601e2ac79ec
Signed-off-by: Evan Quan <evan.quan-5C7GfCeVMHo@public.gmane.org<mailto:evan.quan-5C7GfCeVMHo@public.gmane.org>>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 47 ++++++++++++++++------
 1 file changed, 35 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e6ce949670e5..2d72d206cead 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2090,6 +2090,7 @@ static int amdgpu_device_enable_mgpu_fan_boost(void)
  */
 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)  {
+       struct amdgpu_gpu_instance *gpu_instance;
         int i = 0, r;

         for (i = 0; i < adev->num_ip_blocks; i++) { @@ -2115,6 +2116,40 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
         if (r)
                 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);

+
+       if (adev->gmc.xgmi.num_physical_nodes > 1) {
+               mutex_lock(&mgpu_info.mutex);
+
+               /*
+                * Reset device p-state to low as this was booted with high.
+                *
+                * This should be performed only after all devices from the same
+                * hive get initialized.
+                *
+                * However, it's unknown how many device in the hive in advance.
+                * As this is counted one by one during devices initializations.
+                *
+                * So, we wait for all XGMI interlinked devices initialized.
+                * This may bring some delays as those devices may come from
+                * different hives. But that should be OK.
+                */
+               if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
[JK] This condition will never succeed.  mgpu_info.num_dgpu is 0-indexed while num_physical_nodes give total nodes.

+                       for (i = 0; i < mgpu_info.num_gpu; i++) {
+                               gpu_instance = &(mgpu_info.gpu_ins[i]);
+                               if (gpu_instance->adev->flags & AMD_IS_APU)
+                                       continue;
+
+                               r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 0);
+                               if (r) {
+                                       DRM_ERROR("pstate setting failed (%d).\n", r);
+                                       break;
+                               }
+                       }
+               }
+
+               mutex_unlock(&mgpu_info.mutex);
+       }
+
         return 0;
 }

@@ -2227,18 +2262,6 @@ static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
         r = amdgpu_ib_ring_tests(adev);
         if (r)
                 DRM_ERROR("ib ring test failed (%d).\n", r);
-
-       /*
-        * set to low pstate by default
-        * This should be performed after all devices from
-        * XGMI finish their initializations. Thus it's moved
-        * to here.
-        * The time delay is 2S. TODO: confirm whether that
-        * is enough for all possible XGMI setups.
-        */
-       r = amdgpu_xgmi_set_pstate(adev, 0);
-       if (r)
-               DRM_ERROR("pstate setting failed (%d).\n", r);
 }

 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
--
2.23.0

[-- Attachment #1.2: Type: text/html, Size: 21392 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* RE: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized
@ 2019-11-06  3:49                                 ` Kim, Jonathan
  0 siblings, 0 replies; 20+ messages in thread
From: Kim, Jonathan @ 2019-11-06  3:49 UTC (permalink / raw)
  To: Quan, Evan, Strawbridge, Michael, amd-gfx


[-- Attachment #1.1: Type: text/plain, Size: 8165 bytes --]

No problem.  With the incoming patch referenced below, this patch should be ok since it doesn't break the build.

Reviewed-by:  Jonathan Kim <Jonathan.Kim@amd.com>

From: Quan, Evan <Evan.Quan@amd.com>
Sent: Tuesday, November 5, 2019 10:11 PM
To: Kim, Jonathan <Jonathan.Kim@amd.com>; Strawbridge, Michael <Michael.Strawbridge@amd.com>; amd-gfx@lists.freedesktop.org
Subject: RE: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized

Thanks Jon.
Per discussed in another mail thread, this should be applied after the patch below
https://lists.freedesktop.org/archives/amd-gfx/2019-November/042160.html
Sorry for missing this important information.

Regards,
Evan
From: Kim, Jonathan <Jonathan.Kim@amd.com<mailto:Jonathan.Kim@amd.com>>
Sent: Wednesday, November 6, 2019 10:18 AM
To: Strawbridge, Michael <Michael.Strawbridge@amd.com<mailto:Michael.Strawbridge@amd.com>>; Quan, Evan <Evan.Quan@amd.com<mailto:Evan.Quan@amd.com>>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
Subject: RE: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized

Taking a look at this again, it's not an indexing issue, it's a placement problem.  I don't think this solution will work if we expect to pstate switch all gpus.

From amdgpu_kms.c,  amdgpu_register_gpu_instance comes after amdgpu_device_init.  This means we'll never reach mgpu_info.num_gpu == adev->gmc.xgmi.num_physical_nodes until in this code path.

From: Kim, Jonathan
Sent: Tuesday, November 5, 2019 12:07 PM
To: Strawbridge, Michael <Michael.Strawbridge@amd.com<mailto:Michael.Strawbridge@amd.com>>; Quan, Evan <Evan.Quan@amd.com<mailto:Evan.Quan@amd.com>>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
Subject: RE: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized

+                       for (i = 0; i < mgpu_info.num_gpu; i++) {
Also while num_physical_nodes should be used here instead.  It doesn't make sense to have a pre-condition check i.e. (if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes - 1) ) against the total number of nodes while the loop checks for the current num gpus from mgpu_info.  This will fail to set the last node to low pstate for the same 0-indexing issue once the pre-condition passes.

+                               gpu_instance = &(mgpu_info.gpu_ins[i]);
+                               if (gpu_instance->adev->flags & AMD_IS_APU)
+                                       continue;
+

From: Kim, Jonathan
Sent: Tuesday, November 5, 2019 11:42 AM
To: Strawbridge, Michael <Michael.Strawbridge@amd.com<mailto:Michael.Strawbridge@amd.com>>; Quan, Evan <Evan.Quan@amd.com<mailto:Evan.Quan@amd.com>>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
Subject: RE: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized

Yes that's correct.  This should fix the issue.

Jon

From: Strawbridge, Michael <Michael.Strawbridge@amd.com<mailto:Michael.Strawbridge@amd.com>>
Sent: Tuesday, November 5, 2019 11:40 AM
To: Kim, Jonathan <Jonathan.Kim@amd.com<mailto:Jonathan.Kim@amd.com>>; Quan, Evan <Evan.Quan@amd.com<mailto:Evan.Quan@amd.com>>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
Subject: Re: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized

Hi Jon,

Does that mean we can simply use this instead?

+               if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes - 1) {

Thanks,
Michael
________________________________
From: Kim, Jonathan <Jonathan.Kim@amd.com<mailto:Jonathan.Kim@amd.com>>
Sent: 05 November 2019 11:32 AM
To: Quan, Evan <Evan.Quan@amd.com<mailto:Evan.Quan@amd.com>>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org> <amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>>
Cc: Strawbridge, Michael <Michael.Strawbridge@amd.com<mailto:Michael.Strawbridge@amd.com>>
Subject: RE: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized

Please see inline.

Jon

-----Original Message-----
From: Quan, Evan <Evan.Quan@amd.com<mailto:Evan.Quan@amd.com>>
Sent: Tuesday, November 5, 2019 5:24 AM
To: amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
Cc: Kim, Jonathan <Jonathan.Kim@amd.com<mailto:Jonathan.Kim@amd.com>>; Strawbridge, Michael <Michael.Strawbridge@amd.com<mailto:Michael.Strawbridge@amd.com>>; Quan, Evan <Evan.Quan@amd.com<mailto:Evan.Quan@amd.com>>
Subject: [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized

P-state switch should be performed after all devices from the hive get initialized.

Change-Id: Ifc7cac9ef0cf250447d2a412da35d601e2ac79ec
Signed-off-by: Evan Quan <evan.quan@amd.com<mailto:evan.quan@amd.com>>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 47 ++++++++++++++++------
 1 file changed, 35 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e6ce949670e5..2d72d206cead 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2090,6 +2090,7 @@ static int amdgpu_device_enable_mgpu_fan_boost(void)
  */
 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)  {
+       struct amdgpu_gpu_instance *gpu_instance;
         int i = 0, r;

         for (i = 0; i < adev->num_ip_blocks; i++) { @@ -2115,6 +2116,40 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
         if (r)
                 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);

+
+       if (adev->gmc.xgmi.num_physical_nodes > 1) {
+               mutex_lock(&mgpu_info.mutex);
+
+               /*
+                * Reset device p-state to low as this was booted with high.
+                *
+                * This should be performed only after all devices from the same
+                * hive get initialized.
+                *
+                * However, it's unknown how many device in the hive in advance.
+                * As this is counted one by one during devices initializations.
+                *
+                * So, we wait for all XGMI interlinked devices initialized.
+                * This may bring some delays as those devices may come from
+                * different hives. But that should be OK.
+                */
+               if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
[JK] This condition will never succeed.  mgpu_info.num_dgpu is 0-indexed while num_physical_nodes give total nodes.

+                       for (i = 0; i < mgpu_info.num_gpu; i++) {
+                               gpu_instance = &(mgpu_info.gpu_ins[i]);
+                               if (gpu_instance->adev->flags & AMD_IS_APU)
+                                       continue;
+
+                               r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 0);
+                               if (r) {
+                                       DRM_ERROR("pstate setting failed (%d).\n", r);
+                                       break;
+                               }
+                       }
+               }
+
+               mutex_unlock(&mgpu_info.mutex);
+       }
+
         return 0;
 }

@@ -2227,18 +2262,6 @@ static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
         r = amdgpu_ib_ring_tests(adev);
         if (r)
                 DRM_ERROR("ib ring test failed (%d).\n", r);
-
-       /*
-        * set to low pstate by default
-        * This should be performed after all devices from
-        * XGMI finish their initializations. Thus it's moved
-        * to here.
-        * The time delay is 2S. TODO: confirm whether that
-        * is enough for all possible XGMI setups.
-        */
-       r = amdgpu_xgmi_set_pstate(adev, 0);
-       if (r)
-               DRM_ERROR("pstate setting failed (%d).\n", r);
 }

 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
--
2.23.0

[-- Attachment #1.2: Type: text/html, Size: 20249 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2019-11-06  3:49 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-11-05 10:23 [PATCH 1/2] drm/amdgpu: fix possible pstate switch race condition Quan, Evan
2019-11-05 10:23 ` Quan, Evan
     [not found] ` <20191105102310.16657-1-evan.quan-5C7GfCeVMHo@public.gmane.org>
2019-11-05 10:23   ` [PATCH 2/2] drm/amdgpu: perform p-state switch after the whole hive initialized Quan, Evan
2019-11-05 10:23     ` Quan, Evan
     [not found]     ` <20191105102310.16657-2-evan.quan-5C7GfCeVMHo@public.gmane.org>
2019-11-05 16:32       ` Kim, Jonathan
2019-11-05 16:32         ` Kim, Jonathan
     [not found]         ` <CH2PR12MB3831CEABDD4CBA2FEB8DEFEB857E0-rV3HgkLYx2Wb1l0FfFgDpQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2019-11-05 16:39           ` Strawbridge, Michael
2019-11-05 16:39             ` Strawbridge, Michael
     [not found]             ` <DM5PR12MB248696FDB2D0E563221E7CDFE87E0-2J9CzHegvk8hnZ7s9K7NnwdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2019-11-05 16:41               ` Kim, Jonathan
2019-11-05 16:41                 ` Kim, Jonathan
     [not found]                 ` <CH2PR12MB3831D16D352FA009002800CC857E0-rV3HgkLYx2Wb1l0FfFgDpQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2019-11-05 17:07                   ` Kim, Jonathan
2019-11-05 17:07                     ` Kim, Jonathan
     [not found]                     ` <CH2PR12MB3831868F2A7B4A8BCCBA24F7857E0-rV3HgkLYx2Wb1l0FfFgDpQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2019-11-06  2:17                       ` Kim, Jonathan
2019-11-06  2:17                         ` Kim, Jonathan
     [not found]                         ` <CH2PR12MB383150DB7FA13B7E5472A37085790-rV3HgkLYx2Wb1l0FfFgDpQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2019-11-06  3:10                           ` Quan, Evan
2019-11-06  3:10                             ` Quan, Evan
     [not found]                             ` <MN2PR12MB3344BF9A70FE63F2D5E4AE63E4790-rweVpJHSKToDMgCC8P//OwdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2019-11-06  3:49                               ` Kim, Jonathan
2019-11-06  3:49                                 ` Kim, Jonathan
2019-11-05 11:34   ` [PATCH 1/2] drm/amdgpu: fix possible pstate switch race condition Xu, Feifei
2019-11-05 11:34     ` Xu, Feifei

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.