All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/amd/pm: Fix a bug communicating with the SMU
@ 2021-07-12 15:30 Luben Tuikov
  2021-07-13  2:56 ` Lazar, Lijo
  2021-07-13  7:07 ` Quan, Evan
  0 siblings, 2 replies; 11+ messages in thread
From: Luben Tuikov @ 2021-07-12 15:30 UTC (permalink / raw)
  To: amd-gfx; +Cc: Alex Deucher, Luben Tuikov, Evan Quan

This fixes a bug which if we probe a non-existing
I2C device, and the SMU returns 0xFF, from then on
we can never communicate with the SMU, because the
code before this patch reads and interprets 0xFF
as a terminal error, and thus we never write 0
into register 90 to clear the status (and
subsequently send a new command to the SMU.)

It is not an error that the SMU returns status
0xFF. This means that the SMU executed the last
command successfully (execution status), but the
command result is an error of some sort (execution
result), depending on what the command was.

When doing a status check of the SMU, before we
send a new command, the only status which
precludes us from sending a new command is 0--the
SMU hasn't finished executing a previous command,
and 0xFC--the SMU is busy.

This bug was seen as the following line in the
kernel log,

amdgpu: Msg issuing pre-check failed(0xff) and SMU may be not in the right state!

when subsequent SMU commands, not necessarily
related to I2C, were sent to the SMU.

This patch fixes this bug.

Cc: Alex Deucher <Alexander.Deucher@amd.com>
Cc: Evan Quan <evan.quan@amd.com>
Fixes: fcb1fe9c9e0031 ("drm/amd/powerplay: pre-check the SMU state before issuing message")
Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
---
 drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c | 196 +++++++++++++++++++------
 drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h |   3 +-
 2 files changed, 152 insertions(+), 47 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
index c902fdf322c1be..775eb50a2e49a6 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
@@ -55,7 +55,7 @@
 
 #undef __SMU_DUMMY_MAP
 #define __SMU_DUMMY_MAP(type)	#type
-static const char* __smu_message_names[] = {
+static const char * const __smu_message_names[] = {
 	SMU_MESSAGE_TYPES
 };
 
@@ -76,46 +76,161 @@ static void smu_cmn_read_arg(struct smu_context *smu,
 	*arg = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_82);
 }
 
-int smu_cmn_wait_for_response(struct smu_context *smu)
+/**
+ * __smu_cmn_poll_stat -- poll for a status from the SMU
+ * smu: a pointer to SMU context
+ *
+ * Returns the status of the SMU, which could be,
+ * 0, the SMU is busy with your previous command;
+ * 1,    execution status: success, execution result: success;
+ * 0xFF, execution status: success, execution result: failure;
+ * 0xFE, unknown command;
+ * 0xFD, valid command, but bad (command) prerequisites;
+ * 0xFC, the command was rejected as the SMU is busy;
+ * 0xFB, "SMC_Result_DebugDataDumpEnd".
+ */
+static u32 __smu_cmn_poll_stat(struct smu_context *smu)
 {
 	struct amdgpu_device *adev = smu->adev;
-	uint32_t cur_value, i, timeout = adev->usec_timeout * 20;
+	int timeout = adev->usec_timeout * 20;
+	u32 reg;
 
-	for (i = 0; i < timeout; i++) {
-		cur_value = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90);
-		if ((cur_value & MP1_C2PMSG_90__CONTENT_MASK) != 0)
-			return cur_value;
+	for ( ; timeout > 0; timeout--) {
+		reg = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90);
+		if ((reg & MP1_C2PMSG_90__CONTENT_MASK) != 0)
+			break;
 
 		udelay(1);
 	}
 
-	/* timeout means wrong logic */
-	if (i == timeout)
-		return -ETIME;
-
-	return RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90);
+	return reg;
 }
 
-int smu_cmn_send_msg_without_waiting(struct smu_context *smu,
-				     uint16_t msg, uint32_t param)
+static void __smu_cmn_reg_print_error(struct smu_context *smu,
+				      u32 reg_c2pmsg_90,
+				      int msg_index,
+				      u32 param,
+				      enum smu_message_type msg)
 {
 	struct amdgpu_device *adev = smu->adev;
-	int ret;
+	const char *message = smu_get_message_name(smu, msg);
 
-	ret = smu_cmn_wait_for_response(smu);
-	if (ret != 0x1) {
-		dev_err(adev->dev, "Msg issuing pre-check failed(0x%x) and "
-		       "SMU may be not in the right state!\n", ret);
-		if (ret != -ETIME)
-			ret = -EIO;
-		return ret;
+	switch (reg_c2pmsg_90) {
+	case 0:
+		dev_err_ratelimited(adev->dev,
+				    "SMU: I'm not done with your previous command!");
+		break;
+	case 1:
+		/* The SMU executed the command. It completed with a
+		 * successful result.
+		 */
+		break;
+	case 0xFF:
+		/* The SMU executed the command. It completed with a
+		 * unsuccessful result.
+		 */
+		break;
+	case 0xFE:
+		dev_err_ratelimited(adev->dev,
+				    "SMU: unknown command: index:%d param:0x%08X message:%s",
+				    msg_index, param, message);
+		break;
+	case 0xFD:
+		dev_err_ratelimited(adev->dev,
+				    "SMU: valid command, bad prerequisites: index:%d param:0x%08X message:%s",
+				    msg_index, param, message);
+		break;
+	case 0xFC:
+		dev_err_ratelimited(adev->dev,
+				    "SMU: I'm very busy for your command: index:%d param:0x%08X message:%s",
+				    msg_index, param, message);
+		break;
+	case 0xFB:
+		dev_err_ratelimited(adev->dev,
+				    "SMU: I'm debugging!");
+		break;
+	default:
+		dev_err_ratelimited(adev->dev,
+				    "SMU: response:0x%08X for index:%d param:0x%08X message:%s?",
+				    reg_c2pmsg_90, msg_index, param, message);
+		break;
+	}
+}
+
+static int __smu_cmn_reg2errno(struct smu_context *smu, u32 reg_c2pmsg_90)
+{
+	int res;
+
+	switch (reg_c2pmsg_90) {
+	case 0:
+		res = -ETIME;
+		break;
+	case 1:
+		res = 0;
+		break;
+	case 0xFF:
+		res = -EIO;
+		break;
+	case 0xFE:
+		res = -EOPNOTSUPP;
+		break;
+	case 0xFD:
+		res = -EIO;
+		break;
+	case 0xFC:
+		res = -EBUSY;
+		break;
+	case 0xFB:
+		res = -EIO;
+		break;
+	default:
+		res = -EIO;
+		break;
 	}
 
+	return res;
+}
+
+static void __smu_cmn_send_msg(struct smu_context *smu,
+			       u16 msg,
+			       u32 param)
+{
+	struct amdgpu_device *adev = smu->adev;
+
 	WREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90, 0);
 	WREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_82, param);
 	WREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_66, msg);
+}
 
-	return 0;
+int smu_cmn_send_msg_without_waiting(struct smu_context *smu,
+				     uint16_t msg_index,
+				     uint32_t param)
+{
+	u32 reg;
+	int res;
+
+	if (smu->adev->in_pci_err_recovery)
+		return 0;
+
+	mutex_lock(&smu->message_lock);
+	reg = __smu_cmn_poll_stat(smu);
+	if (reg == 0 || reg == 0xFC) {
+		res = __smu_cmn_reg2errno(smu, reg);
+		goto Out;
+	}
+	__smu_cmn_send_msg(smu, msg_index, param);
+	res = 0;
+Out:
+	mutex_unlock(&smu->message_lock);
+	return res;
+}
+
+int smu_cmn_wait_for_response(struct smu_context *smu)
+{
+	u32 reg;
+
+	reg = __smu_cmn_poll_stat(smu);
+	return __smu_cmn_reg2errno(smu, reg);
 }
 
 int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
@@ -123,8 +238,8 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
 				    uint32_t param,
 				    uint32_t *read_arg)
 {
-	struct amdgpu_device *adev = smu->adev;
-	int ret = 0, index = 0;
+	int res, index;
+	u32 reg;
 
 	if (smu->adev->in_pci_err_recovery)
 		return 0;
@@ -136,31 +251,20 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
 		return index == -EACCES ? 0 : index;
 
 	mutex_lock(&smu->message_lock);
-	ret = smu_cmn_send_msg_without_waiting(smu, (uint16_t)index, param);
-	if (ret)
-		goto out;
-
-	ret = smu_cmn_wait_for_response(smu);
-	if (ret != 0x1) {
-		if (ret == -ETIME) {
-			dev_err(adev->dev, "message: %15s (%d) \tparam: 0x%08x is timeout (no response)\n",
-				smu_get_message_name(smu, msg), index, param);
-		} else {
-			dev_err(adev->dev, "failed send message: %15s (%d) \tparam: 0x%08x response %#x\n",
-				smu_get_message_name(smu, msg), index, param,
-				ret);
-			ret = -EIO;
-		}
-		goto out;
+	reg = __smu_cmn_poll_stat(smu);
+	if (reg == 0 || reg == 0xFC) {
+		res = __smu_cmn_reg2errno(smu, reg);
+		__smu_cmn_reg_print_error(smu, reg, index, param, msg);
+		goto Out;
 	}
-
+	__smu_cmn_send_msg(smu, (uint16_t) index, param);
+	reg = __smu_cmn_poll_stat(smu);
+	res = __smu_cmn_reg2errno(smu, reg);
 	if (read_arg)
 		smu_cmn_read_arg(smu, read_arg);
-
-	ret = 0; /* 0 as driver return value */
-out:
+Out:
 	mutex_unlock(&smu->message_lock);
-	return ret;
+	return res;
 }
 
 int smu_cmn_send_smc_msg(struct smu_context *smu,
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
index 9add5f16ff562a..16993daa2ae042 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
@@ -27,7 +27,8 @@
 
 #if defined(SWSMU_CODE_LAYER_L2) || defined(SWSMU_CODE_LAYER_L3) || defined(SWSMU_CODE_LAYER_L4)
 int smu_cmn_send_msg_without_waiting(struct smu_context *smu,
-				     uint16_t msg, uint32_t param);
+				     uint16_t msg_index,
+				     uint32_t param);
 int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
 				    enum smu_message_type msg,
 				    uint32_t param,
-- 
2.32.0

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH] drm/amd/pm: Fix a bug communicating with the SMU
  2021-07-12 15:30 [PATCH] drm/amd/pm: Fix a bug communicating with the SMU Luben Tuikov
@ 2021-07-13  2:56 ` Lazar, Lijo
  2021-07-13 17:18   ` Luben Tuikov
  2021-07-14 15:19   ` Alex Deucher
  2021-07-13  7:07 ` Quan, Evan
  1 sibling, 2 replies; 11+ messages in thread
From: Lazar, Lijo @ 2021-07-13  2:56 UTC (permalink / raw)
  To: Luben Tuikov, amd-gfx; +Cc: Alex Deucher, Evan Quan



On 7/12/2021 9:00 PM, Luben Tuikov wrote:
> This fixes a bug which if we probe a non-existing
> I2C device, and the SMU returns 0xFF, from then on
> we can never communicate with the SMU, because the
> code before this patch reads and interprets 0xFF
> as a terminal error, and thus we never write 0
> into register 90 to clear the status (and
> subsequently send a new command to the SMU.)
> 
> It is not an error that the SMU returns status
> 0xFF. This means that the SMU executed the last
> command successfully (execution status), but the
> command result is an error of some sort (execution
> result), depending on what the command was.
> 
> When doing a status check of the SMU, before we
> send a new command, the only status which
> precludes us from sending a new command is 0--the
> SMU hasn't finished executing a previous command,
> and 0xFC--the SMU is busy.
> 
> This bug was seen as the following line in the
> kernel log,
> 
> amdgpu: Msg issuing pre-check failed(0xff) and SMU may be not in the right state!
> 
> when subsequent SMU commands, not necessarily
> related to I2C, were sent to the SMU.
> 
> This patch fixes this bug.
> 
> Cc: Alex Deucher <Alexander.Deucher@amd.com>
> Cc: Evan Quan <evan.quan@amd.com>
> Fixes: fcb1fe9c9e0031 ("drm/amd/powerplay: pre-check the SMU state before issuing message")
> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
> ---
>   drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c | 196 +++++++++++++++++++------
>   drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h |   3 +-
>   2 files changed, 152 insertions(+), 47 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> index c902fdf322c1be..775eb50a2e49a6 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> @@ -55,7 +55,7 @@
>   
>   #undef __SMU_DUMMY_MAP
>   #define __SMU_DUMMY_MAP(type)	#type
> -static const char* __smu_message_names[] = {
> +static const char * const __smu_message_names[] = {
>   	SMU_MESSAGE_TYPES
>   };
>   
> @@ -76,46 +76,161 @@ static void smu_cmn_read_arg(struct smu_context *smu,
>   	*arg = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_82);
>   }
>   
> -int smu_cmn_wait_for_response(struct smu_context *smu)
> +/**
> + * __smu_cmn_poll_stat -- poll for a status from the SMU
> + * smu: a pointer to SMU context
> + *
> + * Returns the status of the SMU, which could be,
> + * 0, the SMU is busy with your previous command;
> + * 1,    execution status: success, execution result: success;
> + * 0xFF, execution status: success, execution result: failure;
> + * 0xFE, unknown command;
> + * 0xFD, valid command, but bad (command) prerequisites;
> + * 0xFC, the command was rejected as the SMU is busy;
> + * 0xFB, "SMC_Result_DebugDataDumpEnd".
> + */

These are the response codes defined in header (0xFB is somehow missing)
// SMU Response Codes:
#define PPSMC_Result_OK                    0x1
#define PPSMC_Result_Failed                0xFF
#define PPSMC_Result_UnknownCmd            0xFE
#define PPSMC_Result_CmdRejectedPrereq     0xFD
#define PPSMC_Result_CmdRejectedBusy       0xFC

It's better to use #defines for these, usually we follow a convention 
like SMU_

Ex:
	#define SMU_RESP_RESULT_OK 0x1
	

> +static u32 __smu_cmn_poll_stat(struct smu_context *smu)
>   {
>   	struct amdgpu_device *adev = smu->adev;
> -	uint32_t cur_value, i, timeout = adev->usec_timeout * 20;
> +	int timeout = adev->usec_timeout * 20;
> +	u32 reg;
>   
> -	for (i = 0; i < timeout; i++) {
> -		cur_value = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90);
> -		if ((cur_value & MP1_C2PMSG_90__CONTENT_MASK) != 0)
> -			return cur_value;
> +	for ( ; timeout > 0; timeout--) {
> +		reg = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90);
> +		if ((reg & MP1_C2PMSG_90__CONTENT_MASK) != 0)
> +			break;
>   
>   		udelay(1);
>   	}
>   
> -	/* timeout means wrong logic */
> -	if (i == timeout)
> -		return -ETIME;
> -
> -	return RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90);
> +	return reg;
>   }
>   
> -int smu_cmn_send_msg_without_waiting(struct smu_context *smu,
> -				     uint16_t msg, uint32_t param)
> +static void __smu_cmn_reg_print_error(struct smu_context *smu,
> +				      u32 reg_c2pmsg_90,

Instead of using reg/regname in function, it would be better to name it 
as smu_cmn_resp/smu_resp or similar to make it clear that we are 
decoding smu response.

> +				      int msg_index,
> +				      u32 param,
> +				      enum smu_message_type msg)
>   {
>   	struct amdgpu_device *adev = smu->adev;
> -	int ret;
> +	const char *message = smu_get_message_name(smu, msg);
>   
> -	ret = smu_cmn_wait_for_response(smu);
> -	if (ret != 0x1) {
> -		dev_err(adev->dev, "Msg issuing pre-check failed(0x%x) and "
> -		       "SMU may be not in the right state!\n", ret);
> -		if (ret != -ETIME)
> -			ret = -EIO;
> -		return ret;
> +	switch (reg_c2pmsg_90) {
> +	case 0:
> +		dev_err_ratelimited(adev->dev,
> +				    "SMU: I'm not done with your previous command!");
> +		break;
> +	case 1:
> +		/* The SMU executed the command. It completed with a
> +		 * successful result.
> +		 */
> +		break;
> +	case 0xFF:
> +		/* The SMU executed the command. It completed with a
> +		 * unsuccessful result.
> +		 */
> +		break;
> +	case 0xFE:
> +		dev_err_ratelimited(adev->dev,
> +				    "SMU: unknown command: index:%d param:0x%08X message:%s",
> +				    msg_index, param, message);
> +		break;
> +	case 0xFD:
> +		dev_err_ratelimited(adev->dev,
> +				    "SMU: valid command, bad prerequisites: index:%d param:0x%08X message:%s",
> +				    msg_index, param, message);
> +		break;
> +	case 0xFC:
> +		dev_err_ratelimited(adev->dev,
> +				    "SMU: I'm very busy for your command: index:%d param:0x%08X message:%s",
> +				    msg_index, param, message);
> +		break;
> +	case 0xFB:
> +		dev_err_ratelimited(adev->dev,
> +				    "SMU: I'm debugging!");
> +		break;
> +	default:
> +		dev_err_ratelimited(adev->dev,
> +				    "SMU: response:0x%08X for index:%d param:0x%08X message:%s?",
> +				    reg_c2pmsg_90, msg_index, param, message);
> +		break;
> +	}
> +}
> +
> +static int __smu_cmn_reg2errno(struct smu_context *smu, u32 reg_c2pmsg_90)

Same comment on naming - resp2errno?
> +{
> +	int res;
> +
> +	switch (reg_c2pmsg_90) {
> +	case 0:
> +		res = -ETIME;
> +		break;
> +	case 1:
> +		res = 0;
> +		break;
> +	case 0xFF:
> +		res = -EIO;
> +		break;
> +	case 0xFE:
> +		res = -EOPNOTSUPP;
> +		break;
> +	case 0xFD:
> +		res = -EIO;
> +		break;
> +	case 0xFC:
> +		res = -EBUSY;
> +		break;
> +	case 0xFB:
> +		res = -EIO;
> +		break;
> +	default:
> +		res = -EIO;
> +		break;
>   	}
>   
> +	return res;
> +}
> +
> +static void __smu_cmn_send_msg(struct smu_context *smu,
> +			       u16 msg,
> +			       u32 param)
> +{
> +	struct amdgpu_device *adev = smu->adev;
> +
>   	WREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90, 0);
>   	WREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_82, param);
>   	WREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_66, msg);
> +}
>   
> -	return 0;
> +int smu_cmn_send_msg_without_waiting(struct smu_context *smu,
> +				     uint16_t msg_index,
> +				     uint32_t param)
> +{
> +	u32 reg;
> +	int res;
> +
> +	if (smu->adev->in_pci_err_recovery)
> +		return 0;
> +
> +	mutex_lock(&smu->message_lock);
> +	reg = __smu_cmn_poll_stat(smu);
> +	if (reg == 0 || reg == 0xFC) {

The problem with 0xFC check is it could be the response of a previous 
message. It could mean that FW was busy when the prev message was sent, 
not now.

There is a default case (value not in any of the predefined error 
codes), that should be considered here also. That happens sometimes and 
usually that means FW is in undefined state.


> +		res = __smu_cmn_reg2errno(smu, reg);
> +		goto Out;

Label naming style, lower case?.

> +	}
> +	__smu_cmn_send_msg(smu, msg_index, param);
> +	res = 0;
> +Out:
> +	mutex_unlock(&smu->message_lock);
> +	return res;
> +}
> +
> +int smu_cmn_wait_for_response(struct smu_context *smu)
> +{
> +	u32 reg;
> +
> +	reg = __smu_cmn_poll_stat(smu);
> +	return __smu_cmn_reg2errno(smu, reg);
>   }
>   
>   int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
> @@ -123,8 +238,8 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
>   				    uint32_t param,
>   				    uint32_t *read_arg)
>   {
> -	struct amdgpu_device *adev = smu->adev;
> -	int ret = 0, index = 0;
> +	int res, index;
> +	u32 reg;
>   
>   	if (smu->adev->in_pci_err_recovery)
>   		return 0;
> @@ -136,31 +251,20 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
>   		return index == -EACCES ? 0 : index;
>   
>   	mutex_lock(&smu->message_lock);
> -	ret = smu_cmn_send_msg_without_waiting(smu, (uint16_t)index, param);
> -	if (ret)
> -		goto out;
> -
> -	ret = smu_cmn_wait_for_response(smu);
> -	if (ret != 0x1) {
> -		if (ret == -ETIME) {
> -			dev_err(adev->dev, "message: %15s (%d) \tparam: 0x%08x is timeout (no response)\n",
> -				smu_get_message_name(smu, msg), index, param);
> -		} else {
> -			dev_err(adev->dev, "failed send message: %15s (%d) \tparam: 0x%08x response %#x\n",
> -				smu_get_message_name(smu, msg), index, param,
> -				ret);
> -			ret = -EIO;
> -		}
> -		goto out;
> +	reg = __smu_cmn_poll_stat(smu);
> +	if (reg == 0 || reg == 0xFC) {

Same comments as for without_waiting case.

> +		res = __smu_cmn_reg2errno(smu, reg);
> +		__smu_cmn_reg_print_error(smu, reg, index, param, msg);

This precheck fail print is missing in without_waiting message.

> +		goto Out; >   	}
> -
> +	__smu_cmn_send_msg(smu, (uint16_t) index, param);
> +	reg = __smu_cmn_poll_stat(smu);
> +	res = __smu_cmn_reg2errno(smu, reg);

Using smu_cmn_wait_for_response instead of these two makes the intent 
clearer - that we are waiting for the response.

We need a print here as well if the message has failed.

Thanks,
Lijo

>   	if (read_arg)
>   		smu_cmn_read_arg(smu, read_arg);
> -
> -	ret = 0; /* 0 as driver return value */
> -out:
> +Out:
>   	mutex_unlock(&smu->message_lock);
> -	return ret;
> +	return res;
>   }
>   
>   int smu_cmn_send_smc_msg(struct smu_context *smu,
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
> index 9add5f16ff562a..16993daa2ae042 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
> @@ -27,7 +27,8 @@
>   
>   #if defined(SWSMU_CODE_LAYER_L2) || defined(SWSMU_CODE_LAYER_L3) || defined(SWSMU_CODE_LAYER_L4)
>   int smu_cmn_send_msg_without_waiting(struct smu_context *smu,
> -				     uint16_t msg, uint32_t param);
> +				     uint16_t msg_index,
> +				     uint32_t param);
>   int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
>   				    enum smu_message_type msg,
>   				    uint32_t param,
> 
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE: [PATCH] drm/amd/pm: Fix a bug communicating with the SMU
  2021-07-12 15:30 [PATCH] drm/amd/pm: Fix a bug communicating with the SMU Luben Tuikov
  2021-07-13  2:56 ` Lazar, Lijo
@ 2021-07-13  7:07 ` Quan, Evan
  2021-07-13 17:35   ` Luben Tuikov
  1 sibling, 1 reply; 11+ messages in thread
From: Quan, Evan @ 2021-07-13  7:07 UTC (permalink / raw)
  To: Tuikov, Luben, amd-gfx; +Cc: Deucher, Alexander

[AMD Official Use Only]



> -----Original Message-----
> From: Tuikov, Luben <Luben.Tuikov@amd.com>
> Sent: Monday, July 12, 2021 11:31 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Tuikov, Luben <Luben.Tuikov@amd.com>; Deucher, Alexander
> <Alexander.Deucher@amd.com>; Quan, Evan <Evan.Quan@amd.com>
> Subject: [PATCH] drm/amd/pm: Fix a bug communicating with the SMU
> 
> This fixes a bug which if we probe a non-existing
> I2C device, and the SMU returns 0xFF, 
[Quan, Evan] Do we have to probe I2C device via issuing commands to SMU and check existence via SMU response?
Is there other more friendly way?
>from then on
> we can never communicate with the SMU, because the
> code before this patch reads and interprets 0xFF
> as a terminal error, and thus we never write 0
> into register 90 to clear the status (and
> subsequently send a new command to the SMU.)
> 
> It is not an error that the SMU returns status
> 0xFF. This means that the SMU executed the last
> command successfully (execution status), but the
> command result is an error of some sort (execution
> result), depending on what the command was.
> 
> When doing a status check of the SMU, before we
> send a new command, the only status which
> precludes us from sending a new command is 0--the
> SMU hasn't finished executing a previous command,
> and 0xFC--the SMU is busy.
> 
> This bug was seen as the following line in the
> kernel log,
> 
> amdgpu: Msg issuing pre-check failed(0xff) and SMU may be not in the right
> state!
[Quan, Evan] This was designed to prevent failure scenario from ruin.
Via this, we can prevent those SMU command/response related registers overwritten. 
Then PMFW team can known which command invoked the first error.

BR
Evan
> 
> when subsequent SMU commands, not necessarily
> related to I2C, were sent to the SMU.
> 
> This patch fixes this bug.
> 
> Cc: Alex Deucher <Alexander.Deucher@amd.com>
> Cc: Evan Quan <evan.quan@amd.com>
> Fixes: fcb1fe9c9e0031 ("drm/amd/powerplay: pre-check the SMU state
> before issuing message")
> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
> ---
>  drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c | 196
> +++++++++++++++++++------
>  drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h |   3 +-
>  2 files changed, 152 insertions(+), 47 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> index c902fdf322c1be..775eb50a2e49a6 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> @@ -55,7 +55,7 @@
> 
>  #undef __SMU_DUMMY_MAP
>  #define __SMU_DUMMY_MAP(type)	#type
> -static const char* __smu_message_names[] = {
> +static const char * const __smu_message_names[] = {
>  	SMU_MESSAGE_TYPES
>  };
> 
> @@ -76,46 +76,161 @@ static void smu_cmn_read_arg(struct smu_context
> *smu,
>  	*arg = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_82);
>  }
> 
> -int smu_cmn_wait_for_response(struct smu_context *smu)
> +/**
> + * __smu_cmn_poll_stat -- poll for a status from the SMU
> + * smu: a pointer to SMU context
> + *
> + * Returns the status of the SMU, which could be,
> + * 0, the SMU is busy with your previous command;
> + * 1,    execution status: success, execution result: success;
> + * 0xFF, execution status: success, execution result: failure;
> + * 0xFE, unknown command;
> + * 0xFD, valid command, but bad (command) prerequisites;
> + * 0xFC, the command was rejected as the SMU is busy;
> + * 0xFB, "SMC_Result_DebugDataDumpEnd".
> + */
> +static u32 __smu_cmn_poll_stat(struct smu_context *smu)
>  {
>  	struct amdgpu_device *adev = smu->adev;
> -	uint32_t cur_value, i, timeout = adev->usec_timeout * 20;
> +	int timeout = adev->usec_timeout * 20;
> +	u32 reg;
> 
> -	for (i = 0; i < timeout; i++) {
> -		cur_value = RREG32_SOC15(MP1, 0,
> mmMP1_SMN_C2PMSG_90);
> -		if ((cur_value & MP1_C2PMSG_90__CONTENT_MASK) != 0)
> -			return cur_value;
> +	for ( ; timeout > 0; timeout--) {
> +		reg = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90);
> +		if ((reg & MP1_C2PMSG_90__CONTENT_MASK) != 0)
> +			break;
> 
>  		udelay(1);
>  	}
> 
> -	/* timeout means wrong logic */
> -	if (i == timeout)
> -		return -ETIME;
> -
> -	return RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90);
> +	return reg;
>  }
> 
> -int smu_cmn_send_msg_without_waiting(struct smu_context *smu,
> -				     uint16_t msg, uint32_t param)
> +static void __smu_cmn_reg_print_error(struct smu_context *smu,
> +				      u32 reg_c2pmsg_90,
> +				      int msg_index,
> +				      u32 param,
> +				      enum smu_message_type msg)
>  {
>  	struct amdgpu_device *adev = smu->adev;
> -	int ret;
> +	const char *message = smu_get_message_name(smu, msg);
> 
> -	ret = smu_cmn_wait_for_response(smu);
> -	if (ret != 0x1) {
> -		dev_err(adev->dev, "Msg issuing pre-check failed(0x%x) and
> "
> -		       "SMU may be not in the right state!\n", ret);
> -		if (ret != -ETIME)
> -			ret = -EIO;
> -		return ret;
> +	switch (reg_c2pmsg_90) {
> +	case 0:
> +		dev_err_ratelimited(adev->dev,
> +				    "SMU: I'm not done with your previous
> command!");
> +		break;
> +	case 1:
> +		/* The SMU executed the command. It completed with a
> +		 * successful result.
> +		 */
> +		break;
> +	case 0xFF:
> +		/* The SMU executed the command. It completed with a
> +		 * unsuccessful result.
> +		 */
> +		break;
> +	case 0xFE:
> +		dev_err_ratelimited(adev->dev,
> +				    "SMU: unknown command: index:%d
> param:0x%08X message:%s",
> +				    msg_index, param, message);
> +		break;
> +	case 0xFD:
> +		dev_err_ratelimited(adev->dev,
> +				    "SMU: valid command, bad prerequisites:
> index:%d param:0x%08X message:%s",
> +				    msg_index, param, message);
> +		break;
> +	case 0xFC:
> +		dev_err_ratelimited(adev->dev,
> +				    "SMU: I'm very busy for your command:
> index:%d param:0x%08X message:%s",
> +				    msg_index, param, message);
> +		break;
> +	case 0xFB:
> +		dev_err_ratelimited(adev->dev,
> +				    "SMU: I'm debugging!");
> +		break;
> +	default:
> +		dev_err_ratelimited(adev->dev,
> +				    "SMU: response:0x%08X for index:%d
> param:0x%08X message:%s?",
> +				    reg_c2pmsg_90, msg_index, param,
> message);
> +		break;
> +	}
> +}
> +
> +static int __smu_cmn_reg2errno(struct smu_context *smu, u32
> reg_c2pmsg_90)
> +{
> +	int res;
> +
> +	switch (reg_c2pmsg_90) {
> +	case 0:
> +		res = -ETIME;
> +		break;
> +	case 1:
> +		res = 0;
> +		break;
> +	case 0xFF:
> +		res = -EIO;
> +		break;
> +	case 0xFE:
> +		res = -EOPNOTSUPP;
> +		break;
> +	case 0xFD:
> +		res = -EIO;
> +		break;
> +	case 0xFC:
> +		res = -EBUSY;
> +		break;
> +	case 0xFB:
> +		res = -EIO;
> +		break;
> +	default:
> +		res = -EIO;
> +		break;
>  	}
> 
> +	return res;
> +}
> +
> +static void __smu_cmn_send_msg(struct smu_context *smu,
> +			       u16 msg,
> +			       u32 param)
> +{
> +	struct amdgpu_device *adev = smu->adev;
> +
>  	WREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90, 0);
>  	WREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_82, param);
>  	WREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_66, msg);
> +}
> 
> -	return 0;
> +int smu_cmn_send_msg_without_waiting(struct smu_context *smu,
> +				     uint16_t msg_index,
> +				     uint32_t param)
> +{
> +	u32 reg;
> +	int res;
> +
> +	if (smu->adev->in_pci_err_recovery)
> +		return 0;
> +
> +	mutex_lock(&smu->message_lock);
> +	reg = __smu_cmn_poll_stat(smu);
> +	if (reg == 0 || reg == 0xFC) {
> +		res = __smu_cmn_reg2errno(smu, reg);
> +		goto Out;
> +	}
> +	__smu_cmn_send_msg(smu, msg_index, param);
> +	res = 0;
> +Out:
> +	mutex_unlock(&smu->message_lock);
> +	return res;
> +}
> +
> +int smu_cmn_wait_for_response(struct smu_context *smu)
> +{
> +	u32 reg;
> +
> +	reg = __smu_cmn_poll_stat(smu);
> +	return __smu_cmn_reg2errno(smu, reg);
>  }
> 
>  int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
> @@ -123,8 +238,8 @@ int smu_cmn_send_smc_msg_with_param(struct
> smu_context *smu,
>  				    uint32_t param,
>  				    uint32_t *read_arg)
>  {
> -	struct amdgpu_device *adev = smu->adev;
> -	int ret = 0, index = 0;
> +	int res, index;
> +	u32 reg;
> 
>  	if (smu->adev->in_pci_err_recovery)
>  		return 0;
> @@ -136,31 +251,20 @@ int smu_cmn_send_smc_msg_with_param(struct
> smu_context *smu,
>  		return index == -EACCES ? 0 : index;
> 
>  	mutex_lock(&smu->message_lock);
> -	ret = smu_cmn_send_msg_without_waiting(smu, (uint16_t)index,
> param);
> -	if (ret)
> -		goto out;
> -
> -	ret = smu_cmn_wait_for_response(smu);
> -	if (ret != 0x1) {
> -		if (ret == -ETIME) {
> -			dev_err(adev->dev, "message: %15s (%d) \tparam:
> 0x%08x is timeout (no response)\n",
> -				smu_get_message_name(smu, msg), index,
> param);
> -		} else {
> -			dev_err(adev->dev, "failed send message: %15s (%d)
> \tparam: 0x%08x response %#x\n",
> -				smu_get_message_name(smu, msg), index,
> param,
> -				ret);
> -			ret = -EIO;
> -		}
> -		goto out;
> +	reg = __smu_cmn_poll_stat(smu);
> +	if (reg == 0 || reg == 0xFC) {
> +		res = __smu_cmn_reg2errno(smu, reg);
> +		__smu_cmn_reg_print_error(smu, reg, index, param, msg);
> +		goto Out;
>  	}
> -
> +	__smu_cmn_send_msg(smu, (uint16_t) index, param);
> +	reg = __smu_cmn_poll_stat(smu);
> +	res = __smu_cmn_reg2errno(smu, reg);
>  	if (read_arg)
>  		smu_cmn_read_arg(smu, read_arg);
> -
> -	ret = 0; /* 0 as driver return value */
> -out:
> +Out:
>  	mutex_unlock(&smu->message_lock);
> -	return ret;
> +	return res;
>  }
> 
>  int smu_cmn_send_smc_msg(struct smu_context *smu,
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
> b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
> index 9add5f16ff562a..16993daa2ae042 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
> @@ -27,7 +27,8 @@
> 
>  #if defined(SWSMU_CODE_LAYER_L2) || defined(SWSMU_CODE_LAYER_L3)
> || defined(SWSMU_CODE_LAYER_L4)
>  int smu_cmn_send_msg_without_waiting(struct smu_context *smu,
> -				     uint16_t msg, uint32_t param);
> +				     uint16_t msg_index,
> +				     uint32_t param);
>  int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
>  				    enum smu_message_type msg,
>  				    uint32_t param,
> --
> 2.32.0
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] drm/amd/pm: Fix a bug communicating with the SMU
  2021-07-13  2:56 ` Lazar, Lijo
@ 2021-07-13 17:18   ` Luben Tuikov
  2021-07-14 15:19   ` Alex Deucher
  1 sibling, 0 replies; 11+ messages in thread
From: Luben Tuikov @ 2021-07-13 17:18 UTC (permalink / raw)
  To: Lazar, Lijo, amd-gfx
  Cc: Alex Deucher, Evan Quan, aaron.baluja, Alimucaj, Andi

On 2021-07-12 10:56 p.m., Lazar, Lijo wrote:
>
> On 7/12/2021 9:00 PM, Luben Tuikov wrote:
>> This fixes a bug which if we probe a non-existing
>> I2C device, and the SMU returns 0xFF, from then on
>> we can never communicate with the SMU, because the
>> code before this patch reads and interprets 0xFF
>> as a terminal error, and thus we never write 0
>> into register 90 to clear the status (and
>> subsequently send a new command to the SMU.)
>>
>> It is not an error that the SMU returns status
>> 0xFF. This means that the SMU executed the last
>> command successfully (execution status), but the
>> command result is an error of some sort (execution
>> result), depending on what the command was.
>>
>> When doing a status check of the SMU, before we
>> send a new command, the only status which
>> precludes us from sending a new command is 0--the
>> SMU hasn't finished executing a previous command,
>> and 0xFC--the SMU is busy.
>>
>> This bug was seen as the following line in the
>> kernel log,
>>
>> amdgpu: Msg issuing pre-check failed(0xff) and SMU may be not in the right state!
>>
>> when subsequent SMU commands, not necessarily
>> related to I2C, were sent to the SMU.
>>
>> This patch fixes this bug.
>>
>> Cc: Alex Deucher <Alexander.Deucher@amd.com>
>> Cc: Evan Quan <evan.quan@amd.com>
>> Fixes: fcb1fe9c9e0031 ("drm/amd/powerplay: pre-check the SMU state before issuing message")
>> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
>> ---
>>   drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c | 196 +++++++++++++++++++------
>>   drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h |   3 +-
>>   2 files changed, 152 insertions(+), 47 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>> index c902fdf322c1be..775eb50a2e49a6 100644
>> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>> @@ -55,7 +55,7 @@
>>   
>>   #undef __SMU_DUMMY_MAP
>>   #define __SMU_DUMMY_MAP(type)	#type
>> -static const char* __smu_message_names[] = {
>> +static const char * const __smu_message_names[] = {
>>   	SMU_MESSAGE_TYPES
>>   };
>>   
>> @@ -76,46 +76,161 @@ static void smu_cmn_read_arg(struct smu_context *smu,
>>   	*arg = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_82);
>>   }
>>   
>> -int smu_cmn_wait_for_response(struct smu_context *smu)
>> +/**
>> + * __smu_cmn_poll_stat -- poll for a status from the SMU
>> + * smu: a pointer to SMU context
>> + *
>> + * Returns the status of the SMU, which could be,
>> + * 0, the SMU is busy with your previous command;
>> + * 1,    execution status: success, execution result: success;
>> + * 0xFF, execution status: success, execution result: failure;
>> + * 0xFE, unknown command;
>> + * 0xFD, valid command, but bad (command) prerequisites;
>> + * 0xFC, the command was rejected as the SMU is busy;
>> + * 0xFB, "SMC_Result_DebugDataDumpEnd".
>> + */
> These are the response codes defined in header (0xFB is somehow missing)
> // SMU Response Codes:
> #define PPSMC_Result_OK                    0x1
> #define PPSMC_Result_Failed                0xFF
> #define PPSMC_Result_UnknownCmd            0xFE
> #define PPSMC_Result_CmdRejectedPrereq     0xFD
> #define PPSMC_Result_CmdRejectedBusy       0xFC
>
> It's better to use #defines for these, usually we follow a convention 
> like SMU_
>
> Ex:
> 	#define SMU_RESP_RESULT_OK 0x1

I did not define these as macros (they're called "macros" not "defines"), _on_purpose_,
because they are already defined by the SMU, but we're not using header files which define them,
and for this reason I don't want to define them YET AGAIN. So, no, no macros for these constants,
unless we include a header file which is also used by the SMU team.

Furthermore, I can see that these constants are defined for each ASIC, yet we're a single
driver for all ASICs and I don't see how this can be reconciled, unless the SMU team moves
to a single header file defining those for all ASICs.

Thus, no macros.

> 	
>
>> +static u32 __smu_cmn_poll_stat(struct smu_context *smu)
>>   {
>>   	struct amdgpu_device *adev = smu->adev;
>> -	uint32_t cur_value, i, timeout = adev->usec_timeout * 20;
>> +	int timeout = adev->usec_timeout * 20;
>> +	u32 reg;
>>   
>> -	for (i = 0; i < timeout; i++) {
>> -		cur_value = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90);
>> -		if ((cur_value & MP1_C2PMSG_90__CONTENT_MASK) != 0)
>> -			return cur_value;
>> +	for ( ; timeout > 0; timeout--) {
>> +		reg = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90);
>> +		if ((reg & MP1_C2PMSG_90__CONTENT_MASK) != 0)
>> +			break;
>>   
>>   		udelay(1);
>>   	}
>>   
>> -	/* timeout means wrong logic */
>> -	if (i == timeout)
>> -		return -ETIME;
>> -
>> -	return RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90);
>> +	return reg;
>>   }
>>   
>> -int smu_cmn_send_msg_without_waiting(struct smu_context *smu,
>> -				     uint16_t msg, uint32_t param)
>> +static void __smu_cmn_reg_print_error(struct smu_context *smu,
>> +				      u32 reg_c2pmsg_90,
> Instead of using reg/regname in function, it would be better to name it 
> as smu_cmn_resp/smu_resp or similar to make it clear that we are 
> decoding smu response.

No. And here's the reason: I had about two hour meeting with the leads of the SMU team,
and it was clear that there is a distinction between a command execution status,
and command execution result. The former describes whether the SMU executed the command
or not, and the latter describes the result of the action executed by the command.

Unfortunately, it is easy to confuse those as was done in buggy commit fcb1fe9c9e0031, prior
to which it worked fine (!).

So, I don't want to give a stamp to register 90 as a "response". We may change this to be
"status" instead and response to be returned in the argument register, unless the function
returns an argument in which case 90 will be 0, or something like that--we've not fleshed it out yet.

In any case, I don't want to label 90 as "response" and this is why the name of this function
says EXACTLY what it is doing and NO MORE: decode the code in register 90, that's it.

>
>> +				      int msg_index,
>> +				      u32 param,
>> +				      enum smu_message_type msg)
>>   {
>>   	struct amdgpu_device *adev = smu->adev;
>> -	int ret;
>> +	const char *message = smu_get_message_name(smu, msg);
>>   
>> -	ret = smu_cmn_wait_for_response(smu);
>> -	if (ret != 0x1) {
>> -		dev_err(adev->dev, "Msg issuing pre-check failed(0x%x) and "
>> -		       "SMU may be not in the right state!\n", ret);
>> -		if (ret != -ETIME)
>> -			ret = -EIO;
>> -		return ret;
>> +	switch (reg_c2pmsg_90) {
>> +	case 0:
>> +		dev_err_ratelimited(adev->dev,
>> +				    "SMU: I'm not done with your previous command!");
>> +		break;
>> +	case 1:
>> +		/* The SMU executed the command. It completed with a
>> +		 * successful result.
>> +		 */
>> +		break;
>> +	case 0xFF:
>> +		/* The SMU executed the command. It completed with a
>> +		 * unsuccessful result.
>> +		 */
>> +		break;
>> +	case 0xFE:
>> +		dev_err_ratelimited(adev->dev,
>> +				    "SMU: unknown command: index:%d param:0x%08X message:%s",
>> +				    msg_index, param, message);
>> +		break;
>> +	case 0xFD:
>> +		dev_err_ratelimited(adev->dev,
>> +				    "SMU: valid command, bad prerequisites: index:%d param:0x%08X message:%s",
>> +				    msg_index, param, message);
>> +		break;
>> +	case 0xFC:
>> +		dev_err_ratelimited(adev->dev,
>> +				    "SMU: I'm very busy for your command: index:%d param:0x%08X message:%s",
>> +				    msg_index, param, message);
>> +		break;
>> +	case 0xFB:
>> +		dev_err_ratelimited(adev->dev,
>> +				    "SMU: I'm debugging!");
>> +		break;
>> +	default:
>> +		dev_err_ratelimited(adev->dev,
>> +				    "SMU: response:0x%08X for index:%d param:0x%08X message:%s?",
>> +				    reg_c2pmsg_90, msg_index, param, message);
>> +		break;
>> +	}
>> +}
>> +
>> +static int __smu_cmn_reg2errno(struct smu_context *smu, u32 reg_c2pmsg_90)
> Same comment on naming - resp2errno?

Nope! See above.
No renaming 90 to response, no overloading meaning. See my comment above.


>> +{
>> +	int res;
>> +
>> +	switch (reg_c2pmsg_90) {
>> +	case 0:
>> +		res = -ETIME;
>> +		break;
>> +	case 1:
>> +		res = 0;
>> +		break;
>> +	case 0xFF:
>> +		res = -EIO;
>> +		break;
>> +	case 0xFE:
>> +		res = -EOPNOTSUPP;
>> +		break;
>> +	case 0xFD:
>> +		res = -EIO;
>> +		break;
>> +	case 0xFC:
>> +		res = -EBUSY;
>> +		break;
>> +	case 0xFB:
>> +		res = -EIO;
>> +		break;
>> +	default:
>> +		res = -EIO;
>> +		break;
>>   	}
>>   
>> +	return res;
>> +}
>> +
>> +static void __smu_cmn_send_msg(struct smu_context *smu,
>> +			       u16 msg,
>> +			       u32 param)
>> +{
>> +	struct amdgpu_device *adev = smu->adev;
>> +
>>   	WREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90, 0);
>>   	WREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_82, param);
>>   	WREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_66, msg);
>> +}
>>   
>> -	return 0;
>> +int smu_cmn_send_msg_without_waiting(struct smu_context *smu,
>> +				     uint16_t msg_index,
>> +				     uint32_t param)
>> +{
>> +	u32 reg;
>> +	int res;
>> +
>> +	if (smu->adev->in_pci_err_recovery)
>> +		return 0;
>> +
>> +	mutex_lock(&smu->message_lock);
>> +	reg = __smu_cmn_poll_stat(smu);
>> +	if (reg == 0 || reg == 0xFC) {
> The problem with 0xFC check is it could be the response of a previous 
> message. It could mean that FW was busy when the prev message was sent, 
> not now.

It was explained to me by the SMU lead, that these codes mean that the SMU FW is busy NOW.

>
> There is a default case (value not in any of the predefined error 
> codes), that should be considered here also. That happens sometimes and 
> usually that means FW is in undefined state.

Is this "default value" used to be returned (exposed) to the driver by the SMU FW?
Will the driver actually see this value in 90 returned by the FW?
What is this value?

>
>
>> +		res = __smu_cmn_reg2errno(smu, reg);
>> +		goto Out;
> Label naming style, lower case?.

Absolutely NOT!
It's a paragraph label, not a variable.

>
>> +	}
>> +	__smu_cmn_send_msg(smu, msg_index, param);
>> +	res = 0;
>> +Out:
>> +	mutex_unlock(&smu->message_lock);
>> +	return res;
>> +}
>> +
>> +int smu_cmn_wait_for_response(struct smu_context *smu)
>> +{
>> +	u32 reg;
>> +
>> +	reg = __smu_cmn_poll_stat(smu);
>> +	return __smu_cmn_reg2errno(smu, reg);
>>   }
>>   
>>   int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
>> @@ -123,8 +238,8 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
>>   				    uint32_t param,
>>   				    uint32_t *read_arg)
>>   {
>> -	struct amdgpu_device *adev = smu->adev;
>> -	int ret = 0, index = 0;
>> +	int res, index;
>> +	u32 reg;
>>   
>>   	if (smu->adev->in_pci_err_recovery)
>>   		return 0;
>> @@ -136,31 +251,20 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
>>   		return index == -EACCES ? 0 : index;
>>   
>>   	mutex_lock(&smu->message_lock);
>> -	ret = smu_cmn_send_msg_without_waiting(smu, (uint16_t)index, param);
>> -	if (ret)
>> -		goto out;
>> -
>> -	ret = smu_cmn_wait_for_response(smu);
>> -	if (ret != 0x1) {
>> -		if (ret == -ETIME) {
>> -			dev_err(adev->dev, "message: %15s (%d) \tparam: 0x%08x is timeout (no response)\n",
>> -				smu_get_message_name(smu, msg), index, param);
>> -		} else {
>> -			dev_err(adev->dev, "failed send message: %15s (%d) \tparam: 0x%08x response %#x\n",
>> -				smu_get_message_name(smu, msg), index, param,
>> -				ret);
>> -			ret = -EIO;
>> -		}
>> -		goto out;
>> +	reg = __smu_cmn_poll_stat(smu);
>> +	if (reg == 0 || reg == 0xFC) {
> Same comments as for without_waiting case.

Same response.

>
>> +		res = __smu_cmn_reg2errno(smu, reg);
>> +		__smu_cmn_reg_print_error(smu, reg, index, param, msg);
> This precheck fail print is missing in without_waiting message.

And that is ON PURPOSE, if you study how "without_waiting" is used.
It's ON PURPOSE.

>
>> +		goto Out; >   	}
>> -
>> +	__smu_cmn_send_msg(smu, (uint16_t) index, param);
>> +	reg = __smu_cmn_poll_stat(smu);
>> +	res = __smu_cmn_reg2errno(smu, reg);
> Using smu_cmn_wait_for_response instead of these two makes the intent 
> clearer - that we are waiting for the response.
>
> We need a print here as well if the message has failed.

Nope--that'll be the naive thing to do. You don't want a print here. It was done this way on purpose.
You're merely sending something to the SMU, maybe a reset.....................
You want to know if it was sent. If it is not a reset, the SMU is allowed to take it's time to complete it,
and also the same for a reset. So what if the SMU takes one 1us over YOUR timeout period? But
if the SMU hung, you'll see this on the NEXT command submission to the SMU.

Regards,
Luben

>
> Thanks,
> Lijo
>
>>   	if (read_arg)
>>   		smu_cmn_read_arg(smu, read_arg);
>> -
>> -	ret = 0; /* 0 as driver return value */
>> -out:
>> +Out:
>>   	mutex_unlock(&smu->message_lock);
>> -	return ret;
>> +	return res;
>>   }
>>   
>>   int smu_cmn_send_smc_msg(struct smu_context *smu,
>> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
>> index 9add5f16ff562a..16993daa2ae042 100644
>> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
>> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
>> @@ -27,7 +27,8 @@
>>   
>>   #if defined(SWSMU_CODE_LAYER_L2) || defined(SWSMU_CODE_LAYER_L3) || defined(SWSMU_CODE_LAYER_L4)
>>   int smu_cmn_send_msg_without_waiting(struct smu_context *smu,
>> -				     uint16_t msg, uint32_t param);
>> +				     uint16_t msg_index,
>> +				     uint32_t param);
>>   int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
>>   				    enum smu_message_type msg,
>>   				    uint32_t param,
>>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] drm/amd/pm: Fix a bug communicating with the SMU
  2021-07-13  7:07 ` Quan, Evan
@ 2021-07-13 17:35   ` Luben Tuikov
  2021-07-14  2:01     ` Quan, Evan
  0 siblings, 1 reply; 11+ messages in thread
From: Luben Tuikov @ 2021-07-13 17:35 UTC (permalink / raw)
  To: Quan, Evan, amd-gfx; +Cc: Deucher, Alexander, aaron.baluja, Alimucaj, Andi

On 2021-07-13 3:07 a.m., Quan, Evan wrote:
> [AMD Official Use Only]
>
>
>
>> -----Original Message-----
>> From: Tuikov, Luben <Luben.Tuikov@amd.com>
>> Sent: Monday, July 12, 2021 11:31 PM
>> To: amd-gfx@lists.freedesktop.org
>> Cc: Tuikov, Luben <Luben.Tuikov@amd.com>; Deucher, Alexander
>> <Alexander.Deucher@amd.com>; Quan, Evan <Evan.Quan@amd.com>
>> Subject: [PATCH] drm/amd/pm: Fix a bug communicating with the SMU
>>
>> This fixes a bug which if we probe a non-existing
>> I2C device, and the SMU returns 0xFF, 
> [Quan, Evan] Do we have to probe I2C device via issuing commands to SMU and check existence via SMU response?

Yes, yes we do.

> Is there other more friendly way?

No, there isn't.

> >from then on
>> we can never communicate with the SMU, because the
>> code before this patch reads and interprets 0xFF
>> as a terminal error, and thus we never write 0
>> into register 90 to clear the status (and
>> subsequently send a new command to the SMU.)
>>
>> It is not an error that the SMU returns status
>> 0xFF. This means that the SMU executed the last
>> command successfully (execution status), but the
>> command result is an error of some sort (execution
>> result), depending on what the command was.
>>
>> When doing a status check of the SMU, before we
>> send a new command, the only status which
>> precludes us from sending a new command is 0--the
>> SMU hasn't finished executing a previous command,
>> and 0xFC--the SMU is busy.
>>
>> This bug was seen as the following line in the
>> kernel log,
>>
>> amdgpu: Msg issuing pre-check failed(0xff) and SMU may be not in the right
>> state!
> [Quan, Evan] This was designed to prevent failure scenario from ruin.
> Via this, we can prevent those SMU command/response related registers overwritten. 
> Then PMFW team can known which command invoked the first error.

Sorry, this is not correct.

The interface cannot block valid access to the SMU firmware, just because a command executed successfully,
but with a failed result, i.e. set a clock frequency to such-and-such frequency was executed successfully by the SMU,
but the frequency wasn't able to be set as required--the SMU IS NOT BLOCKED, but can execute more commands.

If the PMFW team wants to know which command invoked "the first error", they can check this explicitly,
they can call smu_cmn_wait_for_response(), just like the reset code does--see the reset code.

The way commit fcb1fe9c9e0031 modified the code, it blocks access to the SMU for various other users
of the SMU, not least of which is the I2C.

Regards,
Luben

>
> BR
> Evan
>> when subsequent SMU commands, not necessarily
>> related to I2C, were sent to the SMU.
>>
>> This patch fixes this bug.
>>
>> Cc: Alex Deucher <Alexander.Deucher@amd.com>
>> Cc: Evan Quan <evan.quan@amd.com>
>> Fixes: fcb1fe9c9e0031 ("drm/amd/powerplay: pre-check the SMU state
>> before issuing message")
>> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
>> ---
>>  drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c | 196
>> +++++++++++++++++++------
>>  drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h |   3 +-
>>  2 files changed, 152 insertions(+), 47 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>> b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>> index c902fdf322c1be..775eb50a2e49a6 100644
>> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>> @@ -55,7 +55,7 @@
>>
>>  #undef __SMU_DUMMY_MAP
>>  #define __SMU_DUMMY_MAP(type)	#type
>> -static const char* __smu_message_names[] = {
>> +static const char * const __smu_message_names[] = {
>>  	SMU_MESSAGE_TYPES
>>  };
>>
>> @@ -76,46 +76,161 @@ static void smu_cmn_read_arg(struct smu_context
>> *smu,
>>  	*arg = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_82);
>>  }
>>
>> -int smu_cmn_wait_for_response(struct smu_context *smu)
>> +/**
>> + * __smu_cmn_poll_stat -- poll for a status from the SMU
>> + * smu: a pointer to SMU context
>> + *
>> + * Returns the status of the SMU, which could be,
>> + * 0, the SMU is busy with your previous command;
>> + * 1,    execution status: success, execution result: success;
>> + * 0xFF, execution status: success, execution result: failure;
>> + * 0xFE, unknown command;
>> + * 0xFD, valid command, but bad (command) prerequisites;
>> + * 0xFC, the command was rejected as the SMU is busy;
>> + * 0xFB, "SMC_Result_DebugDataDumpEnd".
>> + */
>> +static u32 __smu_cmn_poll_stat(struct smu_context *smu)
>>  {
>>  	struct amdgpu_device *adev = smu->adev;
>> -	uint32_t cur_value, i, timeout = adev->usec_timeout * 20;
>> +	int timeout = adev->usec_timeout * 20;
>> +	u32 reg;
>>
>> -	for (i = 0; i < timeout; i++) {
>> -		cur_value = RREG32_SOC15(MP1, 0,
>> mmMP1_SMN_C2PMSG_90);
>> -		if ((cur_value & MP1_C2PMSG_90__CONTENT_MASK) != 0)
>> -			return cur_value;
>> +	for ( ; timeout > 0; timeout--) {
>> +		reg = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90);
>> +		if ((reg & MP1_C2PMSG_90__CONTENT_MASK) != 0)
>> +			break;
>>
>>  		udelay(1);
>>  	}
>>
>> -	/* timeout means wrong logic */
>> -	if (i == timeout)
>> -		return -ETIME;
>> -
>> -	return RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90);
>> +	return reg;
>>  }
>>
>> -int smu_cmn_send_msg_without_waiting(struct smu_context *smu,
>> -				     uint16_t msg, uint32_t param)
>> +static void __smu_cmn_reg_print_error(struct smu_context *smu,
>> +				      u32 reg_c2pmsg_90,
>> +				      int msg_index,
>> +				      u32 param,
>> +				      enum smu_message_type msg)
>>  {
>>  	struct amdgpu_device *adev = smu->adev;
>> -	int ret;
>> +	const char *message = smu_get_message_name(smu, msg);
>>
>> -	ret = smu_cmn_wait_for_response(smu);
>> -	if (ret != 0x1) {
>> -		dev_err(adev->dev, "Msg issuing pre-check failed(0x%x) and
>> "
>> -		       "SMU may be not in the right state!\n", ret);
>> -		if (ret != -ETIME)
>> -			ret = -EIO;
>> -		return ret;
>> +	switch (reg_c2pmsg_90) {
>> +	case 0:
>> +		dev_err_ratelimited(adev->dev,
>> +				    "SMU: I'm not done with your previous
>> command!");
>> +		break;
>> +	case 1:
>> +		/* The SMU executed the command. It completed with a
>> +		 * successful result.
>> +		 */
>> +		break;
>> +	case 0xFF:
>> +		/* The SMU executed the command. It completed with a
>> +		 * unsuccessful result.
>> +		 */
>> +		break;
>> +	case 0xFE:
>> +		dev_err_ratelimited(adev->dev,
>> +				    "SMU: unknown command: index:%d
>> param:0x%08X message:%s",
>> +				    msg_index, param, message);
>> +		break;
>> +	case 0xFD:
>> +		dev_err_ratelimited(adev->dev,
>> +				    "SMU: valid command, bad prerequisites:
>> index:%d param:0x%08X message:%s",
>> +				    msg_index, param, message);
>> +		break;
>> +	case 0xFC:
>> +		dev_err_ratelimited(adev->dev,
>> +				    "SMU: I'm very busy for your command:
>> index:%d param:0x%08X message:%s",
>> +				    msg_index, param, message);
>> +		break;
>> +	case 0xFB:
>> +		dev_err_ratelimited(adev->dev,
>> +				    "SMU: I'm debugging!");
>> +		break;
>> +	default:
>> +		dev_err_ratelimited(adev->dev,
>> +				    "SMU: response:0x%08X for index:%d
>> param:0x%08X message:%s?",
>> +				    reg_c2pmsg_90, msg_index, param,
>> message);
>> +		break;
>> +	}
>> +}
>> +
>> +static int __smu_cmn_reg2errno(struct smu_context *smu, u32
>> reg_c2pmsg_90)
>> +{
>> +	int res;
>> +
>> +	switch (reg_c2pmsg_90) {
>> +	case 0:
>> +		res = -ETIME;
>> +		break;
>> +	case 1:
>> +		res = 0;
>> +		break;
>> +	case 0xFF:
>> +		res = -EIO;
>> +		break;
>> +	case 0xFE:
>> +		res = -EOPNOTSUPP;
>> +		break;
>> +	case 0xFD:
>> +		res = -EIO;
>> +		break;
>> +	case 0xFC:
>> +		res = -EBUSY;
>> +		break;
>> +	case 0xFB:
>> +		res = -EIO;
>> +		break;
>> +	default:
>> +		res = -EIO;
>> +		break;
>>  	}
>>
>> +	return res;
>> +}
>> +
>> +static void __smu_cmn_send_msg(struct smu_context *smu,
>> +			       u16 msg,
>> +			       u32 param)
>> +{
>> +	struct amdgpu_device *adev = smu->adev;
>> +
>>  	WREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90, 0);
>>  	WREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_82, param);
>>  	WREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_66, msg);
>> +}
>>
>> -	return 0;
>> +int smu_cmn_send_msg_without_waiting(struct smu_context *smu,
>> +				     uint16_t msg_index,
>> +				     uint32_t param)
>> +{
>> +	u32 reg;
>> +	int res;
>> +
>> +	if (smu->adev->in_pci_err_recovery)
>> +		return 0;
>> +
>> +	mutex_lock(&smu->message_lock);
>> +	reg = __smu_cmn_poll_stat(smu);
>> +	if (reg == 0 || reg == 0xFC) {
>> +		res = __smu_cmn_reg2errno(smu, reg);
>> +		goto Out;
>> +	}
>> +	__smu_cmn_send_msg(smu, msg_index, param);
>> +	res = 0;
>> +Out:
>> +	mutex_unlock(&smu->message_lock);
>> +	return res;
>> +}
>> +
>> +int smu_cmn_wait_for_response(struct smu_context *smu)
>> +{
>> +	u32 reg;
>> +
>> +	reg = __smu_cmn_poll_stat(smu);
>> +	return __smu_cmn_reg2errno(smu, reg);
>>  }
>>
>>  int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
>> @@ -123,8 +238,8 @@ int smu_cmn_send_smc_msg_with_param(struct
>> smu_context *smu,
>>  				    uint32_t param,
>>  				    uint32_t *read_arg)
>>  {
>> -	struct amdgpu_device *adev = smu->adev;
>> -	int ret = 0, index = 0;
>> +	int res, index;
>> +	u32 reg;
>>
>>  	if (smu->adev->in_pci_err_recovery)
>>  		return 0;
>> @@ -136,31 +251,20 @@ int smu_cmn_send_smc_msg_with_param(struct
>> smu_context *smu,
>>  		return index == -EACCES ? 0 : index;
>>
>>  	mutex_lock(&smu->message_lock);
>> -	ret = smu_cmn_send_msg_without_waiting(smu, (uint16_t)index,
>> param);
>> -	if (ret)
>> -		goto out;
>> -
>> -	ret = smu_cmn_wait_for_response(smu);
>> -	if (ret != 0x1) {
>> -		if (ret == -ETIME) {
>> -			dev_err(adev->dev, "message: %15s (%d) \tparam:
>> 0x%08x is timeout (no response)\n",
>> -				smu_get_message_name(smu, msg), index,
>> param);
>> -		} else {
>> -			dev_err(adev->dev, "failed send message: %15s (%d)
>> \tparam: 0x%08x response %#x\n",
>> -				smu_get_message_name(smu, msg), index,
>> param,
>> -				ret);
>> -			ret = -EIO;
>> -		}
>> -		goto out;
>> +	reg = __smu_cmn_poll_stat(smu);
>> +	if (reg == 0 || reg == 0xFC) {
>> +		res = __smu_cmn_reg2errno(smu, reg);
>> +		__smu_cmn_reg_print_error(smu, reg, index, param, msg);
>> +		goto Out;
>>  	}
>> -
>> +	__smu_cmn_send_msg(smu, (uint16_t) index, param);
>> +	reg = __smu_cmn_poll_stat(smu);
>> +	res = __smu_cmn_reg2errno(smu, reg);
>>  	if (read_arg)
>>  		smu_cmn_read_arg(smu, read_arg);
>> -
>> -	ret = 0; /* 0 as driver return value */
>> -out:
>> +Out:
>>  	mutex_unlock(&smu->message_lock);
>> -	return ret;
>> +	return res;
>>  }
>>
>>  int smu_cmn_send_smc_msg(struct smu_context *smu,
>> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
>> b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
>> index 9add5f16ff562a..16993daa2ae042 100644
>> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
>> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
>> @@ -27,7 +27,8 @@
>>
>>  #if defined(SWSMU_CODE_LAYER_L2) || defined(SWSMU_CODE_LAYER_L3)
>> || defined(SWSMU_CODE_LAYER_L4)
>>  int smu_cmn_send_msg_without_waiting(struct smu_context *smu,
>> -				     uint16_t msg, uint32_t param);
>> +				     uint16_t msg_index,
>> +				     uint32_t param);
>>  int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
>>  				    enum smu_message_type msg,
>>  				    uint32_t param,
>> --
>> 2.32.0

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE: [PATCH] drm/amd/pm: Fix a bug communicating with the SMU
  2021-07-13 17:35   ` Luben Tuikov
@ 2021-07-14  2:01     ` Quan, Evan
  2021-07-14  4:04       ` Luben Tuikov
  2021-07-14 15:22       ` Alex Deucher
  0 siblings, 2 replies; 11+ messages in thread
From: Quan, Evan @ 2021-07-14  2:01 UTC (permalink / raw)
  To: Tuikov, Luben, amd-gfx; +Cc: Deucher, Alexander, Baluja, Aaron, Alimucaj, Andi

[AMD Official Use Only]



> -----Original Message-----
> From: Tuikov, Luben <Luben.Tuikov@amd.com>
> Sent: Wednesday, July 14, 2021 1:36 AM
> To: Quan, Evan <Evan.Quan@amd.com>; amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Alimucaj, Andi
> <Anduil.Alimucaj@amd.com>; Baluja, Aaron <Aaron.Baluja@amd.com>
> Subject: Re: [PATCH] drm/amd/pm: Fix a bug communicating with the SMU
> 
> On 2021-07-13 3:07 a.m., Quan, Evan wrote:
> > [AMD Official Use Only]
> >
> >
> >
> >> -----Original Message-----
> >> From: Tuikov, Luben <Luben.Tuikov@amd.com>
> >> Sent: Monday, July 12, 2021 11:31 PM
> >> To: amd-gfx@lists.freedesktop.org
> >> Cc: Tuikov, Luben <Luben.Tuikov@amd.com>; Deucher, Alexander
> >> <Alexander.Deucher@amd.com>; Quan, Evan <Evan.Quan@amd.com>
> >> Subject: [PATCH] drm/amd/pm: Fix a bug communicating with the SMU
> >>
> >> This fixes a bug which if we probe a non-existing I2C device, and the
> >> SMU returns 0xFF,
> > [Quan, Evan] Do we have to probe I2C device via issuing commands to SMU
> and check existence via SMU response?
> 
> Yes, yes we do.
> 
> > Is there other more friendly way?
> 
> No, there isn't.
> 
> > >from then on
> >> we can never communicate with the SMU, because the code before this
> >> patch reads and interprets 0xFF as a terminal error, and thus we
> >> never write 0 into register 90 to clear the status (and subsequently
> >> send a new command to the SMU.)
> >>
> >> It is not an error that the SMU returns status 0xFF. This means that
> >> the SMU executed the last command successfully (execution status),
> >> but the command result is an error of some sort (execution result),
> >> depending on what the command was.
> >>
> >> When doing a status check of the SMU, before we send a new command,
> >> the only status which precludes us from sending a new command is
> >> 0--the SMU hasn't finished executing a previous command, and
> >> 0xFC--the SMU is busy.
> >>
> >> This bug was seen as the following line in the kernel log,
> >>
> >> amdgpu: Msg issuing pre-check failed(0xff) and SMU may be not in the
> >> right state!
> > [Quan, Evan] This was designed to prevent failure scenario from ruin.
> > Via this, we can prevent those SMU command/response related registers
> overwritten.
> > Then PMFW team can known which command invoked the first error.
> 
> Sorry, this is not correct.
> 
> The interface cannot block valid access to the SMU firmware, just because a
> command executed successfully, but with a failed result, i.e. set a clock
> frequency to such-and-such frequency was executed successfully by the
> SMU, but the frequency wasn't able to be set as required--the SMU IS NOT
> BLOCKED, but can execute more commands.
[Quan, Evan] 
1. First of all, if the response from SMU is not 1, it means SMU must detected something wrong.
We(driver) should properly handle that. I do not think it's a good idea to silently ignore that and proceed more commands.
2. Please be noticed that many commands(SMU messages) have dependence with each other. It means even if the further command
can be executed "successfully", it may be not executed in the expected way.
> 
> If the PMFW team wants to know which command invoked "the first error",
> they can check this explicitly, they can call smu_cmn_wait_for_response(),
> just like the reset code does--see the reset code.
[Quan, Evan] Per my knowledge gained during co-work with PMFW team, they expect no further driver-smu interaction(driver stops issuing command)
 when something went wrong. As they highly rely on SMU internal statistics for their debugging and further interaction may ruin them.
> 
> The way commit fcb1fe9c9e0031 modified the code, it blocks access to the
> SMU for various other users of the SMU, not least of which is the I2C.
[Quan, Evan] I'm wondering can we just list the I2C case as an exception. I means for the SMU command related with I2C we always assume the response is 1.
For other commands, we just leave them as they are.

BR
Evan
> 
> Regards,
> Luben
> 
> >
> > BR
> > Evan
> >> when subsequent SMU commands, not necessarily related to I2C, were
> >> sent to the SMU.
> >>
> >> This patch fixes this bug.
> >>
> >> Cc: Alex Deucher <Alexander.Deucher@amd.com>
> >> Cc: Evan Quan <evan.quan@amd.com>
> >> Fixes: fcb1fe9c9e0031 ("drm/amd/powerplay: pre-check the SMU state
> >> before issuing message")
> >> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
> >> ---
> >>  drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c | 196
> >> +++++++++++++++++++------
> >>  drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h |   3 +-
> >>  2 files changed, 152 insertions(+), 47 deletions(-)
> >>
> >> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> >> b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> >> index c902fdf322c1be..775eb50a2e49a6 100644
> >> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> >> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> >> @@ -55,7 +55,7 @@
> >>
> >>  #undef __SMU_DUMMY_MAP
> >>  #define __SMU_DUMMY_MAP(type)	#type
> >> -static const char* __smu_message_names[] = {
> >> +static const char * const __smu_message_names[] = {
> >>  	SMU_MESSAGE_TYPES
> >>  };
> >>
> >> @@ -76,46 +76,161 @@ static void smu_cmn_read_arg(struct
> smu_context
> >> *smu,
> >>  	*arg = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_82);  }
> >>
> >> -int smu_cmn_wait_for_response(struct smu_context *smu)
> >> +/**
> >> + * __smu_cmn_poll_stat -- poll for a status from the SMU
> >> + * smu: a pointer to SMU context
> >> + *
> >> + * Returns the status of the SMU, which could be,
> >> + * 0, the SMU is busy with your previous command;
> >> + * 1,    execution status: success, execution result: success;
> >> + * 0xFF, execution status: success, execution result: failure;
> >> + * 0xFE, unknown command;
> >> + * 0xFD, valid command, but bad (command) prerequisites;
> >> + * 0xFC, the command was rejected as the SMU is busy;
> >> + * 0xFB, "SMC_Result_DebugDataDumpEnd".
> >> + */
> >> +static u32 __smu_cmn_poll_stat(struct smu_context *smu)
> >>  {
> >>  	struct amdgpu_device *adev = smu->adev;
> >> -	uint32_t cur_value, i, timeout = adev->usec_timeout * 20;
> >> +	int timeout = adev->usec_timeout * 20;
> >> +	u32 reg;
> >>
> >> -	for (i = 0; i < timeout; i++) {
> >> -		cur_value = RREG32_SOC15(MP1, 0,
> >> mmMP1_SMN_C2PMSG_90);
> >> -		if ((cur_value & MP1_C2PMSG_90__CONTENT_MASK) != 0)
> >> -			return cur_value;
> >> +	for ( ; timeout > 0; timeout--) {
> >> +		reg = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90);
> >> +		if ((reg & MP1_C2PMSG_90__CONTENT_MASK) != 0)
> >> +			break;
> >>
> >>  		udelay(1);
> >>  	}
> >>
> >> -	/* timeout means wrong logic */
> >> -	if (i == timeout)
> >> -		return -ETIME;
> >> -
> >> -	return RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90);
> >> +	return reg;
> >>  }
> >>
> >> -int smu_cmn_send_msg_without_waiting(struct smu_context *smu,
> >> -				     uint16_t msg, uint32_t param)
> >> +static void __smu_cmn_reg_print_error(struct smu_context *smu,
> >> +				      u32 reg_c2pmsg_90,
> >> +				      int msg_index,
> >> +				      u32 param,
> >> +				      enum smu_message_type msg)
> >>  {
> >>  	struct amdgpu_device *adev = smu->adev;
> >> -	int ret;
> >> +	const char *message = smu_get_message_name(smu, msg);
> >>
> >> -	ret = smu_cmn_wait_for_response(smu);
> >> -	if (ret != 0x1) {
> >> -		dev_err(adev->dev, "Msg issuing pre-check failed(0x%x) and
> >> "
> >> -		       "SMU may be not in the right state!\n", ret);
> >> -		if (ret != -ETIME)
> >> -			ret = -EIO;
> >> -		return ret;
> >> +	switch (reg_c2pmsg_90) {
> >> +	case 0:
> >> +		dev_err_ratelimited(adev->dev,
> >> +				    "SMU: I'm not done with your previous
> >> command!");
> >> +		break;
> >> +	case 1:
> >> +		/* The SMU executed the command. It completed with a
> >> +		 * successful result.
> >> +		 */
> >> +		break;
> >> +	case 0xFF:
> >> +		/* The SMU executed the command. It completed with a
> >> +		 * unsuccessful result.
> >> +		 */
> >> +		break;
> >> +	case 0xFE:
> >> +		dev_err_ratelimited(adev->dev,
> >> +				    "SMU: unknown command: index:%d
> >> param:0x%08X message:%s",
> >> +				    msg_index, param, message);
> >> +		break;
> >> +	case 0xFD:
> >> +		dev_err_ratelimited(adev->dev,
> >> +				    "SMU: valid command, bad prerequisites:
> >> index:%d param:0x%08X message:%s",
> >> +				    msg_index, param, message);
> >> +		break;
> >> +	case 0xFC:
> >> +		dev_err_ratelimited(adev->dev,
> >> +				    "SMU: I'm very busy for your command:
> >> index:%d param:0x%08X message:%s",
> >> +				    msg_index, param, message);
> >> +		break;
> >> +	case 0xFB:
> >> +		dev_err_ratelimited(adev->dev,
> >> +				    "SMU: I'm debugging!");
> >> +		break;
> >> +	default:
> >> +		dev_err_ratelimited(adev->dev,
> >> +				    "SMU: response:0x%08X for index:%d
> >> param:0x%08X message:%s?",
> >> +				    reg_c2pmsg_90, msg_index, param,
> >> message);
> >> +		break;
> >> +	}
> >> +}
> >> +
> >> +static int __smu_cmn_reg2errno(struct smu_context *smu, u32
> >> reg_c2pmsg_90)
> >> +{
> >> +	int res;
> >> +
> >> +	switch (reg_c2pmsg_90) {
> >> +	case 0:
> >> +		res = -ETIME;
> >> +		break;
> >> +	case 1:
> >> +		res = 0;
> >> +		break;
> >> +	case 0xFF:
> >> +		res = -EIO;
> >> +		break;
> >> +	case 0xFE:
> >> +		res = -EOPNOTSUPP;
> >> +		break;
> >> +	case 0xFD:
> >> +		res = -EIO;
> >> +		break;
> >> +	case 0xFC:
> >> +		res = -EBUSY;
> >> +		break;
> >> +	case 0xFB:
> >> +		res = -EIO;
> >> +		break;
> >> +	default:
> >> +		res = -EIO;
> >> +		break;
> >>  	}
> >>
> >> +	return res;
> >> +}
> >> +
> >> +static void __smu_cmn_send_msg(struct smu_context *smu,
> >> +			       u16 msg,
> >> +			       u32 param)
> >> +{
> >> +	struct amdgpu_device *adev = smu->adev;
> >> +
> >>  	WREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90, 0);
> >>  	WREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_82, param);
> >>  	WREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_66, msg);
> >> +}
> >>
> >> -	return 0;
> >> +int smu_cmn_send_msg_without_waiting(struct smu_context *smu,
> >> +				     uint16_t msg_index,
> >> +				     uint32_t param)
> >> +{
> >> +	u32 reg;
> >> +	int res;
> >> +
> >> +	if (smu->adev->in_pci_err_recovery)
> >> +		return 0;
> >> +
> >> +	mutex_lock(&smu->message_lock);
> >> +	reg = __smu_cmn_poll_stat(smu);
> >> +	if (reg == 0 || reg == 0xFC) {
> >> +		res = __smu_cmn_reg2errno(smu, reg);
> >> +		goto Out;
> >> +	}
> >> +	__smu_cmn_send_msg(smu, msg_index, param);
> >> +	res = 0;
> >> +Out:
> >> +	mutex_unlock(&smu->message_lock);
> >> +	return res;
> >> +}
> >> +
> >> +int smu_cmn_wait_for_response(struct smu_context *smu) {
> >> +	u32 reg;
> >> +
> >> +	reg = __smu_cmn_poll_stat(smu);
> >> +	return __smu_cmn_reg2errno(smu, reg);
> >>  }
> >>
> >>  int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
> @@
> >> -123,8 +238,8 @@ int smu_cmn_send_smc_msg_with_param(struct
> >> smu_context *smu,
> >>  				    uint32_t param,
> >>  				    uint32_t *read_arg)
> >>  {
> >> -	struct amdgpu_device *adev = smu->adev;
> >> -	int ret = 0, index = 0;
> >> +	int res, index;
> >> +	u32 reg;
> >>
> >>  	if (smu->adev->in_pci_err_recovery)
> >>  		return 0;
> >> @@ -136,31 +251,20 @@ int
> smu_cmn_send_smc_msg_with_param(struct
> >> smu_context *smu,
> >>  		return index == -EACCES ? 0 : index;
> >>
> >>  	mutex_lock(&smu->message_lock);
> >> -	ret = smu_cmn_send_msg_without_waiting(smu, (uint16_t)index,
> >> param);
> >> -	if (ret)
> >> -		goto out;
> >> -
> >> -	ret = smu_cmn_wait_for_response(smu);
> >> -	if (ret != 0x1) {
> >> -		if (ret == -ETIME) {
> >> -			dev_err(adev->dev, "message: %15s (%d) \tparam:
> >> 0x%08x is timeout (no response)\n",
> >> -				smu_get_message_name(smu, msg), index,
> >> param);
> >> -		} else {
> >> -			dev_err(adev->dev, "failed send message: %15s (%d)
> >> \tparam: 0x%08x response %#x\n",
> >> -				smu_get_message_name(smu, msg), index,
> >> param,
> >> -				ret);
> >> -			ret = -EIO;
> >> -		}
> >> -		goto out;
> >> +	reg = __smu_cmn_poll_stat(smu);
> >> +	if (reg == 0 || reg == 0xFC) {
> >> +		res = __smu_cmn_reg2errno(smu, reg);
> >> +		__smu_cmn_reg_print_error(smu, reg, index, param, msg);
> >> +		goto Out;
> >>  	}
> >> -
> >> +	__smu_cmn_send_msg(smu, (uint16_t) index, param);
> >> +	reg = __smu_cmn_poll_stat(smu);
> >> +	res = __smu_cmn_reg2errno(smu, reg);
> >>  	if (read_arg)
> >>  		smu_cmn_read_arg(smu, read_arg);
> >> -
> >> -	ret = 0; /* 0 as driver return value */
> >> -out:
> >> +Out:
> >>  	mutex_unlock(&smu->message_lock);
> >> -	return ret;
> >> +	return res;
> >>  }
> >>
> >>  int smu_cmn_send_smc_msg(struct smu_context *smu, diff --git
> >> a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
> >> b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
> >> index 9add5f16ff562a..16993daa2ae042 100644
> >> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
> >> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
> >> @@ -27,7 +27,8 @@
> >>
> >>  #if defined(SWSMU_CODE_LAYER_L2) ||
> defined(SWSMU_CODE_LAYER_L3)
> >> || defined(SWSMU_CODE_LAYER_L4)
> >>  int smu_cmn_send_msg_without_waiting(struct smu_context *smu,
> >> -				     uint16_t msg, uint32_t param);
> >> +				     uint16_t msg_index,
> >> +				     uint32_t param);
> >>  int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
> >>  				    enum smu_message_type msg,
> >>  				    uint32_t param,
> >> --
> >> 2.32.0
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] drm/amd/pm: Fix a bug communicating with the SMU
  2021-07-14  2:01     ` Quan, Evan
@ 2021-07-14  4:04       ` Luben Tuikov
  2021-07-14 15:22       ` Alex Deucher
  1 sibling, 0 replies; 11+ messages in thread
From: Luben Tuikov @ 2021-07-14  4:04 UTC (permalink / raw)
  To: Quan, Evan, amd-gfx; +Cc: Deucher, Alexander, Baluja, Aaron, Alimucaj, Andi

On 2021-07-13 10:01 p.m., Quan, Evan wrote:
> [AMD Official Use Only]
>
>
>
>> -----Original Message-----
>> From: Tuikov, Luben <Luben.Tuikov@amd.com>
>> Sent: Wednesday, July 14, 2021 1:36 AM
>> To: Quan, Evan <Evan.Quan@amd.com>; amd-gfx@lists.freedesktop.org
>> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Alimucaj, Andi
>> <Anduil.Alimucaj@amd.com>; Baluja, Aaron <Aaron.Baluja@amd.com>
>> Subject: Re: [PATCH] drm/amd/pm: Fix a bug communicating with the SMU
>>
>> On 2021-07-13 3:07 a.m., Quan, Evan wrote:
>>> [AMD Official Use Only]
>>>
>>>
>>>
>>>> -----Original Message-----
>>>> From: Tuikov, Luben <Luben.Tuikov@amd.com>
>>>> Sent: Monday, July 12, 2021 11:31 PM
>>>> To: amd-gfx@lists.freedesktop.org
>>>> Cc: Tuikov, Luben <Luben.Tuikov@amd.com>; Deucher, Alexander
>>>> <Alexander.Deucher@amd.com>; Quan, Evan <Evan.Quan@amd.com>
>>>> Subject: [PATCH] drm/amd/pm: Fix a bug communicating with the SMU
>>>>
>>>> This fixes a bug which if we probe a non-existing I2C device, and the
>>>> SMU returns 0xFF,
>>> [Quan, Evan] Do we have to probe I2C device via issuing commands to SMU
>> and check existence via SMU response?
>>
>> Yes, yes we do.
>>
>>> Is there other more friendly way?
>> No, there isn't.
>>
>>> >from then on
>>>> we can never communicate with the SMU, because the code before this
>>>> patch reads and interprets 0xFF as a terminal error, and thus we
>>>> never write 0 into register 90 to clear the status (and subsequently
>>>> send a new command to the SMU.)
>>>>
>>>> It is not an error that the SMU returns status 0xFF. This means that
>>>> the SMU executed the last command successfully (execution status),
>>>> but the command result is an error of some sort (execution result),
>>>> depending on what the command was.
>>>>
>>>> When doing a status check of the SMU, before we send a new command,
>>>> the only status which precludes us from sending a new command is
>>>> 0--the SMU hasn't finished executing a previous command, and
>>>> 0xFC--the SMU is busy.
>>>>
>>>> This bug was seen as the following line in the kernel log,
>>>>
>>>> amdgpu: Msg issuing pre-check failed(0xff) and SMU may be not in the
>>>> right state!
>>> [Quan, Evan] This was designed to prevent failure scenario from ruin.
>>> Via this, we can prevent those SMU command/response related registers
>> overwritten.
>>> Then PMFW team can known which command invoked the first error.
>> Sorry, this is not correct.
>>
>> The interface cannot block valid access to the SMU firmware, just because a
>> command executed successfully, but with a failed result, i.e. set a clock
>> frequency to such-and-such frequency was executed successfully by the
>> SMU, but the frequency wasn't able to be set as required--the SMU IS NOT
>> BLOCKED, but can execute more commands.
> [Quan, Evan] 
> 1. First of all, if the response from SMU is not 1, it means SMU must detected something wrong.

Not necessarily. The codes I've included in the patch. "something wrong" isn't well defined.

> We(driver) should properly handle that. I do not think it's a good idea to silently ignore that and proceed more commands.

And we do. We do handle properly. Perhaps the best way to handle it is to *send another command* to the SMU as a follow-up, and so on.

> 2. Please be noticed that many commands(SMU messages) have dependence with each other. It means even if the further command
> can be executed "successfully", it may be not executed in the expected way.

Sure, sure. And the client should know this dependency, and should check the result, and should know whether to continue to send more commands or do something else. Either way, you cannot block access to the SMU, when other clients can use it. We've seen plenty of logs where gfxoff was unsuccessful because of this bug.

Here's the gist of it: smu_cmn.c CANNOT BLOCK access to the SMU just because one client doesn't check results and doesn't know how to handle them correctly. smu_cmn.c just facilitates access to the SMU when possible (SMU is not busy).

If there's an error, the client takes that error and decides what to do next, but smu_cmn.c CANNOT block access to the SMU.

>> If the PMFW team wants to know which command invoked "the first error",
>> they can check this explicitly, they can call smu_cmn_wait_for_response(),
>> just like the reset code does--see the reset code.
> [Quan, Evan] Per my knowledge gained during co-work with PMFW team, they expect no further driver-smu interaction(driver stops issuing command)
>  when something went wrong. As they highly rely on SMU internal statistics for their debugging and further interaction may ruin them.

Then that is a different change, and a different patch.

Note however that STILL you cannot block access to the SMU, for instance gfxoff, or RAS may want to write some bad pages to the EEPROM. Nope! You cannot block access to the SMU.

>> The way commit fcb1fe9c9e0031 modified the code, it blocks access to the
>> SMU for various other users of the SMU, not least of which is the I2C.
> [Quan, Evan] I'm wondering can we just list the I2C case as an exception. I means for the SMU command related with I2C we always assume the response is 1.
> For other commands, we just leave them as they are.

Well, it seems you're writing a debug behaviour into the driver out in the field.

We cannot assume that for I2C response is always 1--what if there is no such I2C device on that bus? Upper layers need to know that.

Regards,
Luben



>
> BR
> Evan
>> Regards,
>> Luben
>>
>>> BR
>>> Evan
>>>> when subsequent SMU commands, not necessarily related to I2C, were
>>>> sent to the SMU.
>>>>
>>>> This patch fixes this bug.
>>>>
>>>> Cc: Alex Deucher <Alexander.Deucher@amd.com>
>>>> Cc: Evan Quan <evan.quan@amd.com>
>>>> Fixes: fcb1fe9c9e0031 ("drm/amd/powerplay: pre-check the SMU state
>>>> before issuing message")
>>>> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
>>>> ---
>>>>  drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c | 196
>>>> +++++++++++++++++++------
>>>>  drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h |   3 +-
>>>>  2 files changed, 152 insertions(+), 47 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>>>> b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>>>> index c902fdf322c1be..775eb50a2e49a6 100644
>>>> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>>>> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>>>> @@ -55,7 +55,7 @@
>>>>
>>>>  #undef __SMU_DUMMY_MAP
>>>>  #define __SMU_DUMMY_MAP(type)	#type
>>>> -static const char* __smu_message_names[] = {
>>>> +static const char * const __smu_message_names[] = {
>>>>  	SMU_MESSAGE_TYPES
>>>>  };
>>>>
>>>> @@ -76,46 +76,161 @@ static void smu_cmn_read_arg(struct
>> smu_context
>>>> *smu,
>>>>  	*arg = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_82);  }
>>>>
>>>> -int smu_cmn_wait_for_response(struct smu_context *smu)
>>>> +/**
>>>> + * __smu_cmn_poll_stat -- poll for a status from the SMU
>>>> + * smu: a pointer to SMU context
>>>> + *
>>>> + * Returns the status of the SMU, which could be,
>>>> + * 0, the SMU is busy with your previous command;
>>>> + * 1,    execution status: success, execution result: success;
>>>> + * 0xFF, execution status: success, execution result: failure;
>>>> + * 0xFE, unknown command;
>>>> + * 0xFD, valid command, but bad (command) prerequisites;
>>>> + * 0xFC, the command was rejected as the SMU is busy;
>>>> + * 0xFB, "SMC_Result_DebugDataDumpEnd".
>>>> + */
>>>> +static u32 __smu_cmn_poll_stat(struct smu_context *smu)
>>>>  {
>>>>  	struct amdgpu_device *adev = smu->adev;
>>>> -	uint32_t cur_value, i, timeout = adev->usec_timeout * 20;
>>>> +	int timeout = adev->usec_timeout * 20;
>>>> +	u32 reg;
>>>>
>>>> -	for (i = 0; i < timeout; i++) {
>>>> -		cur_value = RREG32_SOC15(MP1, 0,
>>>> mmMP1_SMN_C2PMSG_90);
>>>> -		if ((cur_value & MP1_C2PMSG_90__CONTENT_MASK) != 0)
>>>> -			return cur_value;
>>>> +	for ( ; timeout > 0; timeout--) {
>>>> +		reg = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90);
>>>> +		if ((reg & MP1_C2PMSG_90__CONTENT_MASK) != 0)
>>>> +			break;
>>>>
>>>>  		udelay(1);
>>>>  	}
>>>>
>>>> -	/* timeout means wrong logic */
>>>> -	if (i == timeout)
>>>> -		return -ETIME;
>>>> -
>>>> -	return RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90);
>>>> +	return reg;
>>>>  }
>>>>
>>>> -int smu_cmn_send_msg_without_waiting(struct smu_context *smu,
>>>> -				     uint16_t msg, uint32_t param)
>>>> +static void __smu_cmn_reg_print_error(struct smu_context *smu,
>>>> +				      u32 reg_c2pmsg_90,
>>>> +				      int msg_index,
>>>> +				      u32 param,
>>>> +				      enum smu_message_type msg)
>>>>  {
>>>>  	struct amdgpu_device *adev = smu->adev;
>>>> -	int ret;
>>>> +	const char *message = smu_get_message_name(smu, msg);
>>>>
>>>> -	ret = smu_cmn_wait_for_response(smu);
>>>> -	if (ret != 0x1) {
>>>> -		dev_err(adev->dev, "Msg issuing pre-check failed(0x%x) and
>>>> "
>>>> -		       "SMU may be not in the right state!\n", ret);
>>>> -		if (ret != -ETIME)
>>>> -			ret = -EIO;
>>>> -		return ret;
>>>> +	switch (reg_c2pmsg_90) {
>>>> +	case 0:
>>>> +		dev_err_ratelimited(adev->dev,
>>>> +				    "SMU: I'm not done with your previous
>>>> command!");
>>>> +		break;
>>>> +	case 1:
>>>> +		/* The SMU executed the command. It completed with a
>>>> +		 * successful result.
>>>> +		 */
>>>> +		break;
>>>> +	case 0xFF:
>>>> +		/* The SMU executed the command. It completed with a
>>>> +		 * unsuccessful result.
>>>> +		 */
>>>> +		break;
>>>> +	case 0xFE:
>>>> +		dev_err_ratelimited(adev->dev,
>>>> +				    "SMU: unknown command: index:%d
>>>> param:0x%08X message:%s",
>>>> +				    msg_index, param, message);
>>>> +		break;
>>>> +	case 0xFD:
>>>> +		dev_err_ratelimited(adev->dev,
>>>> +				    "SMU: valid command, bad prerequisites:
>>>> index:%d param:0x%08X message:%s",
>>>> +				    msg_index, param, message);
>>>> +		break;
>>>> +	case 0xFC:
>>>> +		dev_err_ratelimited(adev->dev,
>>>> +				    "SMU: I'm very busy for your command:
>>>> index:%d param:0x%08X message:%s",
>>>> +				    msg_index, param, message);
>>>> +		break;
>>>> +	case 0xFB:
>>>> +		dev_err_ratelimited(adev->dev,
>>>> +				    "SMU: I'm debugging!");
>>>> +		break;
>>>> +	default:
>>>> +		dev_err_ratelimited(adev->dev,
>>>> +				    "SMU: response:0x%08X for index:%d
>>>> param:0x%08X message:%s?",
>>>> +				    reg_c2pmsg_90, msg_index, param,
>>>> message);
>>>> +		break;
>>>> +	}
>>>> +}
>>>> +
>>>> +static int __smu_cmn_reg2errno(struct smu_context *smu, u32
>>>> reg_c2pmsg_90)
>>>> +{
>>>> +	int res;
>>>> +
>>>> +	switch (reg_c2pmsg_90) {
>>>> +	case 0:
>>>> +		res = -ETIME;
>>>> +		break;
>>>> +	case 1:
>>>> +		res = 0;
>>>> +		break;
>>>> +	case 0xFF:
>>>> +		res = -EIO;
>>>> +		break;
>>>> +	case 0xFE:
>>>> +		res = -EOPNOTSUPP;
>>>> +		break;
>>>> +	case 0xFD:
>>>> +		res = -EIO;
>>>> +		break;
>>>> +	case 0xFC:
>>>> +		res = -EBUSY;
>>>> +		break;
>>>> +	case 0xFB:
>>>> +		res = -EIO;
>>>> +		break;
>>>> +	default:
>>>> +		res = -EIO;
>>>> +		break;
>>>>  	}
>>>>
>>>> +	return res;
>>>> +}
>>>> +
>>>> +static void __smu_cmn_send_msg(struct smu_context *smu,
>>>> +			       u16 msg,
>>>> +			       u32 param)
>>>> +{
>>>> +	struct amdgpu_device *adev = smu->adev;
>>>> +
>>>>  	WREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90, 0);
>>>>  	WREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_82, param);
>>>>  	WREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_66, msg);
>>>> +}
>>>>
>>>> -	return 0;
>>>> +int smu_cmn_send_msg_without_waiting(struct smu_context *smu,
>>>> +				     uint16_t msg_index,
>>>> +				     uint32_t param)
>>>> +{
>>>> +	u32 reg;
>>>> +	int res;
>>>> +
>>>> +	if (smu->adev->in_pci_err_recovery)
>>>> +		return 0;
>>>> +
>>>> +	mutex_lock(&smu->message_lock);
>>>> +	reg = __smu_cmn_poll_stat(smu);
>>>> +	if (reg == 0 || reg == 0xFC) {
>>>> +		res = __smu_cmn_reg2errno(smu, reg);
>>>> +		goto Out;
>>>> +	}
>>>> +	__smu_cmn_send_msg(smu, msg_index, param);
>>>> +	res = 0;
>>>> +Out:
>>>> +	mutex_unlock(&smu->message_lock);
>>>> +	return res;
>>>> +}
>>>> +
>>>> +int smu_cmn_wait_for_response(struct smu_context *smu) {
>>>> +	u32 reg;
>>>> +
>>>> +	reg = __smu_cmn_poll_stat(smu);
>>>> +	return __smu_cmn_reg2errno(smu, reg);
>>>>  }
>>>>
>>>>  int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
>> @@
>>>> -123,8 +238,8 @@ int smu_cmn_send_smc_msg_with_param(struct
>>>> smu_context *smu,
>>>>  				    uint32_t param,
>>>>  				    uint32_t *read_arg)
>>>>  {
>>>> -	struct amdgpu_device *adev = smu->adev;
>>>> -	int ret = 0, index = 0;
>>>> +	int res, index;
>>>> +	u32 reg;
>>>>
>>>>  	if (smu->adev->in_pci_err_recovery)
>>>>  		return 0;
>>>> @@ -136,31 +251,20 @@ int
>> smu_cmn_send_smc_msg_with_param(struct
>>>> smu_context *smu,
>>>>  		return index == -EACCES ? 0 : index;
>>>>
>>>>  	mutex_lock(&smu->message_lock);
>>>> -	ret = smu_cmn_send_msg_without_waiting(smu, (uint16_t)index,
>>>> param);
>>>> -	if (ret)
>>>> -		goto out;
>>>> -
>>>> -	ret = smu_cmn_wait_for_response(smu);
>>>> -	if (ret != 0x1) {
>>>> -		if (ret == -ETIME) {
>>>> -			dev_err(adev->dev, "message: %15s (%d) \tparam:
>>>> 0x%08x is timeout (no response)\n",
>>>> -				smu_get_message_name(smu, msg), index,
>>>> param);
>>>> -		} else {
>>>> -			dev_err(adev->dev, "failed send message: %15s (%d)
>>>> \tparam: 0x%08x response %#x\n",
>>>> -				smu_get_message_name(smu, msg), index,
>>>> param,
>>>> -				ret);
>>>> -			ret = -EIO;
>>>> -		}
>>>> -		goto out;
>>>> +	reg = __smu_cmn_poll_stat(smu);
>>>> +	if (reg == 0 || reg == 0xFC) {
>>>> +		res = __smu_cmn_reg2errno(smu, reg);
>>>> +		__smu_cmn_reg_print_error(smu, reg, index, param, msg);
>>>> +		goto Out;
>>>>  	}
>>>> -
>>>> +	__smu_cmn_send_msg(smu, (uint16_t) index, param);
>>>> +	reg = __smu_cmn_poll_stat(smu);
>>>> +	res = __smu_cmn_reg2errno(smu, reg);
>>>>  	if (read_arg)
>>>>  		smu_cmn_read_arg(smu, read_arg);
>>>> -
>>>> -	ret = 0; /* 0 as driver return value */
>>>> -out:
>>>> +Out:
>>>>  	mutex_unlock(&smu->message_lock);
>>>> -	return ret;
>>>> +	return res;
>>>>  }
>>>>
>>>>  int smu_cmn_send_smc_msg(struct smu_context *smu, diff --git
>>>> a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
>>>> b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
>>>> index 9add5f16ff562a..16993daa2ae042 100644
>>>> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
>>>> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
>>>> @@ -27,7 +27,8 @@
>>>>
>>>>  #if defined(SWSMU_CODE_LAYER_L2) ||
>> defined(SWSMU_CODE_LAYER_L3)
>>>> || defined(SWSMU_CODE_LAYER_L4)
>>>>  int smu_cmn_send_msg_without_waiting(struct smu_context *smu,
>>>> -				     uint16_t msg, uint32_t param);
>>>> +				     uint16_t msg_index,
>>>> +				     uint32_t param);
>>>>  int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
>>>>  				    enum smu_message_type msg,
>>>>  				    uint32_t param,
>>>> --
>>>> 2.32.0

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] drm/amd/pm: Fix a bug communicating with the SMU
  2021-07-13  2:56 ` Lazar, Lijo
  2021-07-13 17:18   ` Luben Tuikov
@ 2021-07-14 15:19   ` Alex Deucher
  2021-07-14 16:57     ` Luben Tuikov
  1 sibling, 1 reply; 11+ messages in thread
From: Alex Deucher @ 2021-07-14 15:19 UTC (permalink / raw)
  To: Lazar, Lijo; +Cc: Alex Deucher, Luben Tuikov, Evan Quan, amd-gfx list

On Mon, Jul 12, 2021 at 10:56 PM Lazar, Lijo <lijo.lazar@amd.com> wrote:
>
>
>
> On 7/12/2021 9:00 PM, Luben Tuikov wrote:
> > This fixes a bug which if we probe a non-existing
> > I2C device, and the SMU returns 0xFF, from then on
> > we can never communicate with the SMU, because the
> > code before this patch reads and interprets 0xFF
> > as a terminal error, and thus we never write 0
> > into register 90 to clear the status (and
> > subsequently send a new command to the SMU.)
> >
> > It is not an error that the SMU returns status
> > 0xFF. This means that the SMU executed the last
> > command successfully (execution status), but the
> > command result is an error of some sort (execution
> > result), depending on what the command was.
> >
> > When doing a status check of the SMU, before we
> > send a new command, the only status which
> > precludes us from sending a new command is 0--the
> > SMU hasn't finished executing a previous command,
> > and 0xFC--the SMU is busy.
> >
> > This bug was seen as the following line in the
> > kernel log,
> >
> > amdgpu: Msg issuing pre-check failed(0xff) and SMU may be not in the right state!
> >
> > when subsequent SMU commands, not necessarily
> > related to I2C, were sent to the SMU.
> >
> > This patch fixes this bug.
> >
> > Cc: Alex Deucher <Alexander.Deucher@amd.com>
> > Cc: Evan Quan <evan.quan@amd.com>
> > Fixes: fcb1fe9c9e0031 ("drm/amd/powerplay: pre-check the SMU state before issuing message")
> > Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
> > ---
> >   drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c | 196 +++++++++++++++++++------
> >   drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h |   3 +-
> >   2 files changed, 152 insertions(+), 47 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> > index c902fdf322c1be..775eb50a2e49a6 100644
> > --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> > @@ -55,7 +55,7 @@
> >
> >   #undef __SMU_DUMMY_MAP
> >   #define __SMU_DUMMY_MAP(type)       #type
> > -static const char* __smu_message_names[] = {
> > +static const char * const __smu_message_names[] = {
> >       SMU_MESSAGE_TYPES
> >   };
> >
> > @@ -76,46 +76,161 @@ static void smu_cmn_read_arg(struct smu_context *smu,
> >       *arg = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_82);
> >   }
> >
> > -int smu_cmn_wait_for_response(struct smu_context *smu)
> > +/**
> > + * __smu_cmn_poll_stat -- poll for a status from the SMU
> > + * smu: a pointer to SMU context
> > + *
> > + * Returns the status of the SMU, which could be,
> > + * 0, the SMU is busy with your previous command;
> > + * 1,    execution status: success, execution result: success;
> > + * 0xFF, execution status: success, execution result: failure;
> > + * 0xFE, unknown command;
> > + * 0xFD, valid command, but bad (command) prerequisites;
> > + * 0xFC, the command was rejected as the SMU is busy;
> > + * 0xFB, "SMC_Result_DebugDataDumpEnd".
> > + */
>
> These are the response codes defined in header (0xFB is somehow missing)
> // SMU Response Codes:
> #define PPSMC_Result_OK                    0x1
> #define PPSMC_Result_Failed                0xFF
> #define PPSMC_Result_UnknownCmd            0xFE
> #define PPSMC_Result_CmdRejectedPrereq     0xFD
> #define PPSMC_Result_CmdRejectedBusy       0xFC
>
> It's better to use #defines for these, usually we follow a convention
> like SMU_

We could do a MAP_RESULT() macro like we do with the messages, etc. to
make them per asic, but that may be overkill as I think these result
codes have been the same across asics for a long time.

Alex

>
> Ex:
>         #define SMU_RESP_RESULT_OK 0x1
>
>
> > +static u32 __smu_cmn_poll_stat(struct smu_context *smu)
> >   {
> >       struct amdgpu_device *adev = smu->adev;
> > -     uint32_t cur_value, i, timeout = adev->usec_timeout * 20;
> > +     int timeout = adev->usec_timeout * 20;
> > +     u32 reg;
> >
> > -     for (i = 0; i < timeout; i++) {
> > -             cur_value = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90);
> > -             if ((cur_value & MP1_C2PMSG_90__CONTENT_MASK) != 0)
> > -                     return cur_value;
> > +     for ( ; timeout > 0; timeout--) {
> > +             reg = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90);
> > +             if ((reg & MP1_C2PMSG_90__CONTENT_MASK) != 0)
> > +                     break;
> >
> >               udelay(1);
> >       }
> >
> > -     /* timeout means wrong logic */
> > -     if (i == timeout)
> > -             return -ETIME;
> > -
> > -     return RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90);
> > +     return reg;
> >   }
> >
> > -int smu_cmn_send_msg_without_waiting(struct smu_context *smu,
> > -                                  uint16_t msg, uint32_t param)
> > +static void __smu_cmn_reg_print_error(struct smu_context *smu,
> > +                                   u32 reg_c2pmsg_90,
>
> Instead of using reg/regname in function, it would be better to name it
> as smu_cmn_resp/smu_resp or similar to make it clear that we are
> decoding smu response.
>
> > +                                   int msg_index,
> > +                                   u32 param,
> > +                                   enum smu_message_type msg)
> >   {
> >       struct amdgpu_device *adev = smu->adev;
> > -     int ret;
> > +     const char *message = smu_get_message_name(smu, msg);
> >
> > -     ret = smu_cmn_wait_for_response(smu);
> > -     if (ret != 0x1) {
> > -             dev_err(adev->dev, "Msg issuing pre-check failed(0x%x) and "
> > -                    "SMU may be not in the right state!\n", ret);
> > -             if (ret != -ETIME)
> > -                     ret = -EIO;
> > -             return ret;
> > +     switch (reg_c2pmsg_90) {
> > +     case 0:
> > +             dev_err_ratelimited(adev->dev,
> > +                                 "SMU: I'm not done with your previous command!");
> > +             break;
> > +     case 1:
> > +             /* The SMU executed the command. It completed with a
> > +              * successful result.
> > +              */
> > +             break;
> > +     case 0xFF:
> > +             /* The SMU executed the command. It completed with a
> > +              * unsuccessful result.
> > +              */
> > +             break;
> > +     case 0xFE:
> > +             dev_err_ratelimited(adev->dev,
> > +                                 "SMU: unknown command: index:%d param:0x%08X message:%s",
> > +                                 msg_index, param, message);
> > +             break;
> > +     case 0xFD:
> > +             dev_err_ratelimited(adev->dev,
> > +                                 "SMU: valid command, bad prerequisites: index:%d param:0x%08X message:%s",
> > +                                 msg_index, param, message);
> > +             break;
> > +     case 0xFC:
> > +             dev_err_ratelimited(adev->dev,
> > +                                 "SMU: I'm very busy for your command: index:%d param:0x%08X message:%s",
> > +                                 msg_index, param, message);
> > +             break;
> > +     case 0xFB:
> > +             dev_err_ratelimited(adev->dev,
> > +                                 "SMU: I'm debugging!");
> > +             break;
> > +     default:
> > +             dev_err_ratelimited(adev->dev,
> > +                                 "SMU: response:0x%08X for index:%d param:0x%08X message:%s?",
> > +                                 reg_c2pmsg_90, msg_index, param, message);
> > +             break;
> > +     }
> > +}
> > +
> > +static int __smu_cmn_reg2errno(struct smu_context *smu, u32 reg_c2pmsg_90)
>
> Same comment on naming - resp2errno?
> > +{
> > +     int res;
> > +
> > +     switch (reg_c2pmsg_90) {
> > +     case 0:
> > +             res = -ETIME;
> > +             break;
> > +     case 1:
> > +             res = 0;
> > +             break;
> > +     case 0xFF:
> > +             res = -EIO;
> > +             break;
> > +     case 0xFE:
> > +             res = -EOPNOTSUPP;
> > +             break;
> > +     case 0xFD:
> > +             res = -EIO;
> > +             break;
> > +     case 0xFC:
> > +             res = -EBUSY;
> > +             break;
> > +     case 0xFB:
> > +             res = -EIO;
> > +             break;
> > +     default:
> > +             res = -EIO;
> > +             break;
> >       }
> >
> > +     return res;
> > +}
> > +
> > +static void __smu_cmn_send_msg(struct smu_context *smu,
> > +                            u16 msg,
> > +                            u32 param)
> > +{
> > +     struct amdgpu_device *adev = smu->adev;
> > +
> >       WREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90, 0);
> >       WREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_82, param);
> >       WREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_66, msg);
> > +}
> >
> > -     return 0;
> > +int smu_cmn_send_msg_without_waiting(struct smu_context *smu,
> > +                                  uint16_t msg_index,
> > +                                  uint32_t param)
> > +{
> > +     u32 reg;
> > +     int res;
> > +
> > +     if (smu->adev->in_pci_err_recovery)
> > +             return 0;
> > +
> > +     mutex_lock(&smu->message_lock);
> > +     reg = __smu_cmn_poll_stat(smu);
> > +     if (reg == 0 || reg == 0xFC) {
>
> The problem with 0xFC check is it could be the response of a previous
> message. It could mean that FW was busy when the prev message was sent,
> not now.
>
> There is a default case (value not in any of the predefined error
> codes), that should be considered here also. That happens sometimes and
> usually that means FW is in undefined state.
>
>
> > +             res = __smu_cmn_reg2errno(smu, reg);
> > +             goto Out;
>
> Label naming style, lower case?.
>
> > +     }
> > +     __smu_cmn_send_msg(smu, msg_index, param);
> > +     res = 0;
> > +Out:
> > +     mutex_unlock(&smu->message_lock);
> > +     return res;
> > +}
> > +
> > +int smu_cmn_wait_for_response(struct smu_context *smu)
> > +{
> > +     u32 reg;
> > +
> > +     reg = __smu_cmn_poll_stat(smu);
> > +     return __smu_cmn_reg2errno(smu, reg);
> >   }
> >
> >   int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
> > @@ -123,8 +238,8 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
> >                                   uint32_t param,
> >                                   uint32_t *read_arg)
> >   {
> > -     struct amdgpu_device *adev = smu->adev;
> > -     int ret = 0, index = 0;
> > +     int res, index;
> > +     u32 reg;
> >
> >       if (smu->adev->in_pci_err_recovery)
> >               return 0;
> > @@ -136,31 +251,20 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
> >               return index == -EACCES ? 0 : index;
> >
> >       mutex_lock(&smu->message_lock);
> > -     ret = smu_cmn_send_msg_without_waiting(smu, (uint16_t)index, param);
> > -     if (ret)
> > -             goto out;
> > -
> > -     ret = smu_cmn_wait_for_response(smu);
> > -     if (ret != 0x1) {
> > -             if (ret == -ETIME) {
> > -                     dev_err(adev->dev, "message: %15s (%d) \tparam: 0x%08x is timeout (no response)\n",
> > -                             smu_get_message_name(smu, msg), index, param);
> > -             } else {
> > -                     dev_err(adev->dev, "failed send message: %15s (%d) \tparam: 0x%08x response %#x\n",
> > -                             smu_get_message_name(smu, msg), index, param,
> > -                             ret);
> > -                     ret = -EIO;
> > -             }
> > -             goto out;
> > +     reg = __smu_cmn_poll_stat(smu);
> > +     if (reg == 0 || reg == 0xFC) {
>
> Same comments as for without_waiting case.
>
> > +             res = __smu_cmn_reg2errno(smu, reg);
> > +             __smu_cmn_reg_print_error(smu, reg, index, param, msg);
>
> This precheck fail print is missing in without_waiting message.
>
> > +             goto Out; >     }
> > -
> > +     __smu_cmn_send_msg(smu, (uint16_t) index, param);
> > +     reg = __smu_cmn_poll_stat(smu);
> > +     res = __smu_cmn_reg2errno(smu, reg);
>
> Using smu_cmn_wait_for_response instead of these two makes the intent
> clearer - that we are waiting for the response.
>
> We need a print here as well if the message has failed.
>
> Thanks,
> Lijo
>
> >       if (read_arg)
> >               smu_cmn_read_arg(smu, read_arg);
> > -
> > -     ret = 0; /* 0 as driver return value */
> > -out:
> > +Out:
> >       mutex_unlock(&smu->message_lock);
> > -     return ret;
> > +     return res;
> >   }
> >
> >   int smu_cmn_send_smc_msg(struct smu_context *smu,
> > diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
> > index 9add5f16ff562a..16993daa2ae042 100644
> > --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
> > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
> > @@ -27,7 +27,8 @@
> >
> >   #if defined(SWSMU_CODE_LAYER_L2) || defined(SWSMU_CODE_LAYER_L3) || defined(SWSMU_CODE_LAYER_L4)
> >   int smu_cmn_send_msg_without_waiting(struct smu_context *smu,
> > -                                  uint16_t msg, uint32_t param);
> > +                                  uint16_t msg_index,
> > +                                  uint32_t param);
> >   int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
> >                                   enum smu_message_type msg,
> >                                   uint32_t param,
> >
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] drm/amd/pm: Fix a bug communicating with the SMU
  2021-07-14  2:01     ` Quan, Evan
  2021-07-14  4:04       ` Luben Tuikov
@ 2021-07-14 15:22       ` Alex Deucher
  2021-07-14 16:54         ` Luben Tuikov
  1 sibling, 1 reply; 11+ messages in thread
From: Alex Deucher @ 2021-07-14 15:22 UTC (permalink / raw)
  To: Quan, Evan
  Cc: Deucher, Alexander, Tuikov, Luben, Baluja, Aaron, amd-gfx,
	Alimucaj, Andi

On Tue, Jul 13, 2021 at 10:01 PM Quan, Evan <Evan.Quan@amd.com> wrote:
>
> [AMD Official Use Only]
>
>
>
> > -----Original Message-----
> > From: Tuikov, Luben <Luben.Tuikov@amd.com>
> > Sent: Wednesday, July 14, 2021 1:36 AM
> > To: Quan, Evan <Evan.Quan@amd.com>; amd-gfx@lists.freedesktop.org
> > Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Alimucaj, Andi
> > <Anduil.Alimucaj@amd.com>; Baluja, Aaron <Aaron.Baluja@amd.com>
> > Subject: Re: [PATCH] drm/amd/pm: Fix a bug communicating with the SMU
> >
> > On 2021-07-13 3:07 a.m., Quan, Evan wrote:
> > > [AMD Official Use Only]
> > >
> > >
> > >
> > >> -----Original Message-----
> > >> From: Tuikov, Luben <Luben.Tuikov@amd.com>
> > >> Sent: Monday, July 12, 2021 11:31 PM
> > >> To: amd-gfx@lists.freedesktop.org
> > >> Cc: Tuikov, Luben <Luben.Tuikov@amd.com>; Deucher, Alexander
> > >> <Alexander.Deucher@amd.com>; Quan, Evan <Evan.Quan@amd.com>
> > >> Subject: [PATCH] drm/amd/pm: Fix a bug communicating with the SMU
> > >>
> > >> This fixes a bug which if we probe a non-existing I2C device, and the
> > >> SMU returns 0xFF,
> > > [Quan, Evan] Do we have to probe I2C device via issuing commands to SMU
> > and check existence via SMU response?
> >
> > Yes, yes we do.
> >
> > > Is there other more friendly way?
> >
> > No, there isn't.
> >
> > > >from then on
> > >> we can never communicate with the SMU, because the code before this
> > >> patch reads and interprets 0xFF as a terminal error, and thus we
> > >> never write 0 into register 90 to clear the status (and subsequently
> > >> send a new command to the SMU.)
> > >>
> > >> It is not an error that the SMU returns status 0xFF. This means that
> > >> the SMU executed the last command successfully (execution status),
> > >> but the command result is an error of some sort (execution result),
> > >> depending on what the command was.
> > >>
> > >> When doing a status check of the SMU, before we send a new command,
> > >> the only status which precludes us from sending a new command is
> > >> 0--the SMU hasn't finished executing a previous command, and
> > >> 0xFC--the SMU is busy.
> > >>
> > >> This bug was seen as the following line in the kernel log,
> > >>
> > >> amdgpu: Msg issuing pre-check failed(0xff) and SMU may be not in the
> > >> right state!
> > > [Quan, Evan] This was designed to prevent failure scenario from ruin.
> > > Via this, we can prevent those SMU command/response related registers
> > overwritten.
> > > Then PMFW team can known which command invoked the first error.
> >
> > Sorry, this is not correct.
> >
> > The interface cannot block valid access to the SMU firmware, just because a
> > command executed successfully, but with a failed result, i.e. set a clock
> > frequency to such-and-such frequency was executed successfully by the
> > SMU, but the frequency wasn't able to be set as required--the SMU IS NOT
> > BLOCKED, but can execute more commands.
> [Quan, Evan]
> 1. First of all, if the response from SMU is not 1, it means SMU must detected something wrong.
> We(driver) should properly handle that. I do not think it's a good idea to silently ignore that and proceed more commands.
> 2. Please be noticed that many commands(SMU messages) have dependence with each other. It means even if the further command
> can be executed "successfully", it may be not executed in the expected way.
> >
> > If the PMFW team wants to know which command invoked "the first error",
> > they can check this explicitly, they can call smu_cmn_wait_for_response(),
> > just like the reset code does--see the reset code.
> [Quan, Evan] Per my knowledge gained during co-work with PMFW team, they expect no further driver-smu interaction(driver stops issuing command)
>  when something went wrong. As they highly rely on SMU internal statistics for their debugging and further interaction may ruin them.
> >
> > The way commit fcb1fe9c9e0031 modified the code, it blocks access to the
> > SMU for various other users of the SMU, not least of which is the I2C.
> [Quan, Evan] I'm wondering can we just list the I2C case as an exception. I means for the SMU command related with I2C we always assume the response is 1.
> For other commands, we just leave them as they are.

Maybe we need to just bubble this up to the callers and let each one
handle it as appropriate.  I don't think just throwing up our hands on
any error is the right thing to do.  The i2c case is a good example.
In other cases, we could probably just retry the transaction or just
ignore the error.

Alex


>
> BR
> Evan
> >
> > Regards,
> > Luben
> >
> > >
> > > BR
> > > Evan
> > >> when subsequent SMU commands, not necessarily related to I2C, were
> > >> sent to the SMU.
> > >>
> > >> This patch fixes this bug.
> > >>
> > >> Cc: Alex Deucher <Alexander.Deucher@amd.com>
> > >> Cc: Evan Quan <evan.quan@amd.com>
> > >> Fixes: fcb1fe9c9e0031 ("drm/amd/powerplay: pre-check the SMU state
> > >> before issuing message")
> > >> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
> > >> ---
> > >>  drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c | 196
> > >> +++++++++++++++++++------
> > >>  drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h |   3 +-
> > >>  2 files changed, 152 insertions(+), 47 deletions(-)
> > >>
> > >> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> > >> b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> > >> index c902fdf322c1be..775eb50a2e49a6 100644
> > >> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> > >> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> > >> @@ -55,7 +55,7 @@
> > >>
> > >>  #undef __SMU_DUMMY_MAP
> > >>  #define __SMU_DUMMY_MAP(type)     #type
> > >> -static const char* __smu_message_names[] = {
> > >> +static const char * const __smu_message_names[] = {
> > >>    SMU_MESSAGE_TYPES
> > >>  };
> > >>
> > >> @@ -76,46 +76,161 @@ static void smu_cmn_read_arg(struct
> > smu_context
> > >> *smu,
> > >>    *arg = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_82);  }
> > >>
> > >> -int smu_cmn_wait_for_response(struct smu_context *smu)
> > >> +/**
> > >> + * __smu_cmn_poll_stat -- poll for a status from the SMU
> > >> + * smu: a pointer to SMU context
> > >> + *
> > >> + * Returns the status of the SMU, which could be,
> > >> + * 0, the SMU is busy with your previous command;
> > >> + * 1,    execution status: success, execution result: success;
> > >> + * 0xFF, execution status: success, execution result: failure;
> > >> + * 0xFE, unknown command;
> > >> + * 0xFD, valid command, but bad (command) prerequisites;
> > >> + * 0xFC, the command was rejected as the SMU is busy;
> > >> + * 0xFB, "SMC_Result_DebugDataDumpEnd".
> > >> + */
> > >> +static u32 __smu_cmn_poll_stat(struct smu_context *smu)
> > >>  {
> > >>    struct amdgpu_device *adev = smu->adev;
> > >> -  uint32_t cur_value, i, timeout = adev->usec_timeout * 20;
> > >> +  int timeout = adev->usec_timeout * 20;
> > >> +  u32 reg;
> > >>
> > >> -  for (i = 0; i < timeout; i++) {
> > >> -          cur_value = RREG32_SOC15(MP1, 0,
> > >> mmMP1_SMN_C2PMSG_90);
> > >> -          if ((cur_value & MP1_C2PMSG_90__CONTENT_MASK) != 0)
> > >> -                  return cur_value;
> > >> +  for ( ; timeout > 0; timeout--) {
> > >> +          reg = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90);
> > >> +          if ((reg & MP1_C2PMSG_90__CONTENT_MASK) != 0)
> > >> +                  break;
> > >>
> > >>            udelay(1);
> > >>    }
> > >>
> > >> -  /* timeout means wrong logic */
> > >> -  if (i == timeout)
> > >> -          return -ETIME;
> > >> -
> > >> -  return RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90);
> > >> +  return reg;
> > >>  }
> > >>
> > >> -int smu_cmn_send_msg_without_waiting(struct smu_context *smu,
> > >> -                               uint16_t msg, uint32_t param)
> > >> +static void __smu_cmn_reg_print_error(struct smu_context *smu,
> > >> +                                u32 reg_c2pmsg_90,
> > >> +                                int msg_index,
> > >> +                                u32 param,
> > >> +                                enum smu_message_type msg)
> > >>  {
> > >>    struct amdgpu_device *adev = smu->adev;
> > >> -  int ret;
> > >> +  const char *message = smu_get_message_name(smu, msg);
> > >>
> > >> -  ret = smu_cmn_wait_for_response(smu);
> > >> -  if (ret != 0x1) {
> > >> -          dev_err(adev->dev, "Msg issuing pre-check failed(0x%x) and
> > >> "
> > >> -                 "SMU may be not in the right state!\n", ret);
> > >> -          if (ret != -ETIME)
> > >> -                  ret = -EIO;
> > >> -          return ret;
> > >> +  switch (reg_c2pmsg_90) {
> > >> +  case 0:
> > >> +          dev_err_ratelimited(adev->dev,
> > >> +                              "SMU: I'm not done with your previous
> > >> command!");
> > >> +          break;
> > >> +  case 1:
> > >> +          /* The SMU executed the command. It completed with a
> > >> +           * successful result.
> > >> +           */
> > >> +          break;
> > >> +  case 0xFF:
> > >> +          /* The SMU executed the command. It completed with a
> > >> +           * unsuccessful result.
> > >> +           */
> > >> +          break;
> > >> +  case 0xFE:
> > >> +          dev_err_ratelimited(adev->dev,
> > >> +                              "SMU: unknown command: index:%d
> > >> param:0x%08X message:%s",
> > >> +                              msg_index, param, message);
> > >> +          break;
> > >> +  case 0xFD:
> > >> +          dev_err_ratelimited(adev->dev,
> > >> +                              "SMU: valid command, bad prerequisites:
> > >> index:%d param:0x%08X message:%s",
> > >> +                              msg_index, param, message);
> > >> +          break;
> > >> +  case 0xFC:
> > >> +          dev_err_ratelimited(adev->dev,
> > >> +                              "SMU: I'm very busy for your command:
> > >> index:%d param:0x%08X message:%s",
> > >> +                              msg_index, param, message);
> > >> +          break;
> > >> +  case 0xFB:
> > >> +          dev_err_ratelimited(adev->dev,
> > >> +                              "SMU: I'm debugging!");
> > >> +          break;
> > >> +  default:
> > >> +          dev_err_ratelimited(adev->dev,
> > >> +                              "SMU: response:0x%08X for index:%d
> > >> param:0x%08X message:%s?",
> > >> +                              reg_c2pmsg_90, msg_index, param,
> > >> message);
> > >> +          break;
> > >> +  }
> > >> +}
> > >> +
> > >> +static int __smu_cmn_reg2errno(struct smu_context *smu, u32
> > >> reg_c2pmsg_90)
> > >> +{
> > >> +  int res;
> > >> +
> > >> +  switch (reg_c2pmsg_90) {
> > >> +  case 0:
> > >> +          res = -ETIME;
> > >> +          break;
> > >> +  case 1:
> > >> +          res = 0;
> > >> +          break;
> > >> +  case 0xFF:
> > >> +          res = -EIO;
> > >> +          break;
> > >> +  case 0xFE:
> > >> +          res = -EOPNOTSUPP;
> > >> +          break;
> > >> +  case 0xFD:
> > >> +          res = -EIO;
> > >> +          break;
> > >> +  case 0xFC:
> > >> +          res = -EBUSY;
> > >> +          break;
> > >> +  case 0xFB:
> > >> +          res = -EIO;
> > >> +          break;
> > >> +  default:
> > >> +          res = -EIO;
> > >> +          break;
> > >>    }
> > >>
> > >> +  return res;
> > >> +}
> > >> +
> > >> +static void __smu_cmn_send_msg(struct smu_context *smu,
> > >> +                         u16 msg,
> > >> +                         u32 param)
> > >> +{
> > >> +  struct amdgpu_device *adev = smu->adev;
> > >> +
> > >>    WREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90, 0);
> > >>    WREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_82, param);
> > >>    WREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_66, msg);
> > >> +}
> > >>
> > >> -  return 0;
> > >> +int smu_cmn_send_msg_without_waiting(struct smu_context *smu,
> > >> +                               uint16_t msg_index,
> > >> +                               uint32_t param)
> > >> +{
> > >> +  u32 reg;
> > >> +  int res;
> > >> +
> > >> +  if (smu->adev->in_pci_err_recovery)
> > >> +          return 0;
> > >> +
> > >> +  mutex_lock(&smu->message_lock);
> > >> +  reg = __smu_cmn_poll_stat(smu);
> > >> +  if (reg == 0 || reg == 0xFC) {
> > >> +          res = __smu_cmn_reg2errno(smu, reg);
> > >> +          goto Out;
> > >> +  }
> > >> +  __smu_cmn_send_msg(smu, msg_index, param);
> > >> +  res = 0;
> > >> +Out:
> > >> +  mutex_unlock(&smu->message_lock);
> > >> +  return res;
> > >> +}
> > >> +
> > >> +int smu_cmn_wait_for_response(struct smu_context *smu) {
> > >> +  u32 reg;
> > >> +
> > >> +  reg = __smu_cmn_poll_stat(smu);
> > >> +  return __smu_cmn_reg2errno(smu, reg);
> > >>  }
> > >>
> > >>  int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
> > @@
> > >> -123,8 +238,8 @@ int smu_cmn_send_smc_msg_with_param(struct
> > >> smu_context *smu,
> > >>                                uint32_t param,
> > >>                                uint32_t *read_arg)
> > >>  {
> > >> -  struct amdgpu_device *adev = smu->adev;
> > >> -  int ret = 0, index = 0;
> > >> +  int res, index;
> > >> +  u32 reg;
> > >>
> > >>    if (smu->adev->in_pci_err_recovery)
> > >>            return 0;
> > >> @@ -136,31 +251,20 @@ int
> > smu_cmn_send_smc_msg_with_param(struct
> > >> smu_context *smu,
> > >>            return index == -EACCES ? 0 : index;
> > >>
> > >>    mutex_lock(&smu->message_lock);
> > >> -  ret = smu_cmn_send_msg_without_waiting(smu, (uint16_t)index,
> > >> param);
> > >> -  if (ret)
> > >> -          goto out;
> > >> -
> > >> -  ret = smu_cmn_wait_for_response(smu);
> > >> -  if (ret != 0x1) {
> > >> -          if (ret == -ETIME) {
> > >> -                  dev_err(adev->dev, "message: %15s (%d) \tparam:
> > >> 0x%08x is timeout (no response)\n",
> > >> -                          smu_get_message_name(smu, msg), index,
> > >> param);
> > >> -          } else {
> > >> -                  dev_err(adev->dev, "failed send message: %15s (%d)
> > >> \tparam: 0x%08x response %#x\n",
> > >> -                          smu_get_message_name(smu, msg), index,
> > >> param,
> > >> -                          ret);
> > >> -                  ret = -EIO;
> > >> -          }
> > >> -          goto out;
> > >> +  reg = __smu_cmn_poll_stat(smu);
> > >> +  if (reg == 0 || reg == 0xFC) {
> > >> +          res = __smu_cmn_reg2errno(smu, reg);
> > >> +          __smu_cmn_reg_print_error(smu, reg, index, param, msg);
> > >> +          goto Out;
> > >>    }
> > >> -
> > >> +  __smu_cmn_send_msg(smu, (uint16_t) index, param);
> > >> +  reg = __smu_cmn_poll_stat(smu);
> > >> +  res = __smu_cmn_reg2errno(smu, reg);
> > >>    if (read_arg)
> > >>            smu_cmn_read_arg(smu, read_arg);
> > >> -
> > >> -  ret = 0; /* 0 as driver return value */
> > >> -out:
> > >> +Out:
> > >>    mutex_unlock(&smu->message_lock);
> > >> -  return ret;
> > >> +  return res;
> > >>  }
> > >>
> > >>  int smu_cmn_send_smc_msg(struct smu_context *smu, diff --git
> > >> a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
> > >> b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
> > >> index 9add5f16ff562a..16993daa2ae042 100644
> > >> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
> > >> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
> > >> @@ -27,7 +27,8 @@
> > >>
> > >>  #if defined(SWSMU_CODE_LAYER_L2) ||
> > defined(SWSMU_CODE_LAYER_L3)
> > >> || defined(SWSMU_CODE_LAYER_L4)
> > >>  int smu_cmn_send_msg_without_waiting(struct smu_context *smu,
> > >> -                               uint16_t msg, uint32_t param);
> > >> +                               uint16_t msg_index,
> > >> +                               uint32_t param);
> > >>  int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
> > >>                                enum smu_message_type msg,
> > >>                                uint32_t param,
> > >> --
> > >> 2.32.0
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] drm/amd/pm: Fix a bug communicating with the SMU
  2021-07-14 15:22       ` Alex Deucher
@ 2021-07-14 16:54         ` Luben Tuikov
  0 siblings, 0 replies; 11+ messages in thread
From: Luben Tuikov @ 2021-07-14 16:54 UTC (permalink / raw)
  To: Alex Deucher, Quan, Evan
  Cc: Deucher, Alexander, Baluja, Aaron, amd-gfx, Alimucaj, Andi

On 2021-07-14 11:22 a.m., Alex Deucher wrote:
> On Tue, Jul 13, 2021 at 10:01 PM Quan, Evan <Evan.Quan@amd.com> wrote:
>> [AMD Official Use Only]
>>
>>
>>
>>> -----Original Message-----
>>> From: Tuikov, Luben <Luben.Tuikov@amd.com>
>>> Sent: Wednesday, July 14, 2021 1:36 AM
>>> To: Quan, Evan <Evan.Quan@amd.com>; amd-gfx@lists.freedesktop.org
>>> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Alimucaj, Andi
>>> <Anduil.Alimucaj@amd.com>; Baluja, Aaron <Aaron.Baluja@amd.com>
>>> Subject: Re: [PATCH] drm/amd/pm: Fix a bug communicating with the SMU
>>>
>>> On 2021-07-13 3:07 a.m., Quan, Evan wrote:
>>>> [AMD Official Use Only]
>>>>
>>>>
>>>>
>>>>> -----Original Message-----
>>>>> From: Tuikov, Luben <Luben.Tuikov@amd.com>
>>>>> Sent: Monday, July 12, 2021 11:31 PM
>>>>> To: amd-gfx@lists.freedesktop.org
>>>>> Cc: Tuikov, Luben <Luben.Tuikov@amd.com>; Deucher, Alexander
>>>>> <Alexander.Deucher@amd.com>; Quan, Evan <Evan.Quan@amd.com>
>>>>> Subject: [PATCH] drm/amd/pm: Fix a bug communicating with the SMU
>>>>>
>>>>> This fixes a bug which if we probe a non-existing I2C device, and the
>>>>> SMU returns 0xFF,
>>>> [Quan, Evan] Do we have to probe I2C device via issuing commands to SMU
>>> and check existence via SMU response?
>>>
>>> Yes, yes we do.
>>>
>>>> Is there other more friendly way?
>>> No, there isn't.
>>>
>>>> >from then on
>>>>> we can never communicate with the SMU, because the code before this
>>>>> patch reads and interprets 0xFF as a terminal error, and thus we
>>>>> never write 0 into register 90 to clear the status (and subsequently
>>>>> send a new command to the SMU.)
>>>>>
>>>>> It is not an error that the SMU returns status 0xFF. This means that
>>>>> the SMU executed the last command successfully (execution status),
>>>>> but the command result is an error of some sort (execution result),
>>>>> depending on what the command was.
>>>>>
>>>>> When doing a status check of the SMU, before we send a new command,
>>>>> the only status which precludes us from sending a new command is
>>>>> 0--the SMU hasn't finished executing a previous command, and
>>>>> 0xFC--the SMU is busy.
>>>>>
>>>>> This bug was seen as the following line in the kernel log,
>>>>>
>>>>> amdgpu: Msg issuing pre-check failed(0xff) and SMU may be not in the
>>>>> right state!
>>>> [Quan, Evan] This was designed to prevent failure scenario from ruin.
>>>> Via this, we can prevent those SMU command/response related registers
>>> overwritten.
>>>> Then PMFW team can known which command invoked the first error.
>>> Sorry, this is not correct.
>>>
>>> The interface cannot block valid access to the SMU firmware, just because a
>>> command executed successfully, but with a failed result, i.e. set a clock
>>> frequency to such-and-such frequency was executed successfully by the
>>> SMU, but the frequency wasn't able to be set as required--the SMU IS NOT
>>> BLOCKED, but can execute more commands.
>> [Quan, Evan]
>> 1. First of all, if the response from SMU is not 1, it means SMU must detected something wrong.
>> We(driver) should properly handle that. I do not think it's a good idea to silently ignore that and proceed more commands.
>> 2. Please be noticed that many commands(SMU messages) have dependence with each other. It means even if the further command
>> can be executed "successfully", it may be not executed in the expected way.
>>> If the PMFW team wants to know which command invoked "the first error",
>>> they can check this explicitly, they can call smu_cmn_wait_for_response(),
>>> just like the reset code does--see the reset code.
>> [Quan, Evan] Per my knowledge gained during co-work with PMFW team, they expect no further driver-smu interaction(driver stops issuing command)
>>  when something went wrong. As they highly rely on SMU internal statistics for their debugging and further interaction may ruin them.
>>> The way commit fcb1fe9c9e0031 modified the code, it blocks access to the
>>> SMU for various other users of the SMU, not least of which is the I2C.
>> [Quan, Evan] I'm wondering can we just list the I2C case as an exception. I means for the SMU command related with I2C we always assume the response is 1.
>> For other commands, we just leave them as they are.
> Maybe we need to just bubble this up to the callers and let each one
> handle it as appropriate.  I don't think just throwing up our hands on
> any error is the right thing to do.  The i2c case is a good example.
> In other cases, we could probably just retry the transaction or just
> ignore the error.

My thoughts exactly. I'll generate a v3 of the patch to address the issues brought here and elsewhere.

Regards,
Luben

>
> Alex
>
>
>> BR
>> Evan
>>> Regards,
>>> Luben
>>>
>>>> BR
>>>> Evan
>>>>> when subsequent SMU commands, not necessarily related to I2C, were
>>>>> sent to the SMU.
>>>>>
>>>>> This patch fixes this bug.
>>>>>
>>>>> Cc: Alex Deucher <Alexander.Deucher@amd.com>
>>>>> Cc: Evan Quan <evan.quan@amd.com>
>>>>> Fixes: fcb1fe9c9e0031 ("drm/amd/powerplay: pre-check the SMU state
>>>>> before issuing message")
>>>>> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
>>>>> ---
>>>>>  drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c | 196
>>>>> +++++++++++++++++++------
>>>>>  drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h |   3 +-
>>>>>  2 files changed, 152 insertions(+), 47 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>>>>> b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>>>>> index c902fdf322c1be..775eb50a2e49a6 100644
>>>>> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>>>>> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>>>>> @@ -55,7 +55,7 @@
>>>>>
>>>>>  #undef __SMU_DUMMY_MAP
>>>>>  #define __SMU_DUMMY_MAP(type)     #type
>>>>> -static const char* __smu_message_names[] = {
>>>>> +static const char * const __smu_message_names[] = {
>>>>>    SMU_MESSAGE_TYPES
>>>>>  };
>>>>>
>>>>> @@ -76,46 +76,161 @@ static void smu_cmn_read_arg(struct
>>> smu_context
>>>>> *smu,
>>>>>    *arg = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_82);  }
>>>>>
>>>>> -int smu_cmn_wait_for_response(struct smu_context *smu)
>>>>> +/**
>>>>> + * __smu_cmn_poll_stat -- poll for a status from the SMU
>>>>> + * smu: a pointer to SMU context
>>>>> + *
>>>>> + * Returns the status of the SMU, which could be,
>>>>> + * 0, the SMU is busy with your previous command;
>>>>> + * 1,    execution status: success, execution result: success;
>>>>> + * 0xFF, execution status: success, execution result: failure;
>>>>> + * 0xFE, unknown command;
>>>>> + * 0xFD, valid command, but bad (command) prerequisites;
>>>>> + * 0xFC, the command was rejected as the SMU is busy;
>>>>> + * 0xFB, "SMC_Result_DebugDataDumpEnd".
>>>>> + */
>>>>> +static u32 __smu_cmn_poll_stat(struct smu_context *smu)
>>>>>  {
>>>>>    struct amdgpu_device *adev = smu->adev;
>>>>> -  uint32_t cur_value, i, timeout = adev->usec_timeout * 20;
>>>>> +  int timeout = adev->usec_timeout * 20;
>>>>> +  u32 reg;
>>>>>
>>>>> -  for (i = 0; i < timeout; i++) {
>>>>> -          cur_value = RREG32_SOC15(MP1, 0,
>>>>> mmMP1_SMN_C2PMSG_90);
>>>>> -          if ((cur_value & MP1_C2PMSG_90__CONTENT_MASK) != 0)
>>>>> -                  return cur_value;
>>>>> +  for ( ; timeout > 0; timeout--) {
>>>>> +          reg = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90);
>>>>> +          if ((reg & MP1_C2PMSG_90__CONTENT_MASK) != 0)
>>>>> +                  break;
>>>>>
>>>>>            udelay(1);
>>>>>    }
>>>>>
>>>>> -  /* timeout means wrong logic */
>>>>> -  if (i == timeout)
>>>>> -          return -ETIME;
>>>>> -
>>>>> -  return RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90);
>>>>> +  return reg;
>>>>>  }
>>>>>
>>>>> -int smu_cmn_send_msg_without_waiting(struct smu_context *smu,
>>>>> -                               uint16_t msg, uint32_t param)
>>>>> +static void __smu_cmn_reg_print_error(struct smu_context *smu,
>>>>> +                                u32 reg_c2pmsg_90,
>>>>> +                                int msg_index,
>>>>> +                                u32 param,
>>>>> +                                enum smu_message_type msg)
>>>>>  {
>>>>>    struct amdgpu_device *adev = smu->adev;
>>>>> -  int ret;
>>>>> +  const char *message = smu_get_message_name(smu, msg);
>>>>>
>>>>> -  ret = smu_cmn_wait_for_response(smu);
>>>>> -  if (ret != 0x1) {
>>>>> -          dev_err(adev->dev, "Msg issuing pre-check failed(0x%x) and
>>>>> "
>>>>> -                 "SMU may be not in the right state!\n", ret);
>>>>> -          if (ret != -ETIME)
>>>>> -                  ret = -EIO;
>>>>> -          return ret;
>>>>> +  switch (reg_c2pmsg_90) {
>>>>> +  case 0:
>>>>> +          dev_err_ratelimited(adev->dev,
>>>>> +                              "SMU: I'm not done with your previous
>>>>> command!");
>>>>> +          break;
>>>>> +  case 1:
>>>>> +          /* The SMU executed the command. It completed with a
>>>>> +           * successful result.
>>>>> +           */
>>>>> +          break;
>>>>> +  case 0xFF:
>>>>> +          /* The SMU executed the command. It completed with a
>>>>> +           * unsuccessful result.
>>>>> +           */
>>>>> +          break;
>>>>> +  case 0xFE:
>>>>> +          dev_err_ratelimited(adev->dev,
>>>>> +                              "SMU: unknown command: index:%d
>>>>> param:0x%08X message:%s",
>>>>> +                              msg_index, param, message);
>>>>> +          break;
>>>>> +  case 0xFD:
>>>>> +          dev_err_ratelimited(adev->dev,
>>>>> +                              "SMU: valid command, bad prerequisites:
>>>>> index:%d param:0x%08X message:%s",
>>>>> +                              msg_index, param, message);
>>>>> +          break;
>>>>> +  case 0xFC:
>>>>> +          dev_err_ratelimited(adev->dev,
>>>>> +                              "SMU: I'm very busy for your command:
>>>>> index:%d param:0x%08X message:%s",
>>>>> +                              msg_index, param, message);
>>>>> +          break;
>>>>> +  case 0xFB:
>>>>> +          dev_err_ratelimited(adev->dev,
>>>>> +                              "SMU: I'm debugging!");
>>>>> +          break;
>>>>> +  default:
>>>>> +          dev_err_ratelimited(adev->dev,
>>>>> +                              "SMU: response:0x%08X for index:%d
>>>>> param:0x%08X message:%s?",
>>>>> +                              reg_c2pmsg_90, msg_index, param,
>>>>> message);
>>>>> +          break;
>>>>> +  }
>>>>> +}
>>>>> +
>>>>> +static int __smu_cmn_reg2errno(struct smu_context *smu, u32
>>>>> reg_c2pmsg_90)
>>>>> +{
>>>>> +  int res;
>>>>> +
>>>>> +  switch (reg_c2pmsg_90) {
>>>>> +  case 0:
>>>>> +          res = -ETIME;
>>>>> +          break;
>>>>> +  case 1:
>>>>> +          res = 0;
>>>>> +          break;
>>>>> +  case 0xFF:
>>>>> +          res = -EIO;
>>>>> +          break;
>>>>> +  case 0xFE:
>>>>> +          res = -EOPNOTSUPP;
>>>>> +          break;
>>>>> +  case 0xFD:
>>>>> +          res = -EIO;
>>>>> +          break;
>>>>> +  case 0xFC:
>>>>> +          res = -EBUSY;
>>>>> +          break;
>>>>> +  case 0xFB:
>>>>> +          res = -EIO;
>>>>> +          break;
>>>>> +  default:
>>>>> +          res = -EIO;
>>>>> +          break;
>>>>>    }
>>>>>
>>>>> +  return res;
>>>>> +}
>>>>> +
>>>>> +static void __smu_cmn_send_msg(struct smu_context *smu,
>>>>> +                         u16 msg,
>>>>> +                         u32 param)
>>>>> +{
>>>>> +  struct amdgpu_device *adev = smu->adev;
>>>>> +
>>>>>    WREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90, 0);
>>>>>    WREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_82, param);
>>>>>    WREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_66, msg);
>>>>> +}
>>>>>
>>>>> -  return 0;
>>>>> +int smu_cmn_send_msg_without_waiting(struct smu_context *smu,
>>>>> +                               uint16_t msg_index,
>>>>> +                               uint32_t param)
>>>>> +{
>>>>> +  u32 reg;
>>>>> +  int res;
>>>>> +
>>>>> +  if (smu->adev->in_pci_err_recovery)
>>>>> +          return 0;
>>>>> +
>>>>> +  mutex_lock(&smu->message_lock);
>>>>> +  reg = __smu_cmn_poll_stat(smu);
>>>>> +  if (reg == 0 || reg == 0xFC) {
>>>>> +          res = __smu_cmn_reg2errno(smu, reg);
>>>>> +          goto Out;
>>>>> +  }
>>>>> +  __smu_cmn_send_msg(smu, msg_index, param);
>>>>> +  res = 0;
>>>>> +Out:
>>>>> +  mutex_unlock(&smu->message_lock);
>>>>> +  return res;
>>>>> +}
>>>>> +
>>>>> +int smu_cmn_wait_for_response(struct smu_context *smu) {
>>>>> +  u32 reg;
>>>>> +
>>>>> +  reg = __smu_cmn_poll_stat(smu);
>>>>> +  return __smu_cmn_reg2errno(smu, reg);
>>>>>  }
>>>>>
>>>>>  int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
>>> @@
>>>>> -123,8 +238,8 @@ int smu_cmn_send_smc_msg_with_param(struct
>>>>> smu_context *smu,
>>>>>                                uint32_t param,
>>>>>                                uint32_t *read_arg)
>>>>>  {
>>>>> -  struct amdgpu_device *adev = smu->adev;
>>>>> -  int ret = 0, index = 0;
>>>>> +  int res, index;
>>>>> +  u32 reg;
>>>>>
>>>>>    if (smu->adev->in_pci_err_recovery)
>>>>>            return 0;
>>>>> @@ -136,31 +251,20 @@ int
>>> smu_cmn_send_smc_msg_with_param(struct
>>>>> smu_context *smu,
>>>>>            return index == -EACCES ? 0 : index;
>>>>>
>>>>>    mutex_lock(&smu->message_lock);
>>>>> -  ret = smu_cmn_send_msg_without_waiting(smu, (uint16_t)index,
>>>>> param);
>>>>> -  if (ret)
>>>>> -          goto out;
>>>>> -
>>>>> -  ret = smu_cmn_wait_for_response(smu);
>>>>> -  if (ret != 0x1) {
>>>>> -          if (ret == -ETIME) {
>>>>> -                  dev_err(adev->dev, "message: %15s (%d) \tparam:
>>>>> 0x%08x is timeout (no response)\n",
>>>>> -                          smu_get_message_name(smu, msg), index,
>>>>> param);
>>>>> -          } else {
>>>>> -                  dev_err(adev->dev, "failed send message: %15s (%d)
>>>>> \tparam: 0x%08x response %#x\n",
>>>>> -                          smu_get_message_name(smu, msg), index,
>>>>> param,
>>>>> -                          ret);
>>>>> -                  ret = -EIO;
>>>>> -          }
>>>>> -          goto out;
>>>>> +  reg = __smu_cmn_poll_stat(smu);
>>>>> +  if (reg == 0 || reg == 0xFC) {
>>>>> +          res = __smu_cmn_reg2errno(smu, reg);
>>>>> +          __smu_cmn_reg_print_error(smu, reg, index, param, msg);
>>>>> +          goto Out;
>>>>>    }
>>>>> -
>>>>> +  __smu_cmn_send_msg(smu, (uint16_t) index, param);
>>>>> +  reg = __smu_cmn_poll_stat(smu);
>>>>> +  res = __smu_cmn_reg2errno(smu, reg);
>>>>>    if (read_arg)
>>>>>            smu_cmn_read_arg(smu, read_arg);
>>>>> -
>>>>> -  ret = 0; /* 0 as driver return value */
>>>>> -out:
>>>>> +Out:
>>>>>    mutex_unlock(&smu->message_lock);
>>>>> -  return ret;
>>>>> +  return res;
>>>>>  }
>>>>>
>>>>>  int smu_cmn_send_smc_msg(struct smu_context *smu, diff --git
>>>>> a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
>>>>> b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
>>>>> index 9add5f16ff562a..16993daa2ae042 100644
>>>>> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
>>>>> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
>>>>> @@ -27,7 +27,8 @@
>>>>>
>>>>>  #if defined(SWSMU_CODE_LAYER_L2) ||
>>> defined(SWSMU_CODE_LAYER_L3)
>>>>> || defined(SWSMU_CODE_LAYER_L4)
>>>>>  int smu_cmn_send_msg_without_waiting(struct smu_context *smu,
>>>>> -                               uint16_t msg, uint32_t param);
>>>>> +                               uint16_t msg_index,
>>>>> +                               uint32_t param);
>>>>>  int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
>>>>>                                enum smu_message_type msg,
>>>>>                                uint32_t param,
>>>>> --
>>>>> 2.32.0
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=04%7C01%7CLuben.Tuikov%40amd.com%7C0034eccb90234c6ba0d908d946db48a1%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637618729900432066%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=DYigx7bg6kAoZOeQKP2olVlwmrombaAnVpVNDXhqwzg%3D&amp;reserved=0

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] drm/amd/pm: Fix a bug communicating with the SMU
  2021-07-14 15:19   ` Alex Deucher
@ 2021-07-14 16:57     ` Luben Tuikov
  0 siblings, 0 replies; 11+ messages in thread
From: Luben Tuikov @ 2021-07-14 16:57 UTC (permalink / raw)
  To: Alex Deucher, Lazar, Lijo; +Cc: Alex Deucher, Evan Quan, amd-gfx list

On 2021-07-14 11:19 a.m., Alex Deucher wrote:
> On Mon, Jul 12, 2021 at 10:56 PM Lazar, Lijo <lijo.lazar@amd.com> wrote:
>>
>>
>> On 7/12/2021 9:00 PM, Luben Tuikov wrote:
>>> This fixes a bug which if we probe a non-existing
>>> I2C device, and the SMU returns 0xFF, from then on
>>> we can never communicate with the SMU, because the
>>> code before this patch reads and interprets 0xFF
>>> as a terminal error, and thus we never write 0
>>> into register 90 to clear the status (and
>>> subsequently send a new command to the SMU.)
>>>
>>> It is not an error that the SMU returns status
>>> 0xFF. This means that the SMU executed the last
>>> command successfully (execution status), but the
>>> command result is an error of some sort (execution
>>> result), depending on what the command was.
>>>
>>> When doing a status check of the SMU, before we
>>> send a new command, the only status which
>>> precludes us from sending a new command is 0--the
>>> SMU hasn't finished executing a previous command,
>>> and 0xFC--the SMU is busy.
>>>
>>> This bug was seen as the following line in the
>>> kernel log,
>>>
>>> amdgpu: Msg issuing pre-check failed(0xff) and SMU may be not in the right state!
>>>
>>> when subsequent SMU commands, not necessarily
>>> related to I2C, were sent to the SMU.
>>>
>>> This patch fixes this bug.
>>>
>>> Cc: Alex Deucher <Alexander.Deucher@amd.com>
>>> Cc: Evan Quan <evan.quan@amd.com>
>>> Fixes: fcb1fe9c9e0031 ("drm/amd/powerplay: pre-check the SMU state before issuing message")
>>> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c | 196 +++++++++++++++++++------
>>>   drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h |   3 +-
>>>   2 files changed, 152 insertions(+), 47 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>>> index c902fdf322c1be..775eb50a2e49a6 100644
>>> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>>> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>>> @@ -55,7 +55,7 @@
>>>
>>>   #undef __SMU_DUMMY_MAP
>>>   #define __SMU_DUMMY_MAP(type)       #type
>>> -static const char* __smu_message_names[] = {
>>> +static const char * const __smu_message_names[] = {
>>>       SMU_MESSAGE_TYPES
>>>   };
>>>
>>> @@ -76,46 +76,161 @@ static void smu_cmn_read_arg(struct smu_context *smu,
>>>       *arg = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_82);
>>>   }
>>>
>>> -int smu_cmn_wait_for_response(struct smu_context *smu)
>>> +/**
>>> + * __smu_cmn_poll_stat -- poll for a status from the SMU
>>> + * smu: a pointer to SMU context
>>> + *
>>> + * Returns the status of the SMU, which could be,
>>> + * 0, the SMU is busy with your previous command;
>>> + * 1,    execution status: success, execution result: success;
>>> + * 0xFF, execution status: success, execution result: failure;
>>> + * 0xFE, unknown command;
>>> + * 0xFD, valid command, but bad (command) prerequisites;
>>> + * 0xFC, the command was rejected as the SMU is busy;
>>> + * 0xFB, "SMC_Result_DebugDataDumpEnd".
>>> + */
>> These are the response codes defined in header (0xFB is somehow missing)
>> // SMU Response Codes:
>> #define PPSMC_Result_OK                    0x1
>> #define PPSMC_Result_Failed                0xFF
>> #define PPSMC_Result_UnknownCmd            0xFE
>> #define PPSMC_Result_CmdRejectedPrereq     0xFD
>> #define PPSMC_Result_CmdRejectedBusy       0xFC
>>
>> It's better to use #defines for these, usually we follow a convention
>> like SMU_
> We could do a MAP_RESULT() macro like we do with the messages, etc. to
> make them per asic, but that may be overkill as I think these result
> codes have been the same across asics for a long time.

I think this would be best done in a subsequent/follow-up patch, since
it doesn't affect the behaviour of the system. I feel that such a change, while cosmetic,
would be somewhat involved and deserves its own patch and review, but for now,
we should get the system going.

I also think that ideally we'd want this to arrive from the SMU team perhaps,
so that we're impervious to such changes.

Regards,
Luben


>
> Alex
>
>> Ex:
>>         #define SMU_RESP_RESULT_OK 0x1
>>
>>
>>> +static u32 __smu_cmn_poll_stat(struct smu_context *smu)
>>>   {
>>>       struct amdgpu_device *adev = smu->adev;
>>> -     uint32_t cur_value, i, timeout = adev->usec_timeout * 20;
>>> +     int timeout = adev->usec_timeout * 20;
>>> +     u32 reg;
>>>
>>> -     for (i = 0; i < timeout; i++) {
>>> -             cur_value = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90);
>>> -             if ((cur_value & MP1_C2PMSG_90__CONTENT_MASK) != 0)
>>> -                     return cur_value;
>>> +     for ( ; timeout > 0; timeout--) {
>>> +             reg = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90);
>>> +             if ((reg & MP1_C2PMSG_90__CONTENT_MASK) != 0)
>>> +                     break;
>>>
>>>               udelay(1);
>>>       }
>>>
>>> -     /* timeout means wrong logic */
>>> -     if (i == timeout)
>>> -             return -ETIME;
>>> -
>>> -     return RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90);
>>> +     return reg;
>>>   }
>>>
>>> -int smu_cmn_send_msg_without_waiting(struct smu_context *smu,
>>> -                                  uint16_t msg, uint32_t param)
>>> +static void __smu_cmn_reg_print_error(struct smu_context *smu,
>>> +                                   u32 reg_c2pmsg_90,
>> Instead of using reg/regname in function, it would be better to name it
>> as smu_cmn_resp/smu_resp or similar to make it clear that we are
>> decoding smu response.
>>
>>> +                                   int msg_index,
>>> +                                   u32 param,
>>> +                                   enum smu_message_type msg)
>>>   {
>>>       struct amdgpu_device *adev = smu->adev;
>>> -     int ret;
>>> +     const char *message = smu_get_message_name(smu, msg);
>>>
>>> -     ret = smu_cmn_wait_for_response(smu);
>>> -     if (ret != 0x1) {
>>> -             dev_err(adev->dev, "Msg issuing pre-check failed(0x%x) and "
>>> -                    "SMU may be not in the right state!\n", ret);
>>> -             if (ret != -ETIME)
>>> -                     ret = -EIO;
>>> -             return ret;
>>> +     switch (reg_c2pmsg_90) {
>>> +     case 0:
>>> +             dev_err_ratelimited(adev->dev,
>>> +                                 "SMU: I'm not done with your previous command!");
>>> +             break;
>>> +     case 1:
>>> +             /* The SMU executed the command. It completed with a
>>> +              * successful result.
>>> +              */
>>> +             break;
>>> +     case 0xFF:
>>> +             /* The SMU executed the command. It completed with a
>>> +              * unsuccessful result.
>>> +              */
>>> +             break;
>>> +     case 0xFE:
>>> +             dev_err_ratelimited(adev->dev,
>>> +                                 "SMU: unknown command: index:%d param:0x%08X message:%s",
>>> +                                 msg_index, param, message);
>>> +             break;
>>> +     case 0xFD:
>>> +             dev_err_ratelimited(adev->dev,
>>> +                                 "SMU: valid command, bad prerequisites: index:%d param:0x%08X message:%s",
>>> +                                 msg_index, param, message);
>>> +             break;
>>> +     case 0xFC:
>>> +             dev_err_ratelimited(adev->dev,
>>> +                                 "SMU: I'm very busy for your command: index:%d param:0x%08X message:%s",
>>> +                                 msg_index, param, message);
>>> +             break;
>>> +     case 0xFB:
>>> +             dev_err_ratelimited(adev->dev,
>>> +                                 "SMU: I'm debugging!");
>>> +             break;
>>> +     default:
>>> +             dev_err_ratelimited(adev->dev,
>>> +                                 "SMU: response:0x%08X for index:%d param:0x%08X message:%s?",
>>> +                                 reg_c2pmsg_90, msg_index, param, message);
>>> +             break;
>>> +     }
>>> +}
>>> +
>>> +static int __smu_cmn_reg2errno(struct smu_context *smu, u32 reg_c2pmsg_90)
>> Same comment on naming - resp2errno?
>>> +{
>>> +     int res;
>>> +
>>> +     switch (reg_c2pmsg_90) {
>>> +     case 0:
>>> +             res = -ETIME;
>>> +             break;
>>> +     case 1:
>>> +             res = 0;
>>> +             break;
>>> +     case 0xFF:
>>> +             res = -EIO;
>>> +             break;
>>> +     case 0xFE:
>>> +             res = -EOPNOTSUPP;
>>> +             break;
>>> +     case 0xFD:
>>> +             res = -EIO;
>>> +             break;
>>> +     case 0xFC:
>>> +             res = -EBUSY;
>>> +             break;
>>> +     case 0xFB:
>>> +             res = -EIO;
>>> +             break;
>>> +     default:
>>> +             res = -EIO;
>>> +             break;
>>>       }
>>>
>>> +     return res;
>>> +}
>>> +
>>> +static void __smu_cmn_send_msg(struct smu_context *smu,
>>> +                            u16 msg,
>>> +                            u32 param)
>>> +{
>>> +     struct amdgpu_device *adev = smu->adev;
>>> +
>>>       WREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90, 0);
>>>       WREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_82, param);
>>>       WREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_66, msg);
>>> +}
>>>
>>> -     return 0;
>>> +int smu_cmn_send_msg_without_waiting(struct smu_context *smu,
>>> +                                  uint16_t msg_index,
>>> +                                  uint32_t param)
>>> +{
>>> +     u32 reg;
>>> +     int res;
>>> +
>>> +     if (smu->adev->in_pci_err_recovery)
>>> +             return 0;
>>> +
>>> +     mutex_lock(&smu->message_lock);
>>> +     reg = __smu_cmn_poll_stat(smu);
>>> +     if (reg == 0 || reg == 0xFC) {
>> The problem with 0xFC check is it could be the response of a previous
>> message. It could mean that FW was busy when the prev message was sent,
>> not now.
>>
>> There is a default case (value not in any of the predefined error
>> codes), that should be considered here also. That happens sometimes and
>> usually that means FW is in undefined state.
>>
>>
>>> +             res = __smu_cmn_reg2errno(smu, reg);
>>> +             goto Out;
>> Label naming style, lower case?.
>>
>>> +     }
>>> +     __smu_cmn_send_msg(smu, msg_index, param);
>>> +     res = 0;
>>> +Out:
>>> +     mutex_unlock(&smu->message_lock);
>>> +     return res;
>>> +}
>>> +
>>> +int smu_cmn_wait_for_response(struct smu_context *smu)
>>> +{
>>> +     u32 reg;
>>> +
>>> +     reg = __smu_cmn_poll_stat(smu);
>>> +     return __smu_cmn_reg2errno(smu, reg);
>>>   }
>>>
>>>   int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
>>> @@ -123,8 +238,8 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
>>>                                   uint32_t param,
>>>                                   uint32_t *read_arg)
>>>   {
>>> -     struct amdgpu_device *adev = smu->adev;
>>> -     int ret = 0, index = 0;
>>> +     int res, index;
>>> +     u32 reg;
>>>
>>>       if (smu->adev->in_pci_err_recovery)
>>>               return 0;
>>> @@ -136,31 +251,20 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
>>>               return index == -EACCES ? 0 : index;
>>>
>>>       mutex_lock(&smu->message_lock);
>>> -     ret = smu_cmn_send_msg_without_waiting(smu, (uint16_t)index, param);
>>> -     if (ret)
>>> -             goto out;
>>> -
>>> -     ret = smu_cmn_wait_for_response(smu);
>>> -     if (ret != 0x1) {
>>> -             if (ret == -ETIME) {
>>> -                     dev_err(adev->dev, "message: %15s (%d) \tparam: 0x%08x is timeout (no response)\n",
>>> -                             smu_get_message_name(smu, msg), index, param);
>>> -             } else {
>>> -                     dev_err(adev->dev, "failed send message: %15s (%d) \tparam: 0x%08x response %#x\n",
>>> -                             smu_get_message_name(smu, msg), index, param,
>>> -                             ret);
>>> -                     ret = -EIO;
>>> -             }
>>> -             goto out;
>>> +     reg = __smu_cmn_poll_stat(smu);
>>> +     if (reg == 0 || reg == 0xFC) {
>> Same comments as for without_waiting case.
>>
>>> +             res = __smu_cmn_reg2errno(smu, reg);
>>> +             __smu_cmn_reg_print_error(smu, reg, index, param, msg);
>> This precheck fail print is missing in without_waiting message.
>>
>>> +             goto Out; >     }
>>> -
>>> +     __smu_cmn_send_msg(smu, (uint16_t) index, param);
>>> +     reg = __smu_cmn_poll_stat(smu);
>>> +     res = __smu_cmn_reg2errno(smu, reg);
>> Using smu_cmn_wait_for_response instead of these two makes the intent
>> clearer - that we are waiting for the response.
>>
>> We need a print here as well if the message has failed.
>>
>> Thanks,
>> Lijo
>>
>>>       if (read_arg)
>>>               smu_cmn_read_arg(smu, read_arg);
>>> -
>>> -     ret = 0; /* 0 as driver return value */
>>> -out:
>>> +Out:
>>>       mutex_unlock(&smu->message_lock);
>>> -     return ret;
>>> +     return res;
>>>   }
>>>
>>>   int smu_cmn_send_smc_msg(struct smu_context *smu,
>>> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
>>> index 9add5f16ff562a..16993daa2ae042 100644
>>> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
>>> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
>>> @@ -27,7 +27,8 @@
>>>
>>>   #if defined(SWSMU_CODE_LAYER_L2) || defined(SWSMU_CODE_LAYER_L3) || defined(SWSMU_CODE_LAYER_L4)
>>>   int smu_cmn_send_msg_without_waiting(struct smu_context *smu,
>>> -                                  uint16_t msg, uint32_t param);
>>> +                                  uint16_t msg_index,
>>> +                                  uint32_t param);
>>>   int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
>>>                                   enum smu_message_type msg,
>>>                                   uint32_t param,
>>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=04%7C01%7Cluben.tuikov%40amd.com%7C63c12227eaee4417d06c08d946dad4be%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637618727955855924%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=Au4k6zam4oVtRNl2JA%2BNEQTGjiOLxZULDKQnYDQg9ho%3D&amp;reserved=0

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2021-07-14 16:57 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-07-12 15:30 [PATCH] drm/amd/pm: Fix a bug communicating with the SMU Luben Tuikov
2021-07-13  2:56 ` Lazar, Lijo
2021-07-13 17:18   ` Luben Tuikov
2021-07-14 15:19   ` Alex Deucher
2021-07-14 16:57     ` Luben Tuikov
2021-07-13  7:07 ` Quan, Evan
2021-07-13 17:35   ` Luben Tuikov
2021-07-14  2:01     ` Quan, Evan
2021-07-14  4:04       ` Luben Tuikov
2021-07-14 15:22       ` Alex Deucher
2021-07-14 16:54         ` Luben Tuikov

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.