All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v4 0/6] VCN and JPEG RAS poison detection
@ 2022-03-28  6:24 Mohammad Zafar Ziya
  2022-03-28  6:24 ` [PATCH v4 1/6] drm/amdgpu: Add vcn and jpeg ras support flag Mohammad Zafar Ziya
                   ` (6 more replies)
  0 siblings, 7 replies; 20+ messages in thread
From: Mohammad Zafar Ziya @ 2022-03-28  6:24 UTC (permalink / raw)
  To: amd-gfx; +Cc: Tao.Zhou1, lijo.lazar, Mohammad Zafar Ziya, Hawking.Zhang

VCN and JPEG RAS poison detection

Mohammad Zafar Ziya (6):
  drm/amdgpu: Add vcn and jpeg ras support flag
  drm/amdgpu/vcn: Add vcn ras support
  drm/amdgpu/jpeg: Add jpeg block ras support
  drm/amdgpu/vcn: vcn and jpeg ver 2.6 ras register definition
  drm/amdgpu/vcn: VCN ras error query support
  drm/amdgpu/jpeg: jpeg ras error query support

 drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h      |  8 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c       |  9 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h       |  3 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h       | 10 +++
 drivers/gpu/drm/amd/amdgpu/jpeg_v2_5.c        | 74 +++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/jpeg_v2_5.h        |  7 ++
 drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c         | 71 ++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/vcn_v2_5.h         |  6 ++
 .../amd/include/asic_reg/vcn/vcn_2_5_offset.h | 13 ++++
 .../include/asic_reg/vcn/vcn_2_5_sh_mask.h    | 24 ++++++
 10 files changed, 225 insertions(+)

-- 
2.25.1


^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH v4 1/6] drm/amdgpu: Add vcn and jpeg ras support flag
  2022-03-28  6:24 [PATCH v4 0/6] VCN and JPEG RAS poison detection Mohammad Zafar Ziya
@ 2022-03-28  6:24 ` Mohammad Zafar Ziya
  2022-03-28  6:24 ` [PATCH v4 2/6] drm/amdgpu/vcn: Add vcn ras support Mohammad Zafar Ziya
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 20+ messages in thread
From: Mohammad Zafar Ziya @ 2022-03-28  6:24 UTC (permalink / raw)
  To: amd-gfx; +Cc: Tao.Zhou1, lijo.lazar, Mohammad Zafar Ziya, Hawking.Zhang

Add vcn and jpeg ras support options

V2: vcn and jpeg ras flag enabled for aldebaran asic only

V3: vcn and jpeg ras flag disabled for error counter query
Generic poison query interface added
VCN and JPEG ras enabled based on IP version check

V4: vcn and jpeg ras flag moved under ecc flag for dGPU

Signed-off-by: Mohammad Zafar Ziya <Mohammadzafar.ziya@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 9 +++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 ++
 2 files changed, 11 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index f9104f99eb9c..4bbed76b79c8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -66,6 +66,8 @@ const char *ras_block_string[] = {
 	"mp1",
 	"fuse",
 	"mca",
+	"vcn",
+	"jpeg",
 };
 
 const char *ras_mca_block_string[] = {
@@ -2205,6 +2207,13 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev)
 			dev_info(adev->dev, "SRAM ECC is active.\n");
 			adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC |
 						    1 << AMDGPU_RAS_BLOCK__DF);
+
+			if (adev->ip_versions[VCN_HWIP][0] == IP_VERSION(2, 6, 0))
+				adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN |
+						1 << AMDGPU_RAS_BLOCK__JPEG);
+			else
+				adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__VCN |
+						1 << AMDGPU_RAS_BLOCK__JPEG);
 		} else {
 			dev_info(adev->dev, "SRAM ECC is not presented.\n");
 		}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 9314fde81e68..1e1a3b736859 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -49,6 +49,8 @@ enum amdgpu_ras_block {
 	AMDGPU_RAS_BLOCK__MP1,
 	AMDGPU_RAS_BLOCK__FUSE,
 	AMDGPU_RAS_BLOCK__MCA,
+	AMDGPU_RAS_BLOCK__VCN,
+	AMDGPU_RAS_BLOCK__JPEG,
 
 	AMDGPU_RAS_BLOCK__LAST
 };
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH v4 2/6] drm/amdgpu/vcn: Add vcn ras support
  2022-03-28  6:24 [PATCH v4 0/6] VCN and JPEG RAS poison detection Mohammad Zafar Ziya
  2022-03-28  6:24 ` [PATCH v4 1/6] drm/amdgpu: Add vcn and jpeg ras support flag Mohammad Zafar Ziya
@ 2022-03-28  6:24 ` Mohammad Zafar Ziya
  2022-03-28  6:40   ` Paul Menzel
  2022-03-28  6:24 ` [PATCH v4 3/6] drm/amdgpu/jpeg: Add jpeg block " Mohammad Zafar Ziya
                   ` (4 subsequent siblings)
  6 siblings, 1 reply; 20+ messages in thread
From: Mohammad Zafar Ziya @ 2022-03-28  6:24 UTC (permalink / raw)
  To: amd-gfx; +Cc: Tao.Zhou1, lijo.lazar, Mohammad Zafar Ziya, Hawking.Zhang

VCN block ras feature support addition

V2: default ras callback removed

Signed-off-by: Mohammad Zafar Ziya <Mohammadzafar.ziya@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
index e2fde88aaf5e..ea07974ef6f4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
@@ -24,6 +24,8 @@
 #ifndef __AMDGPU_VCN_H__
 #define __AMDGPU_VCN_H__
 
+#include "amdgpu_ras.h"
+
 #define AMDGPU_VCN_STACK_SIZE		(128*1024)
 #define AMDGPU_VCN_CONTEXT_SIZE 	(512*1024)
 
@@ -232,6 +234,10 @@ struct amdgpu_vcn_inst {
 	struct amdgpu_vcn_fw_shared fw_shared;
 };
 
+struct amdgpu_vcn_ras {
+	struct amdgpu_ras_block_object ras_block;
+};
+
 struct amdgpu_vcn {
 	unsigned		fw_version;
 	struct delayed_work	idle_work;
@@ -251,6 +257,9 @@ struct amdgpu_vcn {
 	unsigned	harvest_config;
 	int (*pause_dpg_mode)(struct amdgpu_device *adev,
 		int inst_idx, struct dpg_pause_state *new_state);
+
+	struct ras_common_if    *ras_if;
+	struct amdgpu_vcn_ras   *ras;
 };
 
 struct amdgpu_fw_shared_rb_ptrs_struct {
@@ -339,4 +348,5 @@ void amdgpu_vcn_setup_ucode(struct amdgpu_device *adev);
 void amdgpu_vcn_fwlog_init(struct amdgpu_vcn_inst *vcn);
 void amdgpu_debugfs_vcn_fwlog_init(struct amdgpu_device *adev,
                                    uint8_t i, struct amdgpu_vcn_inst *vcn);
+
 #endif
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH v4 3/6] drm/amdgpu/jpeg: Add jpeg block ras support
  2022-03-28  6:24 [PATCH v4 0/6] VCN and JPEG RAS poison detection Mohammad Zafar Ziya
  2022-03-28  6:24 ` [PATCH v4 1/6] drm/amdgpu: Add vcn and jpeg ras support flag Mohammad Zafar Ziya
  2022-03-28  6:24 ` [PATCH v4 2/6] drm/amdgpu/vcn: Add vcn ras support Mohammad Zafar Ziya
@ 2022-03-28  6:24 ` Mohammad Zafar Ziya
  2022-03-28  6:24 ` [PATCH v4 4/6] drm/amdgpu/vcn: vcn and jpeg ver 2.6 ras register definition Mohammad Zafar Ziya
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 20+ messages in thread
From: Mohammad Zafar Ziya @ 2022-03-28  6:24 UTC (permalink / raw)
  To: amd-gfx; +Cc: Tao.Zhou1, lijo.lazar, Mohammad Zafar Ziya, Hawking.Zhang

Ras support addition for JPEG block

V2: removed default callback

Signed-off-by: Mohammad Zafar Ziya <Mohammadzafar.ziya@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h
index 55fbff2be761..b6c7fb00e05a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h
@@ -24,6 +24,8 @@
 #ifndef __AMDGPU_JPEG_H__
 #define __AMDGPU_JPEG_H__
 
+#include "amdgpu_ras.h"
+
 #define AMDGPU_MAX_JPEG_INSTANCES	2
 
 #define AMDGPU_JPEG_HARVEST_JPEG0 (1 << 0)
@@ -39,6 +41,10 @@ struct amdgpu_jpeg_inst {
 	struct amdgpu_jpeg_reg external;
 };
 
+struct amdgpu_jpeg_ras {
+	struct amdgpu_ras_block_object ras_block;
+};
+
 struct amdgpu_jpeg {
 	uint8_t	num_jpeg_inst;
 	struct amdgpu_jpeg_inst inst[AMDGPU_MAX_JPEG_INSTANCES];
@@ -48,6 +54,8 @@ struct amdgpu_jpeg {
 	enum amd_powergating_state cur_state;
 	struct mutex jpeg_pg_lock;
 	atomic_t total_submission_cnt;
+	struct ras_common_if	*ras_if;
+	struct amdgpu_jpeg_ras	*ras;
 };
 
 int amdgpu_jpeg_sw_init(struct amdgpu_device *adev);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH v4 4/6] drm/amdgpu/vcn: vcn and jpeg ver 2.6 ras register definition
  2022-03-28  6:24 [PATCH v4 0/6] VCN and JPEG RAS poison detection Mohammad Zafar Ziya
                   ` (2 preceding siblings ...)
  2022-03-28  6:24 ` [PATCH v4 3/6] drm/amdgpu/jpeg: Add jpeg block " Mohammad Zafar Ziya
@ 2022-03-28  6:24 ` Mohammad Zafar Ziya
  2022-03-28  6:24 ` [PATCH v4 5/6] drm/amdgpu/vcn: VCN ras error query support Mohammad Zafar Ziya
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 20+ messages in thread
From: Mohammad Zafar Ziya @ 2022-03-28  6:24 UTC (permalink / raw)
  To: amd-gfx; +Cc: Tao.Zhou1, lijo.lazar, Mohammad Zafar Ziya, Hawking.Zhang

Adding vcn and jpeg ver 2.6 ras register definition

Signed-off-by: Mohammad Zafar Ziya <Mohammadzafar.ziya@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
---
 .../amd/include/asic_reg/vcn/vcn_2_5_offset.h | 13 ++++++++++
 .../include/asic_reg/vcn/vcn_2_5_sh_mask.h    | 24 +++++++++++++++++++
 2 files changed, 37 insertions(+)

diff --git a/drivers/gpu/drm/amd/include/asic_reg/vcn/vcn_2_5_offset.h b/drivers/gpu/drm/amd/include/asic_reg/vcn/vcn_2_5_offset.h
index 90350f46a0c4..363d2139cea2 100644
--- a/drivers/gpu/drm/amd/include/asic_reg/vcn/vcn_2_5_offset.h
+++ b/drivers/gpu/drm/amd/include/asic_reg/vcn/vcn_2_5_offset.h
@@ -988,4 +988,17 @@
 #define mmMDM_WIG_PIPE_BUSY_BASE_IDX                                                                   1
 
 
+/* VCN 2_6_0 regs */
+#define mmUVD_RAS_VCPU_VCODEC_STATUS                           0x0057
+#define mmUVD_RAS_VCPU_VCODEC_STATUS_BASE_IDX                  1
+#define mmUVD_RAS_MMSCH_FATAL_ERROR                            0x0058
+#define mmUVD_RAS_MMSCH_FATAL_ERROR_BASE_IDX                   1
+
+
+/* JPEG 2_6_0 regs */
+#define mmUVD_RAS_JPEG0_STATUS                                 0x0059
+#define mmUVD_RAS_JPEG0_STATUS_BASE_IDX                        1
+#define mmUVD_RAS_JPEG1_STATUS                                 0x005a
+#define mmUVD_RAS_JPEG1_STATUS_BASE_IDX                        1
+
 #endif
diff --git a/drivers/gpu/drm/amd/include/asic_reg/vcn/vcn_2_5_sh_mask.h b/drivers/gpu/drm/amd/include/asic_reg/vcn/vcn_2_5_sh_mask.h
index c41c59c30006..8de883b76d90 100644
--- a/drivers/gpu/drm/amd/include/asic_reg/vcn/vcn_2_5_sh_mask.h
+++ b/drivers/gpu/drm/amd/include/asic_reg/vcn/vcn_2_5_sh_mask.h
@@ -3606,4 +3606,28 @@
 #define UVD_LMI_CRC3__CRC32_MASK                                                                              0xFFFFFFFFL
 
 
+/* VCN 2_6_0 UVD_RAS_VCPU_VCODEC_STATUS */
+#define UVD_RAS_VCPU_VCODEC_STATUS__POISONED_VF__SHIFT          0x0
+#define UVD_RAS_VCPU_VCODEC_STATUS__POISONED_PF__SHIFT          0x1f
+#define UVD_RAS_VCPU_VCODEC_STATUS__POISONED_VF_MASK            0x7FFFFFFFL
+#define UVD_RAS_VCPU_VCODEC_STATUS__POISONED_PF_MASK            0x80000000L
+
+/* VCN 2_6_0 UVD_RAS_MMSCH_FATAL_ERROR */
+#define UVD_RAS_MMSCH_FATAL_ERROR__POISONED_VF__SHIFT           0x0
+#define UVD_RAS_MMSCH_FATAL_ERROR__POISONED_PF__SHIFT           0x1f
+#define UVD_RAS_MMSCH_FATAL_ERROR__POISONED_VF_MASK             0x7FFFFFFFL
+#define UVD_RAS_MMSCH_FATAL_ERROR__POISONED_PF_MASK             0x80000000L
+
+/* JPEG 2_6_0 UVD_RAS_JPEG0_STATUS */
+#define UVD_RAS_JPEG0_STATUS__POISONED_VF__SHIFT                0x0
+#define UVD_RAS_JPEG0_STATUS__POISONED_PF__SHIFT                0x1f
+#define UVD_RAS_JPEG0_STATUS__POISONED_VF_MASK                  0x7FFFFFFFL
+#define UVD_RAS_JPEG0_STATUS__POISONED_PF_MASK                  0x80000000L
+
+/* JPEG 2_6_0 UVD_RAS_JPEG1_STATUS */
+#define UVD_RAS_JPEG1_STATUS__POISONED_VF__SHIFT                0x0
+#define UVD_RAS_JPEG1_STATUS__POISONED_PF__SHIFT                0x1f
+#define UVD_RAS_JPEG1_STATUS__POISONED_VF_MASK                  0x7FFFFFFFL
+#define UVD_RAS_JPEG1_STATUS__POISONED_PF_MASK                  0x80000000L
+
 #endif
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH v4 5/6] drm/amdgpu/vcn: VCN ras error query support
  2022-03-28  6:24 [PATCH v4 0/6] VCN and JPEG RAS poison detection Mohammad Zafar Ziya
                   ` (3 preceding siblings ...)
  2022-03-28  6:24 ` [PATCH v4 4/6] drm/amdgpu/vcn: vcn and jpeg ver 2.6 ras register definition Mohammad Zafar Ziya
@ 2022-03-28  6:24 ` Mohammad Zafar Ziya
  2022-03-28  7:43   ` Zhou1, Tao
  2022-03-28  6:24 ` [PATCH v4 6/6] drm/amdgpu/jpeg: jpeg " Mohammad Zafar Ziya
  2022-03-28  6:39 ` [PATCH v4 0/6] VCN and JPEG RAS poison detection Paul Menzel
  6 siblings, 1 reply; 20+ messages in thread
From: Mohammad Zafar Ziya @ 2022-03-28  6:24 UTC (permalink / raw)
  To: amd-gfx; +Cc: Tao.Zhou1, lijo.lazar, Mohammad Zafar Ziya, Hawking.Zhang

RAS error query support addition for VCN 2.6

V2: removed unused option and corrected comment format
Moved the register definition under header file

V3: poison query status check added.
Removed error query interface

V4: MMSCH poison check option removed, return true/false refactored.

Signed-off-by: Mohammad Zafar Ziya <Mohammadzafar.ziya@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  1 +
 drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c   | 71 +++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/vcn_v2_5.h   |  6 +++
 3 files changed, 78 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 1e1a3b736859..606df8869b89 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -508,6 +508,7 @@ struct amdgpu_ras_block_hw_ops {
 	void (*query_ras_error_address)(struct amdgpu_device *adev, void *ras_error_status);
 	void (*reset_ras_error_count)(struct amdgpu_device *adev);
 	void (*reset_ras_error_status)(struct amdgpu_device *adev);
+	bool (*query_poison_status)(struct amdgpu_device *adev);
 };
 
 /* work flow
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
index 1869bae4104b..3988fc647741 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
@@ -31,6 +31,7 @@
 #include "soc15d.h"
 #include "vcn_v2_0.h"
 #include "mmsch_v1_0.h"
+#include "vcn_v2_5.h"
 
 #include "vcn/vcn_2_5_offset.h"
 #include "vcn/vcn_2_5_sh_mask.h"
@@ -59,6 +60,7 @@ static int vcn_v2_5_set_powergating_state(void *handle,
 static int vcn_v2_5_pause_dpg_mode(struct amdgpu_device *adev,
 				int inst_idx, struct dpg_pause_state *new_state);
 static int vcn_v2_5_sriov_start(struct amdgpu_device *adev);
+static void vcn_v2_5_set_ras_funcs(struct amdgpu_device *adev);
 
 static int amdgpu_ih_clientid_vcns[] = {
 	SOC15_IH_CLIENTID_VCN,
@@ -100,6 +102,7 @@ static int vcn_v2_5_early_init(void *handle)
 	vcn_v2_5_set_dec_ring_funcs(adev);
 	vcn_v2_5_set_enc_ring_funcs(adev);
 	vcn_v2_5_set_irq_funcs(adev);
+	vcn_v2_5_set_ras_funcs(adev);
 
 	return 0;
 }
@@ -1930,3 +1933,71 @@ const struct amdgpu_ip_block_version vcn_v2_6_ip_block =
 		.rev = 0,
 		.funcs = &vcn_v2_6_ip_funcs,
 };
+
+static uint32_t vcn_v2_6_query_poison_by_instance(struct amdgpu_device *adev,
+			uint32_t instance, uint32_t sub_block)
+{
+	uint32_t poison_stat = 0, reg_value = 0;
+
+	switch (sub_block) {
+	case AMDGPU_VCN_V2_6_VCPU_VCODEC:
+		reg_value = RREG32_SOC15(VCN, instance, mmUVD_RAS_VCPU_VCODEC_STATUS);
+		poison_stat = REG_GET_FIELD(reg_value, UVD_RAS_VCPU_VCODEC_STATUS, POISONED_PF);
+		break;
+	default:
+		break;
+	};
+
+	if (poison_stat)
+		dev_info(adev->dev, "Poison detected in VCN%d, sub_block%d\n",
+			instance, sub_block);
+
+	return poison_stat;
+}
+
+static bool vcn_v2_6_query_poison_status(struct amdgpu_device *adev)
+{
+	uint32_t inst, sub;
+	uint32_t poison_stat = 0;
+
+	for (inst = 0; inst < adev->vcn.num_vcn_inst; inst++)
+		for (sub = 0; sub < AMDGPU_VCN_V2_6_MAX_SUB_BLOCK; sub++)
+			poison_stat +=
+			vcn_v2_6_query_poison_by_instance(adev, inst, sub);
+
+	return poison_stat ? true : false;
+}
+
+const struct amdgpu_ras_block_hw_ops vcn_v2_6_ras_hw_ops = {
+	.query_poison_status = vcn_v2_6_query_poison_status,
+};
+
+static struct amdgpu_vcn_ras vcn_v2_6_ras = {
+	.ras_block = {
+		.hw_ops = &vcn_v2_6_ras_hw_ops,
+	},
+};
+
+static void vcn_v2_5_set_ras_funcs(struct amdgpu_device *adev)
+{
+	switch (adev->ip_versions[VCN_HWIP][0]) {
+	case IP_VERSION(2, 6, 0):
+		adev->vcn.ras = &vcn_v2_6_ras;
+		break;
+	default:
+		break;
+	}
+
+	if (adev->vcn.ras) {
+		amdgpu_ras_register_ras_block(adev, &adev->vcn.ras->ras_block);
+
+		strcpy(adev->vcn.ras->ras_block.ras_comm.name, "vcn");
+		adev->vcn.ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__VCN;
+		adev->vcn.ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__POISON;
+		adev->vcn.ras_if = &adev->vcn.ras->ras_block.ras_comm;
+
+		/* If don't define special ras_late_init function, use default ras_late_init */
+		if (!adev->vcn.ras->ras_block.ras_late_init)
+			adev->vcn.ras->ras_block.ras_late_init = amdgpu_ras_block_late_init;
+	}
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.h b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.h
index e72f799ed0fd..1c19af74e4fd 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.h
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.h
@@ -24,6 +24,12 @@
 #ifndef __VCN_V2_5_H__
 #define __VCN_V2_5_H__
 
+enum amdgpu_vcn_v2_6_sub_block {
+	AMDGPU_VCN_V2_6_VCPU_VCODEC = 0,
+
+	AMDGPU_VCN_V2_6_MAX_SUB_BLOCK,
+};
+
 extern const struct amdgpu_ip_block_version vcn_v2_5_ip_block;
 extern const struct amdgpu_ip_block_version vcn_v2_6_ip_block;
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH v4 6/6] drm/amdgpu/jpeg: jpeg ras error query support
  2022-03-28  6:24 [PATCH v4 0/6] VCN and JPEG RAS poison detection Mohammad Zafar Ziya
                   ` (4 preceding siblings ...)
  2022-03-28  6:24 ` [PATCH v4 5/6] drm/amdgpu/vcn: VCN ras error query support Mohammad Zafar Ziya
@ 2022-03-28  6:24 ` Mohammad Zafar Ziya
  2022-03-28  6:39 ` [PATCH v4 0/6] VCN and JPEG RAS poison detection Paul Menzel
  6 siblings, 0 replies; 20+ messages in thread
From: Mohammad Zafar Ziya @ 2022-03-28  6:24 UTC (permalink / raw)
  To: amd-gfx; +Cc: Tao.Zhou1, lijo.lazar, Mohammad Zafar Ziya, Hawking.Zhang

RAS error query support addition for JPEG 2.6

V2: removed unused options and corrected comment format.
Moved register definition to header file.

V3: poison query status check added.
Removed the error query support

V4: Return statement refactored.

Signed-off-by: Mohammad Zafar Ziya <Mohammadzafar.ziya@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/jpeg_v2_5.c | 74 ++++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/jpeg_v2_5.h |  7 +++
 2 files changed, 81 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v2_5.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v2_5.c
index a29c86617fb5..4f2de9c31d6b 100644
--- a/drivers/gpu/drm/amd/amdgpu/jpeg_v2_5.c
+++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v2_5.c
@@ -26,6 +26,7 @@
 #include "soc15.h"
 #include "soc15d.h"
 #include "jpeg_v2_0.h"
+#include "jpeg_v2_5.h"
 
 #include "vcn/vcn_2_5_offset.h"
 #include "vcn/vcn_2_5_sh_mask.h"
@@ -39,6 +40,7 @@ static void jpeg_v2_5_set_dec_ring_funcs(struct amdgpu_device *adev);
 static void jpeg_v2_5_set_irq_funcs(struct amdgpu_device *adev);
 static int jpeg_v2_5_set_powergating_state(void *handle,
 				enum amd_powergating_state state);
+static void jpeg_v2_5_set_ras_funcs(struct amdgpu_device *adev);
 
 static int amdgpu_ih_clientid_jpeg[] = {
 	SOC15_IH_CLIENTID_VCN,
@@ -70,6 +72,7 @@ static int jpeg_v2_5_early_init(void *handle)
 
 	jpeg_v2_5_set_dec_ring_funcs(adev);
 	jpeg_v2_5_set_irq_funcs(adev);
+	jpeg_v2_5_set_ras_funcs(adev);
 
 	return 0;
 }
@@ -730,3 +733,74 @@ const struct amdgpu_ip_block_version jpeg_v2_6_ip_block =
 		.rev = 0,
 		.funcs = &jpeg_v2_6_ip_funcs,
 };
+
+static uint32_t jpeg_v2_6_query_poison_by_instance(struct amdgpu_device *adev,
+		uint32_t instance, uint32_t sub_block)
+{
+	uint32_t poison_stat = 0, reg_value = 0;
+
+	switch (sub_block) {
+	case AMDGPU_JPEG_V2_6_JPEG0:
+		reg_value = RREG32_SOC15(JPEG, instance, mmUVD_RAS_JPEG0_STATUS);
+		poison_stat = REG_GET_FIELD(reg_value, UVD_RAS_JPEG0_STATUS, POISONED_PF);
+		break;
+	case AMDGPU_JPEG_V2_6_JPEG1:
+		reg_value = RREG32_SOC15(JPEG, instance, mmUVD_RAS_JPEG1_STATUS);
+		poison_stat = REG_GET_FIELD(reg_value, UVD_RAS_JPEG1_STATUS, POISONED_PF);
+		break;
+	default:
+		break;
+	}
+
+	if (poison_stat)
+		dev_info(adev->dev, "Poison detected in JPEG%d sub_block%d\n",
+			instance, sub_block);
+
+	return poison_stat;
+}
+
+static bool jpeg_v2_6_query_ras_poison_status(struct amdgpu_device *adev)
+{
+	uint32_t inst = 0, sub = 0, poison_stat = 0;
+
+	for (inst = 0; inst < adev->jpeg.num_jpeg_inst; inst++)
+		for (sub = 0; sub < AMDGPU_JPEG_V2_6_MAX_SUB_BLOCK; sub++)
+			poison_stat +=
+			jpeg_v2_6_query_poison_by_instance(adev, inst, sub);
+
+	return poison_stat ? true : false;
+}
+
+const struct amdgpu_ras_block_hw_ops jpeg_v2_6_ras_hw_ops = {
+	.query_poison_status = jpeg_v2_6_query_ras_poison_status,
+};
+
+static struct amdgpu_jpeg_ras jpeg_v2_6_ras = {
+	.ras_block = {
+		.hw_ops = &jpeg_v2_6_ras_hw_ops,
+	},
+};
+
+static void jpeg_v2_5_set_ras_funcs(struct amdgpu_device *adev)
+{
+	switch (adev->ip_versions[JPEG_HWIP][0]) {
+	case IP_VERSION(2, 6, 0):
+		adev->jpeg.ras = &jpeg_v2_6_ras;
+		break;
+	default:
+		break;
+	}
+
+	if (adev->jpeg.ras) {
+		amdgpu_ras_register_ras_block(adev, &adev->jpeg.ras->ras_block);
+
+		strcpy(adev->jpeg.ras->ras_block.ras_comm.name, "jpeg");
+		adev->jpeg.ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__JPEG;
+		adev->jpeg.ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__POISON;
+		adev->jpeg.ras_if = &adev->jpeg.ras->ras_block.ras_comm;
+
+		/* If don't define special ras_late_init function, use default ras_late_init */
+		if (!adev->jpeg.ras->ras_block.ras_late_init)
+			adev->jpeg.ras->ras_block.ras_late_init = amdgpu_ras_block_late_init;
+	}
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v2_5.h b/drivers/gpu/drm/amd/amdgpu/jpeg_v2_5.h
index 3b0aa29b9879..1e858c6cdf13 100644
--- a/drivers/gpu/drm/amd/amdgpu/jpeg_v2_5.h
+++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v2_5.h
@@ -24,6 +24,13 @@
 #ifndef __JPEG_V2_5_H__
 #define __JPEG_V2_5_H__
 
+enum amdgpu_jpeg_v2_6_sub_block {
+	AMDGPU_JPEG_V2_6_JPEG0 = 0,
+	AMDGPU_JPEG_V2_6_JPEG1,
+
+	AMDGPU_JPEG_V2_6_MAX_SUB_BLOCK,
+};
+
 extern const struct amdgpu_ip_block_version jpeg_v2_5_ip_block;
 extern const struct amdgpu_ip_block_version jpeg_v2_6_ip_block;
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH v4 0/6] VCN and JPEG RAS poison detection
  2022-03-28  6:24 [PATCH v4 0/6] VCN and JPEG RAS poison detection Mohammad Zafar Ziya
                   ` (5 preceding siblings ...)
  2022-03-28  6:24 ` [PATCH v4 6/6] drm/amdgpu/jpeg: jpeg " Mohammad Zafar Ziya
@ 2022-03-28  6:39 ` Paul Menzel
  2022-03-28  6:58   ` Ziya, Mohammad zafar
  6 siblings, 1 reply; 20+ messages in thread
From: Paul Menzel @ 2022-03-28  6:39 UTC (permalink / raw)
  To: Mohammad Zafar Ziya; +Cc: lijo.lazar, Tao.Zhou1, amd-gfx, Hawking.Zhang

Dear Mahommad,


Am 28.03.22 um 08:24 schrieb Mohammad Zafar Ziya:
> VCN and JPEG RAS poison detection

It’d be great if you extended this a little bit. Especially, how it can 
be tested.

> Mohammad Zafar Ziya (6):
>    drm/amdgpu: Add vcn and jpeg ras support flag
>    drm/amdgpu/vcn: Add vcn ras support
>    drm/amdgpu/jpeg: Add jpeg block ras support
>    drm/amdgpu/vcn: vcn and jpeg ver 2.6 ras register definition
>    drm/amdgpu/vcn: VCN ras error query support
>    drm/amdgpu/jpeg: jpeg ras error query support

It’d be great if you made the last three commit message summaries also 
statements (by adding a verb in imperative mood).


Kind regards,

Paul

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v4 2/6] drm/amdgpu/vcn: Add vcn ras support
  2022-03-28  6:24 ` [PATCH v4 2/6] drm/amdgpu/vcn: Add vcn ras support Mohammad Zafar Ziya
@ 2022-03-28  6:40   ` Paul Menzel
  0 siblings, 0 replies; 20+ messages in thread
From: Paul Menzel @ 2022-03-28  6:40 UTC (permalink / raw)
  To: Mohammad Zafar Ziya; +Cc: lijo.lazar, Tao.Zhou1, amd-gfx, Hawking.Zhang

Dear Mohammad,


Thank you for your patch.

Am 28.03.22 um 08:24 schrieb Mohammad Zafar Ziya:
> VCN block ras feature support addition
> 
> V2: default ras callback removed
> 
> Signed-off-by: Mohammad Zafar Ziya <Mohammadzafar.ziya@amd.com>
> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h | 10 ++++++++++
>   1 file changed, 10 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> index e2fde88aaf5e..ea07974ef6f4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> @@ -24,6 +24,8 @@
>   #ifndef __AMDGPU_VCN_H__
>   #define __AMDGPU_VCN_H__
>   
> +#include "amdgpu_ras.h"
> +
>   #define AMDGPU_VCN_STACK_SIZE		(128*1024)
>   #define AMDGPU_VCN_CONTEXT_SIZE 	(512*1024)
>   
> @@ -232,6 +234,10 @@ struct amdgpu_vcn_inst {
>   	struct amdgpu_vcn_fw_shared fw_shared;
>   };
>   
> +struct amdgpu_vcn_ras {
> +	struct amdgpu_ras_block_object ras_block;
> +};
> +
>   struct amdgpu_vcn {
>   	unsigned		fw_version;
>   	struct delayed_work	idle_work;
> @@ -251,6 +257,9 @@ struct amdgpu_vcn {
>   	unsigned	harvest_config;
>   	int (*pause_dpg_mode)(struct amdgpu_device *adev,
>   		int inst_idx, struct dpg_pause_state *new_state);
> +
> +	struct ras_common_if    *ras_if;
> +	struct amdgpu_vcn_ras   *ras;
>   };
>   
>   struct amdgpu_fw_shared_rb_ptrs_struct {
> @@ -339,4 +348,5 @@ void amdgpu_vcn_setup_ucode(struct amdgpu_device *adev);
>   void amdgpu_vcn_fwlog_init(struct amdgpu_vcn_inst *vcn);
>   void amdgpu_debugfs_vcn_fwlog_init(struct amdgpu_device *adev,
>                                      uint8_t i, struct amdgpu_vcn_inst *vcn);
> +
>   #endif

This hunk looks unrelated. Maybe remove it?


Kind regards,

Paul

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [PATCH v4 0/6] VCN and JPEG RAS poison detection
  2022-03-28  6:39 ` [PATCH v4 0/6] VCN and JPEG RAS poison detection Paul Menzel
@ 2022-03-28  6:58   ` Ziya, Mohammad zafar
  2022-03-28  8:03     ` Zhou1, Tao
  0 siblings, 1 reply; 20+ messages in thread
From: Ziya, Mohammad zafar @ 2022-03-28  6:58 UTC (permalink / raw)
  To: Paul Menzel; +Cc: Lazar, Lijo, Zhou1, Tao, amd-gfx, Zhang,  Hawking

[AMD Official Use Only]

Dear Paul,

Thank for review. 
Added comment inline.

Regards,
Mohammad

>-----Original Message-----
>From: Paul Menzel <pmenzel@molgen.mpg.de>
>Sent: Monday, March 28, 2022 12:09 PM
>To: Ziya, Mohammad zafar <Mohammadzafar.Ziya@amd.com>
>Cc: amd-gfx@lists.freedesktop.org; Zhou1, Tao <Tao.Zhou1@amd.com>;
>Lazar, Lijo <Lijo.Lazar@amd.com>; Zhang, Hawking
><Hawking.Zhang@amd.com>
>Subject: Re: [PATCH v4 0/6] VCN and JPEG RAS poison detection
>
>Dear Mahommad,
>
>
>Am 28.03.22 um 08:24 schrieb Mohammad Zafar Ziya:
>> VCN and JPEG RAS poison detection
>
>It’d be great if you extended this a little bit. Especially, how it can be tested.

[Mohammad]: This is the first series of the RAS poison consumption detection support, where this series only detects the poison consumption. The Interrupt from VCN/JPEG block will be intercepted and check the poison consumed by VCN/JPEG and cross check it is indeed the vcn/jpeg poison consumption interrupt.

>
>> Mohammad Zafar Ziya (6):
>>    drm/amdgpu: Add vcn and jpeg ras support flag
>>    drm/amdgpu/vcn: Add vcn ras support
>>    drm/amdgpu/jpeg: Add jpeg block ras support
>>    drm/amdgpu/vcn: vcn and jpeg ver 2.6 ras register definition
>>    drm/amdgpu/vcn: VCN ras error query support
>>    drm/amdgpu/jpeg: jpeg ras error query support
>
>It’d be great if you made the last three commit message summaries also
>statements (by adding a verb in imperative mood).

[Mohammad]: The 2nd last and last commit only facilitate the functionality of the poison consumption detection of the vcn/jpeg block. This will be called under IH/BH to check indeed interrupt from VCN/JPEG poison consumption. 
The 3rd last commits only add the register definition needed to facilitate the functionality.

>
>
>Kind regards,
>
>Paul

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [PATCH v4 5/6] drm/amdgpu/vcn: VCN ras error query support
  2022-03-28  6:24 ` [PATCH v4 5/6] drm/amdgpu/vcn: VCN ras error query support Mohammad Zafar Ziya
@ 2022-03-28  7:43   ` Zhou1, Tao
  2022-03-28  7:52     ` Paul Menzel
  0 siblings, 1 reply; 20+ messages in thread
From: Zhou1, Tao @ 2022-03-28  7:43 UTC (permalink / raw)
  To: Ziya, Mohammad zafar, amd-gfx; +Cc: Lazar, Lijo, Zhang, Hawking

[AMD Official Use Only]



-----Original Message-----
From: Ziya, Mohammad zafar <Mohammadzafar.Ziya@amd.com> 
Sent: Monday, March 28, 2022 2:25 PM
To: amd-gfx@lists.freedesktop.org
Cc: Lazar, Lijo <Lijo.Lazar@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Ziya, Mohammad zafar <Mohammadzafar.Ziya@amd.com>
Subject: [PATCH v4 5/6] drm/amdgpu/vcn: VCN ras error query support

RAS error query support addition for VCN 2.6

V2: removed unused option and corrected comment format Moved the register definition under header file

V3: poison query status check added.
Removed error query interface

V4: MMSCH poison check option removed, return true/false refactored.

Signed-off-by: Mohammad Zafar Ziya <Mohammadzafar.ziya@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  1 +
 drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c   | 71 +++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/vcn_v2_5.h   |  6 +++
 3 files changed, 78 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 1e1a3b736859..606df8869b89 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -508,6 +508,7 @@ struct amdgpu_ras_block_hw_ops {
 	void (*query_ras_error_address)(struct amdgpu_device *adev, void *ras_error_status);
 	void (*reset_ras_error_count)(struct amdgpu_device *adev);
 	void (*reset_ras_error_status)(struct amdgpu_device *adev);
+	bool (*query_poison_status)(struct amdgpu_device *adev);
 };
 
 /* work flow
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
index 1869bae4104b..3988fc647741 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
@@ -31,6 +31,7 @@
 #include "soc15d.h"
 #include "vcn_v2_0.h"
 #include "mmsch_v1_0.h"
+#include "vcn_v2_5.h"
 
 #include "vcn/vcn_2_5_offset.h"
 #include "vcn/vcn_2_5_sh_mask.h"
@@ -59,6 +60,7 @@ static int vcn_v2_5_set_powergating_state(void *handle,  static int vcn_v2_5_pause_dpg_mode(struct amdgpu_device *adev,
 				int inst_idx, struct dpg_pause_state *new_state);  static int vcn_v2_5_sriov_start(struct amdgpu_device *adev);
+static void vcn_v2_5_set_ras_funcs(struct amdgpu_device *adev);
 
 static int amdgpu_ih_clientid_vcns[] = {
 	SOC15_IH_CLIENTID_VCN,
@@ -100,6 +102,7 @@ static int vcn_v2_5_early_init(void *handle)
 	vcn_v2_5_set_dec_ring_funcs(adev);
 	vcn_v2_5_set_enc_ring_funcs(adev);
 	vcn_v2_5_set_irq_funcs(adev);
+	vcn_v2_5_set_ras_funcs(adev);
 
 	return 0;
 }
@@ -1930,3 +1933,71 @@ const struct amdgpu_ip_block_version vcn_v2_6_ip_block =
 		.rev = 0,
 		.funcs = &vcn_v2_6_ip_funcs,
 };
+
+static uint32_t vcn_v2_6_query_poison_by_instance(struct amdgpu_device *adev,
+			uint32_t instance, uint32_t sub_block) {
+	uint32_t poison_stat = 0, reg_value = 0;
+
+	switch (sub_block) {
+	case AMDGPU_VCN_V2_6_VCPU_VCODEC:
+		reg_value = RREG32_SOC15(VCN, instance, mmUVD_RAS_VCPU_VCODEC_STATUS);
+		poison_stat = REG_GET_FIELD(reg_value, UVD_RAS_VCPU_VCODEC_STATUS, POISONED_PF);
+		break;
+	default:
+		break;
+	};
+
+	if (poison_stat)
+		dev_info(adev->dev, "Poison detected in VCN%d, sub_block%d\n",
+			instance, sub_block);
+
+	return poison_stat;
+}
+
+static bool vcn_v2_6_query_poison_status(struct amdgpu_device *adev) {
+	uint32_t inst, sub;
+	uint32_t poison_stat = 0;
+
+	for (inst = 0; inst < adev->vcn.num_vcn_inst; inst++)
+		for (sub = 0; sub < AMDGPU_VCN_V2_6_MAX_SUB_BLOCK; sub++)
+			poison_stat +=
+			vcn_v2_6_query_poison_by_instance(adev, inst, sub);
+
+	return poison_stat ? true : false;

[Tao] just want to confirm the logic, if the POISONED_PF of one instance is 1 and another is 0, the poison_stat is true, correct?
Can we add a print message for this case? Same question to JPEG.

+}
+
+const struct amdgpu_ras_block_hw_ops vcn_v2_6_ras_hw_ops = {
+	.query_poison_status = vcn_v2_6_query_poison_status, };
+
+static struct amdgpu_vcn_ras vcn_v2_6_ras = {
+	.ras_block = {
+		.hw_ops = &vcn_v2_6_ras_hw_ops,
+	},
+};
+
+static void vcn_v2_5_set_ras_funcs(struct amdgpu_device *adev) {
+	switch (adev->ip_versions[VCN_HWIP][0]) {
+	case IP_VERSION(2, 6, 0):
+		adev->vcn.ras = &vcn_v2_6_ras;
+		break;
+	default:
+		break;
+	}
+
+	if (adev->vcn.ras) {
+		amdgpu_ras_register_ras_block(adev, &adev->vcn.ras->ras_block);
+
+		strcpy(adev->vcn.ras->ras_block.ras_comm.name, "vcn");
+		adev->vcn.ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__VCN;
+		adev->vcn.ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__POISON;
+		adev->vcn.ras_if = &adev->vcn.ras->ras_block.ras_comm;
+
+		/* If don't define special ras_late_init function, use default ras_late_init */
+		if (!adev->vcn.ras->ras_block.ras_late_init)
+			adev->vcn.ras->ras_block.ras_late_init = amdgpu_ras_block_late_init;
+	}
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.h b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.h
index e72f799ed0fd..1c19af74e4fd 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.h
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.h
@@ -24,6 +24,12 @@
 #ifndef __VCN_V2_5_H__
 #define __VCN_V2_5_H__
 
+enum amdgpu_vcn_v2_6_sub_block {
+	AMDGPU_VCN_V2_6_VCPU_VCODEC = 0,
+
+	AMDGPU_VCN_V2_6_MAX_SUB_BLOCK,
+};
+
 extern const struct amdgpu_ip_block_version vcn_v2_5_ip_block;  extern const struct amdgpu_ip_block_version vcn_v2_6_ip_block;
 
--
2.25.1

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH v4 5/6] drm/amdgpu/vcn: VCN ras error query support
  2022-03-28  7:43   ` Zhou1, Tao
@ 2022-03-28  7:52     ` Paul Menzel
  2022-03-28  8:00       ` Ziya, Mohammad zafar
  0 siblings, 1 reply; 20+ messages in thread
From: Paul Menzel @ 2022-03-28  7:52 UTC (permalink / raw)
  To: Tao Zhou, Mohammad Zafar Ziya; +Cc: Lijo Lazar, amd-gfx, Hawking Zhang

Dear Mohammad, dear Tao,


Tao, it’s hard to find your reply in the quote, as the message is not 
quoted correctly (> prepended). Is it possible to use a different email 
client?


Am 28.03.22 um 09:43 schrieb Zhou1, Tao:
> -----Original Message-----
> From: Ziya, Mohammad zafar <Mohammadzafar.Ziya@amd.com>
> Sent: Monday, March 28, 2022 2:25 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Lazar, Lijo <Lijo.Lazar@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Ziya, Mohammad zafar <Mohammadzafar.Ziya@amd.com>
> Subject: [PATCH v4 5/6] drm/amdgpu/vcn: VCN ras error query support
> 
> RAS error query support addition for VCN 2.6
> 
> V2: removed unused option and corrected comment format Moved the register definition under header file

Please wrap lines after 75 characters. (`scripts/checkpatch.pl` should 
have warned you about that).

> V3: poison query status check added.
> Removed error query interface
> 
> V4: MMSCH poison check option removed, return true/false refactored.
> 
> Signed-off-by: Mohammad Zafar Ziya <Mohammadzafar.ziya@amd.com>
> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  1 +
>   drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c   | 71 +++++++++++++++++++++++++
>   drivers/gpu/drm/amd/amdgpu/vcn_v2_5.h   |  6 +++
>   3 files changed, 78 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 1e1a3b736859..606df8869b89 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -508,6 +508,7 @@ struct amdgpu_ras_block_hw_ops {
>   	void (*query_ras_error_address)(struct amdgpu_device *adev, void *ras_error_status);
>   	void (*reset_ras_error_count)(struct amdgpu_device *adev);
>   	void (*reset_ras_error_status)(struct amdgpu_device *adev);
> +	bool (*query_poison_status)(struct amdgpu_device *adev);
>   };
>   
>   /* work flow
> diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
> index 1869bae4104b..3988fc647741 100644
> --- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
> +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
> @@ -31,6 +31,7 @@
>   #include "soc15d.h"
>   #include "vcn_v2_0.h"
>   #include "mmsch_v1_0.h"
> +#include "vcn_v2_5.h"
>   
>   #include "vcn/vcn_2_5_offset.h"
>   #include "vcn/vcn_2_5_sh_mask.h"
> @@ -59,6 +60,7 @@ static int vcn_v2_5_set_powergating_state(void *handle,  static int vcn_v2_5_pause_dpg_mode(struct amdgpu_device *adev,
>   				int inst_idx, struct dpg_pause_state *new_state);  static int vcn_v2_5_sriov_start(struct amdgpu_device *adev);
> +static void vcn_v2_5_set_ras_funcs(struct amdgpu_device *adev);
>   
>   static int amdgpu_ih_clientid_vcns[] = {
>   	SOC15_IH_CLIENTID_VCN,
> @@ -100,6 +102,7 @@ static int vcn_v2_5_early_init(void *handle)
>   	vcn_v2_5_set_dec_ring_funcs(adev);
>   	vcn_v2_5_set_enc_ring_funcs(adev);
>   	vcn_v2_5_set_irq_funcs(adev);
> +	vcn_v2_5_set_ras_funcs(adev);
>   
>   	return 0;
>   }
> @@ -1930,3 +1933,71 @@ const struct amdgpu_ip_block_version vcn_v2_6_ip_block =
>   		.rev = 0,
>   		.funcs = &vcn_v2_6_ip_funcs,
>   };
> +
> +static uint32_t vcn_v2_6_query_poison_by_instance(struct amdgpu_device *adev,
> +			uint32_t instance, uint32_t sub_block) {
> +	uint32_t poison_stat = 0, reg_value = 0;
> +
> +	switch (sub_block) {
> +	case AMDGPU_VCN_V2_6_VCPU_VCODEC:
> +		reg_value = RREG32_SOC15(VCN, instance, mmUVD_RAS_VCPU_VCODEC_STATUS);
> +		poison_stat = REG_GET_FIELD(reg_value, UVD_RAS_VCPU_VCODEC_STATUS, POISONED_PF);
> +		break;
> +	default:
> +		break;
> +	};
> +
> +	if (poison_stat)
> +		dev_info(adev->dev, "Poison detected in VCN%d, sub_block%d\n",
> +			instance, sub_block);

What should a user do with that information? Faulty hardware, …?

> +
> +	return poison_stat;
> +}
> +
> +static bool vcn_v2_6_query_poison_status(struct amdgpu_device *adev) {
> +	uint32_t inst, sub;
> +	uint32_t poison_stat = 0;
> +
> +	for (inst = 0; inst < adev->vcn.num_vcn_inst; inst++)
> +		for (sub = 0; sub < AMDGPU_VCN_V2_6_MAX_SUB_BLOCK; sub++)
> +			poison_stat +=
> +			vcn_v2_6_query_poison_by_instance(adev, inst, sub);
> +
> +	return poison_stat ? true : false;
> 
> [Tao] just want to confirm the logic, if the POISONED_PF of one instance is 1 and another is 0, the poison_stat is true, correct?
> Can we add a print message for this case? Same question to JPEG.

Is the `dev_info` message in `vcn_v2_6_query_poison_by_instance()` doing 
what you want?

Also, instead of `poison_stat ? true : false;` a common pattern is 
`!!poison_stat` I believe.


Kind regards,

Paul


> +}
> +
> +const struct amdgpu_ras_block_hw_ops vcn_v2_6_ras_hw_ops = {
> +	.query_poison_status = vcn_v2_6_query_poison_status, };
> +
> +static struct amdgpu_vcn_ras vcn_v2_6_ras = {
> +	.ras_block = {
> +		.hw_ops = &vcn_v2_6_ras_hw_ops,
> +	},
> +};
> +
> +static void vcn_v2_5_set_ras_funcs(struct amdgpu_device *adev) {
> +	switch (adev->ip_versions[VCN_HWIP][0]) {
> +	case IP_VERSION(2, 6, 0):
> +		adev->vcn.ras = &vcn_v2_6_ras;
> +		break;
> +	default:
> +		break;
> +	}
> +
> +	if (adev->vcn.ras) {
> +		amdgpu_ras_register_ras_block(adev, &adev->vcn.ras->ras_block);
> +
> +		strcpy(adev->vcn.ras->ras_block.ras_comm.name, "vcn");
> +		adev->vcn.ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__VCN;
> +		adev->vcn.ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__POISON;
> +		adev->vcn.ras_if = &adev->vcn.ras->ras_block.ras_comm;
> +
> +		/* If don't define special ras_late_init function, use default ras_late_init */
> +		if (!adev->vcn.ras->ras_block.ras_late_init)
> +			adev->vcn.ras->ras_block.ras_late_init = amdgpu_ras_block_late_init;
> +	}
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.h b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.h
> index e72f799ed0fd..1c19af74e4fd 100644
> --- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.h
> +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.h
> @@ -24,6 +24,12 @@
>   #ifndef __VCN_V2_5_H__
>   #define __VCN_V2_5_H__
>   
> +enum amdgpu_vcn_v2_6_sub_block {
> +	AMDGPU_VCN_V2_6_VCPU_VCODEC = 0,
> +
> +	AMDGPU_VCN_V2_6_MAX_SUB_BLOCK,
> +};
> +
>   extern const struct amdgpu_ip_block_version vcn_v2_5_ip_block;  extern const struct amdgpu_ip_block_version vcn_v2_6_ip_block;
>   
> --
> 2.25.1

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [PATCH v4 5/6] drm/amdgpu/vcn: VCN ras error query support
  2022-03-28  7:52     ` Paul Menzel
@ 2022-03-28  8:00       ` Ziya, Mohammad zafar
  2022-03-28  8:08         ` Paul Menzel
  0 siblings, 1 reply; 20+ messages in thread
From: Ziya, Mohammad zafar @ 2022-03-28  8:00 UTC (permalink / raw)
  To: Paul Menzel, Zhou1, Tao; +Cc: Lazar, Lijo, amd-gfx, Zhang, Hawking

[AMD Official Use Only]

Dear Paul and Tao,

Comments are added inline.

Regards,
Zafar

>-----Original Message-----
>From: Paul Menzel <pmenzel@molgen.mpg.de>
>Sent: Monday, March 28, 2022 1:22 PM
>To: Zhou1, Tao <Tao.Zhou1@amd.com>; Ziya, Mohammad zafar
><Mohammadzafar.Ziya@amd.com>
>Cc: amd-gfx@lists.freedesktop.org; Lazar, Lijo <Lijo.Lazar@amd.com>; Zhang,
>Hawking <Hawking.Zhang@amd.com>
>Subject: Re: [PATCH v4 5/6] drm/amdgpu/vcn: VCN ras error query support
>
>Dear Mohammad, dear Tao,
>
>
>Tao, it’s hard to find your reply in the quote, as the message is not quoted
>correctly (> prepended). Is it possible to use a different email client?
>
>
>Am 28.03.22 um 09:43 schrieb Zhou1, Tao:
>> -----Original Message-----
>> From: Ziya, Mohammad zafar <Mohammadzafar.Ziya@amd.com>
>> Sent: Monday, March 28, 2022 2:25 PM
>> To: amd-gfx@lists.freedesktop.org
>> Cc: Lazar, Lijo <Lijo.Lazar@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>;
>> Zhang, Hawking <Hawking.Zhang@amd.com>; Ziya, Mohammad zafar
>> <Mohammadzafar.Ziya@amd.com>
>> Subject: [PATCH v4 5/6] drm/amdgpu/vcn: VCN ras error query support
>>
>> RAS error query support addition for VCN 2.6
>>
>> V2: removed unused option and corrected comment format Moved the
>> register definition under header file
>
>Please wrap lines after 75 characters. (`scripts/checkpatch.pl` should have
>warned you about that).
>
>> V3: poison query status check added.
>> Removed error query interface
>>
>> V4: MMSCH poison check option removed, return true/false refactored.
>>
>> Signed-off-by: Mohammad Zafar Ziya <Mohammadzafar.ziya@amd.com>
>> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  1 +
>>   drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c   | 71
>+++++++++++++++++++++++++
>>   drivers/gpu/drm/amd/amdgpu/vcn_v2_5.h   |  6 +++
>>   3 files changed, 78 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
>> index 1e1a3b736859..606df8869b89 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
>> @@ -508,6 +508,7 @@ struct amdgpu_ras_block_hw_ops {
>>   	void (*query_ras_error_address)(struct amdgpu_device *adev, void
>*ras_error_status);
>>   	void (*reset_ras_error_count)(struct amdgpu_device *adev);
>>   	void (*reset_ras_error_status)(struct amdgpu_device *adev);
>> +	bool (*query_poison_status)(struct amdgpu_device *adev);
>>   };
>>
>>   /* work flow
>> diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
>> b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
>> index 1869bae4104b..3988fc647741 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
>> @@ -31,6 +31,7 @@
>>   #include "soc15d.h"
>>   #include "vcn_v2_0.h"
>>   #include "mmsch_v1_0.h"
>> +#include "vcn_v2_5.h"
>>
>>   #include "vcn/vcn_2_5_offset.h"
>>   #include "vcn/vcn_2_5_sh_mask.h"
>> @@ -59,6 +60,7 @@ static int vcn_v2_5_set_powergating_state(void
>*handle,  static int vcn_v2_5_pause_dpg_mode(struct amdgpu_device
>*adev,
>>   				int inst_idx, struct dpg_pause_state
>*new_state);  static int
>> vcn_v2_5_sriov_start(struct amdgpu_device *adev);
>> +static void vcn_v2_5_set_ras_funcs(struct amdgpu_device *adev);
>>
>>   static int amdgpu_ih_clientid_vcns[] = {
>>   	SOC15_IH_CLIENTID_VCN,
>> @@ -100,6 +102,7 @@ static int vcn_v2_5_early_init(void *handle)
>>   	vcn_v2_5_set_dec_ring_funcs(adev);
>>   	vcn_v2_5_set_enc_ring_funcs(adev);
>>   	vcn_v2_5_set_irq_funcs(adev);
>> +	vcn_v2_5_set_ras_funcs(adev);
>>
>>   	return 0;
>>   }
>> @@ -1930,3 +1933,71 @@ const struct amdgpu_ip_block_version
>vcn_v2_6_ip_block =
>>   		.rev = 0,
>>   		.funcs = &vcn_v2_6_ip_funcs,
>>   };
>> +
>> +static uint32_t vcn_v2_6_query_poison_by_instance(struct
>amdgpu_device *adev,
>> +			uint32_t instance, uint32_t sub_block) {
>> +	uint32_t poison_stat = 0, reg_value = 0;
>> +
>> +	switch (sub_block) {
>> +	case AMDGPU_VCN_V2_6_VCPU_VCODEC:
>> +		reg_value = RREG32_SOC15(VCN, instance,
>mmUVD_RAS_VCPU_VCODEC_STATUS);
>> +		poison_stat = REG_GET_FIELD(reg_value,
>UVD_RAS_VCPU_VCODEC_STATUS, POISONED_PF);
>> +		break;
>> +	default:
>> +		break;
>> +	};
>> +
>> +	if (poison_stat)
>> +		dev_info(adev->dev, "Poison detected in VCN%d,
>sub_block%d\n",
>> +			instance, sub_block);
>
>What should a user do with that information? Faulty hardware, …?

[Mohammad]: This message will help to identify the faulty hardware, the hardware ID will also log along with poison, help to identify among multiple hardware installed on the system.

>
>> +
>> +	return poison_stat;
>> +}
>> +
>> +static bool vcn_v2_6_query_poison_status(struct amdgpu_device *adev) {
>> +	uint32_t inst, sub;
>> +	uint32_t poison_stat = 0;
>> +
>> +	for (inst = 0; inst < adev->vcn.num_vcn_inst; inst++)
>> +		for (sub = 0; sub < AMDGPU_VCN_V2_6_MAX_SUB_BLOCK;
>sub++)
>> +			poison_stat +=
>> +			vcn_v2_6_query_poison_by_instance(adev, inst,
>sub);
>> +
>> +	return poison_stat ? true : false;
>>
>> [Tao] just want to confirm the logic, if the POISONED_PF of one instance is 1
>and another is 0, the poison_stat is true, correct?
>> Can we add a print message for this case? Same question to JPEG.

[Mohammad]: Message will be printed on function block ahead of the function.

>
>Is the `dev_info` message in `vcn_v2_6_query_poison_by_instance()` doing
>what you want?
>
>Also, instead of `poison_stat ? true : false;` a common pattern is
>`!!poison_stat` I believe.
>
>
>Kind regards,
>
>Paul

[Mohammad]: Noted the change. Will make to return !!poison_stat ? true : false;

>
>
>> +}
>> +
>> +const struct amdgpu_ras_block_hw_ops vcn_v2_6_ras_hw_ops = {
>> +	.query_poison_status = vcn_v2_6_query_poison_status, };
>> +
>> +static struct amdgpu_vcn_ras vcn_v2_6_ras = {
>> +	.ras_block = {
>> +		.hw_ops = &vcn_v2_6_ras_hw_ops,
>> +	},
>> +};
>> +
>> +static void vcn_v2_5_set_ras_funcs(struct amdgpu_device *adev) {
>> +	switch (adev->ip_versions[VCN_HWIP][0]) {
>> +	case IP_VERSION(2, 6, 0):
>> +		adev->vcn.ras = &vcn_v2_6_ras;
>> +		break;
>> +	default:
>> +		break;
>> +	}
>> +
>> +	if (adev->vcn.ras) {
>> +		amdgpu_ras_register_ras_block(adev, &adev->vcn.ras-
>>ras_block);
>> +
>> +		strcpy(adev->vcn.ras->ras_block.ras_comm.name, "vcn");
>> +		adev->vcn.ras->ras_block.ras_comm.block =
>AMDGPU_RAS_BLOCK__VCN;
>> +		adev->vcn.ras->ras_block.ras_comm.type =
>AMDGPU_RAS_ERROR__POISON;
>> +		adev->vcn.ras_if = &adev->vcn.ras->ras_block.ras_comm;
>> +
>> +		/* If don't define special ras_late_init function, use default
>ras_late_init */
>> +		if (!adev->vcn.ras->ras_block.ras_late_init)
>> +			adev->vcn.ras->ras_block.ras_late_init =
>amdgpu_ras_block_late_init;
>> +	}
>> +}
>> diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.h
>b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.h
>> index e72f799ed0fd..1c19af74e4fd 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.h
>> @@ -24,6 +24,12 @@
>>   #ifndef __VCN_V2_5_H__
>>   #define __VCN_V2_5_H__
>>
>> +enum amdgpu_vcn_v2_6_sub_block {
>> +	AMDGPU_VCN_V2_6_VCPU_VCODEC = 0,
>> +
>> +	AMDGPU_VCN_V2_6_MAX_SUB_BLOCK,
>> +};
>> +
>>   extern const struct amdgpu_ip_block_version vcn_v2_5_ip_block;  extern
>const struct amdgpu_ip_block_version vcn_v2_6_ip_block;
>>
>> --
>> 2.25.1

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [PATCH v4 0/6] VCN and JPEG RAS poison detection
  2022-03-28  6:58   ` Ziya, Mohammad zafar
@ 2022-03-28  8:03     ` Zhou1, Tao
  0 siblings, 0 replies; 20+ messages in thread
From: Zhou1, Tao @ 2022-03-28  8:03 UTC (permalink / raw)
  To: Ziya, Mohammad zafar, Paul Menzel; +Cc: Lazar, Lijo, amd-gfx, Zhang, Hawking

[AMD Official Use Only]

With Paul's comment fixed, the series is:

Reviewed-by: Tao Zhou <tao.zhou1@amd.com>

-----Original Message-----
From: Ziya, Mohammad zafar <Mohammadzafar.Ziya@amd.com> 
Sent: Monday, March 28, 2022 2:58 PM
To: Paul Menzel <pmenzel@molgen.mpg.de>
Cc: amd-gfx@lists.freedesktop.org; Zhou1, Tao <Tao.Zhou1@amd.com>; Lazar, Lijo <Lijo.Lazar@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>
Subject: RE: [PATCH v4 0/6] VCN and JPEG RAS poison detection

[AMD Official Use Only]

Dear Paul,

Thank for review. 
Added comment inline.

Regards,
Mohammad

>-----Original Message-----
>From: Paul Menzel <pmenzel@molgen.mpg.de>
>Sent: Monday, March 28, 2022 12:09 PM
>To: Ziya, Mohammad zafar <Mohammadzafar.Ziya@amd.com>
>Cc: amd-gfx@lists.freedesktop.org; Zhou1, Tao <Tao.Zhou1@amd.com>; 
>Lazar, Lijo <Lijo.Lazar@amd.com>; Zhang, Hawking 
><Hawking.Zhang@amd.com>
>Subject: Re: [PATCH v4 0/6] VCN and JPEG RAS poison detection
>
>Dear Mahommad,
>
>
>Am 28.03.22 um 08:24 schrieb Mohammad Zafar Ziya:
>> VCN and JPEG RAS poison detection
>
>It’d be great if you extended this a little bit. Especially, how it can be tested.

[Mohammad]: This is the first series of the RAS poison consumption detection support, where this series only detects the poison consumption. The Interrupt from VCN/JPEG block will be intercepted and check the poison consumed by VCN/JPEG and cross check it is indeed the vcn/jpeg poison consumption interrupt.

>
>> Mohammad Zafar Ziya (6):
>>    drm/amdgpu: Add vcn and jpeg ras support flag
>>    drm/amdgpu/vcn: Add vcn ras support
>>    drm/amdgpu/jpeg: Add jpeg block ras support
>>    drm/amdgpu/vcn: vcn and jpeg ver 2.6 ras register definition
>>    drm/amdgpu/vcn: VCN ras error query support
>>    drm/amdgpu/jpeg: jpeg ras error query support
>
>It’d be great if you made the last three commit message summaries also 
>statements (by adding a verb in imperative mood).

[Mohammad]: The 2nd last and last commit only facilitate the functionality of the poison consumption detection of the vcn/jpeg block. This will be called under IH/BH to check indeed interrupt from VCN/JPEG poison consumption. 
The 3rd last commits only add the register definition needed to facilitate the functionality.

>
>
>Kind regards,
>
>Paul

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v4 5/6] drm/amdgpu/vcn: VCN ras error query support
  2022-03-28  8:00       ` Ziya, Mohammad zafar
@ 2022-03-28  8:08         ` Paul Menzel
  2022-03-28  8:47           ` Ziya, Mohammad zafar
  0 siblings, 1 reply; 20+ messages in thread
From: Paul Menzel @ 2022-03-28  8:08 UTC (permalink / raw)
  To: Mohammad Zafar Ziya, Tao Zhou; +Cc: Lijo Lazar, amd-gfx, Hawking Zhang

Dear Mohammad,


Am 28.03.22 um 10:00 schrieb Ziya, Mohammad zafar:

[…]

>> From: Paul Menzel <pmenzel@molgen.mpg.de>
>> Sent: Monday, March 28, 2022 1:22 PM

[…]

>> Tao, it’s hard to find your reply in the quote, as the message is not quoted
>> correctly (> prepended). Is it possible to use a different email client?
>>
>>
>> Am 28.03.22 um 09:43 schrieb Zhou1, Tao:
>>> -----Original Message-----
>>> From: Ziya, Mohammad zafar <Mohammadzafar.Ziya@amd.com>
>>> Sent: Monday, March 28, 2022 2:25 PM

[…]

>>> +static uint32_t vcn_v2_6_query_poison_by_instance(struct amdgpu_device *adev,
>>> +			uint32_t instance, uint32_t sub_block) {
>>> +	uint32_t poison_stat = 0, reg_value = 0;
>>> +
>>> +	switch (sub_block) {
>>> +	case AMDGPU_VCN_V2_6_VCPU_VCODEC:
>>> +		reg_value = RREG32_SOC15(VCN, instance, mmUVD_RAS_VCPU_VCODEC_STATUS);
>>> +		poison_stat = REG_GET_FIELD(reg_value, UVD_RAS_VCPU_VCODEC_STATUS, POISONED_PF);
>>> +		break;
>>> +	default:
>>> +		break;
>>> +	};
>>> +
>>> +	if (poison_stat)
>>> +		dev_info(adev->dev, "Poison detected in VCN%d, sub_block%d\n",
>>> +			instance, sub_block);
>>
>> What should a user do with that information? Faulty hardware, …?
> 
> [Mohammad]: This message will help to identify the faulty hardware,
> the hardware ID will also log along with poison, help to identify
> among multiple hardware installed on the system.

Thank you for clarifying. If it’s indeed faulty hardware, should the log 
level be increased to be an error? Keep in mind, that normal ignorant 
users (like me) are reading the message, and it’d be great to guide them 
a little. They do not know what “Poison“ means I guess. Maybe:

A hardware corruption was found indicating the device might be faulty. 
(Poison detected in VCN%d, sub_block%d)\n

(Keep in mind, I do not know anything about RAS.)

>>> +
>>> +	return poison_stat;
>>> +}
>>> +
>>> +static bool vcn_v2_6_query_poison_status(struct amdgpu_device *adev) {
>>> +	uint32_t inst, sub;
>>> +	uint32_t poison_stat = 0;
>>> +
>>> +	for (inst = 0; inst < adev->vcn.num_vcn_inst; inst++)
>>> +		for (sub = 0; sub < AMDGPU_VCN_V2_6_MAX_SUB_BLOCK; sub++)
>>> +			poison_stat +=
>>> +			vcn_v2_6_query_poison_by_instance(adev, inst, sub);
>>> +
>>> +	return poison_stat ? true : false;
>>>
>>> [Tao] just want to confirm the logic, if the POISONED_PF of one instance is 1
>> and another is 0, the poison_stat is true, correct?
>>> Can we add a print message for this case? Same question to JPEG.
> 
> [Mohammad]: Message will be printed on function block ahead of the function.
> 
>> Is the `dev_info` message in `vcn_v2_6_query_poison_by_instance()` doing
>> what you want?
>>
>> Also, instead of `poison_stat ? true : false;` a common pattern is
>> `!!poison_stat` I believe.

[…]

> [Mohammad]: Noted the change. Will make to return !!poison_stat ? true : false;

No, just

     return !!poison_stat;

[…]


Kind regards,

Paul

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [PATCH v4 5/6] drm/amdgpu/vcn: VCN ras error query support
  2022-03-28  8:08         ` Paul Menzel
@ 2022-03-28  8:47           ` Ziya, Mohammad zafar
  2022-03-28  9:37             ` Paul Menzel
  0 siblings, 1 reply; 20+ messages in thread
From: Ziya, Mohammad zafar @ 2022-03-28  8:47 UTC (permalink / raw)
  To: Paul Menzel, Zhou1, Tao; +Cc: Lazar, Lijo, amd-gfx, Zhang, Hawking

[AMD Official Use Only]

Dear Paul,

Thank you for review. 
Comment added inline.

Regards,
Zafar

>-----Original Message-----
>From: Paul Menzel <pmenzel@molgen.mpg.de>
>Sent: Monday, March 28, 2022 1:39 PM
>To: Ziya, Mohammad zafar <Mohammadzafar.Ziya@amd.com>; Zhou1, Tao
><Tao.Zhou1@amd.com>
>Cc: Lazar, Lijo <Lijo.Lazar@amd.com>; amd-gfx@lists.freedesktop.org; Zhang,
>Hawking <Hawking.Zhang@amd.com>
>Subject: Re: [PATCH v4 5/6] drm/amdgpu/vcn: VCN ras error query support
>
>Dear Mohammad,
>
>
>Am 28.03.22 um 10:00 schrieb Ziya, Mohammad zafar:
>
>[…]
>
>>> From: Paul Menzel <pmenzel@molgen.mpg.de>
>>> Sent: Monday, March 28, 2022 1:22 PM
>
>[…]
>
>>> Tao, it’s hard to find your reply in the quote, as the message is not
>>> quoted correctly (> prepended). Is it possible to use a different email
>client?
>>>
>>>
>>> Am 28.03.22 um 09:43 schrieb Zhou1, Tao:
>>>> -----Original Message-----
>>>> From: Ziya, Mohammad zafar <Mohammadzafar.Ziya@amd.com>
>>>> Sent: Monday, March 28, 2022 2:25 PM
>
>[…]
>
>>>> +static uint32_t vcn_v2_6_query_poison_by_instance(struct
>amdgpu_device *adev,
>>>> +			uint32_t instance, uint32_t sub_block) {
>>>> +	uint32_t poison_stat = 0, reg_value = 0;
>>>> +
>>>> +	switch (sub_block) {
>>>> +	case AMDGPU_VCN_V2_6_VCPU_VCODEC:
>>>> +		reg_value = RREG32_SOC15(VCN, instance,
>mmUVD_RAS_VCPU_VCODEC_STATUS);
>>>> +		poison_stat = REG_GET_FIELD(reg_value,
>UVD_RAS_VCPU_VCODEC_STATUS, POISONED_PF);
>>>> +		break;
>>>> +	default:
>>>> +		break;
>>>> +	};
>>>> +
>>>> +	if (poison_stat)
>>>> +		dev_info(adev->dev, "Poison detected in VCN%d,
>sub_block%d\n",
>>>> +			instance, sub_block);
>>>
>>> What should a user do with that information? Faulty hardware, …?
>>
>> [Mohammad]: This message will help to identify the faulty hardware,
>> the hardware ID will also log along with poison, help to identify
>> among multiple hardware installed on the system.
>
>Thank you for clarifying. If it’s indeed faulty hardware, should the log level be
>increased to be an error? Keep in mind, that normal ignorant users (like me)
>are reading the message, and it’d be great to guide them a little. They do not
>know what “Poison“ means I guess. Maybe:
>
>A hardware corruption was found indicating the device might be faulty.
>(Poison detected in VCN%d, sub_block%d)\n
>
>(Keep in mind, I do not know anything about RAS.)

[Mohammad]: It is an error condition, but this is just an information message which could have been ignored as well because VCN just consumed the poison, not created.

>
>>>> +
>>>> +	return poison_stat;
>>>> +}
>>>> +
>>>> +static bool vcn_v2_6_query_poison_status(struct amdgpu_device
>*adev) {
>>>> +	uint32_t inst, sub;
>>>> +	uint32_t poison_stat = 0;
>>>> +
>>>> +	for (inst = 0; inst < adev->vcn.num_vcn_inst; inst++)
>>>> +		for (sub = 0; sub < AMDGPU_VCN_V2_6_MAX_SUB_BLOCK;
>sub++)
>>>> +			poison_stat +=
>>>> +			vcn_v2_6_query_poison_by_instance(adev, inst,
>sub);
>>>> +
>>>> +	return poison_stat ? true : false;
>>>>
>>>> [Tao] just want to confirm the logic, if the POISONED_PF of one
>>>> instance is 1
>>> and another is 0, the poison_stat is true, correct?
>>>> Can we add a print message for this case? Same question to JPEG.
>>
>> [Mohammad]: Message will be printed on function block ahead of the
>function.
>>
>>> Is the `dev_info` message in `vcn_v2_6_query_poison_by_instance()`
>>> doing what you want?
>>>
>>> Also, instead of `poison_stat ? true : false;` a common pattern is
>>> `!!poison_stat` I believe.
>
>[…]
>
>> [Mohammad]: Noted the change. Will make to return !!poison_stat ? true
>> : false;
>
>No, just
>
>     return !!poison_stat;

[Mohammad]: Noted. I realized !!poison_stat  is enough after sending the reply.
>
>[…]
>
>
>Kind regards,
>
>Paul

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v4 5/6] drm/amdgpu/vcn: VCN ras error query support
  2022-03-28  8:47           ` Ziya, Mohammad zafar
@ 2022-03-28  9:37             ` Paul Menzel
  2022-03-28  9:49               ` Ziya, Mohammad zafar
  0 siblings, 1 reply; 20+ messages in thread
From: Paul Menzel @ 2022-03-28  9:37 UTC (permalink / raw)
  To: Mohammad Zafar Ziya, Tao Zhou; +Cc: Lijo Lazar, amd-gfx, Hawking Zhang


Dear Mohammad,


Am 28.03.22 um 10:47 schrieb Ziya, Mohammad zafar:

[…]

>> -----Original Message-----
>> From: Paul Menzel <pmenzel@molgen.mpg.de>
>> Sent: Monday, March 28, 2022 1:39 PM

>> Am 28.03.22 um 10:00 schrieb Ziya, Mohammad zafar:
>>
>> […]
>>
>>>> From: Paul Menzel <pmenzel@molgen.mpg.de>
>>>> Sent: Monday, March 28, 2022 1:22 PM

>>>> Am 28.03.22 um 09:43 schrieb Zhou1, Tao:
>>>>> -----Original Message-----
>>>>> From: Ziya, Mohammad zafar <Mohammadzafar.Ziya@amd.com>
>>>>> Sent: Monday, March 28, 2022 2:25 PM
>>
>> […]
>>
>>>>> +static uint32_t vcn_v2_6_query_poison_by_instance(struct amdgpu_device *adev,
>>>>> +			uint32_t instance, uint32_t sub_block) {
>>>>> +	uint32_t poison_stat = 0, reg_value = 0;
>>>>> +
>>>>> +	switch (sub_block) {
>>>>> +	case AMDGPU_VCN_V2_6_VCPU_VCODEC:
>>>>> +		reg_value = RREG32_SOC15(VCN, instance, mmUVD_RAS_VCPU_VCODEC_STATUS);
>>>>> +		poison_stat = REG_GET_FIELD(reg_value, UVD_RAS_VCPU_VCODEC_STATUS, POISONED_PF);
>>>>> +		break;
>>>>> +	default:
>>>>> +		break;
>>>>> +	};
>>>>> +
>>>>> +	if (poison_stat)
>>>>> +		dev_info(adev->dev, "Poison detected in VCN%d, sub_block%d\n",
>>>>> +			instance, sub_block);
>>>>
>>>> What should a user do with that information? Faulty hardware, …?
>>>
>>> [Mohammad]: This message will help to identify the faulty hardware,
>>> the hardware ID will also log along with poison, help to identify
>>> among multiple hardware installed on the system.
>>
>> Thank you for clarifying. If it’s indeed faulty hardware, should the log level be
>> increased to be an error? Keep in mind, that normal ignorant users (like me)
>> are reading the message, and it’d be great to guide them a little. They do not
>> know what “Poison“ means I guess. Maybe:
>>
>> A hardware corruption was found indicating the device might be faulty.
>> (Poison detected in VCN%d, sub_block%d)\n
>>
>> (Keep in mind, I do not know anything about RAS.)
>
> [Mohammad]: It is an error condition, but this is just an information
> message which could have been ignored as well because VCN just
> consumed the poison, not created.

Sorry, I have never seen these message in `dmesg`, so could you give an 
example log please, what the user would see?


Kind regards,

Paul

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [PATCH v4 5/6] drm/amdgpu/vcn: VCN ras error query support
  2022-03-28  9:37             ` Paul Menzel
@ 2022-03-28  9:49               ` Ziya, Mohammad zafar
  2022-03-28  9:55                 ` Paul Menzel
  0 siblings, 1 reply; 20+ messages in thread
From: Ziya, Mohammad zafar @ 2022-03-28  9:49 UTC (permalink / raw)
  To: Paul Menzel, Zhou1, Tao; +Cc: Lazar, Lijo, amd-gfx, Zhang, Hawking

[AMD Official Use Only]

Dear Paul,

Comment inline.

Regards,
Zafar

>-----Original Message-----
>From: Paul Menzel <pmenzel@molgen.mpg.de>
>Sent: Monday, March 28, 2022 3:08 PM
>To: Ziya, Mohammad zafar <Mohammadzafar.Ziya@amd.com>; Zhou1, Tao
><Tao.Zhou1@amd.com>
>Cc: Lazar, Lijo <Lijo.Lazar@amd.com>; amd-gfx@lists.freedesktop.org; Zhang,
>Hawking <Hawking.Zhang@amd.com>
>Subject: Re: [PATCH v4 5/6] drm/amdgpu/vcn: VCN ras error query support
>
>
>Dear Mohammad,
>
>
>Am 28.03.22 um 10:47 schrieb Ziya, Mohammad zafar:
>
>[…]
>
>>> -----Original Message-----
>>> From: Paul Menzel <pmenzel@molgen.mpg.de>
>>> Sent: Monday, March 28, 2022 1:39 PM
>
>>> Am 28.03.22 um 10:00 schrieb Ziya, Mohammad zafar:
>>>
>>> […]
>>>
>>>>> From: Paul Menzel <pmenzel@molgen.mpg.de>
>>>>> Sent: Monday, March 28, 2022 1:22 PM
>
>>>>> Am 28.03.22 um 09:43 schrieb Zhou1, Tao:
>>>>>> -----Original Message-----
>>>>>> From: Ziya, Mohammad zafar <Mohammadzafar.Ziya@amd.com>
>>>>>> Sent: Monday, March 28, 2022 2:25 PM
>>>
>>> […]
>>>
>>>>>> +static uint32_t vcn_v2_6_query_poison_by_instance(struct
>amdgpu_device *adev,
>>>>>> +			uint32_t instance, uint32_t sub_block) {
>>>>>> +	uint32_t poison_stat = 0, reg_value = 0;
>>>>>> +
>>>>>> +	switch (sub_block) {
>>>>>> +	case AMDGPU_VCN_V2_6_VCPU_VCODEC:
>>>>>> +		reg_value = RREG32_SOC15(VCN, instance,
>mmUVD_RAS_VCPU_VCODEC_STATUS);
>>>>>> +		poison_stat = REG_GET_FIELD(reg_value,
>UVD_RAS_VCPU_VCODEC_STATUS, POISONED_PF);
>>>>>> +		break;
>>>>>> +	default:
>>>>>> +		break;
>>>>>> +	};
>>>>>> +
>>>>>> +	if (poison_stat)
>>>>>> +		dev_info(adev->dev, "Poison detected in VCN%d,
>sub_block%d\n",
>>>>>> +			instance, sub_block);
>>>>>
>>>>> What should a user do with that information? Faulty hardware, …?
>>>>
>>>> [Mohammad]: This message will help to identify the faulty hardware,
>>>> the hardware ID will also log along with poison, help to identify
>>>> among multiple hardware installed on the system.
>>>
>>> Thank you for clarifying. If it’s indeed faulty hardware, should the
>>> log level be increased to be an error? Keep in mind, that normal
>>> ignorant users (like me) are reading the message, and it’d be great
>>> to guide them a little. They do not know what “Poison“ means I guess.
>Maybe:
>>>
>>> A hardware corruption was found indicating the device might be faulty.
>>> (Poison detected in VCN%d, sub_block%d)\n
>>>
>>> (Keep in mind, I do not know anything about RAS.)
>>
>> [Mohammad]: It is an error condition, but this is just an information
>> message which could have been ignored as well because VCN just
>> consumed the poison, not created.
>
>Sorry, I have never seen these message in `dmesg`, so could you give an
>example log please, what the user would see?
>

[Mohammad]: [  231.181316] amdgpu 0000:8a:00.0: amdgpu: Poison detected in VCN0, sub_block0

Sample message from amdgpu " [  237.013029] amdgpu 0000:8a:00.0: amdgpu: HDCP: optional hdcp ta ucode is not available "
>
>Kind regards,
>
>Paul

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v4 5/6] drm/amdgpu/vcn: VCN ras error query support
  2022-03-28  9:49               ` Ziya, Mohammad zafar
@ 2022-03-28  9:55                 ` Paul Menzel
  2022-03-28 10:11                   ` Lazar, Lijo
  0 siblings, 1 reply; 20+ messages in thread
From: Paul Menzel @ 2022-03-28  9:55 UTC (permalink / raw)
  To: Mohammad Zafar Ziya, Tao Zhou; +Cc: Lijo Lazar, amd-gfx, Hawking Zhang

Dear Mohammad,


Am 28.03.22 um 11:49 schrieb Ziya, Mohammad zafar:

>> -----Original Message-----
>> From: Paul Menzel <pmenzel@molgen.mpg.de>
>> Sent: Monday, March 28, 2022 3:08 PM

>> Am 28.03.22 um 10:47 schrieb Ziya, Mohammad zafar:
>>
>> […]
>>
>>>> -----Original Message-----
>>>> From: Paul Menzel <pmenzel@molgen.mpg.de>
>>>> Sent: Monday, March 28, 2022 1:39 PM
>>
>>>> Am 28.03.22 um 10:00 schrieb Ziya, Mohammad zafar:
>>>>
>>>> […]
>>>>
>>>>>> From: Paul Menzel <pmenzel@molgen.mpg.de>
>>>>>> Sent: Monday, March 28, 2022 1:22 PM
>>
>>>>>> Am 28.03.22 um 09:43 schrieb Zhou1, Tao:
>>>>>>> -----Original Message-----
>>>>>>> From: Ziya, Mohammad zafar <Mohammadzafar.Ziya@amd.com>
>>>>>>> Sent: Monday, March 28, 2022 2:25 PM
>>>>
>>>> […]
>>>>
>>>>>>> +static uint32_t vcn_v2_6_query_poison_by_instance(struct amdgpu_device *adev,
>>>>>>> +			uint32_t instance, uint32_t sub_block) {
>>>>>>> +	uint32_t poison_stat = 0, reg_value = 0;
>>>>>>> +
>>>>>>> +	switch (sub_block) {
>>>>>>> +	case AMDGPU_VCN_V2_6_VCPU_VCODEC:
>>>>>>> +		reg_value = RREG32_SOC15(VCN, instance, mmUVD_RAS_VCPU_VCODEC_STATUS);
>>>>>>> +		poison_stat = REG_GET_FIELD(reg_value, UVD_RAS_VCPU_VCODEC_STATUS, POISONED_PF);
>>>>>>> +		break;
>>>>>>> +	default:
>>>>>>> +		break;
>>>>>>> +	};
>>>>>>> +
>>>>>>> +	if (poison_stat)
>>>>>>> +		dev_info(adev->dev, "Poison detected in VCN%d, sub_block%d\n",
>>>>>>> +			instance, sub_block);
>>>>>>
>>>>>> What should a user do with that information? Faulty hardware, …?
>>>>>
>>>>> [Mohammad]: This message will help to identify the faulty hardware,
>>>>> the hardware ID will also log along with poison, help to identify
>>>>> among multiple hardware installed on the system.
>>>>
>>>> Thank you for clarifying. If it’s indeed faulty hardware, should the
>>>> log level be increased to be an error? Keep in mind, that normal
>>>> ignorant users (like me) are reading the message, and it’d be great
>>>> to guide them a little. They do not know what “Poison“ means I guess. Maybe:
>>>>
>>>> A hardware corruption was found indicating the device might be faulty.
>>>> (Poison detected in VCN%d, sub_block%d)\n
>>>>
>>>> (Keep in mind, I do not know anything about RAS.)
>>>
>>> [Mohammad]: It is an error condition, but this is just an information
>>> message which could have been ignored as well because VCN just
>>> consumed the poison, not created.
>>
>> Sorry, I have never seen these message in `dmesg`, so could you give an
>> example log please, what the user would see?
>>
> 
> [Mohammad]: [  231.181316] amdgpu 0000:8a:00.0: amdgpu: Poison detected in VCN0, sub_block0
> 
> Sample message from amdgpu " [  237.013029] amdgpu 0000:8a:00.0: amdgpu: HDCP: optional hdcp ta ucode is not available "

Hmm, that is six seconds later, so, if Linux logs other stuff in 
between, no idea if the connection will be made.

Both messages read like debug message, with normal users not having a 
clue what to do. Can that be improved by rewording them?


Kind regards,

Paul

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v4 5/6] drm/amdgpu/vcn: VCN ras error query support
  2022-03-28  9:55                 ` Paul Menzel
@ 2022-03-28 10:11                   ` Lazar, Lijo
  0 siblings, 0 replies; 20+ messages in thread
From: Lazar, Lijo @ 2022-03-28 10:11 UTC (permalink / raw)
  To: Paul Menzel, Mohammad Zafar Ziya, Tao Zhou; +Cc: amd-gfx, Hawking Zhang



On 3/28/2022 3:25 PM, Paul Menzel wrote:
> Dear Mohammad,
> 
> 
> Am 28.03.22 um 11:49 schrieb Ziya, Mohammad zafar:
> 
>>> -----Original Message-----
>>> From: Paul Menzel <pmenzel@molgen.mpg.de>
>>> Sent: Monday, March 28, 2022 3:08 PM
> 
>>> Am 28.03.22 um 10:47 schrieb Ziya, Mohammad zafar:
>>>
>>> […]
>>>
>>>>> -----Original Message-----
>>>>> From: Paul Menzel <pmenzel@molgen.mpg.de>
>>>>> Sent: Monday, March 28, 2022 1:39 PM
>>>
>>>>> Am 28.03.22 um 10:00 schrieb Ziya, Mohammad zafar:
>>>>>
>>>>> […]
>>>>>
>>>>>>> From: Paul Menzel <pmenzel@molgen.mpg.de>
>>>>>>> Sent: Monday, March 28, 2022 1:22 PM
>>>
>>>>>>> Am 28.03.22 um 09:43 schrieb Zhou1, Tao:
>>>>>>>> -----Original Message-----
>>>>>>>> From: Ziya, Mohammad zafar <Mohammadzafar.Ziya@amd.com>
>>>>>>>> Sent: Monday, March 28, 2022 2:25 PM
>>>>>
>>>>> […]
>>>>>
>>>>>>>> +static uint32_t vcn_v2_6_query_poison_by_instance(struct 
>>>>>>>> amdgpu_device *adev,
>>>>>>>> +            uint32_t instance, uint32_t sub_block) {
>>>>>>>> +    uint32_t poison_stat = 0, reg_value = 0;
>>>>>>>> +
>>>>>>>> +    switch (sub_block) {
>>>>>>>> +    case AMDGPU_VCN_V2_6_VCPU_VCODEC:
>>>>>>>> +        reg_value = RREG32_SOC15(VCN, instance, 
>>>>>>>> mmUVD_RAS_VCPU_VCODEC_STATUS);
>>>>>>>> +        poison_stat = REG_GET_FIELD(reg_value, 
>>>>>>>> UVD_RAS_VCPU_VCODEC_STATUS, POISONED_PF);
>>>>>>>> +        break;
>>>>>>>> +    default:
>>>>>>>> +        break;
>>>>>>>> +    };
>>>>>>>> +
>>>>>>>> +    if (poison_stat)
>>>>>>>> +        dev_info(adev->dev, "Poison detected in VCN%d, 
>>>>>>>> sub_block%d\n",
>>>>>>>> +            instance, sub_block);
>>>>>>>
>>>>>>> What should a user do with that information? Faulty hardware, …?
>>>>>>
>>>>>> [Mohammad]: This message will help to identify the faulty hardware,
>>>>>> the hardware ID will also log along with poison, help to identify
>>>>>> among multiple hardware installed on the system.
>>>>>
>>>>> Thank you for clarifying. If it’s indeed faulty hardware, should the
>>>>> log level be increased to be an error? Keep in mind, that normal
>>>>> ignorant users (like me) are reading the message, and it’d be great
>>>>> to guide them a little. They do not know what “Poison“ means I 
>>>>> guess. Maybe:
>>>>>
>>>>> A hardware corruption was found indicating the device might be faulty.
>>>>> (Poison detected in VCN%d, sub_block%d)\n
>>>>>
>>>>> (Keep in mind, I do not know anything about RAS.)
>>>>
>>>> [Mohammad]: It is an error condition, but this is just an information
>>>> message which could have been ignored as well because VCN just
>>>> consumed the poison, not created.
>>>
>>> Sorry, I have never seen these message in `dmesg`, so could you give an
>>> example log please, what the user would see?
>>>
>>
>> [Mohammad]: [  231.181316] amdgpu 0000:8a:00.0: amdgpu: Poison 
>> detected in VCN0, sub_block0
>>
>> Sample message from amdgpu " [  237.013029] amdgpu 0000:8a:00.0: 
>> amdgpu: HDCP: optional hdcp ta ucode is not available "
> 
> Hmm, that is six seconds later, so, if Linux logs other stuff in 
> between, no idea if the connection will be made.
> 
> Both messages read like debug message, with normal users not having a 
> clue what to do. Can that be improved by rewording them?
> 

Hi Paul,

In general, when this is detected there will be a subsequent recovery 
action done by amdgpu. The above message is mainly to identify why the 
recovery action happened. The steps to be done for recovery is a 
"work-in-progress".

The occurrence of such a thing is expected to be rare and a general user 
doesn't need to do anything on seeing this. If at all something really 
unexpected happens during such rare cases, this message in dmesg helps 
to identify what happened and whether proper action is taken by the driver.

Thanks,
Lijo

> 
> Kind regards,
> 
> Paul

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2022-03-28 10:11 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-03-28  6:24 [PATCH v4 0/6] VCN and JPEG RAS poison detection Mohammad Zafar Ziya
2022-03-28  6:24 ` [PATCH v4 1/6] drm/amdgpu: Add vcn and jpeg ras support flag Mohammad Zafar Ziya
2022-03-28  6:24 ` [PATCH v4 2/6] drm/amdgpu/vcn: Add vcn ras support Mohammad Zafar Ziya
2022-03-28  6:40   ` Paul Menzel
2022-03-28  6:24 ` [PATCH v4 3/6] drm/amdgpu/jpeg: Add jpeg block " Mohammad Zafar Ziya
2022-03-28  6:24 ` [PATCH v4 4/6] drm/amdgpu/vcn: vcn and jpeg ver 2.6 ras register definition Mohammad Zafar Ziya
2022-03-28  6:24 ` [PATCH v4 5/6] drm/amdgpu/vcn: VCN ras error query support Mohammad Zafar Ziya
2022-03-28  7:43   ` Zhou1, Tao
2022-03-28  7:52     ` Paul Menzel
2022-03-28  8:00       ` Ziya, Mohammad zafar
2022-03-28  8:08         ` Paul Menzel
2022-03-28  8:47           ` Ziya, Mohammad zafar
2022-03-28  9:37             ` Paul Menzel
2022-03-28  9:49               ` Ziya, Mohammad zafar
2022-03-28  9:55                 ` Paul Menzel
2022-03-28 10:11                   ` Lazar, Lijo
2022-03-28  6:24 ` [PATCH v4 6/6] drm/amdgpu/jpeg: jpeg " Mohammad Zafar Ziya
2022-03-28  6:39 ` [PATCH v4 0/6] VCN and JPEG RAS poison detection Paul Menzel
2022-03-28  6:58   ` Ziya, Mohammad zafar
2022-03-28  8:03     ` Zhou1, Tao

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.