[PATCH 1/3] drm/amdgpu: add RAS poison creation handler (v2)

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 1/3] drm/amdgpu: add RAS poison creation handler (v2)
@ 2022-04-20 11:30 Tao Zhou
  2022-04-20 11:30 ` [PATCH 2/3] drm/amdgpu: add RAS poison consumption " Tao Zhou
                   ` (2 more replies)
  0 siblings, 3 replies; 7+ messages in thread
From: Tao Zhou @ 2022-04-20 11:30 UTC (permalink / raw)
  To: amd-gfx, hawking.zhang, stanley.yang, Lijo.Lazar,
	Mohammadzafar.Ziya, YiPeng.Chai
  Cc: Tao Zhou

Prepare for the implementation of poison consumption handler.

v2: separate umc handler from poison creation.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 70 ++++++++++++++++---------
 1 file changed, 44 insertions(+), 26 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 4bbed76b79c8..22477f76913a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1515,12 +1515,45 @@ static int amdgpu_ras_fs_fini(struct amdgpu_device *adev)
 /* ras fs end */
 
 /* ih begin */
+static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj,
+				struct amdgpu_iv_entry *entry)
+{
+	dev_info(obj->adev->dev,
+		"Poison is created, no user action is needed.\n");
+}
+
+static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj,
+				struct amdgpu_iv_entry *entry)
+{
+	struct ras_ih_data *data = &obj->ih_data;
+	struct ras_err_data err_data = {0, 0, 0, NULL};
+	int ret;
+
+	if (!data->cb)
+		return;
+
+	/* Let IP handle its data, maybe we need get the output
+	 * from the callback to update the error type/count, etc
+	 */
+	ret = data->cb(obj->adev, &err_data, entry);
+	/* ue will trigger an interrupt, and in that case
+	 * we need do a reset to recovery the whole system.
+	 * But leave IP do that recovery, here we just dispatch
+	 * the error.
+	 */
+	if (ret == AMDGPU_RAS_SUCCESS) {
+		/* these counts could be left as 0 if
+		 * some blocks do not count error number
+		 */
+		obj->err_data.ue_count += err_data.ue_count;
+		obj->err_data.ce_count += err_data.ce_count;
+	}
+}
+
 static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)
 {
 	struct ras_ih_data *data = &obj->ih_data;
 	struct amdgpu_iv_entry entry;
-	int ret;
-	struct ras_err_data err_data = {0, 0, 0, NULL};
 
 	while (data->rptr != data->wptr) {
 		rmb();
@@ -1531,30 +1564,15 @@ static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)
 		data->rptr = (data->aligned_element_size +
 				data->rptr) % data->ring_size;
 
-		if (data->cb) {
-			if (amdgpu_ras_is_poison_mode_supported(obj->adev) &&
-			    obj->head.block == AMDGPU_RAS_BLOCK__UMC)
-				dev_info(obj->adev->dev,
-						"Poison is created, no user action is needed.\n");
-			else {
-				/* Let IP handle its data, maybe we need get the output
-				 * from the callback to udpate the error type/count, etc
-				 */
-				memset(&err_data, 0, sizeof(err_data));
-				ret = data->cb(obj->adev, &err_data, &entry);
-				/* ue will trigger an interrupt, and in that case
-				 * we need do a reset to recovery the whole system.
-				 * But leave IP do that recovery, here we just dispatch
-				 * the error.
-				 */
-				if (ret == AMDGPU_RAS_SUCCESS) {
-					/* these counts could be left as 0 if
-					 * some blocks do not count error number
-					 */
-					obj->err_data.ue_count += err_data.ue_count;
-					obj->err_data.ce_count += err_data.ce_count;
-				}
-			}
+		if (amdgpu_ras_is_poison_mode_supported(obj->adev)) {
+			if (obj->head.block == AMDGPU_RAS_BLOCK__UMC)
+				amdgpu_ras_interrupt_poison_creation_handler(obj, &entry);
+		} else {
+			if (obj->head.block == AMDGPU_RAS_BLOCK__UMC)
+				amdgpu_ras_interrupt_umc_handler(obj, &entry);
+			else
+				dev_warn(obj->adev->dev,
+					"No RAS interrupt handler for non-UMC block with poison disabled.\n");
 		}
 	}
 }
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 2/3] drm/amdgpu: add RAS poison consumption handler (v2)
  2022-04-20 11:30 [PATCH 1/3] drm/amdgpu: add RAS poison creation handler (v2) Tao Zhou
@ 2022-04-20 11:30 ` Tao Zhou
  2022-04-20 11:30 ` [PATCH 3/3] drm/amdgpu: add RAS fatal error interrupt handler Tao Zhou
  2022-04-21 14:54 ` [PATCH 1/3] drm/amdgpu: add RAS poison creation handler (v2) Zhang, Hawking
  2 siblings, 0 replies; 7+ messages in thread
From: Tao Zhou @ 2022-04-20 11:30 UTC (permalink / raw)
  To: amd-gfx, hawking.zhang, stanley.yang, Lijo.Lazar,
	Mohammadzafar.Ziya, YiPeng.Chai
  Cc: Tao Zhou

Add support for general RAS poison consumption handler.

v2: remove callback function for poison consumption.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 34 +++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  1 +
 2 files changed, 35 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 22477f76913a..ad3b8560b2ca 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1515,6 +1515,38 @@ static int amdgpu_ras_fs_fini(struct amdgpu_device *adev)
 /* ras fs end */
 
 /* ih begin */
+static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *obj,
+				struct amdgpu_iv_entry *entry)
+{
+	bool poison_stat = true, need_reset = true;
+	struct amdgpu_device *adev = obj->adev;
+	struct ras_err_data err_data = {0, 0, 0, NULL};
+	struct amdgpu_ras_block_object *block_obj =
+		amdgpu_ras_get_ras_block(adev, obj->head.block, 0);
+
+	if (!adev->gmc.xgmi.connected_to_cpu)
+		amdgpu_umc_poison_handler(adev, &err_data, false);
+
+	/* both query_poison_status and handle_poison_consumption are optional */
+	if (block_obj && block_obj->hw_ops) {
+		if (block_obj->hw_ops->query_poison_status) {
+			poison_stat = block_obj->hw_ops->query_poison_status(adev);
+			if (!poison_stat)
+				dev_info(adev->dev, "No RAS poison status in %s poison IH.\n",
+						block_obj->ras_comm.name);
+		}
+
+		if (poison_stat && block_obj->hw_ops->handle_poison_consumption) {
+			poison_stat = block_obj->hw_ops->handle_poison_consumption(adev);
+			need_reset = poison_stat;
+		}
+	}
+
+	/* gpu reset is fallback for all failed cases */
+	if (need_reset)
+		amdgpu_ras_reset_gpu(adev);
+}
+
 static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj,
 				struct amdgpu_iv_entry *entry)
 {
@@ -1567,6 +1599,8 @@ static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)
 		if (amdgpu_ras_is_poison_mode_supported(obj->adev)) {
 			if (obj->head.block == AMDGPU_RAS_BLOCK__UMC)
 				amdgpu_ras_interrupt_poison_creation_handler(obj, &entry);
+			else
+				amdgpu_ras_interrupt_poison_consumption_handler(obj, &entry);
 		} else {
 			if (obj->head.block == AMDGPU_RAS_BLOCK__UMC)
 				amdgpu_ras_interrupt_umc_handler(obj, &entry);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 606df8869b89..c4b61785ab5c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -509,6 +509,7 @@ struct amdgpu_ras_block_hw_ops {
 	void (*reset_ras_error_count)(struct amdgpu_device *adev);
 	void (*reset_ras_error_status)(struct amdgpu_device *adev);
 	bool (*query_poison_status)(struct amdgpu_device *adev);
+	bool (*handle_poison_consumption)(struct amdgpu_device *adev);
 };
 
 /* work flow
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 3/3] drm/amdgpu: add RAS fatal error interrupt handler
  2022-04-20 11:30 [PATCH 1/3] drm/amdgpu: add RAS poison creation handler (v2) Tao Zhou
  2022-04-20 11:30 ` [PATCH 2/3] drm/amdgpu: add RAS poison consumption " Tao Zhou
@ 2022-04-20 11:30 ` Tao Zhou
  2022-04-21 14:54 ` [PATCH 1/3] drm/amdgpu: add RAS poison creation handler (v2) Zhang, Hawking
  2 siblings, 0 replies; 7+ messages in thread
From: Tao Zhou @ 2022-04-20 11:30 UTC (permalink / raw)
  To: amd-gfx, hawking.zhang, stanley.yang, Lijo.Lazar,
	Mohammadzafar.Ziya, YiPeng.Chai
  Cc: Tao Zhou

The fatal error handler is independent from general ras interrupt
handler since there is no related IH ring.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c | 15 +--------------
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 20 ++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  1 +
 3 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
index ea3e8c66211f..b4cf8717f554 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
@@ -193,20 +193,7 @@ static irqreturn_t amdgpu_irq_handler(int irq, void *arg)
 	if (ret == IRQ_HANDLED)
 		pm_runtime_mark_last_busy(dev->dev);
 
-	/* For the hardware that cannot enable bif ring for both ras_controller_irq
-         * and ras_err_evnet_athub_irq ih cookies, the driver has to poll status
-	 * register to check whether the interrupt is triggered or not, and properly
-	 * ack the interrupt if it is there
-	 */
-	if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__PCIE_BIF)) {
-		if (adev->nbio.ras &&
-		    adev->nbio.ras->handle_ras_controller_intr_no_bifring)
-			adev->nbio.ras->handle_ras_controller_intr_no_bifring(adev);
-
-		if (adev->nbio.ras &&
-		    adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring)
-			adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring(adev);
-	}
+	amdgpu_ras_interrupt_fatal_error_handler(adev);
 
 	return ret;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index ad3b8560b2ca..eaf7fd0bd5d7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1515,6 +1515,26 @@ static int amdgpu_ras_fs_fini(struct amdgpu_device *adev)
 /* ras fs end */
 
 /* ih begin */
+
+/* For the hardware that cannot enable bif ring for both ras_controller_irq
+ * and ras_err_evnet_athub_irq ih cookies, the driver has to poll status
+ * register to check whether the interrupt is triggered or not, and properly
+ * ack the interrupt if it is there
+ */
+void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev)
+{
+	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__PCIE_BIF))
+		return;
+
+	if (adev->nbio.ras &&
+	    adev->nbio.ras->handle_ras_controller_intr_no_bifring)
+		adev->nbio.ras->handle_ras_controller_intr_no_bifring(adev);
+
+	if (adev->nbio.ras &&
+	    adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring)
+		adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring(adev);
+}
+
 static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *obj,
 				struct amdgpu_iv_entry *entry)
 {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index c4b61785ab5c..b9a6fac2b8b2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -683,4 +683,5 @@ int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_co
 
 int amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
 				struct amdgpu_ras_block_object *ras_block_obj);
+void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev);
 #endif
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* RE: [PATCH 1/3] drm/amdgpu: add RAS poison creation handler (v2)
  2022-04-20 11:30 [PATCH 1/3] drm/amdgpu: add RAS poison creation handler (v2) Tao Zhou
  2022-04-20 11:30 ` [PATCH 2/3] drm/amdgpu: add RAS poison consumption " Tao Zhou
  2022-04-20 11:30 ` [PATCH 3/3] drm/amdgpu: add RAS fatal error interrupt handler Tao Zhou
@ 2022-04-21 14:54 ` Zhang, Hawking
  2022-04-22  4:03   ` Zhou1, Tao
  2 siblings, 1 reply; 7+ messages in thread
From: Zhang, Hawking @ 2022-04-21 14:54 UTC (permalink / raw)
  To: Zhou1, Tao, amd-gfx, Yang, Stanley, Lazar, Lijo, Ziya,
	Mohammad zafar, Chai, Thomas
  Cc: Zhou1, Tao

[AMD Official Use Only]

Hi Tao,

I was thinking more aggressive change - current amdgpu_ras_interrupt_handler only serves as umc poison (poison mode) or uncorrectable error handler (fue mode).

We can still keep it as a unified entry point, but how about check ip block first, then if it is umc, then check poison mode to decide poison creation handling or legacy uncorrectable error handling.

As discussed before, we'll optimize the poison creation handling later. so keeping poison_creation_handler and legacy umc ue handler in separated functions seems right direction to me.

Thoughts?

Regards,
Hawking

-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Tao Zhou
Sent: Wednesday, April 20, 2022 19:30
To: amd-gfx@lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>; Lazar, Lijo <Lijo.Lazar@amd.com>; Ziya, Mohammad zafar <Mohammadzafar.Ziya@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com>
Cc: Zhou1, Tao <Tao.Zhou1@amd.com>
Subject: [PATCH 1/3] drm/amdgpu: add RAS poison creation handler (v2)

Prepare for the implementation of poison consumption handler.

v2: separate umc handler from poison creation.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 70 ++++++++++++++++---------
 1 file changed, 44 insertions(+), 26 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 4bbed76b79c8..22477f76913a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1515,12 +1515,45 @@ static int amdgpu_ras_fs_fini(struct amdgpu_device *adev)
 /* ras fs end */

 /* ih begin */
+static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj,
+                               struct amdgpu_iv_entry *entry)
+{
+       dev_info(obj->adev->dev,
+               "Poison is created, no user action is needed.\n"); }
+
+static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj,
+                               struct amdgpu_iv_entry *entry)
+{
+       struct ras_ih_data *data = &obj->ih_data;
+       struct ras_err_data err_data = {0, 0, 0, NULL};
+       int ret;
+
+       if (!data->cb)
+               return;
+
+       /* Let IP handle its data, maybe we need get the output
+        * from the callback to update the error type/count, etc
+        */
+       ret = data->cb(obj->adev, &err_data, entry);
+       /* ue will trigger an interrupt, and in that case
+        * we need do a reset to recovery the whole system.
+        * But leave IP do that recovery, here we just dispatch
+        * the error.
+        */
+       if (ret == AMDGPU_RAS_SUCCESS) {
+               /* these counts could be left as 0 if
+                * some blocks do not count error number
+                */
+               obj->err_data.ue_count += err_data.ue_count;
+               obj->err_data.ce_count += err_data.ce_count;
+       }
+}
+
 static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)  {
        struct ras_ih_data *data = &obj->ih_data;
        struct amdgpu_iv_entry entry;
-       int ret;
-       struct ras_err_data err_data = {0, 0, 0, NULL};

        while (data->rptr != data->wptr) {
                rmb();
@@ -1531,30 +1564,15 @@ static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)
                data->rptr = (data->aligned_element_size +
                                data->rptr) % data->ring_size;

-               if (data->cb) {
-                       if (amdgpu_ras_is_poison_mode_supported(obj->adev) &&
-                           obj->head.block == AMDGPU_RAS_BLOCK__UMC)
-                               dev_info(obj->adev->dev,
-                                               "Poison is created, no user action is needed.\n");
-                       else {
-                               /* Let IP handle its data, maybe we need get the output
-                                * from the callback to udpate the error type/count, etc
-                                */
-                               memset(&err_data, 0, sizeof(err_data));
-                               ret = data->cb(obj->adev, &err_data, &entry);
-                               /* ue will trigger an interrupt, and in that case
-                                * we need do a reset to recovery the whole system.
-                                * But leave IP do that recovery, here we just dispatch
-                                * the error.
-                                */
-                               if (ret == AMDGPU_RAS_SUCCESS) {
-                                       /* these counts could be left as 0 if
-                                        * some blocks do not count error number
-                                        */
-                                       obj->err_data.ue_count += err_data.ue_count;
-                                       obj->err_data.ce_count += err_data.ce_count;
-                               }
-                       }
+               if (amdgpu_ras_is_poison_mode_supported(obj->adev)) {
+                       if (obj->head.block == AMDGPU_RAS_BLOCK__UMC)
+                               amdgpu_ras_interrupt_poison_creation_handler(obj, &entry);
+               } else {
+                       if (obj->head.block == AMDGPU_RAS_BLOCK__UMC)
+                               amdgpu_ras_interrupt_umc_handler(obj, &entry);
+                       else
+                               dev_warn(obj->adev->dev,
+                                       "No RAS interrupt handler for non-UMC block with poison
+disabled.\n");
                }
        }
 }
--
2.35.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* RE: [PATCH 1/3] drm/amdgpu: add RAS poison creation handler (v2)
  2022-04-21 14:54 ` [PATCH 1/3] drm/amdgpu: add RAS poison creation handler (v2) Zhang, Hawking
@ 2022-04-22  4:03   ` Zhou1, Tao
  2022-04-22  4:11     ` Zhang, Hawking
  0 siblings, 1 reply; 7+ messages in thread
From: Zhou1, Tao @ 2022-04-22  4:03 UTC (permalink / raw)
  To: Zhang, Hawking, amd-gfx, Yang, Stanley, Lazar, Lijo, Ziya,
	Mohammad zafar, Chai, Thomas

[AMD Official Use Only]

Hi Hawking,

The logic in my patch is:

if (poison)
    if (umc)
        poison_creation_handler
    else
        poison_consumption_handler
else
    if (umc)
        umc_handler
    else
        not supported

I think your suggestion is:

if (umc)
    if (poison)
        poison_creation_handler
    else
        umc_handler
else
    if (poiosn)
        poison_consumption_handler
    else
        not supported

Either way is OK for me, I don't think one approach is better than another.

Regards,
Tao

> -----Original Message-----
> From: Zhang, Hawking <Hawking.Zhang@amd.com>
> Sent: Thursday, April 21, 2022 10:54 PM
> To: Zhou1, Tao <Tao.Zhou1@amd.com>; amd-gfx@lists.freedesktop.org; Yang,
> Stanley <Stanley.Yang@amd.com>; Lazar, Lijo <Lijo.Lazar@amd.com>; Ziya,
> Mohammad zafar <Mohammadzafar.Ziya@amd.com>; Chai, Thomas
> <YiPeng.Chai@amd.com>
> Cc: Zhou1, Tao <Tao.Zhou1@amd.com>
> Subject: RE: [PATCH 1/3] drm/amdgpu: add RAS poison creation handler (v2)
> 
> [AMD Official Use Only]
> 
> Hi Tao,
> 
> I was thinking more aggressive change - current amdgpu_ras_interrupt_handler
> only serves as umc poison (poison mode) or uncorrectable error handler (fue
> mode).
> 
> We can still keep it as a unified entry point, but how about check ip block first,
> then if it is umc, then check poison mode to decide poison creation handling or
> legacy uncorrectable error handling.
> 
> As discussed before, we'll optimize the poison creation handling later. so
> keeping poison_creation_handler and legacy umc ue handler in separated
> functions seems right direction to me.
> 
> Thoughts?
> 
> Regards,
> Hawking
> 
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Tao
> Zhou
> Sent: Wednesday, April 20, 2022 19:30
> To: amd-gfx@lists.freedesktop.org; Zhang, Hawking
> <Hawking.Zhang@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>; Lazar,
> Lijo <Lijo.Lazar@amd.com>; Ziya, Mohammad zafar
> <Mohammadzafar.Ziya@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com>
> Cc: Zhou1, Tao <Tao.Zhou1@amd.com>
> Subject: [PATCH 1/3] drm/amdgpu: add RAS poison creation handler (v2)
> 
> Prepare for the implementation of poison consumption handler.
> 
> v2: separate umc handler from poison creation.
> 
> Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 70 ++++++++++++++++---------
>  1 file changed, 44 insertions(+), 26 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 4bbed76b79c8..22477f76913a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -1515,12 +1515,45 @@ static int amdgpu_ras_fs_fini(struct
> amdgpu_device *adev)
>  /* ras fs end */
> 
>  /* ih begin */
> +static void amdgpu_ras_interrupt_poison_creation_handler(struct
> ras_manager *obj,
> +                               struct amdgpu_iv_entry *entry) {
> +       dev_info(obj->adev->dev,
> +               "Poison is created, no user action is needed.\n"); }
> +
> +static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj,
> +                               struct amdgpu_iv_entry *entry) {
> +       struct ras_ih_data *data = &obj->ih_data;
> +       struct ras_err_data err_data = {0, 0, 0, NULL};
> +       int ret;
> +
> +       if (!data->cb)
> +               return;
> +
> +       /* Let IP handle its data, maybe we need get the output
> +        * from the callback to update the error type/count, etc
> +        */
> +       ret = data->cb(obj->adev, &err_data, entry);
> +       /* ue will trigger an interrupt, and in that case
> +        * we need do a reset to recovery the whole system.
> +        * But leave IP do that recovery, here we just dispatch
> +        * the error.
> +        */
> +       if (ret == AMDGPU_RAS_SUCCESS) {
> +               /* these counts could be left as 0 if
> +                * some blocks do not count error number
> +                */
> +               obj->err_data.ue_count += err_data.ue_count;
> +               obj->err_data.ce_count += err_data.ce_count;
> +       }
> +}
> +
>  static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)  {
>         struct ras_ih_data *data = &obj->ih_data;
>         struct amdgpu_iv_entry entry;
> -       int ret;
> -       struct ras_err_data err_data = {0, 0, 0, NULL};
> 
>         while (data->rptr != data->wptr) {
>                 rmb();
> @@ -1531,30 +1564,15 @@ static void amdgpu_ras_interrupt_handler(struct
> ras_manager *obj)
>                 data->rptr = (data->aligned_element_size +
>                                 data->rptr) % data->ring_size;
> 
> -               if (data->cb) {
> -                       if (amdgpu_ras_is_poison_mode_supported(obj->adev) &&
> -                           obj->head.block == AMDGPU_RAS_BLOCK__UMC)
> -                               dev_info(obj->adev->dev,
> -                                               "Poison is created, no user action is needed.\n");
> -                       else {
> -                               /* Let IP handle its data, maybe we need get the output
> -                                * from the callback to udpate the error type/count, etc
> -                                */
> -                               memset(&err_data, 0, sizeof(err_data));
> -                               ret = data->cb(obj->adev, &err_data, &entry);
> -                               /* ue will trigger an interrupt, and in that case
> -                                * we need do a reset to recovery the whole system.
> -                                * But leave IP do that recovery, here we just dispatch
> -                                * the error.
> -                                */
> -                               if (ret == AMDGPU_RAS_SUCCESS) {
> -                                       /* these counts could be left as 0 if
> -                                        * some blocks do not count error number
> -                                        */
> -                                       obj->err_data.ue_count += err_data.ue_count;
> -                                       obj->err_data.ce_count += err_data.ce_count;
> -                               }
> -                       }
> +               if (amdgpu_ras_is_poison_mode_supported(obj->adev)) {
> +                       if (obj->head.block == AMDGPU_RAS_BLOCK__UMC)
> +                               amdgpu_ras_interrupt_poison_creation_handler(obj, &entry);
> +               } else {
> +                       if (obj->head.block == AMDGPU_RAS_BLOCK__UMC)
> +                               amdgpu_ras_interrupt_umc_handler(obj, &entry);
> +                       else
> +                               dev_warn(obj->adev->dev,
> +                                       "No RAS interrupt handler for
> +non-UMC block with poison disabled.\n");
>                 }
>         }
>  }
> --
> 2.35.1
> 

^ permalink raw reply	[flat|nested] 7+ messages in thread

* RE: [PATCH 1/3] drm/amdgpu: add RAS poison creation handler (v2)
  2022-04-22  4:03   ` Zhou1, Tao
@ 2022-04-22  4:11     ` Zhang, Hawking
  0 siblings, 0 replies; 7+ messages in thread
From: Zhang, Hawking @ 2022-04-22  4:11 UTC (permalink / raw)
  To: Zhou1, Tao, amd-gfx, Yang, Stanley, Lazar, Lijo, Ziya,
	Mohammad zafar, Chai, Thomas

Hi Tao,

I'm thinking of checking ip block first because we might want to leverage this interrupt handler as a common entry for other RAS interrupt as well. In such case, it looks more clearer if we check the ip block first.

I agree with you either way looks good. 

You can have my RB for pushing the patches - We are always on the way to improve code quality.

The series is

Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>

Regards,
Hawking

-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1@amd.com> 
Sent: Friday, April 22, 2022 12:04
To: Zhang, Hawking <Hawking.Zhang@amd.com>; amd-gfx@lists.freedesktop.org; Yang, Stanley <Stanley.Yang@amd.com>; Lazar, Lijo <Lijo.Lazar@amd.com>; Ziya, Mohammad zafar <Mohammadzafar.Ziya@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com>
Subject: RE: [PATCH 1/3] drm/amdgpu: add RAS poison creation handler (v2)

[AMD Official Use Only]

Hi Hawking,

The logic in my patch is:

if (poison)
    if (umc)
        poison_creation_handler
    else
        poison_consumption_handler
else
    if (umc)
        umc_handler
    else
        not supported

I think your suggestion is:

if (umc)
    if (poison)
        poison_creation_handler
    else
        umc_handler
else
    if (poiosn)
        poison_consumption_handler
    else
        not supported

Either way is OK for me, I don't think one approach is better than another.

Regards,
Tao

> -----Original Message-----
> From: Zhang, Hawking <Hawking.Zhang@amd.com>
> Sent: Thursday, April 21, 2022 10:54 PM
> To: Zhou1, Tao <Tao.Zhou1@amd.com>; amd-gfx@lists.freedesktop.org; 
> Yang, Stanley <Stanley.Yang@amd.com>; Lazar, Lijo 
> <Lijo.Lazar@amd.com>; Ziya, Mohammad zafar 
> <Mohammadzafar.Ziya@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com>
> Cc: Zhou1, Tao <Tao.Zhou1@amd.com>
> Subject: RE: [PATCH 1/3] drm/amdgpu: add RAS poison creation handler 
> (v2)
> 
> [AMD Official Use Only]
> 
> Hi Tao,
> 
> I was thinking more aggressive change - current 
> amdgpu_ras_interrupt_handler only serves as umc poison (poison mode) 
> or uncorrectable error handler (fue mode).
> 
> We can still keep it as a unified entry point, but how about check ip 
> block first, then if it is umc, then check poison mode to decide 
> poison creation handling or legacy uncorrectable error handling.
> 
> As discussed before, we'll optimize the poison creation handling 
> later. so keeping poison_creation_handler and legacy umc ue handler in 
> separated functions seems right direction to me.
> 
> Thoughts?
> 
> Regards,
> Hawking
> 
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Tao 
> Zhou
> Sent: Wednesday, April 20, 2022 19:30
> To: amd-gfx@lists.freedesktop.org; Zhang, Hawking 
> <Hawking.Zhang@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>; Lazar, 
> Lijo <Lijo.Lazar@amd.com>; Ziya, Mohammad zafar 
> <Mohammadzafar.Ziya@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com>
> Cc: Zhou1, Tao <Tao.Zhou1@amd.com>
> Subject: [PATCH 1/3] drm/amdgpu: add RAS poison creation handler (v2)
> 
> Prepare for the implementation of poison consumption handler.
> 
> v2: separate umc handler from poison creation.
> 
> Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 70 
> ++++++++++++++++---------
>  1 file changed, 44 insertions(+), 26 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 4bbed76b79c8..22477f76913a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -1515,12 +1515,45 @@ static int amdgpu_ras_fs_fini(struct 
> amdgpu_device *adev)
>  /* ras fs end */
> 
>  /* ih begin */
> +static void amdgpu_ras_interrupt_poison_creation_handler(struct
> ras_manager *obj,
> +                               struct amdgpu_iv_entry *entry) {
> +       dev_info(obj->adev->dev,
> +               "Poison is created, no user action is needed.\n"); }
> +
> +static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj,
> +                               struct amdgpu_iv_entry *entry) {
> +       struct ras_ih_data *data = &obj->ih_data;
> +       struct ras_err_data err_data = {0, 0, 0, NULL};
> +       int ret;
> +
> +       if (!data->cb)
> +               return;
> +
> +       /* Let IP handle its data, maybe we need get the output
> +        * from the callback to update the error type/count, etc
> +        */
> +       ret = data->cb(obj->adev, &err_data, entry);
> +       /* ue will trigger an interrupt, and in that case
> +        * we need do a reset to recovery the whole system.
> +        * But leave IP do that recovery, here we just dispatch
> +        * the error.
> +        */
> +       if (ret == AMDGPU_RAS_SUCCESS) {
> +               /* these counts could be left as 0 if
> +                * some blocks do not count error number
> +                */
> +               obj->err_data.ue_count += err_data.ue_count;
> +               obj->err_data.ce_count += err_data.ce_count;
> +       }
> +}
> +
>  static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)  {
>         struct ras_ih_data *data = &obj->ih_data;
>         struct amdgpu_iv_entry entry;
> -       int ret;
> -       struct ras_err_data err_data = {0, 0, 0, NULL};
> 
>         while (data->rptr != data->wptr) {
>                 rmb();
> @@ -1531,30 +1564,15 @@ static void 
> amdgpu_ras_interrupt_handler(struct
> ras_manager *obj)
>                 data->rptr = (data->aligned_element_size +
>                                 data->rptr) % data->ring_size;
> 
> -               if (data->cb) {
> -                       if (amdgpu_ras_is_poison_mode_supported(obj->adev) &&
> -                           obj->head.block == AMDGPU_RAS_BLOCK__UMC)
> -                               dev_info(obj->adev->dev,
> -                                               "Poison is created, no user action is needed.\n");
> -                       else {
> -                               /* Let IP handle its data, maybe we need get the output
> -                                * from the callback to udpate the error type/count, etc
> -                                */
> -                               memset(&err_data, 0, sizeof(err_data));
> -                               ret = data->cb(obj->adev, &err_data, &entry);
> -                               /* ue will trigger an interrupt, and in that case
> -                                * we need do a reset to recovery the whole system.
> -                                * But leave IP do that recovery, here we just dispatch
> -                                * the error.
> -                                */
> -                               if (ret == AMDGPU_RAS_SUCCESS) {
> -                                       /* these counts could be left as 0 if
> -                                        * some blocks do not count error number
> -                                        */
> -                                       obj->err_data.ue_count += err_data.ue_count;
> -                                       obj->err_data.ce_count += err_data.ce_count;
> -                               }
> -                       }
> +               if (amdgpu_ras_is_poison_mode_supported(obj->adev)) {
> +                       if (obj->head.block == AMDGPU_RAS_BLOCK__UMC)
> +                               amdgpu_ras_interrupt_poison_creation_handler(obj, &entry);
> +               } else {
> +                       if (obj->head.block == AMDGPU_RAS_BLOCK__UMC)
> +                               amdgpu_ras_interrupt_umc_handler(obj, &entry);
> +                       else
> +                               dev_warn(obj->adev->dev,
> +                                       "No RAS interrupt handler for 
> +non-UMC block with poison disabled.\n");
>                 }
>         }
>  }
> --
> 2.35.1
> 

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH 3/3] drm/amdgpu: add RAS fatal error interrupt handler
  2022-04-20  3:53 [PATCH 1/3] drm/amdgpu: implement RAS interrupt handler for poison creation Tao Zhou
@ 2022-04-20  3:53 ` Tao Zhou
  0 siblings, 0 replies; 7+ messages in thread
From: Tao Zhou @ 2022-04-20  3:53 UTC (permalink / raw)
  To: amd-gfx, hawking.zhang, stanley.yang, Lijo.Lazar,
	Mohammadzafar.Ziya, YiPeng.Chai
  Cc: Tao Zhou

The fatal error handler is independent from general ras interrupt
handler since there is no related IH ring.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c | 15 +--------------
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 20 ++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  1 +
 3 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
index ea3e8c66211f..b4cf8717f554 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
@@ -193,20 +193,7 @@ static irqreturn_t amdgpu_irq_handler(int irq, void *arg)
 	if (ret == IRQ_HANDLED)
 		pm_runtime_mark_last_busy(dev->dev);
 
-	/* For the hardware that cannot enable bif ring for both ras_controller_irq
-         * and ras_err_evnet_athub_irq ih cookies, the driver has to poll status
-	 * register to check whether the interrupt is triggered or not, and properly
-	 * ack the interrupt if it is there
-	 */
-	if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__PCIE_BIF)) {
-		if (adev->nbio.ras &&
-		    adev->nbio.ras->handle_ras_controller_intr_no_bifring)
-			adev->nbio.ras->handle_ras_controller_intr_no_bifring(adev);
-
-		if (adev->nbio.ras &&
-		    adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring)
-			adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring(adev);
-	}
+	amdgpu_ras_interrupt_fatal_error_handler(adev);
 
 	return ret;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 4bd3c25be809..a93024f72255 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1515,6 +1515,26 @@ static int amdgpu_ras_fs_fini(struct amdgpu_device *adev)
 /* ras fs end */
 
 /* ih begin */
+
+/* For the hardware that cannot enable bif ring for both ras_controller_irq
+ * and ras_err_evnet_athub_irq ih cookies, the driver has to poll status
+ * register to check whether the interrupt is triggered or not, and properly
+ * ack the interrupt if it is there
+ */
+void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev)
+{
+	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__PCIE_BIF))
+		return;
+
+	if (adev->nbio.ras &&
+	    adev->nbio.ras->handle_ras_controller_intr_no_bifring)
+		adev->nbio.ras->handle_ras_controller_intr_no_bifring(adev);
+
+	if (adev->nbio.ras &&
+	    adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring)
+		adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring(adev);
+}
+
 static void ras_interrupt_poison_consumption_handler(struct ras_manager *obj,
 				struct amdgpu_iv_entry *entry)
 {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index c4b61785ab5c..b9a6fac2b8b2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -683,4 +683,5 @@ int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_co
 
 int amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
 				struct amdgpu_ras_block_object *ras_block_obj);
+void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev);
 #endif
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2022-04-22  4:11 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-04-20 11:30 [PATCH 1/3] drm/amdgpu: add RAS poison creation handler (v2) Tao Zhou
2022-04-20 11:30 ` [PATCH 2/3] drm/amdgpu: add RAS poison consumption " Tao Zhou
2022-04-20 11:30 ` [PATCH 3/3] drm/amdgpu: add RAS fatal error interrupt handler Tao Zhou
2022-04-21 14:54 ` [PATCH 1/3] drm/amdgpu: add RAS poison creation handler (v2) Zhang, Hawking
2022-04-22  4:03   ` Zhou1, Tao
2022-04-22  4:11     ` Zhang, Hawking
  -- strict thread matches above, loose matches on Subject: below --
2022-04-20  3:53 [PATCH 1/3] drm/amdgpu: implement RAS interrupt handler for poison creation Tao Zhou
2022-04-20  3:53 ` [PATCH 3/3] drm/amdgpu: add RAS fatal error interrupt handler Tao Zhou

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.