* [PATCH 1/3] drm/amdgpu: add RAS poison creation handler (v2) @ 2022-04-20 11:30 Tao Zhou 2022-04-20 11:30 ` [PATCH 2/3] drm/amdgpu: add RAS poison consumption " Tao Zhou ` (2 more replies) 0 siblings, 3 replies; 6+ messages in thread From: Tao Zhou @ 2022-04-20 11:30 UTC (permalink / raw) To: amd-gfx, hawking.zhang, stanley.yang, Lijo.Lazar, Mohammadzafar.Ziya, YiPeng.Chai Cc: Tao Zhou Prepare for the implementation of poison consumption handler. v2: separate umc handler from poison creation. Signed-off-by: Tao Zhou <tao.zhou1@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 70 ++++++++++++++++--------- 1 file changed, 44 insertions(+), 26 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 4bbed76b79c8..22477f76913a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1515,12 +1515,45 @@ static int amdgpu_ras_fs_fini(struct amdgpu_device *adev) /* ras fs end */ /* ih begin */ +static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj, + struct amdgpu_iv_entry *entry) +{ + dev_info(obj->adev->dev, + "Poison is created, no user action is needed.\n"); +} + +static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj, + struct amdgpu_iv_entry *entry) +{ + struct ras_ih_data *data = &obj->ih_data; + struct ras_err_data err_data = {0, 0, 0, NULL}; + int ret; + + if (!data->cb) + return; + + /* Let IP handle its data, maybe we need get the output + * from the callback to update the error type/count, etc + */ + ret = data->cb(obj->adev, &err_data, entry); + /* ue will trigger an interrupt, and in that case + * we need do a reset to recovery the whole system. + * But leave IP do that recovery, here we just dispatch + * the error. + */ + if (ret == AMDGPU_RAS_SUCCESS) { + /* these counts could be left as 0 if + * some blocks do not count error number + */ + obj->err_data.ue_count += err_data.ue_count; + obj->err_data.ce_count += err_data.ce_count; + } +} + static void amdgpu_ras_interrupt_handler(struct ras_manager *obj) { struct ras_ih_data *data = &obj->ih_data; struct amdgpu_iv_entry entry; - int ret; - struct ras_err_data err_data = {0, 0, 0, NULL}; while (data->rptr != data->wptr) { rmb(); @@ -1531,30 +1564,15 @@ static void amdgpu_ras_interrupt_handler(struct ras_manager *obj) data->rptr = (data->aligned_element_size + data->rptr) % data->ring_size; - if (data->cb) { - if (amdgpu_ras_is_poison_mode_supported(obj->adev) && - obj->head.block == AMDGPU_RAS_BLOCK__UMC) - dev_info(obj->adev->dev, - "Poison is created, no user action is needed.\n"); - else { - /* Let IP handle its data, maybe we need get the output - * from the callback to udpate the error type/count, etc - */ - memset(&err_data, 0, sizeof(err_data)); - ret = data->cb(obj->adev, &err_data, &entry); - /* ue will trigger an interrupt, and in that case - * we need do a reset to recovery the whole system. - * But leave IP do that recovery, here we just dispatch - * the error. - */ - if (ret == AMDGPU_RAS_SUCCESS) { - /* these counts could be left as 0 if - * some blocks do not count error number - */ - obj->err_data.ue_count += err_data.ue_count; - obj->err_data.ce_count += err_data.ce_count; - } - } + if (amdgpu_ras_is_poison_mode_supported(obj->adev)) { + if (obj->head.block == AMDGPU_RAS_BLOCK__UMC) + amdgpu_ras_interrupt_poison_creation_handler(obj, &entry); + } else { + if (obj->head.block == AMDGPU_RAS_BLOCK__UMC) + amdgpu_ras_interrupt_umc_handler(obj, &entry); + else + dev_warn(obj->adev->dev, + "No RAS interrupt handler for non-UMC block with poison disabled.\n"); } } } -- 2.35.1 ^ permalink raw reply related [flat|nested] 6+ messages in thread
* [PATCH 2/3] drm/amdgpu: add RAS poison consumption handler (v2) 2022-04-20 11:30 [PATCH 1/3] drm/amdgpu: add RAS poison creation handler (v2) Tao Zhou @ 2022-04-20 11:30 ` Tao Zhou 2022-04-20 11:30 ` [PATCH 3/3] drm/amdgpu: add RAS fatal error interrupt handler Tao Zhou 2022-04-21 14:54 ` [PATCH 1/3] drm/amdgpu: add RAS poison creation handler (v2) Zhang, Hawking 2 siblings, 0 replies; 6+ messages in thread From: Tao Zhou @ 2022-04-20 11:30 UTC (permalink / raw) To: amd-gfx, hawking.zhang, stanley.yang, Lijo.Lazar, Mohammadzafar.Ziya, YiPeng.Chai Cc: Tao Zhou Add support for general RAS poison consumption handler. v2: remove callback function for poison consumption. Signed-off-by: Tao Zhou <tao.zhou1@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 34 +++++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 1 + 2 files changed, 35 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 22477f76913a..ad3b8560b2ca 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1515,6 +1515,38 @@ static int amdgpu_ras_fs_fini(struct amdgpu_device *adev) /* ras fs end */ /* ih begin */ +static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *obj, + struct amdgpu_iv_entry *entry) +{ + bool poison_stat = true, need_reset = true; + struct amdgpu_device *adev = obj->adev; + struct ras_err_data err_data = {0, 0, 0, NULL}; + struct amdgpu_ras_block_object *block_obj = + amdgpu_ras_get_ras_block(adev, obj->head.block, 0); + + if (!adev->gmc.xgmi.connected_to_cpu) + amdgpu_umc_poison_handler(adev, &err_data, false); + + /* both query_poison_status and handle_poison_consumption are optional */ + if (block_obj && block_obj->hw_ops) { + if (block_obj->hw_ops->query_poison_status) { + poison_stat = block_obj->hw_ops->query_poison_status(adev); + if (!poison_stat) + dev_info(adev->dev, "No RAS poison status in %s poison IH.\n", + block_obj->ras_comm.name); + } + + if (poison_stat && block_obj->hw_ops->handle_poison_consumption) { + poison_stat = block_obj->hw_ops->handle_poison_consumption(adev); + need_reset = poison_stat; + } + } + + /* gpu reset is fallback for all failed cases */ + if (need_reset) + amdgpu_ras_reset_gpu(adev); +} + static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj, struct amdgpu_iv_entry *entry) { @@ -1567,6 +1599,8 @@ static void amdgpu_ras_interrupt_handler(struct ras_manager *obj) if (amdgpu_ras_is_poison_mode_supported(obj->adev)) { if (obj->head.block == AMDGPU_RAS_BLOCK__UMC) amdgpu_ras_interrupt_poison_creation_handler(obj, &entry); + else + amdgpu_ras_interrupt_poison_consumption_handler(obj, &entry); } else { if (obj->head.block == AMDGPU_RAS_BLOCK__UMC) amdgpu_ras_interrupt_umc_handler(obj, &entry); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 606df8869b89..c4b61785ab5c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -509,6 +509,7 @@ struct amdgpu_ras_block_hw_ops { void (*reset_ras_error_count)(struct amdgpu_device *adev); void (*reset_ras_error_status)(struct amdgpu_device *adev); bool (*query_poison_status)(struct amdgpu_device *adev); + bool (*handle_poison_consumption)(struct amdgpu_device *adev); }; /* work flow -- 2.35.1 ^ permalink raw reply related [flat|nested] 6+ messages in thread
* [PATCH 3/3] drm/amdgpu: add RAS fatal error interrupt handler 2022-04-20 11:30 [PATCH 1/3] drm/amdgpu: add RAS poison creation handler (v2) Tao Zhou 2022-04-20 11:30 ` [PATCH 2/3] drm/amdgpu: add RAS poison consumption " Tao Zhou @ 2022-04-20 11:30 ` Tao Zhou 2022-04-21 14:54 ` [PATCH 1/3] drm/amdgpu: add RAS poison creation handler (v2) Zhang, Hawking 2 siblings, 0 replies; 6+ messages in thread From: Tao Zhou @ 2022-04-20 11:30 UTC (permalink / raw) To: amd-gfx, hawking.zhang, stanley.yang, Lijo.Lazar, Mohammadzafar.Ziya, YiPeng.Chai Cc: Tao Zhou The fatal error handler is independent from general ras interrupt handler since there is no related IH ring. Signed-off-by: Tao Zhou <tao.zhou1@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c | 15 +-------------- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 20 ++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 1 + 3 files changed, 22 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c index ea3e8c66211f..b4cf8717f554 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c @@ -193,20 +193,7 @@ static irqreturn_t amdgpu_irq_handler(int irq, void *arg) if (ret == IRQ_HANDLED) pm_runtime_mark_last_busy(dev->dev); - /* For the hardware that cannot enable bif ring for both ras_controller_irq - * and ras_err_evnet_athub_irq ih cookies, the driver has to poll status - * register to check whether the interrupt is triggered or not, and properly - * ack the interrupt if it is there - */ - if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__PCIE_BIF)) { - if (adev->nbio.ras && - adev->nbio.ras->handle_ras_controller_intr_no_bifring) - adev->nbio.ras->handle_ras_controller_intr_no_bifring(adev); - - if (adev->nbio.ras && - adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring) - adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring(adev); - } + amdgpu_ras_interrupt_fatal_error_handler(adev); return ret; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index ad3b8560b2ca..eaf7fd0bd5d7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1515,6 +1515,26 @@ static int amdgpu_ras_fs_fini(struct amdgpu_device *adev) /* ras fs end */ /* ih begin */ + +/* For the hardware that cannot enable bif ring for both ras_controller_irq + * and ras_err_evnet_athub_irq ih cookies, the driver has to poll status + * register to check whether the interrupt is triggered or not, and properly + * ack the interrupt if it is there + */ +void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev) +{ + if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__PCIE_BIF)) + return; + + if (adev->nbio.ras && + adev->nbio.ras->handle_ras_controller_intr_no_bifring) + adev->nbio.ras->handle_ras_controller_intr_no_bifring(adev); + + if (adev->nbio.ras && + adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring) + adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring(adev); +} + static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *obj, struct amdgpu_iv_entry *entry) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index c4b61785ab5c..b9a6fac2b8b2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -683,4 +683,5 @@ int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_co int amdgpu_ras_register_ras_block(struct amdgpu_device *adev, struct amdgpu_ras_block_object *ras_block_obj); +void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev); #endif -- 2.35.1 ^ permalink raw reply related [flat|nested] 6+ messages in thread
* RE: [PATCH 1/3] drm/amdgpu: add RAS poison creation handler (v2) 2022-04-20 11:30 [PATCH 1/3] drm/amdgpu: add RAS poison creation handler (v2) Tao Zhou 2022-04-20 11:30 ` [PATCH 2/3] drm/amdgpu: add RAS poison consumption " Tao Zhou 2022-04-20 11:30 ` [PATCH 3/3] drm/amdgpu: add RAS fatal error interrupt handler Tao Zhou @ 2022-04-21 14:54 ` Zhang, Hawking 2022-04-22 4:03 ` Zhou1, Tao 2 siblings, 1 reply; 6+ messages in thread From: Zhang, Hawking @ 2022-04-21 14:54 UTC (permalink / raw) To: Zhou1, Tao, amd-gfx, Yang, Stanley, Lazar, Lijo, Ziya, Mohammad zafar, Chai, Thomas Cc: Zhou1, Tao [AMD Official Use Only] Hi Tao, I was thinking more aggressive change - current amdgpu_ras_interrupt_handler only serves as umc poison (poison mode) or uncorrectable error handler (fue mode). We can still keep it as a unified entry point, but how about check ip block first, then if it is umc, then check poison mode to decide poison creation handling or legacy uncorrectable error handling. As discussed before, we'll optimize the poison creation handling later. so keeping poison_creation_handler and legacy umc ue handler in separated functions seems right direction to me. Thoughts? Regards, Hawking -----Original Message----- From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Tao Zhou Sent: Wednesday, April 20, 2022 19:30 To: amd-gfx@lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>; Lazar, Lijo <Lijo.Lazar@amd.com>; Ziya, Mohammad zafar <Mohammadzafar.Ziya@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com> Cc: Zhou1, Tao <Tao.Zhou1@amd.com> Subject: [PATCH 1/3] drm/amdgpu: add RAS poison creation handler (v2) Prepare for the implementation of poison consumption handler. v2: separate umc handler from poison creation. Signed-off-by: Tao Zhou <tao.zhou1@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 70 ++++++++++++++++--------- 1 file changed, 44 insertions(+), 26 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 4bbed76b79c8..22477f76913a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1515,12 +1515,45 @@ static int amdgpu_ras_fs_fini(struct amdgpu_device *adev) /* ras fs end */ /* ih begin */ +static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj, + struct amdgpu_iv_entry *entry) +{ + dev_info(obj->adev->dev, + "Poison is created, no user action is needed.\n"); } + +static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj, + struct amdgpu_iv_entry *entry) +{ + struct ras_ih_data *data = &obj->ih_data; + struct ras_err_data err_data = {0, 0, 0, NULL}; + int ret; + + if (!data->cb) + return; + + /* Let IP handle its data, maybe we need get the output + * from the callback to update the error type/count, etc + */ + ret = data->cb(obj->adev, &err_data, entry); + /* ue will trigger an interrupt, and in that case + * we need do a reset to recovery the whole system. + * But leave IP do that recovery, here we just dispatch + * the error. + */ + if (ret == AMDGPU_RAS_SUCCESS) { + /* these counts could be left as 0 if + * some blocks do not count error number + */ + obj->err_data.ue_count += err_data.ue_count; + obj->err_data.ce_count += err_data.ce_count; + } +} + static void amdgpu_ras_interrupt_handler(struct ras_manager *obj) { struct ras_ih_data *data = &obj->ih_data; struct amdgpu_iv_entry entry; - int ret; - struct ras_err_data err_data = {0, 0, 0, NULL}; while (data->rptr != data->wptr) { rmb(); @@ -1531,30 +1564,15 @@ static void amdgpu_ras_interrupt_handler(struct ras_manager *obj) data->rptr = (data->aligned_element_size + data->rptr) % data->ring_size; - if (data->cb) { - if (amdgpu_ras_is_poison_mode_supported(obj->adev) && - obj->head.block == AMDGPU_RAS_BLOCK__UMC) - dev_info(obj->adev->dev, - "Poison is created, no user action is needed.\n"); - else { - /* Let IP handle its data, maybe we need get the output - * from the callback to udpate the error type/count, etc - */ - memset(&err_data, 0, sizeof(err_data)); - ret = data->cb(obj->adev, &err_data, &entry); - /* ue will trigger an interrupt, and in that case - * we need do a reset to recovery the whole system. - * But leave IP do that recovery, here we just dispatch - * the error. - */ - if (ret == AMDGPU_RAS_SUCCESS) { - /* these counts could be left as 0 if - * some blocks do not count error number - */ - obj->err_data.ue_count += err_data.ue_count; - obj->err_data.ce_count += err_data.ce_count; - } - } + if (amdgpu_ras_is_poison_mode_supported(obj->adev)) { + if (obj->head.block == AMDGPU_RAS_BLOCK__UMC) + amdgpu_ras_interrupt_poison_creation_handler(obj, &entry); + } else { + if (obj->head.block == AMDGPU_RAS_BLOCK__UMC) + amdgpu_ras_interrupt_umc_handler(obj, &entry); + else + dev_warn(obj->adev->dev, + "No RAS interrupt handler for non-UMC block with poison +disabled.\n"); } } } -- 2.35.1 ^ permalink raw reply related [flat|nested] 6+ messages in thread
* RE: [PATCH 1/3] drm/amdgpu: add RAS poison creation handler (v2) 2022-04-21 14:54 ` [PATCH 1/3] drm/amdgpu: add RAS poison creation handler (v2) Zhang, Hawking @ 2022-04-22 4:03 ` Zhou1, Tao 2022-04-22 4:11 ` Zhang, Hawking 0 siblings, 1 reply; 6+ messages in thread From: Zhou1, Tao @ 2022-04-22 4:03 UTC (permalink / raw) To: Zhang, Hawking, amd-gfx, Yang, Stanley, Lazar, Lijo, Ziya, Mohammad zafar, Chai, Thomas [AMD Official Use Only] Hi Hawking, The logic in my patch is: if (poison) if (umc) poison_creation_handler else poison_consumption_handler else if (umc) umc_handler else not supported I think your suggestion is: if (umc) if (poison) poison_creation_handler else umc_handler else if (poiosn) poison_consumption_handler else not supported Either way is OK for me, I don't think one approach is better than another. Regards, Tao > -----Original Message----- > From: Zhang, Hawking <Hawking.Zhang@amd.com> > Sent: Thursday, April 21, 2022 10:54 PM > To: Zhou1, Tao <Tao.Zhou1@amd.com>; amd-gfx@lists.freedesktop.org; Yang, > Stanley <Stanley.Yang@amd.com>; Lazar, Lijo <Lijo.Lazar@amd.com>; Ziya, > Mohammad zafar <Mohammadzafar.Ziya@amd.com>; Chai, Thomas > <YiPeng.Chai@amd.com> > Cc: Zhou1, Tao <Tao.Zhou1@amd.com> > Subject: RE: [PATCH 1/3] drm/amdgpu: add RAS poison creation handler (v2) > > [AMD Official Use Only] > > Hi Tao, > > I was thinking more aggressive change - current amdgpu_ras_interrupt_handler > only serves as umc poison (poison mode) or uncorrectable error handler (fue > mode). > > We can still keep it as a unified entry point, but how about check ip block first, > then if it is umc, then check poison mode to decide poison creation handling or > legacy uncorrectable error handling. > > As discussed before, we'll optimize the poison creation handling later. so > keeping poison_creation_handler and legacy umc ue handler in separated > functions seems right direction to me. > > Thoughts? > > Regards, > Hawking > > -----Original Message----- > From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Tao > Zhou > Sent: Wednesday, April 20, 2022 19:30 > To: amd-gfx@lists.freedesktop.org; Zhang, Hawking > <Hawking.Zhang@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>; Lazar, > Lijo <Lijo.Lazar@amd.com>; Ziya, Mohammad zafar > <Mohammadzafar.Ziya@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com> > Cc: Zhou1, Tao <Tao.Zhou1@amd.com> > Subject: [PATCH 1/3] drm/amdgpu: add RAS poison creation handler (v2) > > Prepare for the implementation of poison consumption handler. > > v2: separate umc handler from poison creation. > > Signed-off-by: Tao Zhou <tao.zhou1@amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 70 ++++++++++++++++--------- > 1 file changed, 44 insertions(+), 26 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index 4bbed76b79c8..22477f76913a 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -1515,12 +1515,45 @@ static int amdgpu_ras_fs_fini(struct > amdgpu_device *adev) > /* ras fs end */ > > /* ih begin */ > +static void amdgpu_ras_interrupt_poison_creation_handler(struct > ras_manager *obj, > + struct amdgpu_iv_entry *entry) { > + dev_info(obj->adev->dev, > + "Poison is created, no user action is needed.\n"); } > + > +static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj, > + struct amdgpu_iv_entry *entry) { > + struct ras_ih_data *data = &obj->ih_data; > + struct ras_err_data err_data = {0, 0, 0, NULL}; > + int ret; > + > + if (!data->cb) > + return; > + > + /* Let IP handle its data, maybe we need get the output > + * from the callback to update the error type/count, etc > + */ > + ret = data->cb(obj->adev, &err_data, entry); > + /* ue will trigger an interrupt, and in that case > + * we need do a reset to recovery the whole system. > + * But leave IP do that recovery, here we just dispatch > + * the error. > + */ > + if (ret == AMDGPU_RAS_SUCCESS) { > + /* these counts could be left as 0 if > + * some blocks do not count error number > + */ > + obj->err_data.ue_count += err_data.ue_count; > + obj->err_data.ce_count += err_data.ce_count; > + } > +} > + > static void amdgpu_ras_interrupt_handler(struct ras_manager *obj) { > struct ras_ih_data *data = &obj->ih_data; > struct amdgpu_iv_entry entry; > - int ret; > - struct ras_err_data err_data = {0, 0, 0, NULL}; > > while (data->rptr != data->wptr) { > rmb(); > @@ -1531,30 +1564,15 @@ static void amdgpu_ras_interrupt_handler(struct > ras_manager *obj) > data->rptr = (data->aligned_element_size + > data->rptr) % data->ring_size; > > - if (data->cb) { > - if (amdgpu_ras_is_poison_mode_supported(obj->adev) && > - obj->head.block == AMDGPU_RAS_BLOCK__UMC) > - dev_info(obj->adev->dev, > - "Poison is created, no user action is needed.\n"); > - else { > - /* Let IP handle its data, maybe we need get the output > - * from the callback to udpate the error type/count, etc > - */ > - memset(&err_data, 0, sizeof(err_data)); > - ret = data->cb(obj->adev, &err_data, &entry); > - /* ue will trigger an interrupt, and in that case > - * we need do a reset to recovery the whole system. > - * But leave IP do that recovery, here we just dispatch > - * the error. > - */ > - if (ret == AMDGPU_RAS_SUCCESS) { > - /* these counts could be left as 0 if > - * some blocks do not count error number > - */ > - obj->err_data.ue_count += err_data.ue_count; > - obj->err_data.ce_count += err_data.ce_count; > - } > - } > + if (amdgpu_ras_is_poison_mode_supported(obj->adev)) { > + if (obj->head.block == AMDGPU_RAS_BLOCK__UMC) > + amdgpu_ras_interrupt_poison_creation_handler(obj, &entry); > + } else { > + if (obj->head.block == AMDGPU_RAS_BLOCK__UMC) > + amdgpu_ras_interrupt_umc_handler(obj, &entry); > + else > + dev_warn(obj->adev->dev, > + "No RAS interrupt handler for > +non-UMC block with poison disabled.\n"); > } > } > } > -- > 2.35.1 > ^ permalink raw reply [flat|nested] 6+ messages in thread
* RE: [PATCH 1/3] drm/amdgpu: add RAS poison creation handler (v2) 2022-04-22 4:03 ` Zhou1, Tao @ 2022-04-22 4:11 ` Zhang, Hawking 0 siblings, 0 replies; 6+ messages in thread From: Zhang, Hawking @ 2022-04-22 4:11 UTC (permalink / raw) To: Zhou1, Tao, amd-gfx, Yang, Stanley, Lazar, Lijo, Ziya, Mohammad zafar, Chai, Thomas Hi Tao, I'm thinking of checking ip block first because we might want to leverage this interrupt handler as a common entry for other RAS interrupt as well. In such case, it looks more clearer if we check the ip block first. I agree with you either way looks good. You can have my RB for pushing the patches - We are always on the way to improve code quality. The series is Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Regards, Hawking -----Original Message----- From: Zhou1, Tao <Tao.Zhou1@amd.com> Sent: Friday, April 22, 2022 12:04 To: Zhang, Hawking <Hawking.Zhang@amd.com>; amd-gfx@lists.freedesktop.org; Yang, Stanley <Stanley.Yang@amd.com>; Lazar, Lijo <Lijo.Lazar@amd.com>; Ziya, Mohammad zafar <Mohammadzafar.Ziya@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com> Subject: RE: [PATCH 1/3] drm/amdgpu: add RAS poison creation handler (v2) [AMD Official Use Only] Hi Hawking, The logic in my patch is: if (poison) if (umc) poison_creation_handler else poison_consumption_handler else if (umc) umc_handler else not supported I think your suggestion is: if (umc) if (poison) poison_creation_handler else umc_handler else if (poiosn) poison_consumption_handler else not supported Either way is OK for me, I don't think one approach is better than another. Regards, Tao > -----Original Message----- > From: Zhang, Hawking <Hawking.Zhang@amd.com> > Sent: Thursday, April 21, 2022 10:54 PM > To: Zhou1, Tao <Tao.Zhou1@amd.com>; amd-gfx@lists.freedesktop.org; > Yang, Stanley <Stanley.Yang@amd.com>; Lazar, Lijo > <Lijo.Lazar@amd.com>; Ziya, Mohammad zafar > <Mohammadzafar.Ziya@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com> > Cc: Zhou1, Tao <Tao.Zhou1@amd.com> > Subject: RE: [PATCH 1/3] drm/amdgpu: add RAS poison creation handler > (v2) > > [AMD Official Use Only] > > Hi Tao, > > I was thinking more aggressive change - current > amdgpu_ras_interrupt_handler only serves as umc poison (poison mode) > or uncorrectable error handler (fue mode). > > We can still keep it as a unified entry point, but how about check ip > block first, then if it is umc, then check poison mode to decide > poison creation handling or legacy uncorrectable error handling. > > As discussed before, we'll optimize the poison creation handling > later. so keeping poison_creation_handler and legacy umc ue handler in > separated functions seems right direction to me. > > Thoughts? > > Regards, > Hawking > > -----Original Message----- > From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Tao > Zhou > Sent: Wednesday, April 20, 2022 19:30 > To: amd-gfx@lists.freedesktop.org; Zhang, Hawking > <Hawking.Zhang@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>; Lazar, > Lijo <Lijo.Lazar@amd.com>; Ziya, Mohammad zafar > <Mohammadzafar.Ziya@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com> > Cc: Zhou1, Tao <Tao.Zhou1@amd.com> > Subject: [PATCH 1/3] drm/amdgpu: add RAS poison creation handler (v2) > > Prepare for the implementation of poison consumption handler. > > v2: separate umc handler from poison creation. > > Signed-off-by: Tao Zhou <tao.zhou1@amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 70 > ++++++++++++++++--------- > 1 file changed, 44 insertions(+), 26 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index 4bbed76b79c8..22477f76913a 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -1515,12 +1515,45 @@ static int amdgpu_ras_fs_fini(struct > amdgpu_device *adev) > /* ras fs end */ > > /* ih begin */ > +static void amdgpu_ras_interrupt_poison_creation_handler(struct > ras_manager *obj, > + struct amdgpu_iv_entry *entry) { > + dev_info(obj->adev->dev, > + "Poison is created, no user action is needed.\n"); } > + > +static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj, > + struct amdgpu_iv_entry *entry) { > + struct ras_ih_data *data = &obj->ih_data; > + struct ras_err_data err_data = {0, 0, 0, NULL}; > + int ret; > + > + if (!data->cb) > + return; > + > + /* Let IP handle its data, maybe we need get the output > + * from the callback to update the error type/count, etc > + */ > + ret = data->cb(obj->adev, &err_data, entry); > + /* ue will trigger an interrupt, and in that case > + * we need do a reset to recovery the whole system. > + * But leave IP do that recovery, here we just dispatch > + * the error. > + */ > + if (ret == AMDGPU_RAS_SUCCESS) { > + /* these counts could be left as 0 if > + * some blocks do not count error number > + */ > + obj->err_data.ue_count += err_data.ue_count; > + obj->err_data.ce_count += err_data.ce_count; > + } > +} > + > static void amdgpu_ras_interrupt_handler(struct ras_manager *obj) { > struct ras_ih_data *data = &obj->ih_data; > struct amdgpu_iv_entry entry; > - int ret; > - struct ras_err_data err_data = {0, 0, 0, NULL}; > > while (data->rptr != data->wptr) { > rmb(); > @@ -1531,30 +1564,15 @@ static void > amdgpu_ras_interrupt_handler(struct > ras_manager *obj) > data->rptr = (data->aligned_element_size + > data->rptr) % data->ring_size; > > - if (data->cb) { > - if (amdgpu_ras_is_poison_mode_supported(obj->adev) && > - obj->head.block == AMDGPU_RAS_BLOCK__UMC) > - dev_info(obj->adev->dev, > - "Poison is created, no user action is needed.\n"); > - else { > - /* Let IP handle its data, maybe we need get the output > - * from the callback to udpate the error type/count, etc > - */ > - memset(&err_data, 0, sizeof(err_data)); > - ret = data->cb(obj->adev, &err_data, &entry); > - /* ue will trigger an interrupt, and in that case > - * we need do a reset to recovery the whole system. > - * But leave IP do that recovery, here we just dispatch > - * the error. > - */ > - if (ret == AMDGPU_RAS_SUCCESS) { > - /* these counts could be left as 0 if > - * some blocks do not count error number > - */ > - obj->err_data.ue_count += err_data.ue_count; > - obj->err_data.ce_count += err_data.ce_count; > - } > - } > + if (amdgpu_ras_is_poison_mode_supported(obj->adev)) { > + if (obj->head.block == AMDGPU_RAS_BLOCK__UMC) > + amdgpu_ras_interrupt_poison_creation_handler(obj, &entry); > + } else { > + if (obj->head.block == AMDGPU_RAS_BLOCK__UMC) > + amdgpu_ras_interrupt_umc_handler(obj, &entry); > + else > + dev_warn(obj->adev->dev, > + "No RAS interrupt handler for > +non-UMC block with poison disabled.\n"); > } > } > } > -- > 2.35.1 > ^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2022-04-22 4:11 UTC | newest] Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2022-04-20 11:30 [PATCH 1/3] drm/amdgpu: add RAS poison creation handler (v2) Tao Zhou 2022-04-20 11:30 ` [PATCH 2/3] drm/amdgpu: add RAS poison consumption " Tao Zhou 2022-04-20 11:30 ` [PATCH 3/3] drm/amdgpu: add RAS fatal error interrupt handler Tao Zhou 2022-04-21 14:54 ` [PATCH 1/3] drm/amdgpu: add RAS poison creation handler (v2) Zhang, Hawking 2022-04-22 4:03 ` Zhou1, Tao 2022-04-22 4:11 ` Zhang, Hawking
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).