All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Lazar, Lijo" <Lijo.Lazar@amd.com>
To: "Tuikov, Luben" <Luben.Tuikov@amd.com>,
	"amd-gfx@lists.freedesktop.org" <amd-gfx@lists.freedesktop.org>
Cc: "Deucher, Alexander" <Alexander.Deucher@amd.com>,
	"Tuikov, Luben" <Luben.Tuikov@amd.com>,
	"Clements, John" <John.Clements@amd.com>,
	"Koenig,  Christian" <Christian.Koenig@amd.com>,
	"Zhang, Hawking" <Hawking.Zhang@amd.com>
Subject: RE: [PATCH 3/3] drm/amdgpu: Use delayed work to collect RAS error counters
Date: Wed, 26 May 2021 11:00:52 +0000	[thread overview]
Message-ID: <CH0PR12MB53487EE4BD75A8B5493F742D97249@CH0PR12MB5348.namprd12.prod.outlook.com> (raw)
In-Reply-To: <20210521211836.4839-3-luben.tuikov@amd.com>

[AMD Official Use Only]

Scheduling an error status query just based on IOCTL doesn't sound like a sound approach. What if driver needs to handle errors based on that - for ex: if the number of correctable errors exceed a certain threshold?

IMO, I'm more aligned to Luben's original approach of having something waiting in the background - instead of a periodic timer based trigger, it could be an event based trigger.  Event may be an ioctl, error handler timer ticks or something else.

Thanks,
Lijo

-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Luben Tuikov
Sent: Saturday, May 22, 2021 2:49 AM
To: amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Tuikov, Luben <Luben.Tuikov@amd.com>; Clements, John <John.Clements@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>
Subject: [PATCH 3/3] drm/amdgpu: Use delayed work to collect RAS error counters

On Context Query2 IOCTL return the correctable and uncorrectable errors in O(1) fashion, from cached values, and schedule a delayed work function to calculate and cache them for the next such IOCTL.

Cc: Alexander Deucher <Alexander.Deucher@amd.com>
Cc: Christian König <christian.koenig@amd.com>
Cc: John Clements <john.clements@amd.com>
Cc: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 32 +++++++++++++++++++--  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 38 +++++++++++++++++++++++++  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  5 ++++
 3 files changed, 73 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
index bb0cfe871aba..4e95d255960b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
@@ -331,10 +331,13 @@ static int amdgpu_ctx_query(struct amdgpu_device *adev,
 	return 0;
 }
 
+#define AMDGPU_RAS_COUNTE_DELAY_MS 3000
+
 static int amdgpu_ctx_query2(struct amdgpu_device *adev,
-	struct amdgpu_fpriv *fpriv, uint32_t id,
-	union drm_amdgpu_ctx_out *out)
+			     struct amdgpu_fpriv *fpriv, uint32_t id,
+			     union drm_amdgpu_ctx_out *out)
 {
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 	struct amdgpu_ctx *ctx;
 	struct amdgpu_ctx_mgr *mgr;
 
@@ -361,6 +364,31 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev,
 	if (atomic_read(&ctx->guilty))
 		out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_GUILTY;
 
+	if (adev->ras_enabled && con) {
+		/* Return the cached values in O(1),
+		 * and schedule delayed work to cache
+		 * new vaues.
+		 */
+		int ce_count, ue_count;
+
+		ce_count = atomic_read(&con->ras_ce_count);
+		ue_count = atomic_read(&con->ras_ue_count);
+
+		if (ce_count != ctx->ras_counter_ce) {
+			ctx->ras_counter_ce = ce_count;
+			out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_CE;
+		}
+
+		if (ue_count != ctx->ras_counter_ue) {
+			ctx->ras_counter_ue = ue_count;
+			out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_UE;
+		}
+
+		if (!delayed_work_pending(&con->ras_counte_delay_work))
+			schedule_delayed_work(&con->ras_counte_delay_work,
+				  msecs_to_jiffies(AMDGPU_RAS_COUNTE_DELAY_MS));
+	}
+
 	mutex_unlock(&mgr->lock);
 	return 0;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index ed3c43e8b0b5..80f576098318 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -27,6 +27,7 @@
 #include <linux/uaccess.h>
 #include <linux/reboot.h>
 #include <linux/syscalls.h>
+#include <linux/pm_runtime.h>
 
 #include "amdgpu.h"
 #include "amdgpu_ras.h"
@@ -2116,6 +2117,30 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev)
 		adev->ras_hw_enabled & amdgpu_ras_mask;  }
 
+static void amdgpu_ras_counte_dw(struct work_struct *work) {
+	struct amdgpu_ras *con = container_of(work, struct amdgpu_ras,
+					      ras_counte_delay_work.work);
+	struct amdgpu_device *adev = con->adev;
+	struct drm_device *dev = &adev->ddev;
+	unsigned long ce_count, ue_count;
+	int res;
+
+	res = pm_runtime_get_sync(dev->dev);
+	if (res < 0)
+		goto Out;
+
+	/* Cache new values.
+	 */
+	amdgpu_ras_query_error_count(adev, &ce_count, &ue_count);
+	atomic_set(&con->ras_ce_count, ce_count);
+	atomic_set(&con->ras_ue_count, ue_count);
+
+	pm_runtime_mark_last_busy(dev->dev);
+Out:
+	pm_runtime_put_autosuspend(dev->dev);
+}
+
 int amdgpu_ras_init(struct amdgpu_device *adev)  {
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); @@ -2130,6 +2155,11 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
 	if (!con)
 		return -ENOMEM;
 
+	con->adev = adev;
+	INIT_DELAYED_WORK(&con->ras_counte_delay_work, amdgpu_ras_counte_dw);
+	atomic_set(&con->ras_ce_count, 0);
+	atomic_set(&con->ras_ue_count, 0);
+
 	con->objs = (struct ras_manager *)(con + 1);
 
 	amdgpu_ras_set_context(adev, con);
@@ -2233,6 +2263,8 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev,
 			 struct ras_fs_if *fs_info,
 			 struct ras_ih_if *ih_info)
 {
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+	unsigned long ue_count, ce_count;
 	int r;
 
 	/* disable RAS feature per IP block if it is not supported */ @@ -2273,6 +2305,12 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev,
 	if (r)
 		goto sysfs;
 
+	/* Those are the cached values at init.
+	 */
+	amdgpu_ras_query_error_count(adev, &ce_count, &ue_count);
+	atomic_set(&con->ras_ce_count, ce_count);
+	atomic_set(&con->ras_ue_count, ue_count);
+
 	return 0;
 cleanup:
 	amdgpu_ras_sysfs_remove(adev, ras_block); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 10fca0393106..256cea5d34f2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -340,6 +340,11 @@ struct amdgpu_ras {
 
 	/* disable ras error count harvest in recovery */
 	bool disable_ras_err_cnt_harvest;
+
+	/* RAS count errors delayed work */
+	struct delayed_work ras_counte_delay_work;
+	atomic_t ras_ue_count;
+	atomic_t ras_ce_count;
 };
 
 struct ras_fs_data {
--
2.31.1.527.g2d677e5b15

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=04%7C01%7Clijo.lazar%40amd.com%7C3686015f68f84c9088ab08d91c9e0fcf%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637572287465788021%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=HbdtryTNLwzF862vg6E%2BwKBHmrw8NFz3gKQsU9ggdOk%3D&amp;reserved=0
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

  parent reply	other threads:[~2021-05-26 11:00 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-05-21 21:18 [PATCH 1/3] drm/amdgpu: Don't query CE and UE errors Luben Tuikov
2021-05-21 21:18 ` Luben Tuikov
2021-05-21 21:18 ` [PATCH 2/3] drm/amdgpu: Fix RAS function interface Luben Tuikov
2021-05-21 21:18 ` [PATCH 3/3] drm/amdgpu: Use delayed work to collect RAS error counters Luben Tuikov
2021-05-25 22:03   ` Alex Deucher
2021-05-25 23:56     ` Luben Tuikov
2021-05-26 11:00   ` Lazar, Lijo [this message]
2021-05-26 15:12     ` Luben Tuikov
2021-05-26 16:05       ` Lazar, Lijo
2021-05-26 16:11         ` Alex Deucher
2021-05-26  0:40 [PATCH 1/3] drm/amdgpu: Don't query CE and UE errors Luben Tuikov
2021-05-26  0:40 ` [PATCH 3/3] drm/amdgpu: Use delayed work to collect RAS error counters Luben Tuikov
2021-05-26 10:15   ` Christian König
2021-05-26 14:50     ` Luben Tuikov
2021-05-26 16:43 [PATCH 1/3] drm/amdgpu: Don't query CE and UE errors Luben Tuikov
2021-05-26 16:43 ` [PATCH 3/3] drm/amdgpu: Use delayed work to collect RAS error counters Luben Tuikov
2021-05-26 20:53   ` Deucher, Alexander
2021-05-27  9:42   ` Christian König

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=CH0PR12MB53487EE4BD75A8B5493F742D97249@CH0PR12MB5348.namprd12.prod.outlook.com \
    --to=lijo.lazar@amd.com \
    --cc=Alexander.Deucher@amd.com \
    --cc=Christian.Koenig@amd.com \
    --cc=Hawking.Zhang@amd.com \
    --cc=John.Clements@amd.com \
    --cc=Luben.Tuikov@amd.com \
    --cc=amd-gfx@lists.freedesktop.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.