dri-devel.lists.freedesktop.org archive mirror
 help / color / mirror / Atom feed
From: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
To: <dri-devel@lists.freedesktop.org>, <amd-gfx@lists.freedesktop.org>
Cc: horace.chen@amd.com, christian.koenig@amd.com, Monk.Liu@amd.com,
	Liu Shaoyun <Shaoyun.Liu@amd.com>
Subject: [RFC v3 5/8] drm/amd/virt: For SRIOV send GPU reset directly to TDR queue.
Date: Thu, 23 Dec 2021 13:29:41 -0500	[thread overview]
Message-ID: <20211223182941.888367-1-andrey.grodzovsky@amd.com> (raw)
In-Reply-To: <20211222221400.790842-1-andrey.grodzovsky@amd.com>

No need to to trigger another work queue inside the work queue.

v3:

Problem:
Extra reset caused by host side FLR notification
following guest side triggered reset.
Fix: Preven qeuing flr_work from mailbox irq if guest
already executing a reset.

Suggested-by: Liu Shaoyun <Shaoyun.Liu@amd.com>
Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 9 ++++++---
 drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 9 ++++++---
 drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 9 ++++++---
 3 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index 23b066bcffb2..bdeb8e933bb4 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -276,7 +276,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
 	if (amdgpu_device_should_recover_gpu(adev)
 		&& (!amdgpu_device_has_job_running(adev) ||
 		adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT))
-		amdgpu_device_gpu_recover(adev, NULL);
+		amdgpu_device_gpu_recover_imp(adev, NULL);
 }
 
 static int xgpu_ai_set_mailbox_rcv_irq(struct amdgpu_device *adev,
@@ -301,8 +301,11 @@ static int xgpu_ai_mailbox_rcv_irq(struct amdgpu_device *adev,
 
 	switch (event) {
 		case IDH_FLR_NOTIFICATION:
-		if (amdgpu_sriov_runtime(adev))
-			schedule_work(&adev->virt.flr_work);
+		if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev))
+			WARN_ONCE(!queue_work(adev->reset_domain.wq,
+					      &adev->virt.flr_work),
+				  "Failed to queue work! at %s",
+				  __FUNCTION__ );
 		break;
 		case IDH_QUERY_ALIVE:
 			xgpu_ai_mailbox_send_ack(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
index a35e6d87e537..dd8dc0f6028c 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
@@ -308,7 +308,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
 		adev->gfx_timeout == MAX_SCHEDULE_TIMEOUT ||
 		adev->compute_timeout == MAX_SCHEDULE_TIMEOUT ||
 		adev->video_timeout == MAX_SCHEDULE_TIMEOUT))
-		amdgpu_device_gpu_recover(adev, NULL);
+		amdgpu_device_gpu_recover_imp(adev, NULL);
 }
 
 static int xgpu_nv_set_mailbox_rcv_irq(struct amdgpu_device *adev,
@@ -336,8 +336,11 @@ static int xgpu_nv_mailbox_rcv_irq(struct amdgpu_device *adev,
 
 	switch (event) {
 	case IDH_FLR_NOTIFICATION:
-		if (amdgpu_sriov_runtime(adev))
-			schedule_work(&adev->virt.flr_work);
+		if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev))
+			WARN_ONCE(!queue_work(adev->reset_domain.wq,
+					      &adev->virt.flr_work),
+				  "Failed to queue work! at %s",
+				  __FUNCTION__ );
 		break;
 		/* READY_TO_ACCESS_GPU is fetched by kernel polling, IRQ can ignore
 		 * it byfar since that polling thread will handle it,
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
index aef9d059ae52..c2afb72f97ac 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
@@ -521,7 +521,7 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct *work)
 
 	/* Trigger recovery due to world switch failure */
 	if (amdgpu_device_should_recover_gpu(adev))
-		amdgpu_device_gpu_recover(adev, NULL);
+		amdgpu_device_gpu_recover_imp(adev, NULL);
 }
 
 static int xgpu_vi_set_mailbox_rcv_irq(struct amdgpu_device *adev,
@@ -550,8 +550,11 @@ static int xgpu_vi_mailbox_rcv_irq(struct amdgpu_device *adev,
 		r = xgpu_vi_mailbox_rcv_msg(adev, IDH_FLR_NOTIFICATION);
 
 		/* only handle FLR_NOTIFY now */
-		if (!r)
-			schedule_work(&adev->virt.flr_work);
+		if (!r && !amdgpu_in_reset(adev))
+			WARN_ONCE(!queue_work(adev->reset_domain.wq,
+					      &adev->virt.flr_work),
+				  "Failed to queue work! at %s",
+				  __FUNCTION__ );
 	}
 
 	return 0;
-- 
2.25.1


      parent reply	other threads:[~2021-12-23 18:30 UTC|newest]

Thread overview: 55+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-12-22 22:04 [RFC v2 0/8] Define and use reset domain for GPU recovery in amdgpu Andrey Grodzovsky
2021-12-22 22:04 ` [RFC v2 1/8] drm/amdgpu: Introduce reset domain Andrey Grodzovsky
2021-12-22 22:05 ` [RFC v2 2/8] drm/amdgpu: Move scheduler init to after XGMI is ready Andrey Grodzovsky
2021-12-23  8:39   ` Christian König
2021-12-22 22:05 ` [RFC v2 3/8] drm/amdgpu: Fix crash on modprobe Andrey Grodzovsky
2021-12-23  8:40   ` Christian König
2021-12-22 22:05 ` [RFC v2 4/8] drm/amdgpu: Serialize non TDR gpu recovery with TDRs Andrey Grodzovsky
2021-12-23  8:41   ` Christian König
2022-01-05  9:54   ` Lazar, Lijo
2022-01-05 12:31     ` Christian König
2022-01-05 13:11       ` Lazar, Lijo
2022-01-05 13:15         ` Christian König
2022-01-05 13:26           ` Lazar, Lijo
2022-01-05 13:41             ` Christian König
2022-01-05 18:11       ` Andrey Grodzovsky
2022-01-17 19:14         ` Andrey Grodzovsky
2022-01-17 19:17           ` Christian König
2022-01-17 19:21             ` Andrey Grodzovsky
2022-01-26 15:52               ` Andrey Grodzovsky
2022-01-28 16:57                 ` Grodzovsky, Andrey
2022-02-07  2:41                   ` JingWen Chen
2022-02-07  3:08                     ` Grodzovsky, Andrey
2021-12-22 22:13 ` [RFC v2 5/8] drm/amd/virt: For SRIOV send GPU reset directly to TDR queue Andrey Grodzovsky
2021-12-22 22:13   ` [RFC v2 6/8] drm/amdgpu: Drop hive->in_reset Andrey Grodzovsky
2021-12-22 22:13   ` [RFC v2 7/8] drm/amdgpu: Drop concurrent GPU reset protection for device Andrey Grodzovsky
2021-12-22 22:14   ` [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset protection for SRIOV Andrey Grodzovsky
2021-12-23  8:42     ` Christian König
2021-12-23 10:14       ` Liu, Monk
2021-12-24  8:58         ` Deng, Emily
2021-12-24  9:57           ` JingWen Chen
2021-12-30 18:45             ` Andrey Grodzovsky
2022-01-03 10:17               ` Christian König
2022-01-04  9:07                 ` JingWen Chen
2022-01-04 10:18                   ` Christian König
2022-01-04 10:49                     ` Liu, Monk
2022-01-04 11:36                       ` Christian König
2022-01-04 16:56                         ` Andrey Grodzovsky
2022-01-05  7:34                           ` JingWen Chen
2022-01-05  7:59                             ` Christian König
2022-01-05 18:24                               ` Andrey Grodzovsky
2022-01-06  4:59                                 ` JingWen Chen
2022-01-06  5:18                                   ` JingWen Chen
2022-01-06  9:13                                     ` Christian König
2022-01-06 19:13                                     ` Andrey Grodzovsky
2022-01-07  3:57                                       ` JingWen Chen
2022-01-07  5:46                                         ` JingWen Chen
2022-01-07 16:02                                           ` Andrey Grodzovsky
2022-01-12  6:28                                             ` JingWen Chen
2022-01-04 17:13                         ` Liu, Shaoyun
2022-01-04 20:54                           ` Andrey Grodzovsky
2022-01-05  0:01                             ` Liu, Shaoyun
2022-01-05  7:25                         ` JingWen Chen
2021-12-30 18:39           ` Andrey Grodzovsky
2021-12-23 18:07     ` Liu, Shaoyun
2021-12-23 18:29   ` Andrey Grodzovsky [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20211223182941.888367-1-andrey.grodzovsky@amd.com \
    --to=andrey.grodzovsky@amd.com \
    --cc=Monk.Liu@amd.com \
    --cc=Shaoyun.Liu@amd.com \
    --cc=amd-gfx@lists.freedesktop.org \
    --cc=christian.koenig@amd.com \
    --cc=dri-devel@lists.freedesktop.org \
    --cc=horace.chen@amd.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).