All of lore.kernel.org
 help / color / mirror / Atom feed
From: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
To: <dri-devel@lists.freedesktop.org>, <amd-gfx@lists.freedesktop.org>
Cc: horace.chen@amd.com, lijo.lazar@amd.com, jingwech@amd.com,
	christian.koenig@amd.com, Monk.Liu@amd.com,
	Liu Shaoyun <Shaoyun.Liu@amd.com>
Subject: [RFC v3 05/12] drm/amd/virt: For SRIOV send GPU reset directly to TDR queue.
Date: Tue, 25 Jan 2022 17:37:45 -0500	[thread overview]
Message-ID: <20220125223752.200211-6-andrey.grodzovsky@amd.com> (raw)
In-Reply-To: <20220125223752.200211-1-andrey.grodzovsky@amd.com>

No need to to trigger another work queue inside the work queue.

v3:

Problem:
Extra reset caused by host side FLR notification
following guest side triggered reset.
Fix: Preven qeuing flr_work from mailbox irq if guest
already executing a reset.

Suggested-by: Liu Shaoyun <Shaoyun.Liu@amd.com>
Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 9 ++++++---
 drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 9 ++++++---
 drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 9 ++++++---
 3 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index 23b066bcffb2..b2b40e169342 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -276,7 +276,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
 	if (amdgpu_device_should_recover_gpu(adev)
 		&& (!amdgpu_device_has_job_running(adev) ||
 		adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT))
-		amdgpu_device_gpu_recover(adev, NULL);
+		amdgpu_device_gpu_recover_imp(adev, NULL);
 }
 
 static int xgpu_ai_set_mailbox_rcv_irq(struct amdgpu_device *adev,
@@ -301,8 +301,11 @@ static int xgpu_ai_mailbox_rcv_irq(struct amdgpu_device *adev,
 
 	switch (event) {
 		case IDH_FLR_NOTIFICATION:
-		if (amdgpu_sriov_runtime(adev))
-			schedule_work(&adev->virt.flr_work);
+		if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev))
+			WARN_ONCE(!queue_work(adev->reset_domain.wq,
+					      &adev->virt.flr_work),
+				  "Failed to queue work! at %s",
+				  __func__);
 		break;
 		case IDH_QUERY_ALIVE:
 			xgpu_ai_mailbox_send_ack(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
index a35e6d87e537..08411924150d 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
@@ -308,7 +308,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
 		adev->gfx_timeout == MAX_SCHEDULE_TIMEOUT ||
 		adev->compute_timeout == MAX_SCHEDULE_TIMEOUT ||
 		adev->video_timeout == MAX_SCHEDULE_TIMEOUT))
-		amdgpu_device_gpu_recover(adev, NULL);
+		amdgpu_device_gpu_recover_imp(adev, NULL);
 }
 
 static int xgpu_nv_set_mailbox_rcv_irq(struct amdgpu_device *adev,
@@ -336,8 +336,11 @@ static int xgpu_nv_mailbox_rcv_irq(struct amdgpu_device *adev,
 
 	switch (event) {
 	case IDH_FLR_NOTIFICATION:
-		if (amdgpu_sriov_runtime(adev))
-			schedule_work(&adev->virt.flr_work);
+		if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev))
+			WARN_ONCE(!queue_work(adev->reset_domain.wq,
+					      &adev->virt.flr_work),
+				  "Failed to queue work! at %s",
+				  __func__);
 		break;
 		/* READY_TO_ACCESS_GPU is fetched by kernel polling, IRQ can ignore
 		 * it byfar since that polling thread will handle it,
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
index aef9d059ae52..02290febfcf4 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
@@ -521,7 +521,7 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct *work)
 
 	/* Trigger recovery due to world switch failure */
 	if (amdgpu_device_should_recover_gpu(adev))
-		amdgpu_device_gpu_recover(adev, NULL);
+		amdgpu_device_gpu_recover_imp(adev, NULL);
 }
 
 static int xgpu_vi_set_mailbox_rcv_irq(struct amdgpu_device *adev,
@@ -550,8 +550,11 @@ static int xgpu_vi_mailbox_rcv_irq(struct amdgpu_device *adev,
 		r = xgpu_vi_mailbox_rcv_msg(adev, IDH_FLR_NOTIFICATION);
 
 		/* only handle FLR_NOTIFY now */
-		if (!r)
-			schedule_work(&adev->virt.flr_work);
+		if (!r && !amdgpu_in_reset(adev))
+			WARN_ONCE(!queue_work(adev->reset_domain.wq,
+					      &adev->virt.flr_work),
+				  "Failed to queue work! at %s",
+				  __func__);
 	}
 
 	return 0;
-- 
2.25.1


WARNING: multiple messages have this Message-ID (diff)
From: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
To: <dri-devel@lists.freedesktop.org>, <amd-gfx@lists.freedesktop.org>
Cc: Andrey Grodzovsky <andrey.grodzovsky@amd.com>,
	horace.chen@amd.com, lijo.lazar@amd.com, jingwech@amd.com,
	daniel@ffwll.ch, christian.koenig@amd.com, Monk.Liu@amd.com,
	Liu Shaoyun <Shaoyun.Liu@amd.com>
Subject: [RFC v3 05/12] drm/amd/virt: For SRIOV send GPU reset directly to TDR queue.
Date: Tue, 25 Jan 2022 17:37:45 -0500	[thread overview]
Message-ID: <20220125223752.200211-6-andrey.grodzovsky@amd.com> (raw)
In-Reply-To: <20220125223752.200211-1-andrey.grodzovsky@amd.com>

No need to to trigger another work queue inside the work queue.

v3:

Problem:
Extra reset caused by host side FLR notification
following guest side triggered reset.
Fix: Preven qeuing flr_work from mailbox irq if guest
already executing a reset.

Suggested-by: Liu Shaoyun <Shaoyun.Liu@amd.com>
Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 9 ++++++---
 drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 9 ++++++---
 drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 9 ++++++---
 3 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index 23b066bcffb2..b2b40e169342 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -276,7 +276,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
 	if (amdgpu_device_should_recover_gpu(adev)
 		&& (!amdgpu_device_has_job_running(adev) ||
 		adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT))
-		amdgpu_device_gpu_recover(adev, NULL);
+		amdgpu_device_gpu_recover_imp(adev, NULL);
 }
 
 static int xgpu_ai_set_mailbox_rcv_irq(struct amdgpu_device *adev,
@@ -301,8 +301,11 @@ static int xgpu_ai_mailbox_rcv_irq(struct amdgpu_device *adev,
 
 	switch (event) {
 		case IDH_FLR_NOTIFICATION:
-		if (amdgpu_sriov_runtime(adev))
-			schedule_work(&adev->virt.flr_work);
+		if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev))
+			WARN_ONCE(!queue_work(adev->reset_domain.wq,
+					      &adev->virt.flr_work),
+				  "Failed to queue work! at %s",
+				  __func__);
 		break;
 		case IDH_QUERY_ALIVE:
 			xgpu_ai_mailbox_send_ack(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
index a35e6d87e537..08411924150d 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
@@ -308,7 +308,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
 		adev->gfx_timeout == MAX_SCHEDULE_TIMEOUT ||
 		adev->compute_timeout == MAX_SCHEDULE_TIMEOUT ||
 		adev->video_timeout == MAX_SCHEDULE_TIMEOUT))
-		amdgpu_device_gpu_recover(adev, NULL);
+		amdgpu_device_gpu_recover_imp(adev, NULL);
 }
 
 static int xgpu_nv_set_mailbox_rcv_irq(struct amdgpu_device *adev,
@@ -336,8 +336,11 @@ static int xgpu_nv_mailbox_rcv_irq(struct amdgpu_device *adev,
 
 	switch (event) {
 	case IDH_FLR_NOTIFICATION:
-		if (amdgpu_sriov_runtime(adev))
-			schedule_work(&adev->virt.flr_work);
+		if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev))
+			WARN_ONCE(!queue_work(adev->reset_domain.wq,
+					      &adev->virt.flr_work),
+				  "Failed to queue work! at %s",
+				  __func__);
 		break;
 		/* READY_TO_ACCESS_GPU is fetched by kernel polling, IRQ can ignore
 		 * it byfar since that polling thread will handle it,
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
index aef9d059ae52..02290febfcf4 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
@@ -521,7 +521,7 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct *work)
 
 	/* Trigger recovery due to world switch failure */
 	if (amdgpu_device_should_recover_gpu(adev))
-		amdgpu_device_gpu_recover(adev, NULL);
+		amdgpu_device_gpu_recover_imp(adev, NULL);
 }
 
 static int xgpu_vi_set_mailbox_rcv_irq(struct amdgpu_device *adev,
@@ -550,8 +550,11 @@ static int xgpu_vi_mailbox_rcv_irq(struct amdgpu_device *adev,
 		r = xgpu_vi_mailbox_rcv_msg(adev, IDH_FLR_NOTIFICATION);
 
 		/* only handle FLR_NOTIFY now */
-		if (!r)
-			schedule_work(&adev->virt.flr_work);
+		if (!r && !amdgpu_in_reset(adev))
+			WARN_ONCE(!queue_work(adev->reset_domain.wq,
+					      &adev->virt.flr_work),
+				  "Failed to queue work! at %s",
+				  __func__);
 	}
 
 	return 0;
-- 
2.25.1


  parent reply	other threads:[~2022-01-25 22:38 UTC|newest]

Thread overview: 54+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-01-25 22:37 [RFC v3 00/12] Define and use reset domain for GPU recovery in amdgpu Andrey Grodzovsky
2022-01-25 22:37 ` Andrey Grodzovsky
2022-01-25 22:37 ` [RFC v3 01/12] drm/amdgpu: Introduce reset domain Andrey Grodzovsky
2022-01-25 22:37   ` Andrey Grodzovsky
2022-01-26 12:07   ` Christian König
2022-01-26 12:07     ` Christian König
2022-01-26 15:47     ` Andrey Grodzovsky
2022-01-26 15:47       ` Andrey Grodzovsky
2022-01-25 22:37 ` [RFC v3 02/12] drm/amdgpu: Move scheduler init to after XGMI is ready Andrey Grodzovsky
2022-01-25 22:37   ` Andrey Grodzovsky
2022-01-25 22:37 ` [RFC v3 03/12] drm/amdgpu: Fix crash on modprobe Andrey Grodzovsky
2022-01-25 22:37   ` Andrey Grodzovsky
2022-01-25 22:37 ` [RFC v3 04/12] drm/amdgpu: Serialize non TDR gpu recovery with TDRs Andrey Grodzovsky
2022-01-25 22:37   ` Andrey Grodzovsky
2022-01-25 22:37 ` Andrey Grodzovsky [this message]
2022-01-25 22:37   ` [RFC v3 05/12] drm/amd/virt: For SRIOV send GPU reset directly to TDR queue Andrey Grodzovsky
2022-01-25 22:37 ` [RFC v3 06/12] drm/amdgpu: Drop hive->in_reset Andrey Grodzovsky
2022-01-25 22:37   ` Andrey Grodzovsky
2022-02-08  6:33   ` Lazar, Lijo
2022-02-08  6:33     ` Lazar, Lijo
2022-02-08 15:39     ` Andrey Grodzovsky
2022-02-08 15:39       ` Andrey Grodzovsky
2022-01-25 22:37 ` [RFC v3 07/12] drm/amdgpu: Drop concurrent GPU reset protection for device Andrey Grodzovsky
2022-01-25 22:37   ` Andrey Grodzovsky
2022-01-25 22:37 ` [RFC v3 08/12] drm/amdgpu: Rework reset domain to be refcounted Andrey Grodzovsky
2022-01-25 22:37   ` Andrey Grodzovsky
2022-01-26 12:12   ` Christian König
2022-01-26 12:12     ` Christian König
2022-02-02 17:26   ` [RFC v4] " Andrey Grodzovsky
2022-02-02 17:26     ` Andrey Grodzovsky
2022-02-08 11:25     ` Lazar, Lijo
2022-02-08 11:25       ` Lazar, Lijo
2022-02-08 16:19       ` Andrey Grodzovsky
2022-02-08 16:19         ` Andrey Grodzovsky
2022-02-09  7:51         ` Christian König
2022-02-09  7:51           ` Christian König
2022-01-25 22:37 ` [RFC v3 09/12] drm/amdgpu: Move reset sem into reset_domain Andrey Grodzovsky
2022-01-25 22:37   ` Andrey Grodzovsky
2022-01-25 22:37 ` [RFC v3 10/12] drm/amdgpu: Move in_gpu_reset " Andrey Grodzovsky
2022-01-25 22:37   ` Andrey Grodzovsky
2022-02-08 10:49   ` Lazar, Lijo
2022-02-08 10:49     ` Lazar, Lijo
2022-01-25 22:37 ` [RFC v3 11/12] drm/amdgpu: Rework amdgpu_device_lock_adev Andrey Grodzovsky
2022-01-25 22:37   ` Andrey Grodzovsky
2022-01-25 22:37 ` [RFC v3 12/12] Revert 'drm/amdgpu: annotate a false positive recursive locking' Andrey Grodzovsky
2022-01-25 22:37   ` Andrey Grodzovsky
2022-01-28 19:36 ` [RFC v3 00/12] Define and use reset domain for GPU recovery in amdgpu Andrey Grodzovsky
2022-01-28 19:36   ` Andrey Grodzovsky
2022-02-02 18:57   ` Andrey Grodzovsky
2022-02-02 18:57     ` Andrey Grodzovsky
2022-02-09  6:06     ` JingWen Chen
2022-02-09  6:06       ` JingWen Chen
2022-02-09 16:08       ` Andrey Grodzovsky
2022-02-09 16:08         ` Andrey Grodzovsky

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220125223752.200211-6-andrey.grodzovsky@amd.com \
    --to=andrey.grodzovsky@amd.com \
    --cc=Monk.Liu@amd.com \
    --cc=Shaoyun.Liu@amd.com \
    --cc=amd-gfx@lists.freedesktop.org \
    --cc=christian.koenig@amd.com \
    --cc=dri-devel@lists.freedesktop.org \
    --cc=horace.chen@amd.com \
    --cc=jingwech@amd.com \
    --cc=lijo.lazar@amd.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.