From: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
To: <dri-devel@lists.freedesktop.org>, <amd-gfx@lists.freedesktop.org>
Cc: Monk.Liu@amd.com, horace.chen@amd.com, christian.koenig@amd.com
Subject: [RFC 6/6] drm/amdgpu: Drop concurrent GPU reset protection for device
Date: Fri, 17 Dec 2021 17:27:45 -0500 [thread overview]
Message-ID: <20211217222745.881637-7-andrey.grodzovsky@amd.com> (raw)
In-Reply-To: <20211217222745.881637-1-andrey.grodzovsky@amd.com>
Since now all GPU resets are serialzied there is no need for this.
This patch also reverts 'drm/amdgpu: race issue when jobs on 2 ring timeout'
Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 89 ++--------------------
1 file changed, 7 insertions(+), 82 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index d2701e4d0622..311e0b9e1e4f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4763,11 +4763,10 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
return r;
}
-static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
+static void amdgpu_device_lock_adev(struct amdgpu_device *adev,
struct amdgpu_hive_info *hive)
{
- if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
- return false;
+ atomic_set(&adev->in_gpu_reset, 1);
if (hive) {
down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
@@ -4786,8 +4785,6 @@ static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
adev->mp1_state = PP_MP1_STATE_NONE;
break;
}
-
- return true;
}
static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
@@ -4798,46 +4795,6 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
up_write(&adev->reset_sem);
}
-/*
- * to lockup a list of amdgpu devices in a hive safely, if not a hive
- * with multiple nodes, it will be similar as amdgpu_device_lock_adev.
- *
- * unlock won't require roll back.
- */
-static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive)
-{
- struct amdgpu_device *tmp_adev = NULL;
-
- if (adev->gmc.xgmi.num_physical_nodes > 1) {
- if (!hive) {
- dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes");
- return -ENODEV;
- }
- list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
- if (!amdgpu_device_lock_adev(tmp_adev, hive))
- goto roll_back;
- }
- } else if (!amdgpu_device_lock_adev(adev, hive))
- return -EAGAIN;
-
- return 0;
-roll_back:
- if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) {
- /*
- * if the lockup iteration break in the middle of a hive,
- * it may means there may has a race issue,
- * or a hive device locked up independently.
- * we may be in trouble and may not, so will try to roll back
- * the lock and give out a warnning.
- */
- dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock");
- list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) {
- amdgpu_device_unlock_adev(tmp_adev);
- }
- }
- return -EAGAIN;
-}
-
static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
{
struct pci_dev *p = NULL;
@@ -5023,22 +4980,6 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,
reset_context.hive = hive;
clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
- /*
- * lock the device before we try to operate the linked list
- * if didn't get the device lock, don't touch the linked list since
- * others may iterating it.
- */
- r = amdgpu_device_lock_hive_adev(adev, hive);
- if (r) {
- dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
- job ? job->base.id : -1);
-
- /* even we skipped this reset, still need to set the job to guilty */
- if (job && job->vm)
- drm_sched_increase_karma(&job->base);
- goto skip_recovery;
- }
-
/*
* Build list of devices to reset.
* In case we are in XGMI hive mode, resort the device list
@@ -5058,6 +4999,9 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,
/* block all schedulers and reset given job's ring */
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
+
+ amdgpu_device_lock_adev(tmp_adev, hive);
+
/*
* Try to put the audio codec into suspend state
* before gpu reset started.
@@ -5208,13 +5152,12 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,
amdgpu_device_unlock_adev(tmp_adev);
}
-skip_recovery:
if (hive) {
mutex_unlock(&hive->hive_lock);
amdgpu_put_xgmi_hive(hive);
}
- if (r && r != -EAGAIN)
+ if (r)
dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
return r;
}
@@ -5437,20 +5380,6 @@ int amdgpu_device_baco_exit(struct drm_device *dev)
return 0;
}
-static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev)
-{
- int i;
-
- for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
- struct amdgpu_ring *ring = adev->rings[i];
-
- if (!ring || !ring->sched.thread)
- continue;
-
- cancel_delayed_work_sync(&ring->sched.work_tdr);
- }
-}
-
/**
* amdgpu_pci_error_detected - Called when a PCI error is detected.
* @pdev: PCI device struct
@@ -5481,14 +5410,10 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
/* Fatal error, prepare for slot reset */
case pci_channel_io_frozen:
/*
- * Cancel and wait for all TDRs in progress if failing to
- * set adev->in_gpu_reset in amdgpu_device_lock_adev
- *
* Locking adev->reset_sem will prevent any external access
* to GPU during PCI error recovery
*/
- while (!amdgpu_device_lock_adev(adev, NULL))
- amdgpu_cancel_all_tdr(adev);
+ amdgpu_device_lock_adev(adev, NULL);
/*
* Block any work scheduling as we do for regular GPU reset
--
2.25.1
next prev parent reply other threads:[~2021-12-17 22:28 UTC|newest]
Thread overview: 23+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-12-17 22:27 [RFC 0/6] Define and use reset domain for GPU recovery in amdgpu Andrey Grodzovsky
2021-12-17 22:27 ` [RFC 1/6] drm/amdgpu: Init GPU reset single threaded wq Andrey Grodzovsky
2021-12-17 22:27 ` [RFC 2/6] drm/amdgpu: Move scheduler init to after XGMI is ready Andrey Grodzovsky
2021-12-20 7:16 ` Christian König
2021-12-20 21:51 ` Andrey Grodzovsky
2021-12-21 7:05 ` Christian König
2021-12-17 22:27 ` [RFC 3/6] drm/amdgpu: Fix crash on modprobe Andrey Grodzovsky
2021-12-20 7:17 ` Christian König
2021-12-20 19:22 ` Andrey Grodzovsky
2021-12-21 7:02 ` Christian König
2021-12-21 16:03 ` Andrey Grodzovsky
2021-12-22 7:50 ` Christian König
2021-12-17 22:27 ` [RFC 4/6] drm/amdgpu: Serialize non TDR gpu recovery with TDRs Andrey Grodzovsky
2021-12-20 7:20 ` Christian König
2021-12-20 22:17 ` Andrey Grodzovsky
2021-12-21 7:59 ` Christian König
2021-12-21 16:10 ` Andrey Grodzovsky
2021-12-17 22:27 ` [RFC 5/6] drm/amdgpu: Drop hive->in_reset Andrey Grodzovsky
2021-12-17 22:27 ` Andrey Grodzovsky [this message]
2021-12-20 7:25 ` [RFC 0/6] Define and use reset domain for GPU recovery in amdgpu Christian König
2021-12-20 9:43 ` Daniel Vetter
2021-12-20 17:06 ` Liu, Shaoyun
2021-12-20 19:11 ` Andrey Grodzovsky
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20211217222745.881637-7-andrey.grodzovsky@amd.com \
--to=andrey.grodzovsky@amd.com \
--cc=Monk.Liu@amd.com \
--cc=amd-gfx@lists.freedesktop.org \
--cc=christian.koenig@amd.com \
--cc=dri-devel@lists.freedesktop.org \
--cc=horace.chen@amd.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).