All of lore.kernel.org
 help / color / mirror / Atom feed
From: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
To: <dri-devel@lists.freedesktop.org>, <amd-gfx@lists.freedesktop.org>
Cc: "Daniel Vetter" <daniel.vetter@ffwll.ch>,
	horace.chen@amd.com, lijo.lazar@amd.com, jingwech@amd.com,
	"Christian König" <ckoenig.leichtzumerken@gmail.com>,
	christian.koenig@amd.com, Monk.Liu@amd.com
Subject: [RFC v3 01/12] drm/amdgpu: Introduce reset domain
Date: Tue, 25 Jan 2022 17:37:41 -0500	[thread overview]
Message-ID: <20220125223752.200211-2-andrey.grodzovsky@amd.com> (raw)
In-Reply-To: <20220125223752.200211-1-andrey.grodzovsky@amd.com>

Defined a reset_domain struct such that
all the entities that go through reset
together will be serialized one against
another. Do it for both single device and
XGMI hive cases.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Suggested-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Suggested-by: Christian König <ckoenig.leichtzumerken@gmail.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  7 +++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 20 +++++++++++++++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   |  9 +++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   |  2 ++
 4 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 9f017663ac50..b5ff76aae7e0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -812,6 +812,11 @@ struct amd_powerplay {
 
 #define AMDGPU_RESET_MAGIC_NUM 64
 #define AMDGPU_MAX_DF_PERFMONS 4
+
+struct amdgpu_reset_domain {
+	struct workqueue_struct *wq;
+};
+
 struct amdgpu_device {
 	struct device			*dev;
 	struct pci_dev			*pdev;
@@ -1096,6 +1101,8 @@ struct amdgpu_device {
 
 	struct amdgpu_reset_control     *reset_cntl;
 	uint32_t                        ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE];
+
+	struct amdgpu_reset_domain	reset_domain;
 };
 
 static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 90d22a376632..0f3e6c078f88 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2391,9 +2391,27 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
 	if (r)
 		goto init_failed;
 
-	if (adev->gmc.xgmi.num_physical_nodes > 1)
+	if (adev->gmc.xgmi.num_physical_nodes > 1) {
+		struct amdgpu_hive_info *hive;
+
 		amdgpu_xgmi_add_device(adev);
 
+		hive = amdgpu_get_xgmi_hive(adev);
+		if (!hive || !hive->reset_domain.wq) {
+			DRM_ERROR("Failed to obtain reset domain info for XGMI hive:%llx", hive->hive_id);
+			r = -EINVAL;
+			goto init_failed;
+		}
+
+		adev->reset_domain.wq = hive->reset_domain.wq;
+	} else {
+		adev->reset_domain.wq = alloc_ordered_workqueue("amdgpu-reset-dev", 0);
+		if (!adev->reset_domain.wq) {
+			r = -ENOMEM;
+			goto init_failed;
+		}
+	}
+
 	/* Don't init kfd if whole hive need to be reset during init */
 	if (!adev->gmc.xgmi.pending_reset)
 		amdgpu_amdkfd_device_init(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 567df2db23ac..a858e3457c5c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -392,6 +392,14 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev)
 		goto pro_end;
 	}
 
+	hive->reset_domain.wq = alloc_ordered_workqueue("amdgpu-reset-hive", 0);
+	if (!hive->reset_domain.wq) {
+		dev_err(adev->dev, "XGMI: failed allocating wq for reset domain!\n");
+		kfree(hive);
+		hive = NULL;
+		goto pro_end;
+	}
+
 	hive->hive_id = adev->gmc.xgmi.hive_id;
 	INIT_LIST_HEAD(&hive->device_list);
 	INIT_LIST_HEAD(&hive->node);
@@ -401,6 +409,7 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev)
 	task_barrier_init(&hive->tb);
 	hive->pstate = AMDGPU_XGMI_PSTATE_UNKNOWN;
 	hive->hi_req_gpu = NULL;
+
 	/*
 	 * hive pstate on boot is high in vega20 so we have to go to low
 	 * pstate on after boot.
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
index d2189bf7d428..6121aaa292cb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
@@ -42,6 +42,8 @@ struct amdgpu_hive_info {
 		AMDGPU_XGMI_PSTATE_MAX_VEGA20,
 		AMDGPU_XGMI_PSTATE_UNKNOWN
 	} pstate;
+
+	struct amdgpu_reset_domain reset_domain;
 };
 
 struct amdgpu_pcs_ras_field {
-- 
2.25.1


WARNING: multiple messages have this Message-ID (diff)
From: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
To: <dri-devel@lists.freedesktop.org>, <amd-gfx@lists.freedesktop.org>
Cc: "Andrey Grodzovsky" <andrey.grodzovsky@amd.com>,
	"Daniel Vetter" <daniel.vetter@ffwll.ch>,
	horace.chen@amd.com, lijo.lazar@amd.com, jingwech@amd.com,
	daniel@ffwll.ch,
	"Christian König" <ckoenig.leichtzumerken@gmail.com>,
	christian.koenig@amd.com, Monk.Liu@amd.com
Subject: [RFC v3 01/12] drm/amdgpu: Introduce reset domain
Date: Tue, 25 Jan 2022 17:37:41 -0500	[thread overview]
Message-ID: <20220125223752.200211-2-andrey.grodzovsky@amd.com> (raw)
In-Reply-To: <20220125223752.200211-1-andrey.grodzovsky@amd.com>

Defined a reset_domain struct such that
all the entities that go through reset
together will be serialized one against
another. Do it for both single device and
XGMI hive cases.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Suggested-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Suggested-by: Christian König <ckoenig.leichtzumerken@gmail.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  7 +++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 20 +++++++++++++++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   |  9 +++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   |  2 ++
 4 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 9f017663ac50..b5ff76aae7e0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -812,6 +812,11 @@ struct amd_powerplay {
 
 #define AMDGPU_RESET_MAGIC_NUM 64
 #define AMDGPU_MAX_DF_PERFMONS 4
+
+struct amdgpu_reset_domain {
+	struct workqueue_struct *wq;
+};
+
 struct amdgpu_device {
 	struct device			*dev;
 	struct pci_dev			*pdev;
@@ -1096,6 +1101,8 @@ struct amdgpu_device {
 
 	struct amdgpu_reset_control     *reset_cntl;
 	uint32_t                        ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE];
+
+	struct amdgpu_reset_domain	reset_domain;
 };
 
 static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 90d22a376632..0f3e6c078f88 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2391,9 +2391,27 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
 	if (r)
 		goto init_failed;
 
-	if (adev->gmc.xgmi.num_physical_nodes > 1)
+	if (adev->gmc.xgmi.num_physical_nodes > 1) {
+		struct amdgpu_hive_info *hive;
+
 		amdgpu_xgmi_add_device(adev);
 
+		hive = amdgpu_get_xgmi_hive(adev);
+		if (!hive || !hive->reset_domain.wq) {
+			DRM_ERROR("Failed to obtain reset domain info for XGMI hive:%llx", hive->hive_id);
+			r = -EINVAL;
+			goto init_failed;
+		}
+
+		adev->reset_domain.wq = hive->reset_domain.wq;
+	} else {
+		adev->reset_domain.wq = alloc_ordered_workqueue("amdgpu-reset-dev", 0);
+		if (!adev->reset_domain.wq) {
+			r = -ENOMEM;
+			goto init_failed;
+		}
+	}
+
 	/* Don't init kfd if whole hive need to be reset during init */
 	if (!adev->gmc.xgmi.pending_reset)
 		amdgpu_amdkfd_device_init(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 567df2db23ac..a858e3457c5c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -392,6 +392,14 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev)
 		goto pro_end;
 	}
 
+	hive->reset_domain.wq = alloc_ordered_workqueue("amdgpu-reset-hive", 0);
+	if (!hive->reset_domain.wq) {
+		dev_err(adev->dev, "XGMI: failed allocating wq for reset domain!\n");
+		kfree(hive);
+		hive = NULL;
+		goto pro_end;
+	}
+
 	hive->hive_id = adev->gmc.xgmi.hive_id;
 	INIT_LIST_HEAD(&hive->device_list);
 	INIT_LIST_HEAD(&hive->node);
@@ -401,6 +409,7 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev)
 	task_barrier_init(&hive->tb);
 	hive->pstate = AMDGPU_XGMI_PSTATE_UNKNOWN;
 	hive->hi_req_gpu = NULL;
+
 	/*
 	 * hive pstate on boot is high in vega20 so we have to go to low
 	 * pstate on after boot.
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
index d2189bf7d428..6121aaa292cb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
@@ -42,6 +42,8 @@ struct amdgpu_hive_info {
 		AMDGPU_XGMI_PSTATE_MAX_VEGA20,
 		AMDGPU_XGMI_PSTATE_UNKNOWN
 	} pstate;
+
+	struct amdgpu_reset_domain reset_domain;
 };
 
 struct amdgpu_pcs_ras_field {
-- 
2.25.1


  reply	other threads:[~2022-01-25 22:38 UTC|newest]

Thread overview: 54+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-01-25 22:37 [RFC v3 00/12] Define and use reset domain for GPU recovery in amdgpu Andrey Grodzovsky
2022-01-25 22:37 ` Andrey Grodzovsky
2022-01-25 22:37 ` Andrey Grodzovsky [this message]
2022-01-25 22:37   ` [RFC v3 01/12] drm/amdgpu: Introduce reset domain Andrey Grodzovsky
2022-01-26 12:07   ` Christian König
2022-01-26 12:07     ` Christian König
2022-01-26 15:47     ` Andrey Grodzovsky
2022-01-26 15:47       ` Andrey Grodzovsky
2022-01-25 22:37 ` [RFC v3 02/12] drm/amdgpu: Move scheduler init to after XGMI is ready Andrey Grodzovsky
2022-01-25 22:37   ` Andrey Grodzovsky
2022-01-25 22:37 ` [RFC v3 03/12] drm/amdgpu: Fix crash on modprobe Andrey Grodzovsky
2022-01-25 22:37   ` Andrey Grodzovsky
2022-01-25 22:37 ` [RFC v3 04/12] drm/amdgpu: Serialize non TDR gpu recovery with TDRs Andrey Grodzovsky
2022-01-25 22:37   ` Andrey Grodzovsky
2022-01-25 22:37 ` [RFC v3 05/12] drm/amd/virt: For SRIOV send GPU reset directly to TDR queue Andrey Grodzovsky
2022-01-25 22:37   ` Andrey Grodzovsky
2022-01-25 22:37 ` [RFC v3 06/12] drm/amdgpu: Drop hive->in_reset Andrey Grodzovsky
2022-01-25 22:37   ` Andrey Grodzovsky
2022-02-08  6:33   ` Lazar, Lijo
2022-02-08  6:33     ` Lazar, Lijo
2022-02-08 15:39     ` Andrey Grodzovsky
2022-02-08 15:39       ` Andrey Grodzovsky
2022-01-25 22:37 ` [RFC v3 07/12] drm/amdgpu: Drop concurrent GPU reset protection for device Andrey Grodzovsky
2022-01-25 22:37   ` Andrey Grodzovsky
2022-01-25 22:37 ` [RFC v3 08/12] drm/amdgpu: Rework reset domain to be refcounted Andrey Grodzovsky
2022-01-25 22:37   ` Andrey Grodzovsky
2022-01-26 12:12   ` Christian König
2022-01-26 12:12     ` Christian König
2022-02-02 17:26   ` [RFC v4] " Andrey Grodzovsky
2022-02-02 17:26     ` Andrey Grodzovsky
2022-02-08 11:25     ` Lazar, Lijo
2022-02-08 11:25       ` Lazar, Lijo
2022-02-08 16:19       ` Andrey Grodzovsky
2022-02-08 16:19         ` Andrey Grodzovsky
2022-02-09  7:51         ` Christian König
2022-02-09  7:51           ` Christian König
2022-01-25 22:37 ` [RFC v3 09/12] drm/amdgpu: Move reset sem into reset_domain Andrey Grodzovsky
2022-01-25 22:37   ` Andrey Grodzovsky
2022-01-25 22:37 ` [RFC v3 10/12] drm/amdgpu: Move in_gpu_reset " Andrey Grodzovsky
2022-01-25 22:37   ` Andrey Grodzovsky
2022-02-08 10:49   ` Lazar, Lijo
2022-02-08 10:49     ` Lazar, Lijo
2022-01-25 22:37 ` [RFC v3 11/12] drm/amdgpu: Rework amdgpu_device_lock_adev Andrey Grodzovsky
2022-01-25 22:37   ` Andrey Grodzovsky
2022-01-25 22:37 ` [RFC v3 12/12] Revert 'drm/amdgpu: annotate a false positive recursive locking' Andrey Grodzovsky
2022-01-25 22:37   ` Andrey Grodzovsky
2022-01-28 19:36 ` [RFC v3 00/12] Define and use reset domain for GPU recovery in amdgpu Andrey Grodzovsky
2022-01-28 19:36   ` Andrey Grodzovsky
2022-02-02 18:57   ` Andrey Grodzovsky
2022-02-02 18:57     ` Andrey Grodzovsky
2022-02-09  6:06     ` JingWen Chen
2022-02-09  6:06       ` JingWen Chen
2022-02-09 16:08       ` Andrey Grodzovsky
2022-02-09 16:08         ` Andrey Grodzovsky

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220125223752.200211-2-andrey.grodzovsky@amd.com \
    --to=andrey.grodzovsky@amd.com \
    --cc=Monk.Liu@amd.com \
    --cc=amd-gfx@lists.freedesktop.org \
    --cc=christian.koenig@amd.com \
    --cc=ckoenig.leichtzumerken@gmail.com \
    --cc=daniel.vetter@ffwll.ch \
    --cc=dri-devel@lists.freedesktop.org \
    --cc=horace.chen@amd.com \
    --cc=jingwech@amd.com \
    --cc=lijo.lazar@amd.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.