amd-gfx.lists.freedesktop.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive.
@ 2022-05-04 16:18 Andrey Grodzovsky
  2022-05-05 10:09 ` Christian König
  0 siblings, 1 reply; 40+ messages in thread
From: Andrey Grodzovsky @ 2022-05-04 16:18 UTC (permalink / raw)
  To: amd-gfx; +Cc: Bai Zoy, Andrey Grodzovsky, lijo.lazar, Christian.Koenig

Problem:
During hive reset caused by command timing out on a ring
extra resets are generated by triggered by KFD which is
unable to accesses registers on the resetting ASIC.

Fix: Rework GPU reset to use a list of pending reset jobs
such that the first reset jobs that actaully resets the entire
reset domain will cancel all those pending redundant resets.

This is in line with what we already do for redundant TDRs
in scheduler code.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Tested-by: Bai Zoy <Zoy.Bai@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 11 +---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c  |  3 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h  | 73 +++++++++++++++++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h   |  3 +-
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c      |  7 ++-
 drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c      |  7 ++-
 drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c      |  7 ++-
 8 files changed, 104 insertions(+), 24 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 4264abc5604d..99efd8317547 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -109,6 +109,7 @@
 #include "amdgpu_fdinfo.h"
 #include "amdgpu_mca.h"
 #include "amdgpu_ras.h"
+#include "amdgpu_reset.h"
 
 #define MAX_GPU_INSTANCE		16
 
@@ -509,16 +510,6 @@ struct amdgpu_allowed_register_entry {
 	bool grbm_indexed;
 };
 
-enum amd_reset_method {
-	AMD_RESET_METHOD_NONE = -1,
-	AMD_RESET_METHOD_LEGACY = 0,
-	AMD_RESET_METHOD_MODE0,
-	AMD_RESET_METHOD_MODE1,
-	AMD_RESET_METHOD_MODE2,
-	AMD_RESET_METHOD_BACO,
-	AMD_RESET_METHOD_PCI,
-};
-
 struct amdgpu_video_codec_info {
 	u32 codec_type;
 	u32 max_width;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e582f1044c0f..7fa82269c30f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5201,6 +5201,12 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,
 	}
 
 	tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter));
+
+	/* Drop all pending resets since we will reset now anyway */
+	tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
+					    reset_list);
+	amdgpu_reset_pending_list(tmp_adev->reset_domain);
+
 	/* Actual ASIC resets if needed.*/
 	/* Host driver will handle XGMI hive reset for SRIOV */
 	if (amdgpu_sriov_vf(adev)) {
@@ -5296,7 +5302,7 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,
 }
 
 struct amdgpu_recover_work_struct {
-	struct work_struct base;
+	struct amdgpu_reset_work_struct base;
 	struct amdgpu_device *adev;
 	struct amdgpu_job *job;
 	int ret;
@@ -5304,7 +5310,7 @@ struct amdgpu_recover_work_struct {
 
 static void amdgpu_device_queue_gpu_recover_work(struct work_struct *work)
 {
-	struct amdgpu_recover_work_struct *recover_work = container_of(work, struct amdgpu_recover_work_struct, base);
+	struct amdgpu_recover_work_struct *recover_work = container_of(work, struct amdgpu_recover_work_struct, base.base.work);
 
 	recover_work->ret = amdgpu_device_gpu_recover_imp(recover_work->adev, recover_work->job);
 }
@@ -5316,12 +5322,15 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 {
 	struct amdgpu_recover_work_struct work = {.adev = adev, .job = job};
 
-	INIT_WORK(&work.base, amdgpu_device_queue_gpu_recover_work);
+	INIT_DELAYED_WORK(&work.base.base, amdgpu_device_queue_gpu_recover_work);
+	INIT_LIST_HEAD(&work.base.node);
 
 	if (!amdgpu_reset_domain_schedule(adev->reset_domain, &work.base))
 		return -EAGAIN;
 
-	flush_work(&work.base);
+	flush_delayed_work(&work.base.base);
+
+	amdgpu_reset_domain_del_pendning_work(adev->reset_domain, &work.base);
 
 	return work.ret;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
index c80af0889773..ffddd419c351 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
@@ -134,6 +134,9 @@ struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d
 	atomic_set(&reset_domain->in_gpu_reset, 0);
 	init_rwsem(&reset_domain->sem);
 
+	INIT_LIST_HEAD(&reset_domain->pending_works);
+	mutex_init(&reset_domain->reset_lock);
+
 	return reset_domain;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
index 1949dbe28a86..863ec5720fc1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
@@ -24,7 +24,18 @@
 #ifndef __AMDGPU_RESET_H__
 #define __AMDGPU_RESET_H__
 
-#include "amdgpu.h"
+
+#include <linux/atomic.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/kref.h>
+#include <linux/rwsem.h>
+#include <linux/workqueue.h>
+
+struct amdgpu_device;
+struct amdgpu_job;
+struct amdgpu_hive_info;
+
 
 enum AMDGPU_RESET_FLAGS {
 
@@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS {
 	AMDGPU_SKIP_HW_RESET = 1,
 };
 
+
+enum amd_reset_method {
+	AMD_RESET_METHOD_NONE = -1,
+	AMD_RESET_METHOD_LEGACY = 0,
+	AMD_RESET_METHOD_MODE0,
+	AMD_RESET_METHOD_MODE1,
+	AMD_RESET_METHOD_MODE2,
+	AMD_RESET_METHOD_BACO,
+	AMD_RESET_METHOD_PCI,
+};
+
 struct amdgpu_reset_context {
 	enum amd_reset_method method;
 	struct amdgpu_device *reset_req_dev;
@@ -40,6 +62,8 @@ struct amdgpu_reset_context {
 	unsigned long flags;
 };
 
+struct amdgpu_reset_control;
+
 struct amdgpu_reset_handler {
 	enum amd_reset_method reset_method;
 	struct list_head handler_list;
@@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type {
 	XGMI_HIVE
 };
 
+
+struct amdgpu_reset_work_struct {
+	struct delayed_work base;
+	struct list_head node;
+};
+
 struct amdgpu_reset_domain {
 	struct kref refcount;
 	struct workqueue_struct *wq;
 	enum amdgpu_reset_domain_type type;
 	struct rw_semaphore sem;
 	atomic_t in_gpu_reset;
+
+	struct list_head pending_works;
+	struct mutex reset_lock;
 };
 
 
@@ -113,9 +146,43 @@ static inline void amdgpu_reset_put_reset_domain(struct amdgpu_reset_domain *dom
 }
 
 static inline bool amdgpu_reset_domain_schedule(struct amdgpu_reset_domain *domain,
-						struct work_struct *work)
+						struct amdgpu_reset_work_struct *work)
 {
-	return queue_work(domain->wq, work);
+	mutex_lock(&domain->reset_lock);
+
+	if (!queue_delayed_work(domain->wq, &work->base, 0)) {
+		mutex_unlock(&domain->reset_lock);
+		return false;
+	}
+
+	list_add_tail(&work->node, &domain->pending_works);
+	mutex_unlock(&domain->reset_lock);
+
+	return true;
+}
+
+static inline void amdgpu_reset_domain_del_pendning_work(struct amdgpu_reset_domain *domain,
+				  struct amdgpu_reset_work_struct *work)
+{
+	mutex_lock(&domain->reset_lock);
+	list_del_init(&work->node);
+	mutex_unlock(&domain->reset_lock);
+}
+
+static inline void amdgpu_reset_pending_list(struct amdgpu_reset_domain *domain)
+{
+	struct amdgpu_reset_work_struct *entry, *tmp;
+
+	mutex_lock(&domain->reset_lock);
+	list_for_each_entry_safe(entry, tmp, &domain->pending_works, node) {
+
+		list_del_init(&entry->node);
+
+		/* Stop any other related pending resets */
+		cancel_delayed_work(&entry->base);
+	}
+
+	mutex_unlock(&domain->reset_lock);
 }
 
 void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain *reset_domain);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
index 239f232f9c02..574e870d3064 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
@@ -25,6 +25,7 @@
 #define AMDGPU_VIRT_H
 
 #include "amdgv_sriovmsg.h"
+#include "amdgpu_reset.h"
 
 #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS  (1 << 0) /* vBIOS is sr-iov ready */
 #define AMDGPU_SRIOV_CAPS_ENABLE_IOV   (1 << 1) /* sr-iov is enabled on this GPU */
@@ -230,7 +231,7 @@ struct amdgpu_virt {
 	uint32_t			reg_val_offs;
 	struct amdgpu_irq_src		ack_irq;
 	struct amdgpu_irq_src		rcv_irq;
-	struct work_struct		flr_work;
+	struct amdgpu_reset_work_struct flr_work;
 	struct amdgpu_mm_table		mm_table;
 	const struct amdgpu_virt_ops	*ops;
 	struct amdgpu_vf_error_buffer	vf_errors;
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index b81acf59870c..f3d1c2be9292 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -251,7 +251,7 @@ static int xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device *adev,
 
 static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
 {
-	struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
+	struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work.base.work);
 	struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
 	int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
 
@@ -380,7 +380,8 @@ int xgpu_ai_mailbox_get_irq(struct amdgpu_device *adev)
 		return r;
 	}
 
-	INIT_WORK(&adev->virt.flr_work, xgpu_ai_mailbox_flr_work);
+	INIT_DELAYED_WORK(&adev->virt.flr_work.base, xgpu_ai_mailbox_flr_work);
+	INIT_LIST_HEAD(&adev->virt.flr_work.node);
 
 	return 0;
 }
@@ -389,6 +390,8 @@ void xgpu_ai_mailbox_put_irq(struct amdgpu_device *adev)
 {
 	amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
 	amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
+
+	amdgpu_reset_domain_del_pendning_work(adev->reset_domain, &adev->virt.flr_work);
 }
 
 static int xgpu_ai_request_init_data(struct amdgpu_device *adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
index 22c10b97ea81..927b3d5bb1d0 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
@@ -275,7 +275,7 @@ static int xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device *adev,
 
 static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
 {
-	struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
+	struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work.base.work);
 	struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
 	int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
 
@@ -407,7 +407,8 @@ int xgpu_nv_mailbox_get_irq(struct amdgpu_device *adev)
 		return r;
 	}
 
-	INIT_WORK(&adev->virt.flr_work, xgpu_nv_mailbox_flr_work);
+	INIT_DELAYED_WORK(&adev->virt.flr_work.base, xgpu_nv_mailbox_flr_work);
+	INIT_LIST_HEAD(&adev->virt.flr_work.node);
 
 	return 0;
 }
@@ -416,6 +417,8 @@ void xgpu_nv_mailbox_put_irq(struct amdgpu_device *adev)
 {
 	amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
 	amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
+
+	amdgpu_reset_domain_del_pendning_work(adev->reset_domain, &adev->virt.flr_work);
 }
 
 const struct amdgpu_virt_ops xgpu_nv_virt_ops = {
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
index 7b63d30b9b79..1d4ef5c70730 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
@@ -512,7 +512,7 @@ static int xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device *adev,
 
 static void xgpu_vi_mailbox_flr_work(struct work_struct *work)
 {
-	struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
+	struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work.base.work);
 	struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
 
 	/* wait until RCV_MSG become 3 */
@@ -610,7 +610,8 @@ int xgpu_vi_mailbox_get_irq(struct amdgpu_device *adev)
 		return r;
 	}
 
-	INIT_WORK(&adev->virt.flr_work, xgpu_vi_mailbox_flr_work);
+	INIT_DELAYED_WORK(&adev->virt.flr_work.base, xgpu_vi_mailbox_flr_work);
+	INIT_LIST_HEAD(&adev->virt.flr_work.node);
 
 	return 0;
 }
@@ -619,6 +620,8 @@ void xgpu_vi_mailbox_put_irq(struct amdgpu_device *adev)
 {
 	amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
 	amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
+
+	amdgpu_reset_domain_del_pendning_work(adev->reset_domain, &adev->virt.flr_work);
 }
 
 const struct amdgpu_virt_ops xgpu_vi_virt_ops = {
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 40+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive.
  2022-05-04 16:18 [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive Andrey Grodzovsky
@ 2022-05-05 10:09 ` Christian König
  2022-05-05 13:15   ` Andrey Grodzovsky
  0 siblings, 1 reply; 40+ messages in thread
From: Christian König @ 2022-05-05 10:09 UTC (permalink / raw)
  To: Andrey Grodzovsky, amd-gfx; +Cc: Bai Zoy, lijo.lazar

Am 04.05.22 um 18:18 schrieb Andrey Grodzovsky:
> Problem:
> During hive reset caused by command timing out on a ring
> extra resets are generated by triggered by KFD which is
> unable to accesses registers on the resetting ASIC.
>
> Fix: Rework GPU reset to use a list of pending reset jobs
> such that the first reset jobs that actaully resets the entire
> reset domain will cancel all those pending redundant resets.
>
> This is in line with what we already do for redundant TDRs
> in scheduler code.

Mhm, why exactly do you need the extra linked list then?

Let's talk about that on our call today.

Regards,
Christian.

>
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
> Tested-by: Bai Zoy <Zoy.Bai@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 11 +---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c  |  3 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h  | 73 +++++++++++++++++++++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h   |  3 +-
>   drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c      |  7 ++-
>   drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c      |  7 ++-
>   drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c      |  7 ++-
>   8 files changed, 104 insertions(+), 24 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 4264abc5604d..99efd8317547 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -109,6 +109,7 @@
>   #include "amdgpu_fdinfo.h"
>   #include "amdgpu_mca.h"
>   #include "amdgpu_ras.h"
> +#include "amdgpu_reset.h"
>   
>   #define MAX_GPU_INSTANCE		16
>   
> @@ -509,16 +510,6 @@ struct amdgpu_allowed_register_entry {
>   	bool grbm_indexed;
>   };
>   
> -enum amd_reset_method {
> -	AMD_RESET_METHOD_NONE = -1,
> -	AMD_RESET_METHOD_LEGACY = 0,
> -	AMD_RESET_METHOD_MODE0,
> -	AMD_RESET_METHOD_MODE1,
> -	AMD_RESET_METHOD_MODE2,
> -	AMD_RESET_METHOD_BACO,
> -	AMD_RESET_METHOD_PCI,
> -};
> -
>   struct amdgpu_video_codec_info {
>   	u32 codec_type;
>   	u32 max_width;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index e582f1044c0f..7fa82269c30f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -5201,6 +5201,12 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,
>   	}
>   
>   	tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter));
> +
> +	/* Drop all pending resets since we will reset now anyway */
> +	tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
> +					    reset_list);
> +	amdgpu_reset_pending_list(tmp_adev->reset_domain);
> +
>   	/* Actual ASIC resets if needed.*/
>   	/* Host driver will handle XGMI hive reset for SRIOV */
>   	if (amdgpu_sriov_vf(adev)) {
> @@ -5296,7 +5302,7 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,
>   }
>   
>   struct amdgpu_recover_work_struct {
> -	struct work_struct base;
> +	struct amdgpu_reset_work_struct base;
>   	struct amdgpu_device *adev;
>   	struct amdgpu_job *job;
>   	int ret;
> @@ -5304,7 +5310,7 @@ struct amdgpu_recover_work_struct {
>   
>   static void amdgpu_device_queue_gpu_recover_work(struct work_struct *work)
>   {
> -	struct amdgpu_recover_work_struct *recover_work = container_of(work, struct amdgpu_recover_work_struct, base);
> +	struct amdgpu_recover_work_struct *recover_work = container_of(work, struct amdgpu_recover_work_struct, base.base.work);
>   
>   	recover_work->ret = amdgpu_device_gpu_recover_imp(recover_work->adev, recover_work->job);
>   }
> @@ -5316,12 +5322,15 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   {
>   	struct amdgpu_recover_work_struct work = {.adev = adev, .job = job};
>   
> -	INIT_WORK(&work.base, amdgpu_device_queue_gpu_recover_work);
> +	INIT_DELAYED_WORK(&work.base.base, amdgpu_device_queue_gpu_recover_work);
> +	INIT_LIST_HEAD(&work.base.node);
>   
>   	if (!amdgpu_reset_domain_schedule(adev->reset_domain, &work.base))
>   		return -EAGAIN;
>   
> -	flush_work(&work.base);
> +	flush_delayed_work(&work.base.base);
> +
> +	amdgpu_reset_domain_del_pendning_work(adev->reset_domain, &work.base);
>   
>   	return work.ret;
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> index c80af0889773..ffddd419c351 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d
>   	atomic_set(&reset_domain->in_gpu_reset, 0);
>   	init_rwsem(&reset_domain->sem);
>   
> +	INIT_LIST_HEAD(&reset_domain->pending_works);
> +	mutex_init(&reset_domain->reset_lock);
> +
>   	return reset_domain;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> index 1949dbe28a86..863ec5720fc1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> @@ -24,7 +24,18 @@
>   #ifndef __AMDGPU_RESET_H__
>   #define __AMDGPU_RESET_H__
>   
> -#include "amdgpu.h"
> +
> +#include <linux/atomic.h>
> +#include <linux/mutex.h>
> +#include <linux/list.h>
> +#include <linux/kref.h>
> +#include <linux/rwsem.h>
> +#include <linux/workqueue.h>
> +
> +struct amdgpu_device;
> +struct amdgpu_job;
> +struct amdgpu_hive_info;
> +
>   
>   enum AMDGPU_RESET_FLAGS {
>   
> @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS {
>   	AMDGPU_SKIP_HW_RESET = 1,
>   };
>   
> +
> +enum amd_reset_method {
> +	AMD_RESET_METHOD_NONE = -1,
> +	AMD_RESET_METHOD_LEGACY = 0,
> +	AMD_RESET_METHOD_MODE0,
> +	AMD_RESET_METHOD_MODE1,
> +	AMD_RESET_METHOD_MODE2,
> +	AMD_RESET_METHOD_BACO,
> +	AMD_RESET_METHOD_PCI,
> +};
> +
>   struct amdgpu_reset_context {
>   	enum amd_reset_method method;
>   	struct amdgpu_device *reset_req_dev;
> @@ -40,6 +62,8 @@ struct amdgpu_reset_context {
>   	unsigned long flags;
>   };
>   
> +struct amdgpu_reset_control;
> +
>   struct amdgpu_reset_handler {
>   	enum amd_reset_method reset_method;
>   	struct list_head handler_list;
> @@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type {
>   	XGMI_HIVE
>   };
>   
> +
> +struct amdgpu_reset_work_struct {
> +	struct delayed_work base;
> +	struct list_head node;
> +};
> +
>   struct amdgpu_reset_domain {
>   	struct kref refcount;
>   	struct workqueue_struct *wq;
>   	enum amdgpu_reset_domain_type type;
>   	struct rw_semaphore sem;
>   	atomic_t in_gpu_reset;
> +
> +	struct list_head pending_works;
> +	struct mutex reset_lock;
>   };
>   
>   
> @@ -113,9 +146,43 @@ static inline void amdgpu_reset_put_reset_domain(struct amdgpu_reset_domain *dom
>   }
>   
>   static inline bool amdgpu_reset_domain_schedule(struct amdgpu_reset_domain *domain,
> -						struct work_struct *work)
> +						struct amdgpu_reset_work_struct *work)
>   {
> -	return queue_work(domain->wq, work);
> +	mutex_lock(&domain->reset_lock);
> +
> +	if (!queue_delayed_work(domain->wq, &work->base, 0)) {
> +		mutex_unlock(&domain->reset_lock);
> +		return false;
> +	}
> +
> +	list_add_tail(&work->node, &domain->pending_works);
> +	mutex_unlock(&domain->reset_lock);
> +
> +	return true;
> +}
> +
> +static inline void amdgpu_reset_domain_del_pendning_work(struct amdgpu_reset_domain *domain,
> +				  struct amdgpu_reset_work_struct *work)
> +{
> +	mutex_lock(&domain->reset_lock);
> +	list_del_init(&work->node);
> +	mutex_unlock(&domain->reset_lock);
> +}
> +
> +static inline void amdgpu_reset_pending_list(struct amdgpu_reset_domain *domain)
> +{
> +	struct amdgpu_reset_work_struct *entry, *tmp;
> +
> +	mutex_lock(&domain->reset_lock);
> +	list_for_each_entry_safe(entry, tmp, &domain->pending_works, node) {
> +
> +		list_del_init(&entry->node);
> +
> +		/* Stop any other related pending resets */
> +		cancel_delayed_work(&entry->base);
> +	}
> +
> +	mutex_unlock(&domain->reset_lock);
>   }
>   
>   void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain *reset_domain);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> index 239f232f9c02..574e870d3064 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> @@ -25,6 +25,7 @@
>   #define AMDGPU_VIRT_H
>   
>   #include "amdgv_sriovmsg.h"
> +#include "amdgpu_reset.h"
>   
>   #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS  (1 << 0) /* vBIOS is sr-iov ready */
>   #define AMDGPU_SRIOV_CAPS_ENABLE_IOV   (1 << 1) /* sr-iov is enabled on this GPU */
> @@ -230,7 +231,7 @@ struct amdgpu_virt {
>   	uint32_t			reg_val_offs;
>   	struct amdgpu_irq_src		ack_irq;
>   	struct amdgpu_irq_src		rcv_irq;
> -	struct work_struct		flr_work;
> +	struct amdgpu_reset_work_struct flr_work;
>   	struct amdgpu_mm_table		mm_table;
>   	const struct amdgpu_virt_ops	*ops;
>   	struct amdgpu_vf_error_buffer	vf_errors;
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> index b81acf59870c..f3d1c2be9292 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> @@ -251,7 +251,7 @@ static int xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device *adev,
>   
>   static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
>   {
> -	struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
> +	struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work.base.work);
>   	struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
>   	int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>   
> @@ -380,7 +380,8 @@ int xgpu_ai_mailbox_get_irq(struct amdgpu_device *adev)
>   		return r;
>   	}
>   
> -	INIT_WORK(&adev->virt.flr_work, xgpu_ai_mailbox_flr_work);
> +	INIT_DELAYED_WORK(&adev->virt.flr_work.base, xgpu_ai_mailbox_flr_work);
> +	INIT_LIST_HEAD(&adev->virt.flr_work.node);
>   
>   	return 0;
>   }
> @@ -389,6 +390,8 @@ void xgpu_ai_mailbox_put_irq(struct amdgpu_device *adev)
>   {
>   	amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>   	amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
> +
> +	amdgpu_reset_domain_del_pendning_work(adev->reset_domain, &adev->virt.flr_work);
>   }
>   
>   static int xgpu_ai_request_init_data(struct amdgpu_device *adev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> index 22c10b97ea81..927b3d5bb1d0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> @@ -275,7 +275,7 @@ static int xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device *adev,
>   
>   static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
>   {
> -	struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
> +	struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work.base.work);
>   	struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
>   	int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>   
> @@ -407,7 +407,8 @@ int xgpu_nv_mailbox_get_irq(struct amdgpu_device *adev)
>   		return r;
>   	}
>   
> -	INIT_WORK(&adev->virt.flr_work, xgpu_nv_mailbox_flr_work);
> +	INIT_DELAYED_WORK(&adev->virt.flr_work.base, xgpu_nv_mailbox_flr_work);
> +	INIT_LIST_HEAD(&adev->virt.flr_work.node);
>   
>   	return 0;
>   }
> @@ -416,6 +417,8 @@ void xgpu_nv_mailbox_put_irq(struct amdgpu_device *adev)
>   {
>   	amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>   	amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
> +
> +	amdgpu_reset_domain_del_pendning_work(adev->reset_domain, &adev->virt.flr_work);
>   }
>   
>   const struct amdgpu_virt_ops xgpu_nv_virt_ops = {
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
> index 7b63d30b9b79..1d4ef5c70730 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
> @@ -512,7 +512,7 @@ static int xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device *adev,
>   
>   static void xgpu_vi_mailbox_flr_work(struct work_struct *work)
>   {
> -	struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
> +	struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work.base.work);
>   	struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
>   
>   	/* wait until RCV_MSG become 3 */
> @@ -610,7 +610,8 @@ int xgpu_vi_mailbox_get_irq(struct amdgpu_device *adev)
>   		return r;
>   	}
>   
> -	INIT_WORK(&adev->virt.flr_work, xgpu_vi_mailbox_flr_work);
> +	INIT_DELAYED_WORK(&adev->virt.flr_work.base, xgpu_vi_mailbox_flr_work);
> +	INIT_LIST_HEAD(&adev->virt.flr_work.node);
>   
>   	return 0;
>   }
> @@ -619,6 +620,8 @@ void xgpu_vi_mailbox_put_irq(struct amdgpu_device *adev)
>   {
>   	amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>   	amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
> +
> +	amdgpu_reset_domain_del_pendning_work(adev->reset_domain, &adev->virt.flr_work);
>   }
>   
>   const struct amdgpu_virt_ops xgpu_vi_virt_ops = {


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive.
  2022-05-05 10:09 ` Christian König
@ 2022-05-05 13:15   ` Andrey Grodzovsky
  2022-05-05 13:23     ` Christian König
  0 siblings, 1 reply; 40+ messages in thread
From: Andrey Grodzovsky @ 2022-05-05 13:15 UTC (permalink / raw)
  To: Christian König, amd-gfx; +Cc: Bai Zoy, lijo.lazar

On 2022-05-05 06:09, Christian König wrote:

> Am 04.05.22 um 18:18 schrieb Andrey Grodzovsky:
>> Problem:
>> During hive reset caused by command timing out on a ring
>> extra resets are generated by triggered by KFD which is
>> unable to accesses registers on the resetting ASIC.
>>
>> Fix: Rework GPU reset to use a list of pending reset jobs
>> such that the first reset jobs that actaully resets the entire
>> reset domain will cancel all those pending redundant resets.
>>
>> This is in line with what we already do for redundant TDRs
>> in scheduler code.
>
> Mhm, why exactly do you need the extra linked list then?
>
> Let's talk about that on our call today.


Going to miss it as you know, and also this is the place to discuss 
technical questions anyway so -
It's needed because those other resets are not time out handlers that 
are governed by the scheduler
but rather external resets that are triggered by such clients as KFD, 
RAS and sysfs. Scheduler has no
knowledge of them (and should not have) but they are serialized into 
same wq as the TO handlers
from the scheduler. It just happens that TO triggered reset causes in 
turn another reset (from KFD in
this case) and we want to prevent this second reset from taking place 
just as we want to avoid multiple
TO resets to take place in scheduler code.

Andrey


>
> Regards,
> Christian.
>
>>
>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>> Tested-by: Bai Zoy <Zoy.Bai@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 11 +---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++--
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c  |  3 +
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h  | 73 +++++++++++++++++++++-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h   |  3 +-
>>   drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c      |  7 ++-
>>   drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c      |  7 ++-
>>   drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c      |  7 ++-
>>   8 files changed, 104 insertions(+), 24 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index 4264abc5604d..99efd8317547 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -109,6 +109,7 @@
>>   #include "amdgpu_fdinfo.h"
>>   #include "amdgpu_mca.h"
>>   #include "amdgpu_ras.h"
>> +#include "amdgpu_reset.h"
>>     #define MAX_GPU_INSTANCE        16
>>   @@ -509,16 +510,6 @@ struct amdgpu_allowed_register_entry {
>>       bool grbm_indexed;
>>   };
>>   -enum amd_reset_method {
>> -    AMD_RESET_METHOD_NONE = -1,
>> -    AMD_RESET_METHOD_LEGACY = 0,
>> -    AMD_RESET_METHOD_MODE0,
>> -    AMD_RESET_METHOD_MODE1,
>> -    AMD_RESET_METHOD_MODE2,
>> -    AMD_RESET_METHOD_BACO,
>> -    AMD_RESET_METHOD_PCI,
>> -};
>> -
>>   struct amdgpu_video_codec_info {
>>       u32 codec_type;
>>       u32 max_width;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index e582f1044c0f..7fa82269c30f 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -5201,6 +5201,12 @@ int amdgpu_device_gpu_recover_imp(struct 
>> amdgpu_device *adev,
>>       }
>>         tmp_vram_lost_counter = 
>> atomic_read(&((adev)->vram_lost_counter));
>> +
>> +    /* Drop all pending resets since we will reset now anyway */
>> +    tmp_adev = list_first_entry(device_list_handle, struct 
>> amdgpu_device,
>> +                        reset_list);
>> +    amdgpu_reset_pending_list(tmp_adev->reset_domain);
>> +
>>       /* Actual ASIC resets if needed.*/
>>       /* Host driver will handle XGMI hive reset for SRIOV */
>>       if (amdgpu_sriov_vf(adev)) {
>> @@ -5296,7 +5302,7 @@ int amdgpu_device_gpu_recover_imp(struct 
>> amdgpu_device *adev,
>>   }
>>     struct amdgpu_recover_work_struct {
>> -    struct work_struct base;
>> +    struct amdgpu_reset_work_struct base;
>>       struct amdgpu_device *adev;
>>       struct amdgpu_job *job;
>>       int ret;
>> @@ -5304,7 +5310,7 @@ struct amdgpu_recover_work_struct {
>>     static void amdgpu_device_queue_gpu_recover_work(struct 
>> work_struct *work)
>>   {
>> -    struct amdgpu_recover_work_struct *recover_work = 
>> container_of(work, struct amdgpu_recover_work_struct, base);
>> +    struct amdgpu_recover_work_struct *recover_work = 
>> container_of(work, struct amdgpu_recover_work_struct, base.base.work);
>>         recover_work->ret = 
>> amdgpu_device_gpu_recover_imp(recover_work->adev, recover_work->job);
>>   }
>> @@ -5316,12 +5322,15 @@ int amdgpu_device_gpu_recover(struct 
>> amdgpu_device *adev,
>>   {
>>       struct amdgpu_recover_work_struct work = {.adev = adev, .job = 
>> job};
>>   -    INIT_WORK(&work.base, amdgpu_device_queue_gpu_recover_work);
>> +    INIT_DELAYED_WORK(&work.base.base, 
>> amdgpu_device_queue_gpu_recover_work);
>> +    INIT_LIST_HEAD(&work.base.node);
>>         if (!amdgpu_reset_domain_schedule(adev->reset_domain, 
>> &work.base))
>>           return -EAGAIN;
>>   -    flush_work(&work.base);
>> +    flush_delayed_work(&work.base.base);
>> +
>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, &work.base);
>>         return work.ret;
>>   }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>> index c80af0889773..ffddd419c351 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain 
>> *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d
>>       atomic_set(&reset_domain->in_gpu_reset, 0);
>>       init_rwsem(&reset_domain->sem);
>>   +    INIT_LIST_HEAD(&reset_domain->pending_works);
>> +    mutex_init(&reset_domain->reset_lock);
>> +
>>       return reset_domain;
>>   }
>>   diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>> index 1949dbe28a86..863ec5720fc1 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>> @@ -24,7 +24,18 @@
>>   #ifndef __AMDGPU_RESET_H__
>>   #define __AMDGPU_RESET_H__
>>   -#include "amdgpu.h"
>> +
>> +#include <linux/atomic.h>
>> +#include <linux/mutex.h>
>> +#include <linux/list.h>
>> +#include <linux/kref.h>
>> +#include <linux/rwsem.h>
>> +#include <linux/workqueue.h>
>> +
>> +struct amdgpu_device;
>> +struct amdgpu_job;
>> +struct amdgpu_hive_info;
>> +
>>     enum AMDGPU_RESET_FLAGS {
>>   @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS {
>>       AMDGPU_SKIP_HW_RESET = 1,
>>   };
>>   +
>> +enum amd_reset_method {
>> +    AMD_RESET_METHOD_NONE = -1,
>> +    AMD_RESET_METHOD_LEGACY = 0,
>> +    AMD_RESET_METHOD_MODE0,
>> +    AMD_RESET_METHOD_MODE1,
>> +    AMD_RESET_METHOD_MODE2,
>> +    AMD_RESET_METHOD_BACO,
>> +    AMD_RESET_METHOD_PCI,
>> +};
>> +
>>   struct amdgpu_reset_context {
>>       enum amd_reset_method method;
>>       struct amdgpu_device *reset_req_dev;
>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context {
>>       unsigned long flags;
>>   };
>>   +struct amdgpu_reset_control;
>> +
>>   struct amdgpu_reset_handler {
>>       enum amd_reset_method reset_method;
>>       struct list_head handler_list;
>> @@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type {
>>       XGMI_HIVE
>>   };
>>   +
>> +struct amdgpu_reset_work_struct {
>> +    struct delayed_work base;
>> +    struct list_head node;
>> +};
>> +
>>   struct amdgpu_reset_domain {
>>       struct kref refcount;
>>       struct workqueue_struct *wq;
>>       enum amdgpu_reset_domain_type type;
>>       struct rw_semaphore sem;
>>       atomic_t in_gpu_reset;
>> +
>> +    struct list_head pending_works;
>> +    struct mutex reset_lock;
>>   };
>>     @@ -113,9 +146,43 @@ static inline void 
>> amdgpu_reset_put_reset_domain(struct amdgpu_reset_domain *dom
>>   }
>>     static inline bool amdgpu_reset_domain_schedule(struct 
>> amdgpu_reset_domain *domain,
>> -                        struct work_struct *work)
>> +                        struct amdgpu_reset_work_struct *work)
>>   {
>> -    return queue_work(domain->wq, work);
>> +    mutex_lock(&domain->reset_lock);
>> +
>> +    if (!queue_delayed_work(domain->wq, &work->base, 0)) {
>> +        mutex_unlock(&domain->reset_lock);
>> +        return false;
>> +    }
>> +
>> +    list_add_tail(&work->node, &domain->pending_works);
>> +    mutex_unlock(&domain->reset_lock);
>> +
>> +    return true;
>> +}
>> +
>> +static inline void amdgpu_reset_domain_del_pendning_work(struct 
>> amdgpu_reset_domain *domain,
>> +                  struct amdgpu_reset_work_struct *work)
>> +{
>> +    mutex_lock(&domain->reset_lock);
>> +    list_del_init(&work->node);
>> +    mutex_unlock(&domain->reset_lock);
>> +}
>> +
>> +static inline void amdgpu_reset_pending_list(struct 
>> amdgpu_reset_domain *domain)
>> +{
>> +    struct amdgpu_reset_work_struct *entry, *tmp;
>> +
>> +    mutex_lock(&domain->reset_lock);
>> +    list_for_each_entry_safe(entry, tmp, &domain->pending_works, 
>> node) {
>> +
>> +        list_del_init(&entry->node);
>> +
>> +        /* Stop any other related pending resets */
>> +        cancel_delayed_work(&entry->base);
>> +    }
>> +
>> +    mutex_unlock(&domain->reset_lock);
>>   }
>>     void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain 
>> *reset_domain);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> index 239f232f9c02..574e870d3064 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> @@ -25,6 +25,7 @@
>>   #define AMDGPU_VIRT_H
>>     #include "amdgv_sriovmsg.h"
>> +#include "amdgpu_reset.h"
>>     #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS  (1 << 0) /* vBIOS is 
>> sr-iov ready */
>>   #define AMDGPU_SRIOV_CAPS_ENABLE_IOV   (1 << 1) /* sr-iov is 
>> enabled on this GPU */
>> @@ -230,7 +231,7 @@ struct amdgpu_virt {
>>       uint32_t            reg_val_offs;
>>       struct amdgpu_irq_src        ack_irq;
>>       struct amdgpu_irq_src        rcv_irq;
>> -    struct work_struct        flr_work;
>> +    struct amdgpu_reset_work_struct flr_work;
>>       struct amdgpu_mm_table        mm_table;
>>       const struct amdgpu_virt_ops    *ops;
>>       struct amdgpu_vf_error_buffer    vf_errors;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>> index b81acf59870c..f3d1c2be9292 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>> @@ -251,7 +251,7 @@ static int xgpu_ai_set_mailbox_ack_irq(struct 
>> amdgpu_device *adev,
>>     static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
>>   {
>> -    struct amdgpu_virt *virt = container_of(work, struct 
>> amdgpu_virt, flr_work);
>> +    struct amdgpu_virt *virt = container_of(work, struct 
>> amdgpu_virt, flr_work.base.work);
>>       struct amdgpu_device *adev = container_of(virt, struct 
>> amdgpu_device, virt);
>>       int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>   @@ -380,7 +380,8 @@ int xgpu_ai_mailbox_get_irq(struct 
>> amdgpu_device *adev)
>>           return r;
>>       }
>>   -    INIT_WORK(&adev->virt.flr_work, xgpu_ai_mailbox_flr_work);
>> +    INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>> xgpu_ai_mailbox_flr_work);
>> +    INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>         return 0;
>>   }
>> @@ -389,6 +390,8 @@ void xgpu_ai_mailbox_put_irq(struct amdgpu_device 
>> *adev)
>>   {
>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>> +
>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>> &adev->virt.flr_work);
>>   }
>>     static int xgpu_ai_request_init_data(struct amdgpu_device *adev)
>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>> index 22c10b97ea81..927b3d5bb1d0 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>> @@ -275,7 +275,7 @@ static int xgpu_nv_set_mailbox_ack_irq(struct 
>> amdgpu_device *adev,
>>     static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
>>   {
>> -    struct amdgpu_virt *virt = container_of(work, struct 
>> amdgpu_virt, flr_work);
>> +    struct amdgpu_virt *virt = container_of(work, struct 
>> amdgpu_virt, flr_work.base.work);
>>       struct amdgpu_device *adev = container_of(virt, struct 
>> amdgpu_device, virt);
>>       int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>   @@ -407,7 +407,8 @@ int xgpu_nv_mailbox_get_irq(struct 
>> amdgpu_device *adev)
>>           return r;
>>       }
>>   -    INIT_WORK(&adev->virt.flr_work, xgpu_nv_mailbox_flr_work);
>> +    INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>> xgpu_nv_mailbox_flr_work);
>> +    INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>         return 0;
>>   }
>> @@ -416,6 +417,8 @@ void xgpu_nv_mailbox_put_irq(struct amdgpu_device 
>> *adev)
>>   {
>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>> +
>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>> &adev->virt.flr_work);
>>   }
>>     const struct amdgpu_virt_ops xgpu_nv_virt_ops = {
>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c 
>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>> index 7b63d30b9b79..1d4ef5c70730 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>> @@ -512,7 +512,7 @@ static int xgpu_vi_set_mailbox_ack_irq(struct 
>> amdgpu_device *adev,
>>     static void xgpu_vi_mailbox_flr_work(struct work_struct *work)
>>   {
>> -    struct amdgpu_virt *virt = container_of(work, struct 
>> amdgpu_virt, flr_work);
>> +    struct amdgpu_virt *virt = container_of(work, struct 
>> amdgpu_virt, flr_work.base.work);
>>       struct amdgpu_device *adev = container_of(virt, struct 
>> amdgpu_device, virt);
>>         /* wait until RCV_MSG become 3 */
>> @@ -610,7 +610,8 @@ int xgpu_vi_mailbox_get_irq(struct amdgpu_device 
>> *adev)
>>           return r;
>>       }
>>   -    INIT_WORK(&adev->virt.flr_work, xgpu_vi_mailbox_flr_work);
>> +    INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>> xgpu_vi_mailbox_flr_work);
>> +    INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>         return 0;
>>   }
>> @@ -619,6 +620,8 @@ void xgpu_vi_mailbox_put_irq(struct amdgpu_device 
>> *adev)
>>   {
>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>> +
>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>> &adev->virt.flr_work);
>>   }
>>     const struct amdgpu_virt_ops xgpu_vi_virt_ops = {
>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive.
  2022-05-05 13:15   ` Andrey Grodzovsky
@ 2022-05-05 13:23     ` Christian König
  2022-05-05 13:54       ` Andrey Grodzovsky
  0 siblings, 1 reply; 40+ messages in thread
From: Christian König @ 2022-05-05 13:23 UTC (permalink / raw)
  To: Andrey Grodzovsky, amd-gfx; +Cc: Bai Zoy, lijo.lazar

Am 05.05.22 um 15:15 schrieb Andrey Grodzovsky:
> On 2022-05-05 06:09, Christian König wrote:
>
>> Am 04.05.22 um 18:18 schrieb Andrey Grodzovsky:
>>> Problem:
>>> During hive reset caused by command timing out on a ring
>>> extra resets are generated by triggered by KFD which is
>>> unable to accesses registers on the resetting ASIC.
>>>
>>> Fix: Rework GPU reset to use a list of pending reset jobs
>>> such that the first reset jobs that actaully resets the entire
>>> reset domain will cancel all those pending redundant resets.
>>>
>>> This is in line with what we already do for redundant TDRs
>>> in scheduler code.
>>
>> Mhm, why exactly do you need the extra linked list then?
>>
>> Let's talk about that on our call today.
>
>
> Going to miss it as you know, and also this is the place to discuss 
> technical questions anyway so -

Good point.

> It's needed because those other resets are not time out handlers that 
> are governed by the scheduler
> but rather external resets that are triggered by such clients as KFD, 
> RAS and sysfs. Scheduler has no
> knowledge of them (and should not have) but they are serialized into 
> same wq as the TO handlers
> from the scheduler. It just happens that TO triggered reset causes in 
> turn another reset (from KFD in
> this case) and we want to prevent this second reset from taking place 
> just as we want to avoid multiple
> TO resets to take place in scheduler code.

Yeah, but why do you need multiple workers?

You have a single worker for the GPU reset not triggered by the 
scheduler in you adev and cancel that at the end of the reset procedure.

If anybody things it needs to trigger another reset while in reset 
(which is actually a small design bug separately) the reset will just be 
canceled in the same way we cancel the scheduler resets.

Christian.

>
> Andrey
>
>
>>
>> Regards,
>> Christian.
>>
>>>
>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 11 +---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++--
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c  |  3 +
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h  | 73 
>>> +++++++++++++++++++++-
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h   |  3 +-
>>>   drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c      |  7 ++-
>>>   drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c      |  7 ++-
>>>   drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c      |  7 ++-
>>>   8 files changed, 104 insertions(+), 24 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> index 4264abc5604d..99efd8317547 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> @@ -109,6 +109,7 @@
>>>   #include "amdgpu_fdinfo.h"
>>>   #include "amdgpu_mca.h"
>>>   #include "amdgpu_ras.h"
>>> +#include "amdgpu_reset.h"
>>>     #define MAX_GPU_INSTANCE        16
>>>   @@ -509,16 +510,6 @@ struct amdgpu_allowed_register_entry {
>>>       bool grbm_indexed;
>>>   };
>>>   -enum amd_reset_method {
>>> -    AMD_RESET_METHOD_NONE = -1,
>>> -    AMD_RESET_METHOD_LEGACY = 0,
>>> -    AMD_RESET_METHOD_MODE0,
>>> -    AMD_RESET_METHOD_MODE1,
>>> -    AMD_RESET_METHOD_MODE2,
>>> -    AMD_RESET_METHOD_BACO,
>>> -    AMD_RESET_METHOD_PCI,
>>> -};
>>> -
>>>   struct amdgpu_video_codec_info {
>>>       u32 codec_type;
>>>       u32 max_width;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index e582f1044c0f..7fa82269c30f 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -5201,6 +5201,12 @@ int amdgpu_device_gpu_recover_imp(struct 
>>> amdgpu_device *adev,
>>>       }
>>>         tmp_vram_lost_counter = 
>>> atomic_read(&((adev)->vram_lost_counter));
>>> +
>>> +    /* Drop all pending resets since we will reset now anyway */
>>> +    tmp_adev = list_first_entry(device_list_handle, struct 
>>> amdgpu_device,
>>> +                        reset_list);
>>> +    amdgpu_reset_pending_list(tmp_adev->reset_domain);
>>> +
>>>       /* Actual ASIC resets if needed.*/
>>>       /* Host driver will handle XGMI hive reset for SRIOV */
>>>       if (amdgpu_sriov_vf(adev)) {
>>> @@ -5296,7 +5302,7 @@ int amdgpu_device_gpu_recover_imp(struct 
>>> amdgpu_device *adev,
>>>   }
>>>     struct amdgpu_recover_work_struct {
>>> -    struct work_struct base;
>>> +    struct amdgpu_reset_work_struct base;
>>>       struct amdgpu_device *adev;
>>>       struct amdgpu_job *job;
>>>       int ret;
>>> @@ -5304,7 +5310,7 @@ struct amdgpu_recover_work_struct {
>>>     static void amdgpu_device_queue_gpu_recover_work(struct 
>>> work_struct *work)
>>>   {
>>> -    struct amdgpu_recover_work_struct *recover_work = 
>>> container_of(work, struct amdgpu_recover_work_struct, base);
>>> +    struct amdgpu_recover_work_struct *recover_work = 
>>> container_of(work, struct amdgpu_recover_work_struct, base.base.work);
>>>         recover_work->ret = 
>>> amdgpu_device_gpu_recover_imp(recover_work->adev, recover_work->job);
>>>   }
>>> @@ -5316,12 +5322,15 @@ int amdgpu_device_gpu_recover(struct 
>>> amdgpu_device *adev,
>>>   {
>>>       struct amdgpu_recover_work_struct work = {.adev = adev, .job = 
>>> job};
>>>   -    INIT_WORK(&work.base, amdgpu_device_queue_gpu_recover_work);
>>> +    INIT_DELAYED_WORK(&work.base.base, 
>>> amdgpu_device_queue_gpu_recover_work);
>>> +    INIT_LIST_HEAD(&work.base.node);
>>>         if (!amdgpu_reset_domain_schedule(adev->reset_domain, 
>>> &work.base))
>>>           return -EAGAIN;
>>>   -    flush_work(&work.base);
>>> +    flush_delayed_work(&work.base.base);
>>> +
>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>> &work.base);
>>>         return work.ret;
>>>   }
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>> index c80af0889773..ffddd419c351 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain 
>>> *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d
>>>       atomic_set(&reset_domain->in_gpu_reset, 0);
>>>       init_rwsem(&reset_domain->sem);
>>>   +    INIT_LIST_HEAD(&reset_domain->pending_works);
>>> +    mutex_init(&reset_domain->reset_lock);
>>> +
>>>       return reset_domain;
>>>   }
>>>   diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>> index 1949dbe28a86..863ec5720fc1 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>> @@ -24,7 +24,18 @@
>>>   #ifndef __AMDGPU_RESET_H__
>>>   #define __AMDGPU_RESET_H__
>>>   -#include "amdgpu.h"
>>> +
>>> +#include <linux/atomic.h>
>>> +#include <linux/mutex.h>
>>> +#include <linux/list.h>
>>> +#include <linux/kref.h>
>>> +#include <linux/rwsem.h>
>>> +#include <linux/workqueue.h>
>>> +
>>> +struct amdgpu_device;
>>> +struct amdgpu_job;
>>> +struct amdgpu_hive_info;
>>> +
>>>     enum AMDGPU_RESET_FLAGS {
>>>   @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS {
>>>       AMDGPU_SKIP_HW_RESET = 1,
>>>   };
>>>   +
>>> +enum amd_reset_method {
>>> +    AMD_RESET_METHOD_NONE = -1,
>>> +    AMD_RESET_METHOD_LEGACY = 0,
>>> +    AMD_RESET_METHOD_MODE0,
>>> +    AMD_RESET_METHOD_MODE1,
>>> +    AMD_RESET_METHOD_MODE2,
>>> +    AMD_RESET_METHOD_BACO,
>>> +    AMD_RESET_METHOD_PCI,
>>> +};
>>> +
>>>   struct amdgpu_reset_context {
>>>       enum amd_reset_method method;
>>>       struct amdgpu_device *reset_req_dev;
>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context {
>>>       unsigned long flags;
>>>   };
>>>   +struct amdgpu_reset_control;
>>> +
>>>   struct amdgpu_reset_handler {
>>>       enum amd_reset_method reset_method;
>>>       struct list_head handler_list;
>>> @@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type {
>>>       XGMI_HIVE
>>>   };
>>>   +
>>> +struct amdgpu_reset_work_struct {
>>> +    struct delayed_work base;
>>> +    struct list_head node;
>>> +};
>>> +
>>>   struct amdgpu_reset_domain {
>>>       struct kref refcount;
>>>       struct workqueue_struct *wq;
>>>       enum amdgpu_reset_domain_type type;
>>>       struct rw_semaphore sem;
>>>       atomic_t in_gpu_reset;
>>> +
>>> +    struct list_head pending_works;
>>> +    struct mutex reset_lock;
>>>   };
>>>     @@ -113,9 +146,43 @@ static inline void 
>>> amdgpu_reset_put_reset_domain(struct amdgpu_reset_domain *dom
>>>   }
>>>     static inline bool amdgpu_reset_domain_schedule(struct 
>>> amdgpu_reset_domain *domain,
>>> -                        struct work_struct *work)
>>> +                        struct amdgpu_reset_work_struct *work)
>>>   {
>>> -    return queue_work(domain->wq, work);
>>> +    mutex_lock(&domain->reset_lock);
>>> +
>>> +    if (!queue_delayed_work(domain->wq, &work->base, 0)) {
>>> +        mutex_unlock(&domain->reset_lock);
>>> +        return false;
>>> +    }
>>> +
>>> +    list_add_tail(&work->node, &domain->pending_works);
>>> +    mutex_unlock(&domain->reset_lock);
>>> +
>>> +    return true;
>>> +}
>>> +
>>> +static inline void amdgpu_reset_domain_del_pendning_work(struct 
>>> amdgpu_reset_domain *domain,
>>> +                  struct amdgpu_reset_work_struct *work)
>>> +{
>>> +    mutex_lock(&domain->reset_lock);
>>> +    list_del_init(&work->node);
>>> +    mutex_unlock(&domain->reset_lock);
>>> +}
>>> +
>>> +static inline void amdgpu_reset_pending_list(struct 
>>> amdgpu_reset_domain *domain)
>>> +{
>>> +    struct amdgpu_reset_work_struct *entry, *tmp;
>>> +
>>> +    mutex_lock(&domain->reset_lock);
>>> +    list_for_each_entry_safe(entry, tmp, &domain->pending_works, 
>>> node) {
>>> +
>>> +        list_del_init(&entry->node);
>>> +
>>> +        /* Stop any other related pending resets */
>>> +        cancel_delayed_work(&entry->base);
>>> +    }
>>> +
>>> +    mutex_unlock(&domain->reset_lock);
>>>   }
>>>     void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain 
>>> *reset_domain);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>> index 239f232f9c02..574e870d3064 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>> @@ -25,6 +25,7 @@
>>>   #define AMDGPU_VIRT_H
>>>     #include "amdgv_sriovmsg.h"
>>> +#include "amdgpu_reset.h"
>>>     #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS  (1 << 0) /* vBIOS is 
>>> sr-iov ready */
>>>   #define AMDGPU_SRIOV_CAPS_ENABLE_IOV   (1 << 1) /* sr-iov is 
>>> enabled on this GPU */
>>> @@ -230,7 +231,7 @@ struct amdgpu_virt {
>>>       uint32_t            reg_val_offs;
>>>       struct amdgpu_irq_src        ack_irq;
>>>       struct amdgpu_irq_src        rcv_irq;
>>> -    struct work_struct        flr_work;
>>> +    struct amdgpu_reset_work_struct flr_work;
>>>       struct amdgpu_mm_table        mm_table;
>>>       const struct amdgpu_virt_ops    *ops;
>>>       struct amdgpu_vf_error_buffer    vf_errors;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>> index b81acf59870c..f3d1c2be9292 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>> @@ -251,7 +251,7 @@ static int xgpu_ai_set_mailbox_ack_irq(struct 
>>> amdgpu_device *adev,
>>>     static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
>>>   {
>>> -    struct amdgpu_virt *virt = container_of(work, struct 
>>> amdgpu_virt, flr_work);
>>> +    struct amdgpu_virt *virt = container_of(work, struct 
>>> amdgpu_virt, flr_work.base.work);
>>>       struct amdgpu_device *adev = container_of(virt, struct 
>>> amdgpu_device, virt);
>>>       int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>   @@ -380,7 +380,8 @@ int xgpu_ai_mailbox_get_irq(struct 
>>> amdgpu_device *adev)
>>>           return r;
>>>       }
>>>   -    INIT_WORK(&adev->virt.flr_work, xgpu_ai_mailbox_flr_work);
>>> +    INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>> xgpu_ai_mailbox_flr_work);
>>> +    INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>         return 0;
>>>   }
>>> @@ -389,6 +390,8 @@ void xgpu_ai_mailbox_put_irq(struct 
>>> amdgpu_device *adev)
>>>   {
>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>> +
>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>> &adev->virt.flr_work);
>>>   }
>>>     static int xgpu_ai_request_init_data(struct amdgpu_device *adev)
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>> index 22c10b97ea81..927b3d5bb1d0 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>> @@ -275,7 +275,7 @@ static int xgpu_nv_set_mailbox_ack_irq(struct 
>>> amdgpu_device *adev,
>>>     static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
>>>   {
>>> -    struct amdgpu_virt *virt = container_of(work, struct 
>>> amdgpu_virt, flr_work);
>>> +    struct amdgpu_virt *virt = container_of(work, struct 
>>> amdgpu_virt, flr_work.base.work);
>>>       struct amdgpu_device *adev = container_of(virt, struct 
>>> amdgpu_device, virt);
>>>       int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>   @@ -407,7 +407,8 @@ int xgpu_nv_mailbox_get_irq(struct 
>>> amdgpu_device *adev)
>>>           return r;
>>>       }
>>>   -    INIT_WORK(&adev->virt.flr_work, xgpu_nv_mailbox_flr_work);
>>> +    INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>> xgpu_nv_mailbox_flr_work);
>>> +    INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>         return 0;
>>>   }
>>> @@ -416,6 +417,8 @@ void xgpu_nv_mailbox_put_irq(struct 
>>> amdgpu_device *adev)
>>>   {
>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>> +
>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>> &adev->virt.flr_work);
>>>   }
>>>     const struct amdgpu_virt_ops xgpu_nv_virt_ops = {
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c 
>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>> index 7b63d30b9b79..1d4ef5c70730 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>> @@ -512,7 +512,7 @@ static int xgpu_vi_set_mailbox_ack_irq(struct 
>>> amdgpu_device *adev,
>>>     static void xgpu_vi_mailbox_flr_work(struct work_struct *work)
>>>   {
>>> -    struct amdgpu_virt *virt = container_of(work, struct 
>>> amdgpu_virt, flr_work);
>>> +    struct amdgpu_virt *virt = container_of(work, struct 
>>> amdgpu_virt, flr_work.base.work);
>>>       struct amdgpu_device *adev = container_of(virt, struct 
>>> amdgpu_device, virt);
>>>         /* wait until RCV_MSG become 3 */
>>> @@ -610,7 +610,8 @@ int xgpu_vi_mailbox_get_irq(struct amdgpu_device 
>>> *adev)
>>>           return r;
>>>       }
>>>   -    INIT_WORK(&adev->virt.flr_work, xgpu_vi_mailbox_flr_work);
>>> +    INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>> xgpu_vi_mailbox_flr_work);
>>> +    INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>         return 0;
>>>   }
>>> @@ -619,6 +620,8 @@ void xgpu_vi_mailbox_put_irq(struct 
>>> amdgpu_device *adev)
>>>   {
>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>> +
>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>> &adev->virt.flr_work);
>>>   }
>>>     const struct amdgpu_virt_ops xgpu_vi_virt_ops = {
>>


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive.
  2022-05-05 13:23     ` Christian König
@ 2022-05-05 13:54       ` Andrey Grodzovsky
  2022-05-05 15:06         ` Christian König
  0 siblings, 1 reply; 40+ messages in thread
From: Andrey Grodzovsky @ 2022-05-05 13:54 UTC (permalink / raw)
  To: Christian König, amd-gfx; +Cc: Bai Zoy, lijo.lazar


On 2022-05-05 09:23, Christian König wrote:
> Am 05.05.22 um 15:15 schrieb Andrey Grodzovsky:
>> On 2022-05-05 06:09, Christian König wrote:
>>
>>> Am 04.05.22 um 18:18 schrieb Andrey Grodzovsky:
>>>> Problem:
>>>> During hive reset caused by command timing out on a ring
>>>> extra resets are generated by triggered by KFD which is
>>>> unable to accesses registers on the resetting ASIC.
>>>>
>>>> Fix: Rework GPU reset to use a list of pending reset jobs
>>>> such that the first reset jobs that actaully resets the entire
>>>> reset domain will cancel all those pending redundant resets.
>>>>
>>>> This is in line with what we already do for redundant TDRs
>>>> in scheduler code.
>>>
>>> Mhm, why exactly do you need the extra linked list then?
>>>
>>> Let's talk about that on our call today.
>>
>>
>> Going to miss it as you know, and also this is the place to discuss 
>> technical questions anyway so -
>
> Good point.
>
>> It's needed because those other resets are not time out handlers that 
>> are governed by the scheduler
>> but rather external resets that are triggered by such clients as KFD, 
>> RAS and sysfs. Scheduler has no
>> knowledge of them (and should not have) but they are serialized into 
>> same wq as the TO handlers
>> from the scheduler. It just happens that TO triggered reset causes in 
>> turn another reset (from KFD in
>> this case) and we want to prevent this second reset from taking place 
>> just as we want to avoid multiple
>> TO resets to take place in scheduler code.
>
> Yeah, but why do you need multiple workers?
>
> You have a single worker for the GPU reset not triggered by the 
> scheduler in you adev and cancel that at the end of the reset procedure.
>
> If anybody things it needs to trigger another reset while in reset 
> (which is actually a small design bug separately) the reset will just 
> be canceled in the same way we cancel the scheduler resets.
>
> Christian.


Had this in mind at first but then I realized that each client (RAS, KFD 
and sysfs) will want to fill his own data for the work (see 
amdgpu_device_gpu_recover) - for XGMI hive each will want to set his own 
adev (which is fine if you set a work per adev as you suggest) but also 
each client might want (they all put NULL there but in theory in the 
future) also set his own bad job value and here you might have a collision.
Also in general seems to me it's cleaner approach where this logic (the 
work items) are held and handled in reset_domain and are not split in 
each adev or any other entity. We might want in the future to even move 
the scheduler handling into reset domain since reset domain is supposed 
to be a generic things and not only or AMD.

Andrey


>
>>
>> Andrey
>>
>>
>>>
>>> Regards,
>>> Christian.
>>>
>>>>
>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com>
>>>> ---
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 11 +---
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++--
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c  |  3 +
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h  | 73 
>>>> +++++++++++++++++++++-
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h   |  3 +-
>>>>   drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c      |  7 ++-
>>>>   drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c      |  7 ++-
>>>>   drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c      |  7 ++-
>>>>   8 files changed, 104 insertions(+), 24 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> index 4264abc5604d..99efd8317547 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> @@ -109,6 +109,7 @@
>>>>   #include "amdgpu_fdinfo.h"
>>>>   #include "amdgpu_mca.h"
>>>>   #include "amdgpu_ras.h"
>>>> +#include "amdgpu_reset.h"
>>>>     #define MAX_GPU_INSTANCE        16
>>>>   @@ -509,16 +510,6 @@ struct amdgpu_allowed_register_entry {
>>>>       bool grbm_indexed;
>>>>   };
>>>>   -enum amd_reset_method {
>>>> -    AMD_RESET_METHOD_NONE = -1,
>>>> -    AMD_RESET_METHOD_LEGACY = 0,
>>>> -    AMD_RESET_METHOD_MODE0,
>>>> -    AMD_RESET_METHOD_MODE1,
>>>> -    AMD_RESET_METHOD_MODE2,
>>>> -    AMD_RESET_METHOD_BACO,
>>>> -    AMD_RESET_METHOD_PCI,
>>>> -};
>>>> -
>>>>   struct amdgpu_video_codec_info {
>>>>       u32 codec_type;
>>>>       u32 max_width;
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> index e582f1044c0f..7fa82269c30f 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> @@ -5201,6 +5201,12 @@ int amdgpu_device_gpu_recover_imp(struct 
>>>> amdgpu_device *adev,
>>>>       }
>>>>         tmp_vram_lost_counter = 
>>>> atomic_read(&((adev)->vram_lost_counter));
>>>> +
>>>> +    /* Drop all pending resets since we will reset now anyway */
>>>> +    tmp_adev = list_first_entry(device_list_handle, struct 
>>>> amdgpu_device,
>>>> +                        reset_list);
>>>> +    amdgpu_reset_pending_list(tmp_adev->reset_domain);
>>>> +
>>>>       /* Actual ASIC resets if needed.*/
>>>>       /* Host driver will handle XGMI hive reset for SRIOV */
>>>>       if (amdgpu_sriov_vf(adev)) {
>>>> @@ -5296,7 +5302,7 @@ int amdgpu_device_gpu_recover_imp(struct 
>>>> amdgpu_device *adev,
>>>>   }
>>>>     struct amdgpu_recover_work_struct {
>>>> -    struct work_struct base;
>>>> +    struct amdgpu_reset_work_struct base;
>>>>       struct amdgpu_device *adev;
>>>>       struct amdgpu_job *job;
>>>>       int ret;
>>>> @@ -5304,7 +5310,7 @@ struct amdgpu_recover_work_struct {
>>>>     static void amdgpu_device_queue_gpu_recover_work(struct 
>>>> work_struct *work)
>>>>   {
>>>> -    struct amdgpu_recover_work_struct *recover_work = 
>>>> container_of(work, struct amdgpu_recover_work_struct, base);
>>>> +    struct amdgpu_recover_work_struct *recover_work = 
>>>> container_of(work, struct amdgpu_recover_work_struct, base.base.work);
>>>>         recover_work->ret = 
>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, recover_work->job);
>>>>   }
>>>> @@ -5316,12 +5322,15 @@ int amdgpu_device_gpu_recover(struct 
>>>> amdgpu_device *adev,
>>>>   {
>>>>       struct amdgpu_recover_work_struct work = {.adev = adev, .job 
>>>> = job};
>>>>   -    INIT_WORK(&work.base, amdgpu_device_queue_gpu_recover_work);
>>>> +    INIT_DELAYED_WORK(&work.base.base, 
>>>> amdgpu_device_queue_gpu_recover_work);
>>>> +    INIT_LIST_HEAD(&work.base.node);
>>>>         if (!amdgpu_reset_domain_schedule(adev->reset_domain, 
>>>> &work.base))
>>>>           return -EAGAIN;
>>>>   -    flush_work(&work.base);
>>>> +    flush_delayed_work(&work.base.base);
>>>> +
>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>> &work.base);
>>>>         return work.ret;
>>>>   }
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>> index c80af0889773..ffddd419c351 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain 
>>>> *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d
>>>>       atomic_set(&reset_domain->in_gpu_reset, 0);
>>>>       init_rwsem(&reset_domain->sem);
>>>>   +    INIT_LIST_HEAD(&reset_domain->pending_works);
>>>> +    mutex_init(&reset_domain->reset_lock);
>>>> +
>>>>       return reset_domain;
>>>>   }
>>>>   diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>> index 1949dbe28a86..863ec5720fc1 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>> @@ -24,7 +24,18 @@
>>>>   #ifndef __AMDGPU_RESET_H__
>>>>   #define __AMDGPU_RESET_H__
>>>>   -#include "amdgpu.h"
>>>> +
>>>> +#include <linux/atomic.h>
>>>> +#include <linux/mutex.h>
>>>> +#include <linux/list.h>
>>>> +#include <linux/kref.h>
>>>> +#include <linux/rwsem.h>
>>>> +#include <linux/workqueue.h>
>>>> +
>>>> +struct amdgpu_device;
>>>> +struct amdgpu_job;
>>>> +struct amdgpu_hive_info;
>>>> +
>>>>     enum AMDGPU_RESET_FLAGS {
>>>>   @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS {
>>>>       AMDGPU_SKIP_HW_RESET = 1,
>>>>   };
>>>>   +
>>>> +enum amd_reset_method {
>>>> +    AMD_RESET_METHOD_NONE = -1,
>>>> +    AMD_RESET_METHOD_LEGACY = 0,
>>>> +    AMD_RESET_METHOD_MODE0,
>>>> +    AMD_RESET_METHOD_MODE1,
>>>> +    AMD_RESET_METHOD_MODE2,
>>>> +    AMD_RESET_METHOD_BACO,
>>>> +    AMD_RESET_METHOD_PCI,
>>>> +};
>>>> +
>>>>   struct amdgpu_reset_context {
>>>>       enum amd_reset_method method;
>>>>       struct amdgpu_device *reset_req_dev;
>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context {
>>>>       unsigned long flags;
>>>>   };
>>>>   +struct amdgpu_reset_control;
>>>> +
>>>>   struct amdgpu_reset_handler {
>>>>       enum amd_reset_method reset_method;
>>>>       struct list_head handler_list;
>>>> @@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type {
>>>>       XGMI_HIVE
>>>>   };
>>>>   +
>>>> +struct amdgpu_reset_work_struct {
>>>> +    struct delayed_work base;
>>>> +    struct list_head node;
>>>> +};
>>>> +
>>>>   struct amdgpu_reset_domain {
>>>>       struct kref refcount;
>>>>       struct workqueue_struct *wq;
>>>>       enum amdgpu_reset_domain_type type;
>>>>       struct rw_semaphore sem;
>>>>       atomic_t in_gpu_reset;
>>>> +
>>>> +    struct list_head pending_works;
>>>> +    struct mutex reset_lock;
>>>>   };
>>>>     @@ -113,9 +146,43 @@ static inline void 
>>>> amdgpu_reset_put_reset_domain(struct amdgpu_reset_domain *dom
>>>>   }
>>>>     static inline bool amdgpu_reset_domain_schedule(struct 
>>>> amdgpu_reset_domain *domain,
>>>> -                        struct work_struct *work)
>>>> +                        struct amdgpu_reset_work_struct *work)
>>>>   {
>>>> -    return queue_work(domain->wq, work);
>>>> +    mutex_lock(&domain->reset_lock);
>>>> +
>>>> +    if (!queue_delayed_work(domain->wq, &work->base, 0)) {
>>>> +        mutex_unlock(&domain->reset_lock);
>>>> +        return false;
>>>> +    }
>>>> +
>>>> +    list_add_tail(&work->node, &domain->pending_works);
>>>> +    mutex_unlock(&domain->reset_lock);
>>>> +
>>>> +    return true;
>>>> +}
>>>> +
>>>> +static inline void amdgpu_reset_domain_del_pendning_work(struct 
>>>> amdgpu_reset_domain *domain,
>>>> +                  struct amdgpu_reset_work_struct *work)
>>>> +{
>>>> +    mutex_lock(&domain->reset_lock);
>>>> +    list_del_init(&work->node);
>>>> +    mutex_unlock(&domain->reset_lock);
>>>> +}
>>>> +
>>>> +static inline void amdgpu_reset_pending_list(struct 
>>>> amdgpu_reset_domain *domain)
>>>> +{
>>>> +    struct amdgpu_reset_work_struct *entry, *tmp;
>>>> +
>>>> +    mutex_lock(&domain->reset_lock);
>>>> +    list_for_each_entry_safe(entry, tmp, &domain->pending_works, 
>>>> node) {
>>>> +
>>>> +        list_del_init(&entry->node);
>>>> +
>>>> +        /* Stop any other related pending resets */
>>>> +        cancel_delayed_work(&entry->base);
>>>> +    }
>>>> +
>>>> +    mutex_unlock(&domain->reset_lock);
>>>>   }
>>>>     void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain 
>>>> *reset_domain);
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>> index 239f232f9c02..574e870d3064 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>> @@ -25,6 +25,7 @@
>>>>   #define AMDGPU_VIRT_H
>>>>     #include "amdgv_sriovmsg.h"
>>>> +#include "amdgpu_reset.h"
>>>>     #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS  (1 << 0) /* vBIOS is 
>>>> sr-iov ready */
>>>>   #define AMDGPU_SRIOV_CAPS_ENABLE_IOV   (1 << 1) /* sr-iov is 
>>>> enabled on this GPU */
>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt {
>>>>       uint32_t            reg_val_offs;
>>>>       struct amdgpu_irq_src        ack_irq;
>>>>       struct amdgpu_irq_src        rcv_irq;
>>>> -    struct work_struct        flr_work;
>>>> +    struct amdgpu_reset_work_struct flr_work;
>>>>       struct amdgpu_mm_table        mm_table;
>>>>       const struct amdgpu_virt_ops    *ops;
>>>>       struct amdgpu_vf_error_buffer    vf_errors;
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>> index b81acf59870c..f3d1c2be9292 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>> @@ -251,7 +251,7 @@ static int xgpu_ai_set_mailbox_ack_irq(struct 
>>>> amdgpu_device *adev,
>>>>     static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
>>>>   {
>>>> -    struct amdgpu_virt *virt = container_of(work, struct 
>>>> amdgpu_virt, flr_work);
>>>> +    struct amdgpu_virt *virt = container_of(work, struct 
>>>> amdgpu_virt, flr_work.base.work);
>>>>       struct amdgpu_device *adev = container_of(virt, struct 
>>>> amdgpu_device, virt);
>>>>       int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>   @@ -380,7 +380,8 @@ int xgpu_ai_mailbox_get_irq(struct 
>>>> amdgpu_device *adev)
>>>>           return r;
>>>>       }
>>>>   -    INIT_WORK(&adev->virt.flr_work, xgpu_ai_mailbox_flr_work);
>>>> +    INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>> xgpu_ai_mailbox_flr_work);
>>>> +    INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>         return 0;
>>>>   }
>>>> @@ -389,6 +390,8 @@ void xgpu_ai_mailbox_put_irq(struct 
>>>> amdgpu_device *adev)
>>>>   {
>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>> +
>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>> &adev->virt.flr_work);
>>>>   }
>>>>     static int xgpu_ai_request_init_data(struct amdgpu_device *adev)
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>> index 22c10b97ea81..927b3d5bb1d0 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>> @@ -275,7 +275,7 @@ static int xgpu_nv_set_mailbox_ack_irq(struct 
>>>> amdgpu_device *adev,
>>>>     static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
>>>>   {
>>>> -    struct amdgpu_virt *virt = container_of(work, struct 
>>>> amdgpu_virt, flr_work);
>>>> +    struct amdgpu_virt *virt = container_of(work, struct 
>>>> amdgpu_virt, flr_work.base.work);
>>>>       struct amdgpu_device *adev = container_of(virt, struct 
>>>> amdgpu_device, virt);
>>>>       int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>   @@ -407,7 +407,8 @@ int xgpu_nv_mailbox_get_irq(struct 
>>>> amdgpu_device *adev)
>>>>           return r;
>>>>       }
>>>>   -    INIT_WORK(&adev->virt.flr_work, xgpu_nv_mailbox_flr_work);
>>>> +    INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>> xgpu_nv_mailbox_flr_work);
>>>> +    INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>         return 0;
>>>>   }
>>>> @@ -416,6 +417,8 @@ void xgpu_nv_mailbox_put_irq(struct 
>>>> amdgpu_device *adev)
>>>>   {
>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>> +
>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>> &adev->virt.flr_work);
>>>>   }
>>>>     const struct amdgpu_virt_ops xgpu_nv_virt_ops = {
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>> index 7b63d30b9b79..1d4ef5c70730 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>> @@ -512,7 +512,7 @@ static int xgpu_vi_set_mailbox_ack_irq(struct 
>>>> amdgpu_device *adev,
>>>>     static void xgpu_vi_mailbox_flr_work(struct work_struct *work)
>>>>   {
>>>> -    struct amdgpu_virt *virt = container_of(work, struct 
>>>> amdgpu_virt, flr_work);
>>>> +    struct amdgpu_virt *virt = container_of(work, struct 
>>>> amdgpu_virt, flr_work.base.work);
>>>>       struct amdgpu_device *adev = container_of(virt, struct 
>>>> amdgpu_device, virt);
>>>>         /* wait until RCV_MSG become 3 */
>>>> @@ -610,7 +610,8 @@ int xgpu_vi_mailbox_get_irq(struct 
>>>> amdgpu_device *adev)
>>>>           return r;
>>>>       }
>>>>   -    INIT_WORK(&adev->virt.flr_work, xgpu_vi_mailbox_flr_work);
>>>> +    INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>> xgpu_vi_mailbox_flr_work);
>>>> +    INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>         return 0;
>>>>   }
>>>> @@ -619,6 +620,8 @@ void xgpu_vi_mailbox_put_irq(struct 
>>>> amdgpu_device *adev)
>>>>   {
>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>> +
>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>> &adev->virt.flr_work);
>>>>   }
>>>>     const struct amdgpu_virt_ops xgpu_vi_virt_ops = {
>>>
>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive.
  2022-05-05 13:54       ` Andrey Grodzovsky
@ 2022-05-05 15:06         ` Christian König
  2022-05-05 18:57           ` Andrey Grodzovsky
  0 siblings, 1 reply; 40+ messages in thread
From: Christian König @ 2022-05-05 15:06 UTC (permalink / raw)
  To: Andrey Grodzovsky, amd-gfx; +Cc: Bai Zoy, lijo.lazar

Am 05.05.22 um 15:54 schrieb Andrey Grodzovsky:
>
> On 2022-05-05 09:23, Christian König wrote:
>> Am 05.05.22 um 15:15 schrieb Andrey Grodzovsky:
>>> On 2022-05-05 06:09, Christian König wrote:
>>>
>>>> Am 04.05.22 um 18:18 schrieb Andrey Grodzovsky:
>>>>> Problem:
>>>>> During hive reset caused by command timing out on a ring
>>>>> extra resets are generated by triggered by KFD which is
>>>>> unable to accesses registers on the resetting ASIC.
>>>>>
>>>>> Fix: Rework GPU reset to use a list of pending reset jobs
>>>>> such that the first reset jobs that actaully resets the entire
>>>>> reset domain will cancel all those pending redundant resets.
>>>>>
>>>>> This is in line with what we already do for redundant TDRs
>>>>> in scheduler code.
>>>>
>>>> Mhm, why exactly do you need the extra linked list then?
>>>>
>>>> Let's talk about that on our call today.
>>>
>>>
>>> Going to miss it as you know, and also this is the place to discuss 
>>> technical questions anyway so -
>>
>> Good point.
>>
>>> It's needed because those other resets are not time out handlers 
>>> that are governed by the scheduler
>>> but rather external resets that are triggered by such clients as 
>>> KFD, RAS and sysfs. Scheduler has no
>>> knowledge of them (and should not have) but they are serialized into 
>>> same wq as the TO handlers
>>> from the scheduler. It just happens that TO triggered reset causes 
>>> in turn another reset (from KFD in
>>> this case) and we want to prevent this second reset from taking 
>>> place just as we want to avoid multiple
>>> TO resets to take place in scheduler code.
>>
>> Yeah, but why do you need multiple workers?
>>
>> You have a single worker for the GPU reset not triggered by the 
>> scheduler in you adev and cancel that at the end of the reset procedure.
>>
>> If anybody things it needs to trigger another reset while in reset 
>> (which is actually a small design bug separately) the reset will just 
>> be canceled in the same way we cancel the scheduler resets.
>>
>> Christian.
>
>
> Had this in mind at first but then I realized that each client (RAS, 
> KFD and sysfs) will want to fill his own data for the work (see 
> amdgpu_device_gpu_recover) - for XGMI hive each will want to set his 
> own adev (which is fine if you set a work per adev as you suggest) but 
> also each client might want (they all put NULL there but in theory in 
> the future) also set his own bad job value and here you might have a 
> collision.

Yeah, but that is intentional. See when we have a job that needs to be 
consumed by the reset handler and not overwritten or something.

Additional to that keep in mind that you can't allocate any memory 
before or during the GPU reset nor wait for the reset to complete (so 
you can't allocate anything on the stack either).

I don't think that concept you try here will work.

Regards,
Christian.

> Also in general seems to me it's cleaner approach where this logic 
> (the work items) are held and handled in reset_domain and are not 
> split in each adev or any other entity. We might want in the future to 
> even move the scheduler handling into reset domain since reset domain 
> is supposed to be a generic things and not only or AMD.
>
> Andrey
>
>
>>
>>>
>>> Andrey
>>>
>>>
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>>
>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com>
>>>>> ---
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 11 +---
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++--
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c  |  3 +
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h  | 73 
>>>>> +++++++++++++++++++++-
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h   |  3 +-
>>>>>   drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c      |  7 ++-
>>>>>   drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c      |  7 ++-
>>>>>   drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c      |  7 ++-
>>>>>   8 files changed, 104 insertions(+), 24 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>> index 4264abc5604d..99efd8317547 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>> @@ -109,6 +109,7 @@
>>>>>   #include "amdgpu_fdinfo.h"
>>>>>   #include "amdgpu_mca.h"
>>>>>   #include "amdgpu_ras.h"
>>>>> +#include "amdgpu_reset.h"
>>>>>     #define MAX_GPU_INSTANCE        16
>>>>>   @@ -509,16 +510,6 @@ struct amdgpu_allowed_register_entry {
>>>>>       bool grbm_indexed;
>>>>>   };
>>>>>   -enum amd_reset_method {
>>>>> -    AMD_RESET_METHOD_NONE = -1,
>>>>> -    AMD_RESET_METHOD_LEGACY = 0,
>>>>> -    AMD_RESET_METHOD_MODE0,
>>>>> -    AMD_RESET_METHOD_MODE1,
>>>>> -    AMD_RESET_METHOD_MODE2,
>>>>> -    AMD_RESET_METHOD_BACO,
>>>>> -    AMD_RESET_METHOD_PCI,
>>>>> -};
>>>>> -
>>>>>   struct amdgpu_video_codec_info {
>>>>>       u32 codec_type;
>>>>>       u32 max_width;
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> index e582f1044c0f..7fa82269c30f 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> @@ -5201,6 +5201,12 @@ int amdgpu_device_gpu_recover_imp(struct 
>>>>> amdgpu_device *adev,
>>>>>       }
>>>>>         tmp_vram_lost_counter = 
>>>>> atomic_read(&((adev)->vram_lost_counter));
>>>>> +
>>>>> +    /* Drop all pending resets since we will reset now anyway */
>>>>> +    tmp_adev = list_first_entry(device_list_handle, struct 
>>>>> amdgpu_device,
>>>>> +                        reset_list);
>>>>> +    amdgpu_reset_pending_list(tmp_adev->reset_domain);
>>>>> +
>>>>>       /* Actual ASIC resets if needed.*/
>>>>>       /* Host driver will handle XGMI hive reset for SRIOV */
>>>>>       if (amdgpu_sriov_vf(adev)) {
>>>>> @@ -5296,7 +5302,7 @@ int amdgpu_device_gpu_recover_imp(struct 
>>>>> amdgpu_device *adev,
>>>>>   }
>>>>>     struct amdgpu_recover_work_struct {
>>>>> -    struct work_struct base;
>>>>> +    struct amdgpu_reset_work_struct base;
>>>>>       struct amdgpu_device *adev;
>>>>>       struct amdgpu_job *job;
>>>>>       int ret;
>>>>> @@ -5304,7 +5310,7 @@ struct amdgpu_recover_work_struct {
>>>>>     static void amdgpu_device_queue_gpu_recover_work(struct 
>>>>> work_struct *work)
>>>>>   {
>>>>> -    struct amdgpu_recover_work_struct *recover_work = 
>>>>> container_of(work, struct amdgpu_recover_work_struct, base);
>>>>> +    struct amdgpu_recover_work_struct *recover_work = 
>>>>> container_of(work, struct amdgpu_recover_work_struct, 
>>>>> base.base.work);
>>>>>         recover_work->ret = 
>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, recover_work->job);
>>>>>   }
>>>>> @@ -5316,12 +5322,15 @@ int amdgpu_device_gpu_recover(struct 
>>>>> amdgpu_device *adev,
>>>>>   {
>>>>>       struct amdgpu_recover_work_struct work = {.adev = adev, .job 
>>>>> = job};
>>>>>   -    INIT_WORK(&work.base, amdgpu_device_queue_gpu_recover_work);
>>>>> +    INIT_DELAYED_WORK(&work.base.base, 
>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>> +    INIT_LIST_HEAD(&work.base.node);
>>>>>         if (!amdgpu_reset_domain_schedule(adev->reset_domain, 
>>>>> &work.base))
>>>>>           return -EAGAIN;
>>>>>   -    flush_work(&work.base);
>>>>> +    flush_delayed_work(&work.base.base);
>>>>> +
>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>> &work.base);
>>>>>         return work.ret;
>>>>>   }
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>> index c80af0889773..ffddd419c351 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain 
>>>>> *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d
>>>>>       atomic_set(&reset_domain->in_gpu_reset, 0);
>>>>>       init_rwsem(&reset_domain->sem);
>>>>>   + INIT_LIST_HEAD(&reset_domain->pending_works);
>>>>> +    mutex_init(&reset_domain->reset_lock);
>>>>> +
>>>>>       return reset_domain;
>>>>>   }
>>>>>   diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>> index 1949dbe28a86..863ec5720fc1 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>> @@ -24,7 +24,18 @@
>>>>>   #ifndef __AMDGPU_RESET_H__
>>>>>   #define __AMDGPU_RESET_H__
>>>>>   -#include "amdgpu.h"
>>>>> +
>>>>> +#include <linux/atomic.h>
>>>>> +#include <linux/mutex.h>
>>>>> +#include <linux/list.h>
>>>>> +#include <linux/kref.h>
>>>>> +#include <linux/rwsem.h>
>>>>> +#include <linux/workqueue.h>
>>>>> +
>>>>> +struct amdgpu_device;
>>>>> +struct amdgpu_job;
>>>>> +struct amdgpu_hive_info;
>>>>> +
>>>>>     enum AMDGPU_RESET_FLAGS {
>>>>>   @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS {
>>>>>       AMDGPU_SKIP_HW_RESET = 1,
>>>>>   };
>>>>>   +
>>>>> +enum amd_reset_method {
>>>>> +    AMD_RESET_METHOD_NONE = -1,
>>>>> +    AMD_RESET_METHOD_LEGACY = 0,
>>>>> +    AMD_RESET_METHOD_MODE0,
>>>>> +    AMD_RESET_METHOD_MODE1,
>>>>> +    AMD_RESET_METHOD_MODE2,
>>>>> +    AMD_RESET_METHOD_BACO,
>>>>> +    AMD_RESET_METHOD_PCI,
>>>>> +};
>>>>> +
>>>>>   struct amdgpu_reset_context {
>>>>>       enum amd_reset_method method;
>>>>>       struct amdgpu_device *reset_req_dev;
>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context {
>>>>>       unsigned long flags;
>>>>>   };
>>>>>   +struct amdgpu_reset_control;
>>>>> +
>>>>>   struct amdgpu_reset_handler {
>>>>>       enum amd_reset_method reset_method;
>>>>>       struct list_head handler_list;
>>>>> @@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type {
>>>>>       XGMI_HIVE
>>>>>   };
>>>>>   +
>>>>> +struct amdgpu_reset_work_struct {
>>>>> +    struct delayed_work base;
>>>>> +    struct list_head node;
>>>>> +};
>>>>> +
>>>>>   struct amdgpu_reset_domain {
>>>>>       struct kref refcount;
>>>>>       struct workqueue_struct *wq;
>>>>>       enum amdgpu_reset_domain_type type;
>>>>>       struct rw_semaphore sem;
>>>>>       atomic_t in_gpu_reset;
>>>>> +
>>>>> +    struct list_head pending_works;
>>>>> +    struct mutex reset_lock;
>>>>>   };
>>>>>     @@ -113,9 +146,43 @@ static inline void 
>>>>> amdgpu_reset_put_reset_domain(struct amdgpu_reset_domain *dom
>>>>>   }
>>>>>     static inline bool amdgpu_reset_domain_schedule(struct 
>>>>> amdgpu_reset_domain *domain,
>>>>> -                        struct work_struct *work)
>>>>> +                        struct amdgpu_reset_work_struct *work)
>>>>>   {
>>>>> -    return queue_work(domain->wq, work);
>>>>> +    mutex_lock(&domain->reset_lock);
>>>>> +
>>>>> +    if (!queue_delayed_work(domain->wq, &work->base, 0)) {
>>>>> +        mutex_unlock(&domain->reset_lock);
>>>>> +        return false;
>>>>> +    }
>>>>> +
>>>>> +    list_add_tail(&work->node, &domain->pending_works);
>>>>> +    mutex_unlock(&domain->reset_lock);
>>>>> +
>>>>> +    return true;
>>>>> +}
>>>>> +
>>>>> +static inline void amdgpu_reset_domain_del_pendning_work(struct 
>>>>> amdgpu_reset_domain *domain,
>>>>> +                  struct amdgpu_reset_work_struct *work)
>>>>> +{
>>>>> +    mutex_lock(&domain->reset_lock);
>>>>> +    list_del_init(&work->node);
>>>>> +    mutex_unlock(&domain->reset_lock);
>>>>> +}
>>>>> +
>>>>> +static inline void amdgpu_reset_pending_list(struct 
>>>>> amdgpu_reset_domain *domain)
>>>>> +{
>>>>> +    struct amdgpu_reset_work_struct *entry, *tmp;
>>>>> +
>>>>> +    mutex_lock(&domain->reset_lock);
>>>>> +    list_for_each_entry_safe(entry, tmp, &domain->pending_works, 
>>>>> node) {
>>>>> +
>>>>> +        list_del_init(&entry->node);
>>>>> +
>>>>> +        /* Stop any other related pending resets */
>>>>> +        cancel_delayed_work(&entry->base);
>>>>> +    }
>>>>> +
>>>>> +    mutex_unlock(&domain->reset_lock);
>>>>>   }
>>>>>     void amdgpu_device_lock_reset_domain(struct 
>>>>> amdgpu_reset_domain *reset_domain);
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h 
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>> index 239f232f9c02..574e870d3064 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>> @@ -25,6 +25,7 @@
>>>>>   #define AMDGPU_VIRT_H
>>>>>     #include "amdgv_sriovmsg.h"
>>>>> +#include "amdgpu_reset.h"
>>>>>     #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS  (1 << 0) /* vBIOS is 
>>>>> sr-iov ready */
>>>>>   #define AMDGPU_SRIOV_CAPS_ENABLE_IOV   (1 << 1) /* sr-iov is 
>>>>> enabled on this GPU */
>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt {
>>>>>       uint32_t            reg_val_offs;
>>>>>       struct amdgpu_irq_src        ack_irq;
>>>>>       struct amdgpu_irq_src        rcv_irq;
>>>>> -    struct work_struct        flr_work;
>>>>> +    struct amdgpu_reset_work_struct flr_work;
>>>>>       struct amdgpu_mm_table        mm_table;
>>>>>       const struct amdgpu_virt_ops    *ops;
>>>>>       struct amdgpu_vf_error_buffer    vf_errors;
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>> index b81acf59870c..f3d1c2be9292 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>> @@ -251,7 +251,7 @@ static int xgpu_ai_set_mailbox_ack_irq(struct 
>>>>> amdgpu_device *adev,
>>>>>     static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
>>>>>   {
>>>>> -    struct amdgpu_virt *virt = container_of(work, struct 
>>>>> amdgpu_virt, flr_work);
>>>>> +    struct amdgpu_virt *virt = container_of(work, struct 
>>>>> amdgpu_virt, flr_work.base.work);
>>>>>       struct amdgpu_device *adev = container_of(virt, struct 
>>>>> amdgpu_device, virt);
>>>>>       int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>   @@ -380,7 +380,8 @@ int xgpu_ai_mailbox_get_irq(struct 
>>>>> amdgpu_device *adev)
>>>>>           return r;
>>>>>       }
>>>>>   -    INIT_WORK(&adev->virt.flr_work, xgpu_ai_mailbox_flr_work);
>>>>> +    INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>> xgpu_ai_mailbox_flr_work);
>>>>> +    INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>         return 0;
>>>>>   }
>>>>> @@ -389,6 +390,8 @@ void xgpu_ai_mailbox_put_irq(struct 
>>>>> amdgpu_device *adev)
>>>>>   {
>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>> +
>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>> &adev->virt.flr_work);
>>>>>   }
>>>>>     static int xgpu_ai_request_init_data(struct amdgpu_device *adev)
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>> index 22c10b97ea81..927b3d5bb1d0 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>> @@ -275,7 +275,7 @@ static int xgpu_nv_set_mailbox_ack_irq(struct 
>>>>> amdgpu_device *adev,
>>>>>     static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
>>>>>   {
>>>>> -    struct amdgpu_virt *virt = container_of(work, struct 
>>>>> amdgpu_virt, flr_work);
>>>>> +    struct amdgpu_virt *virt = container_of(work, struct 
>>>>> amdgpu_virt, flr_work.base.work);
>>>>>       struct amdgpu_device *adev = container_of(virt, struct 
>>>>> amdgpu_device, virt);
>>>>>       int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>   @@ -407,7 +407,8 @@ int xgpu_nv_mailbox_get_irq(struct 
>>>>> amdgpu_device *adev)
>>>>>           return r;
>>>>>       }
>>>>>   -    INIT_WORK(&adev->virt.flr_work, xgpu_nv_mailbox_flr_work);
>>>>> +    INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>> xgpu_nv_mailbox_flr_work);
>>>>> +    INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>         return 0;
>>>>>   }
>>>>> @@ -416,6 +417,8 @@ void xgpu_nv_mailbox_put_irq(struct 
>>>>> amdgpu_device *adev)
>>>>>   {
>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>> +
>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>> &adev->virt.flr_work);
>>>>>   }
>>>>>     const struct amdgpu_virt_ops xgpu_nv_virt_ops = {
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c 
>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>> index 7b63d30b9b79..1d4ef5c70730 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>> @@ -512,7 +512,7 @@ static int xgpu_vi_set_mailbox_ack_irq(struct 
>>>>> amdgpu_device *adev,
>>>>>     static void xgpu_vi_mailbox_flr_work(struct work_struct *work)
>>>>>   {
>>>>> -    struct amdgpu_virt *virt = container_of(work, struct 
>>>>> amdgpu_virt, flr_work);
>>>>> +    struct amdgpu_virt *virt = container_of(work, struct 
>>>>> amdgpu_virt, flr_work.base.work);
>>>>>       struct amdgpu_device *adev = container_of(virt, struct 
>>>>> amdgpu_device, virt);
>>>>>         /* wait until RCV_MSG become 3 */
>>>>> @@ -610,7 +610,8 @@ int xgpu_vi_mailbox_get_irq(struct 
>>>>> amdgpu_device *adev)
>>>>>           return r;
>>>>>       }
>>>>>   -    INIT_WORK(&adev->virt.flr_work, xgpu_vi_mailbox_flr_work);
>>>>> +    INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>> xgpu_vi_mailbox_flr_work);
>>>>> +    INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>         return 0;
>>>>>   }
>>>>> @@ -619,6 +620,8 @@ void xgpu_vi_mailbox_put_irq(struct 
>>>>> amdgpu_device *adev)
>>>>>   {
>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>> +
>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>> &adev->virt.flr_work);
>>>>>   }
>>>>>     const struct amdgpu_virt_ops xgpu_vi_virt_ops = {
>>>>
>>


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive.
  2022-05-05 15:06         ` Christian König
@ 2022-05-05 18:57           ` Andrey Grodzovsky
  2022-05-05 19:49             ` Felix Kuehling
  0 siblings, 1 reply; 40+ messages in thread
From: Andrey Grodzovsky @ 2022-05-05 18:57 UTC (permalink / raw)
  To: Christian König, amd-gfx; +Cc: Bai Zoy, lijo.lazar


On 2022-05-05 11:06, Christian König wrote:
> Am 05.05.22 um 15:54 schrieb Andrey Grodzovsky:
>>
>> On 2022-05-05 09:23, Christian König wrote:
>>> Am 05.05.22 um 15:15 schrieb Andrey Grodzovsky:
>>>> On 2022-05-05 06:09, Christian König wrote:
>>>>
>>>>> Am 04.05.22 um 18:18 schrieb Andrey Grodzovsky:
>>>>>> Problem:
>>>>>> During hive reset caused by command timing out on a ring
>>>>>> extra resets are generated by triggered by KFD which is
>>>>>> unable to accesses registers on the resetting ASIC.
>>>>>>
>>>>>> Fix: Rework GPU reset to use a list of pending reset jobs
>>>>>> such that the first reset jobs that actaully resets the entire
>>>>>> reset domain will cancel all those pending redundant resets.
>>>>>>
>>>>>> This is in line with what we already do for redundant TDRs
>>>>>> in scheduler code.
>>>>>
>>>>> Mhm, why exactly do you need the extra linked list then?
>>>>>
>>>>> Let's talk about that on our call today.
>>>>
>>>>
>>>> Going to miss it as you know, and also this is the place to discuss 
>>>> technical questions anyway so -
>>>
>>> Good point.
>>>
>>>> It's needed because those other resets are not time out handlers 
>>>> that are governed by the scheduler
>>>> but rather external resets that are triggered by such clients as 
>>>> KFD, RAS and sysfs. Scheduler has no
>>>> knowledge of them (and should not have) but they are serialized 
>>>> into same wq as the TO handlers
>>>> from the scheduler. It just happens that TO triggered reset causes 
>>>> in turn another reset (from KFD in
>>>> this case) and we want to prevent this second reset from taking 
>>>> place just as we want to avoid multiple
>>>> TO resets to take place in scheduler code.
>>>
>>> Yeah, but why do you need multiple workers?
>>>
>>> You have a single worker for the GPU reset not triggered by the 
>>> scheduler in you adev and cancel that at the end of the reset 
>>> procedure.
>>>
>>> If anybody things it needs to trigger another reset while in reset 
>>> (which is actually a small design bug separately) the reset will 
>>> just be canceled in the same way we cancel the scheduler resets.
>>>
>>> Christian.
>>
>>
>> Had this in mind at first but then I realized that each client (RAS, 
>> KFD and sysfs) will want to fill his own data for the work (see 
>> amdgpu_device_gpu_recover) - for XGMI hive each will want to set his 
>> own adev (which is fine if you set a work per adev as you suggest) 
>> but also each client might want (they all put NULL there but in 
>> theory in the future) also set his own bad job value and here you 
>> might have a collision.
>
> Yeah, but that is intentional. See when we have a job that needs to be 
> consumed by the reset handler and not overwritten or something.


I am not sure why this is a requirement, multiple clients can decide 
concurrently to trigger a reset for some reason (possibly independent 
reasons) hence they cannot share same work struct to pass to it their data.


>
>
> Additional to that keep in mind that you can't allocate any memory 
> before or during the GPU reset nor wait for the reset to complete (so 
> you can't allocate anything on the stack either).


There is no dynamic allocation here, regarding stack allocations - we do 
it all the time when we call functions, even during GPU resets, how on 
stack allocation of work struct in amdgpu_device_gpu_recover is 
different from any other local variable we allocate in any function we 
call ?

I am also not sure why it's not allowed to wait for reset to complete ? 
Also, see in amdgpu_ras_do_recovery and gpu_recover_get (debugfs) - the 
caller expects the reset to complete before he returns. I can probably 
work around it in RAS code by calling atomic_set(&ras->in_recovery, 0)  
from some callback within actual reset function but regarding sysfs it 
actually expects a result returned indicating whether the call was 
successful or not.

Andrey


>
> I don't think that concept you try here will work.
>
> Regards,
> Christian.
>
>> Also in general seems to me it's cleaner approach where this logic 
>> (the work items) are held and handled in reset_domain and are not 
>> split in each adev or any other entity. We might want in the future 
>> to even move the scheduler handling into reset domain since reset 
>> domain is supposed to be a generic things and not only or AMD.
>>
>> Andrey
>>
>>
>>>
>>>>
>>>> Andrey
>>>>
>>>>
>>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>>>
>>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com>
>>>>>> ---
>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 11 +---
>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++--
>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c  |  3 +
>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h  | 73 
>>>>>> +++++++++++++++++++++-
>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h   |  3 +-
>>>>>>   drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c      |  7 ++-
>>>>>>   drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c      |  7 ++-
>>>>>>   drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c      |  7 ++-
>>>>>>   8 files changed, 104 insertions(+), 24 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>> index 4264abc5604d..99efd8317547 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>> @@ -109,6 +109,7 @@
>>>>>>   #include "amdgpu_fdinfo.h"
>>>>>>   #include "amdgpu_mca.h"
>>>>>>   #include "amdgpu_ras.h"
>>>>>> +#include "amdgpu_reset.h"
>>>>>>     #define MAX_GPU_INSTANCE        16
>>>>>>   @@ -509,16 +510,6 @@ struct amdgpu_allowed_register_entry {
>>>>>>       bool grbm_indexed;
>>>>>>   };
>>>>>>   -enum amd_reset_method {
>>>>>> -    AMD_RESET_METHOD_NONE = -1,
>>>>>> -    AMD_RESET_METHOD_LEGACY = 0,
>>>>>> -    AMD_RESET_METHOD_MODE0,
>>>>>> -    AMD_RESET_METHOD_MODE1,
>>>>>> -    AMD_RESET_METHOD_MODE2,
>>>>>> -    AMD_RESET_METHOD_BACO,
>>>>>> -    AMD_RESET_METHOD_PCI,
>>>>>> -};
>>>>>> -
>>>>>>   struct amdgpu_video_codec_info {
>>>>>>       u32 codec_type;
>>>>>>       u32 max_width;
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>> index e582f1044c0f..7fa82269c30f 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>> @@ -5201,6 +5201,12 @@ int amdgpu_device_gpu_recover_imp(struct 
>>>>>> amdgpu_device *adev,
>>>>>>       }
>>>>>>         tmp_vram_lost_counter = 
>>>>>> atomic_read(&((adev)->vram_lost_counter));
>>>>>> +
>>>>>> +    /* Drop all pending resets since we will reset now anyway */
>>>>>> +    tmp_adev = list_first_entry(device_list_handle, struct 
>>>>>> amdgpu_device,
>>>>>> +                        reset_list);
>>>>>> + amdgpu_reset_pending_list(tmp_adev->reset_domain);
>>>>>> +
>>>>>>       /* Actual ASIC resets if needed.*/
>>>>>>       /* Host driver will handle XGMI hive reset for SRIOV */
>>>>>>       if (amdgpu_sriov_vf(adev)) {
>>>>>> @@ -5296,7 +5302,7 @@ int amdgpu_device_gpu_recover_imp(struct 
>>>>>> amdgpu_device *adev,
>>>>>>   }
>>>>>>     struct amdgpu_recover_work_struct {
>>>>>> -    struct work_struct base;
>>>>>> +    struct amdgpu_reset_work_struct base;
>>>>>>       struct amdgpu_device *adev;
>>>>>>       struct amdgpu_job *job;
>>>>>>       int ret;
>>>>>> @@ -5304,7 +5310,7 @@ struct amdgpu_recover_work_struct {
>>>>>>     static void amdgpu_device_queue_gpu_recover_work(struct 
>>>>>> work_struct *work)
>>>>>>   {
>>>>>> -    struct amdgpu_recover_work_struct *recover_work = 
>>>>>> container_of(work, struct amdgpu_recover_work_struct, base);
>>>>>> +    struct amdgpu_recover_work_struct *recover_work = 
>>>>>> container_of(work, struct amdgpu_recover_work_struct, 
>>>>>> base.base.work);
>>>>>>         recover_work->ret = 
>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, 
>>>>>> recover_work->job);
>>>>>>   }
>>>>>> @@ -5316,12 +5322,15 @@ int amdgpu_device_gpu_recover(struct 
>>>>>> amdgpu_device *adev,
>>>>>>   {
>>>>>>       struct amdgpu_recover_work_struct work = {.adev = adev, 
>>>>>> .job = job};
>>>>>>   -    INIT_WORK(&work.base, amdgpu_device_queue_gpu_recover_work);
>>>>>> +    INIT_DELAYED_WORK(&work.base.base, 
>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>> +    INIT_LIST_HEAD(&work.base.node);
>>>>>>         if (!amdgpu_reset_domain_schedule(adev->reset_domain, 
>>>>>> &work.base))
>>>>>>           return -EAGAIN;
>>>>>>   -    flush_work(&work.base);
>>>>>> +    flush_delayed_work(&work.base.base);
>>>>>> +
>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>> &work.base);
>>>>>>         return work.ret;
>>>>>>   }
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>> index c80af0889773..ffddd419c351 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain 
>>>>>> *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d
>>>>>>       atomic_set(&reset_domain->in_gpu_reset, 0);
>>>>>>       init_rwsem(&reset_domain->sem);
>>>>>>   + INIT_LIST_HEAD(&reset_domain->pending_works);
>>>>>> +    mutex_init(&reset_domain->reset_lock);
>>>>>> +
>>>>>>       return reset_domain;
>>>>>>   }
>>>>>>   diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>> index 1949dbe28a86..863ec5720fc1 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>> @@ -24,7 +24,18 @@
>>>>>>   #ifndef __AMDGPU_RESET_H__
>>>>>>   #define __AMDGPU_RESET_H__
>>>>>>   -#include "amdgpu.h"
>>>>>> +
>>>>>> +#include <linux/atomic.h>
>>>>>> +#include <linux/mutex.h>
>>>>>> +#include <linux/list.h>
>>>>>> +#include <linux/kref.h>
>>>>>> +#include <linux/rwsem.h>
>>>>>> +#include <linux/workqueue.h>
>>>>>> +
>>>>>> +struct amdgpu_device;
>>>>>> +struct amdgpu_job;
>>>>>> +struct amdgpu_hive_info;
>>>>>> +
>>>>>>     enum AMDGPU_RESET_FLAGS {
>>>>>>   @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS {
>>>>>>       AMDGPU_SKIP_HW_RESET = 1,
>>>>>>   };
>>>>>>   +
>>>>>> +enum amd_reset_method {
>>>>>> +    AMD_RESET_METHOD_NONE = -1,
>>>>>> +    AMD_RESET_METHOD_LEGACY = 0,
>>>>>> +    AMD_RESET_METHOD_MODE0,
>>>>>> +    AMD_RESET_METHOD_MODE1,
>>>>>> +    AMD_RESET_METHOD_MODE2,
>>>>>> +    AMD_RESET_METHOD_BACO,
>>>>>> +    AMD_RESET_METHOD_PCI,
>>>>>> +};
>>>>>> +
>>>>>>   struct amdgpu_reset_context {
>>>>>>       enum amd_reset_method method;
>>>>>>       struct amdgpu_device *reset_req_dev;
>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context {
>>>>>>       unsigned long flags;
>>>>>>   };
>>>>>>   +struct amdgpu_reset_control;
>>>>>> +
>>>>>>   struct amdgpu_reset_handler {
>>>>>>       enum amd_reset_method reset_method;
>>>>>>       struct list_head handler_list;
>>>>>> @@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type {
>>>>>>       XGMI_HIVE
>>>>>>   };
>>>>>>   +
>>>>>> +struct amdgpu_reset_work_struct {
>>>>>> +    struct delayed_work base;
>>>>>> +    struct list_head node;
>>>>>> +};
>>>>>> +
>>>>>>   struct amdgpu_reset_domain {
>>>>>>       struct kref refcount;
>>>>>>       struct workqueue_struct *wq;
>>>>>>       enum amdgpu_reset_domain_type type;
>>>>>>       struct rw_semaphore sem;
>>>>>>       atomic_t in_gpu_reset;
>>>>>> +
>>>>>> +    struct list_head pending_works;
>>>>>> +    struct mutex reset_lock;
>>>>>>   };
>>>>>>     @@ -113,9 +146,43 @@ static inline void 
>>>>>> amdgpu_reset_put_reset_domain(struct amdgpu_reset_domain *dom
>>>>>>   }
>>>>>>     static inline bool amdgpu_reset_domain_schedule(struct 
>>>>>> amdgpu_reset_domain *domain,
>>>>>> -                        struct work_struct *work)
>>>>>> +                        struct amdgpu_reset_work_struct *work)
>>>>>>   {
>>>>>> -    return queue_work(domain->wq, work);
>>>>>> +    mutex_lock(&domain->reset_lock);
>>>>>> +
>>>>>> +    if (!queue_delayed_work(domain->wq, &work->base, 0)) {
>>>>>> +        mutex_unlock(&domain->reset_lock);
>>>>>> +        return false;
>>>>>> +    }
>>>>>> +
>>>>>> +    list_add_tail(&work->node, &domain->pending_works);
>>>>>> +    mutex_unlock(&domain->reset_lock);
>>>>>> +
>>>>>> +    return true;
>>>>>> +}
>>>>>> +
>>>>>> +static inline void amdgpu_reset_domain_del_pendning_work(struct 
>>>>>> amdgpu_reset_domain *domain,
>>>>>> +                  struct amdgpu_reset_work_struct *work)
>>>>>> +{
>>>>>> +    mutex_lock(&domain->reset_lock);
>>>>>> +    list_del_init(&work->node);
>>>>>> +    mutex_unlock(&domain->reset_lock);
>>>>>> +}
>>>>>> +
>>>>>> +static inline void amdgpu_reset_pending_list(struct 
>>>>>> amdgpu_reset_domain *domain)
>>>>>> +{
>>>>>> +    struct amdgpu_reset_work_struct *entry, *tmp;
>>>>>> +
>>>>>> +    mutex_lock(&domain->reset_lock);
>>>>>> +    list_for_each_entry_safe(entry, tmp, &domain->pending_works, 
>>>>>> node) {
>>>>>> +
>>>>>> +        list_del_init(&entry->node);
>>>>>> +
>>>>>> +        /* Stop any other related pending resets */
>>>>>> +        cancel_delayed_work(&entry->base);
>>>>>> +    }
>>>>>> +
>>>>>> +    mutex_unlock(&domain->reset_lock);
>>>>>>   }
>>>>>>     void amdgpu_device_lock_reset_domain(struct 
>>>>>> amdgpu_reset_domain *reset_domain);
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h 
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>> index 239f232f9c02..574e870d3064 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>> @@ -25,6 +25,7 @@
>>>>>>   #define AMDGPU_VIRT_H
>>>>>>     #include "amdgv_sriovmsg.h"
>>>>>> +#include "amdgpu_reset.h"
>>>>>>     #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS  (1 << 0) /* vBIOS is 
>>>>>> sr-iov ready */
>>>>>>   #define AMDGPU_SRIOV_CAPS_ENABLE_IOV   (1 << 1) /* sr-iov is 
>>>>>> enabled on this GPU */
>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt {
>>>>>>       uint32_t            reg_val_offs;
>>>>>>       struct amdgpu_irq_src        ack_irq;
>>>>>>       struct amdgpu_irq_src        rcv_irq;
>>>>>> -    struct work_struct        flr_work;
>>>>>> +    struct amdgpu_reset_work_struct flr_work;
>>>>>>       struct amdgpu_mm_table        mm_table;
>>>>>>       const struct amdgpu_virt_ops    *ops;
>>>>>>       struct amdgpu_vf_error_buffer    vf_errors;
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>> index b81acf59870c..f3d1c2be9292 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>> @@ -251,7 +251,7 @@ static int xgpu_ai_set_mailbox_ack_irq(struct 
>>>>>> amdgpu_device *adev,
>>>>>>     static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
>>>>>>   {
>>>>>> -    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>> amdgpu_virt, flr_work);
>>>>>> +    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>> amdgpu_virt, flr_work.base.work);
>>>>>>       struct amdgpu_device *adev = container_of(virt, struct 
>>>>>> amdgpu_device, virt);
>>>>>>       int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>   @@ -380,7 +380,8 @@ int xgpu_ai_mailbox_get_irq(struct 
>>>>>> amdgpu_device *adev)
>>>>>>           return r;
>>>>>>       }
>>>>>>   -    INIT_WORK(&adev->virt.flr_work, xgpu_ai_mailbox_flr_work);
>>>>>> +    INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>> xgpu_ai_mailbox_flr_work);
>>>>>> +    INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>         return 0;
>>>>>>   }
>>>>>> @@ -389,6 +390,8 @@ void xgpu_ai_mailbox_put_irq(struct 
>>>>>> amdgpu_device *adev)
>>>>>>   {
>>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>> +
>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>> &adev->virt.flr_work);
>>>>>>   }
>>>>>>     static int xgpu_ai_request_init_data(struct amdgpu_device *adev)
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>> @@ -275,7 +275,7 @@ static int xgpu_nv_set_mailbox_ack_irq(struct 
>>>>>> amdgpu_device *adev,
>>>>>>     static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
>>>>>>   {
>>>>>> -    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>> amdgpu_virt, flr_work);
>>>>>> +    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>> amdgpu_virt, flr_work.base.work);
>>>>>>       struct amdgpu_device *adev = container_of(virt, struct 
>>>>>> amdgpu_device, virt);
>>>>>>       int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>   @@ -407,7 +407,8 @@ int xgpu_nv_mailbox_get_irq(struct 
>>>>>> amdgpu_device *adev)
>>>>>>           return r;
>>>>>>       }
>>>>>>   -    INIT_WORK(&adev->virt.flr_work, xgpu_nv_mailbox_flr_work);
>>>>>> +    INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>> xgpu_nv_mailbox_flr_work);
>>>>>> +    INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>         return 0;
>>>>>>   }
>>>>>> @@ -416,6 +417,8 @@ void xgpu_nv_mailbox_put_irq(struct 
>>>>>> amdgpu_device *adev)
>>>>>>   {
>>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>> +
>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>> &adev->virt.flr_work);
>>>>>>   }
>>>>>>     const struct amdgpu_virt_ops xgpu_nv_virt_ops = {
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c 
>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>> @@ -512,7 +512,7 @@ static int xgpu_vi_set_mailbox_ack_irq(struct 
>>>>>> amdgpu_device *adev,
>>>>>>     static void xgpu_vi_mailbox_flr_work(struct work_struct *work)
>>>>>>   {
>>>>>> -    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>> amdgpu_virt, flr_work);
>>>>>> +    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>> amdgpu_virt, flr_work.base.work);
>>>>>>       struct amdgpu_device *adev = container_of(virt, struct 
>>>>>> amdgpu_device, virt);
>>>>>>         /* wait until RCV_MSG become 3 */
>>>>>> @@ -610,7 +610,8 @@ int xgpu_vi_mailbox_get_irq(struct 
>>>>>> amdgpu_device *adev)
>>>>>>           return r;
>>>>>>       }
>>>>>>   -    INIT_WORK(&adev->virt.flr_work, xgpu_vi_mailbox_flr_work);
>>>>>> +    INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>> xgpu_vi_mailbox_flr_work);
>>>>>> +    INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>         return 0;
>>>>>>   }
>>>>>> @@ -619,6 +620,8 @@ void xgpu_vi_mailbox_put_irq(struct 
>>>>>> amdgpu_device *adev)
>>>>>>   {
>>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>> +
>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>> &adev->virt.flr_work);
>>>>>>   }
>>>>>>     const struct amdgpu_virt_ops xgpu_vi_virt_ops = {
>>>>>
>>>
>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive.
  2022-05-05 18:57           ` Andrey Grodzovsky
@ 2022-05-05 19:49             ` Felix Kuehling
  2022-05-05 21:47               ` Andrey Grodzovsky
  0 siblings, 1 reply; 40+ messages in thread
From: Felix Kuehling @ 2022-05-05 19:49 UTC (permalink / raw)
  To: Andrey Grodzovsky, Christian König, amd-gfx; +Cc: Bai Zoy, lijo.lazar


Am 2022-05-05 um 14:57 schrieb Andrey Grodzovsky:
>
> On 2022-05-05 11:06, Christian König wrote:
>> Am 05.05.22 um 15:54 schrieb Andrey Grodzovsky:
>>>
>>> On 2022-05-05 09:23, Christian König wrote:
>>>> Am 05.05.22 um 15:15 schrieb Andrey Grodzovsky:
>>>>> On 2022-05-05 06:09, Christian König wrote:
>>>>>
>>>>>> Am 04.05.22 um 18:18 schrieb Andrey Grodzovsky:
>>>>>>> Problem:
>>>>>>> During hive reset caused by command timing out on a ring
>>>>>>> extra resets are generated by triggered by KFD which is
>>>>>>> unable to accesses registers on the resetting ASIC.
>>>>>>>
>>>>>>> Fix: Rework GPU reset to use a list of pending reset jobs
>>>>>>> such that the first reset jobs that actaully resets the entire
>>>>>>> reset domain will cancel all those pending redundant resets.
>>>>>>>
>>>>>>> This is in line with what we already do for redundant TDRs
>>>>>>> in scheduler code.
>>>>>>
>>>>>> Mhm, why exactly do you need the extra linked list then?
>>>>>>
>>>>>> Let's talk about that on our call today.
>>>>>
>>>>>
>>>>> Going to miss it as you know, and also this is the place to 
>>>>> discuss technical questions anyway so -
>>>>
>>>> Good point.
>>>>
>>>>> It's needed because those other resets are not time out handlers 
>>>>> that are governed by the scheduler
>>>>> but rather external resets that are triggered by such clients as 
>>>>> KFD, RAS and sysfs. Scheduler has no
>>>>> knowledge of them (and should not have) but they are serialized 
>>>>> into same wq as the TO handlers
>>>>> from the scheduler. It just happens that TO triggered reset causes 
>>>>> in turn another reset (from KFD in
>>>>> this case) and we want to prevent this second reset from taking 
>>>>> place just as we want to avoid multiple
>>>>> TO resets to take place in scheduler code.
>>>>
>>>> Yeah, but why do you need multiple workers?
>>>>
>>>> You have a single worker for the GPU reset not triggered by the 
>>>> scheduler in you adev and cancel that at the end of the reset 
>>>> procedure.
>>>>
>>>> If anybody things it needs to trigger another reset while in reset 
>>>> (which is actually a small design bug separately) the reset will 
>>>> just be canceled in the same way we cancel the scheduler resets.
>>>>
>>>> Christian.
>>>
>>>
>>> Had this in mind at first but then I realized that each client (RAS, 
>>> KFD and sysfs) will want to fill his own data for the work (see 
>>> amdgpu_device_gpu_recover) - for XGMI hive each will want to set his 
>>> own adev (which is fine if you set a work per adev as you suggest) 
>>> but also each client might want (they all put NULL there but in 
>>> theory in the future) also set his own bad job value and here you 
>>> might have a collision.
>>
>> Yeah, but that is intentional. See when we have a job that needs to 
>> be consumed by the reset handler and not overwritten or something.
>
>
> I am not sure why this is a requirement, multiple clients can decide 
> concurrently to trigger a reset for some reason (possibly independent 
> reasons) hence they cannot share same work struct to pass to it their 
> data.

If those concurrent clients could detect that a reset was already in 
progress, you wouldn't need the complexity of multiple work structs 
being scheduled. You could simply return without triggering another reset.

I'd put the reset work struct into the reset_domain struct. That way 
you'd have exactly one worker for the reset domain. You could implement 
a lock-less scheme to decide whether you need to schedule a reset, e.g. 
using an atomic counter in the shared work struct that gets incremented 
when a client wants to trigger a reset (atomic_add_return). If that 
counter is exactly 1 after incrementing, you need to fill in the rest of 
the work struct and schedule the work. Otherwise, it's already scheduled 
(or another client is in the process of scheduling it) and you just 
return. When the worker finishes (after confirming a successful reset), 
it resets the counter to 0, so the next client requesting a reset will 
schedule the worker again.

Regards,
   Felix


>
>
>>
>>
>> Additional to that keep in mind that you can't allocate any memory 
>> before or during the GPU reset nor wait for the reset to complete (so 
>> you can't allocate anything on the stack either).
>
>
> There is no dynamic allocation here, regarding stack allocations - we 
> do it all the time when we call functions, even during GPU resets, how 
> on stack allocation of work struct in amdgpu_device_gpu_recover is 
> different from any other local variable we allocate in any function we 
> call ?
>
> I am also not sure why it's not allowed to wait for reset to complete 
> ? Also, see in amdgpu_ras_do_recovery and gpu_recover_get (debugfs) - 
> the caller expects the reset to complete before he returns. I can 
> probably work around it in RAS code by calling 
> atomic_set(&ras->in_recovery, 0)  from some callback within actual 
> reset function but regarding sysfs it actually expects a result 
> returned indicating whether the call was successful or not.
>
> Andrey
>
>
>>
>> I don't think that concept you try here will work.
>>
>> Regards,
>> Christian.
>>
>>> Also in general seems to me it's cleaner approach where this logic 
>>> (the work items) are held and handled in reset_domain and are not 
>>> split in each adev or any other entity. We might want in the future 
>>> to even move the scheduler handling into reset domain since reset 
>>> domain is supposed to be a generic things and not only or AMD.
>>>
>>> Andrey
>>>
>>>
>>>>
>>>>>
>>>>> Andrey
>>>>>
>>>>>
>>>>>>
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>>>>
>>>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com>
>>>>>>> ---
>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 11 +---
>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++--
>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c  |  3 +
>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h  | 73 
>>>>>>> +++++++++++++++++++++-
>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h   |  3 +-
>>>>>>>   drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c      |  7 ++-
>>>>>>>   drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c      |  7 ++-
>>>>>>>   drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c      |  7 ++-
>>>>>>>   8 files changed, 104 insertions(+), 24 deletions(-)
>>>>>>>
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>> index 4264abc5604d..99efd8317547 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>> @@ -109,6 +109,7 @@
>>>>>>>   #include "amdgpu_fdinfo.h"
>>>>>>>   #include "amdgpu_mca.h"
>>>>>>>   #include "amdgpu_ras.h"
>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>     #define MAX_GPU_INSTANCE        16
>>>>>>>   @@ -509,16 +510,6 @@ struct amdgpu_allowed_register_entry {
>>>>>>>       bool grbm_indexed;
>>>>>>>   };
>>>>>>>   -enum amd_reset_method {
>>>>>>> -    AMD_RESET_METHOD_NONE = -1,
>>>>>>> -    AMD_RESET_METHOD_LEGACY = 0,
>>>>>>> -    AMD_RESET_METHOD_MODE0,
>>>>>>> -    AMD_RESET_METHOD_MODE1,
>>>>>>> -    AMD_RESET_METHOD_MODE2,
>>>>>>> -    AMD_RESET_METHOD_BACO,
>>>>>>> -    AMD_RESET_METHOD_PCI,
>>>>>>> -};
>>>>>>> -
>>>>>>>   struct amdgpu_video_codec_info {
>>>>>>>       u32 codec_type;
>>>>>>>       u32 max_width;
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>> index e582f1044c0f..7fa82269c30f 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>> @@ -5201,6 +5201,12 @@ int amdgpu_device_gpu_recover_imp(struct 
>>>>>>> amdgpu_device *adev,
>>>>>>>       }
>>>>>>>         tmp_vram_lost_counter = 
>>>>>>> atomic_read(&((adev)->vram_lost_counter));
>>>>>>> +
>>>>>>> +    /* Drop all pending resets since we will reset now anyway */
>>>>>>> +    tmp_adev = list_first_entry(device_list_handle, struct 
>>>>>>> amdgpu_device,
>>>>>>> +                        reset_list);
>>>>>>> + amdgpu_reset_pending_list(tmp_adev->reset_domain);
>>>>>>> +
>>>>>>>       /* Actual ASIC resets if needed.*/
>>>>>>>       /* Host driver will handle XGMI hive reset for SRIOV */
>>>>>>>       if (amdgpu_sriov_vf(adev)) {
>>>>>>> @@ -5296,7 +5302,7 @@ int amdgpu_device_gpu_recover_imp(struct 
>>>>>>> amdgpu_device *adev,
>>>>>>>   }
>>>>>>>     struct amdgpu_recover_work_struct {
>>>>>>> -    struct work_struct base;
>>>>>>> +    struct amdgpu_reset_work_struct base;
>>>>>>>       struct amdgpu_device *adev;
>>>>>>>       struct amdgpu_job *job;
>>>>>>>       int ret;
>>>>>>> @@ -5304,7 +5310,7 @@ struct amdgpu_recover_work_struct {
>>>>>>>     static void amdgpu_device_queue_gpu_recover_work(struct 
>>>>>>> work_struct *work)
>>>>>>>   {
>>>>>>> -    struct amdgpu_recover_work_struct *recover_work = 
>>>>>>> container_of(work, struct amdgpu_recover_work_struct, base);
>>>>>>> +    struct amdgpu_recover_work_struct *recover_work = 
>>>>>>> container_of(work, struct amdgpu_recover_work_struct, 
>>>>>>> base.base.work);
>>>>>>>         recover_work->ret = 
>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, 
>>>>>>> recover_work->job);
>>>>>>>   }
>>>>>>> @@ -5316,12 +5322,15 @@ int amdgpu_device_gpu_recover(struct 
>>>>>>> amdgpu_device *adev,
>>>>>>>   {
>>>>>>>       struct amdgpu_recover_work_struct work = {.adev = adev, 
>>>>>>> .job = job};
>>>>>>>   -    INIT_WORK(&work.base, amdgpu_device_queue_gpu_recover_work);
>>>>>>> +    INIT_DELAYED_WORK(&work.base.base, 
>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>> +    INIT_LIST_HEAD(&work.base.node);
>>>>>>>         if (!amdgpu_reset_domain_schedule(adev->reset_domain, 
>>>>>>> &work.base))
>>>>>>>           return -EAGAIN;
>>>>>>>   -    flush_work(&work.base);
>>>>>>> +    flush_delayed_work(&work.base.base);
>>>>>>> +
>>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>> &work.base);
>>>>>>>         return work.ret;
>>>>>>>   }
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>> index c80af0889773..ffddd419c351 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain 
>>>>>>> *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d
>>>>>>>       atomic_set(&reset_domain->in_gpu_reset, 0);
>>>>>>>       init_rwsem(&reset_domain->sem);
>>>>>>>   + INIT_LIST_HEAD(&reset_domain->pending_works);
>>>>>>> +    mutex_init(&reset_domain->reset_lock);
>>>>>>> +
>>>>>>>       return reset_domain;
>>>>>>>   }
>>>>>>>   diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>> index 1949dbe28a86..863ec5720fc1 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>> @@ -24,7 +24,18 @@
>>>>>>>   #ifndef __AMDGPU_RESET_H__
>>>>>>>   #define __AMDGPU_RESET_H__
>>>>>>>   -#include "amdgpu.h"
>>>>>>> +
>>>>>>> +#include <linux/atomic.h>
>>>>>>> +#include <linux/mutex.h>
>>>>>>> +#include <linux/list.h>
>>>>>>> +#include <linux/kref.h>
>>>>>>> +#include <linux/rwsem.h>
>>>>>>> +#include <linux/workqueue.h>
>>>>>>> +
>>>>>>> +struct amdgpu_device;
>>>>>>> +struct amdgpu_job;
>>>>>>> +struct amdgpu_hive_info;
>>>>>>> +
>>>>>>>     enum AMDGPU_RESET_FLAGS {
>>>>>>>   @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS {
>>>>>>>       AMDGPU_SKIP_HW_RESET = 1,
>>>>>>>   };
>>>>>>>   +
>>>>>>> +enum amd_reset_method {
>>>>>>> +    AMD_RESET_METHOD_NONE = -1,
>>>>>>> +    AMD_RESET_METHOD_LEGACY = 0,
>>>>>>> +    AMD_RESET_METHOD_MODE0,
>>>>>>> +    AMD_RESET_METHOD_MODE1,
>>>>>>> +    AMD_RESET_METHOD_MODE2,
>>>>>>> +    AMD_RESET_METHOD_BACO,
>>>>>>> +    AMD_RESET_METHOD_PCI,
>>>>>>> +};
>>>>>>> +
>>>>>>>   struct amdgpu_reset_context {
>>>>>>>       enum amd_reset_method method;
>>>>>>>       struct amdgpu_device *reset_req_dev;
>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context {
>>>>>>>       unsigned long flags;
>>>>>>>   };
>>>>>>>   +struct amdgpu_reset_control;
>>>>>>> +
>>>>>>>   struct amdgpu_reset_handler {
>>>>>>>       enum amd_reset_method reset_method;
>>>>>>>       struct list_head handler_list;
>>>>>>> @@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type {
>>>>>>>       XGMI_HIVE
>>>>>>>   };
>>>>>>>   +
>>>>>>> +struct amdgpu_reset_work_struct {
>>>>>>> +    struct delayed_work base;
>>>>>>> +    struct list_head node;
>>>>>>> +};
>>>>>>> +
>>>>>>>   struct amdgpu_reset_domain {
>>>>>>>       struct kref refcount;
>>>>>>>       struct workqueue_struct *wq;
>>>>>>>       enum amdgpu_reset_domain_type type;
>>>>>>>       struct rw_semaphore sem;
>>>>>>>       atomic_t in_gpu_reset;
>>>>>>> +
>>>>>>> +    struct list_head pending_works;
>>>>>>> +    struct mutex reset_lock;
>>>>>>>   };
>>>>>>>     @@ -113,9 +146,43 @@ static inline void 
>>>>>>> amdgpu_reset_put_reset_domain(struct amdgpu_reset_domain *dom
>>>>>>>   }
>>>>>>>     static inline bool amdgpu_reset_domain_schedule(struct 
>>>>>>> amdgpu_reset_domain *domain,
>>>>>>> -                        struct work_struct *work)
>>>>>>> +                        struct amdgpu_reset_work_struct *work)
>>>>>>>   {
>>>>>>> -    return queue_work(domain->wq, work);
>>>>>>> +    mutex_lock(&domain->reset_lock);
>>>>>>> +
>>>>>>> +    if (!queue_delayed_work(domain->wq, &work->base, 0)) {
>>>>>>> +        mutex_unlock(&domain->reset_lock);
>>>>>>> +        return false;
>>>>>>> +    }
>>>>>>> +
>>>>>>> +    list_add_tail(&work->node, &domain->pending_works);
>>>>>>> +    mutex_unlock(&domain->reset_lock);
>>>>>>> +
>>>>>>> +    return true;
>>>>>>> +}
>>>>>>> +
>>>>>>> +static inline void amdgpu_reset_domain_del_pendning_work(struct 
>>>>>>> amdgpu_reset_domain *domain,
>>>>>>> +                  struct amdgpu_reset_work_struct *work)
>>>>>>> +{
>>>>>>> +    mutex_lock(&domain->reset_lock);
>>>>>>> +    list_del_init(&work->node);
>>>>>>> +    mutex_unlock(&domain->reset_lock);
>>>>>>> +}
>>>>>>> +
>>>>>>> +static inline void amdgpu_reset_pending_list(struct 
>>>>>>> amdgpu_reset_domain *domain)
>>>>>>> +{
>>>>>>> +    struct amdgpu_reset_work_struct *entry, *tmp;
>>>>>>> +
>>>>>>> +    mutex_lock(&domain->reset_lock);
>>>>>>> +    list_for_each_entry_safe(entry, tmp, 
>>>>>>> &domain->pending_works, node) {
>>>>>>> +
>>>>>>> +        list_del_init(&entry->node);
>>>>>>> +
>>>>>>> +        /* Stop any other related pending resets */
>>>>>>> +        cancel_delayed_work(&entry->base);
>>>>>>> +    }
>>>>>>> +
>>>>>>> +    mutex_unlock(&domain->reset_lock);
>>>>>>>   }
>>>>>>>     void amdgpu_device_lock_reset_domain(struct 
>>>>>>> amdgpu_reset_domain *reset_domain);
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h 
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>> index 239f232f9c02..574e870d3064 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>> @@ -25,6 +25,7 @@
>>>>>>>   #define AMDGPU_VIRT_H
>>>>>>>     #include "amdgv_sriovmsg.h"
>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>     #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS  (1 << 0) /* vBIOS is 
>>>>>>> sr-iov ready */
>>>>>>>   #define AMDGPU_SRIOV_CAPS_ENABLE_IOV   (1 << 1) /* sr-iov is 
>>>>>>> enabled on this GPU */
>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt {
>>>>>>>       uint32_t            reg_val_offs;
>>>>>>>       struct amdgpu_irq_src        ack_irq;
>>>>>>>       struct amdgpu_irq_src        rcv_irq;
>>>>>>> -    struct work_struct        flr_work;
>>>>>>> +    struct amdgpu_reset_work_struct flr_work;
>>>>>>>       struct amdgpu_mm_table        mm_table;
>>>>>>>       const struct amdgpu_virt_ops    *ops;
>>>>>>>       struct amdgpu_vf_error_buffer    vf_errors;
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>> index b81acf59870c..f3d1c2be9292 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>> @@ -251,7 +251,7 @@ static int 
>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device *adev,
>>>>>>>     static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
>>>>>>>   {
>>>>>>> -    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>>> amdgpu_virt, flr_work);
>>>>>>> +    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>>> amdgpu_virt, flr_work.base.work);
>>>>>>>       struct amdgpu_device *adev = container_of(virt, struct 
>>>>>>> amdgpu_device, virt);
>>>>>>>       int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>   @@ -380,7 +380,8 @@ int xgpu_ai_mailbox_get_irq(struct 
>>>>>>> amdgpu_device *adev)
>>>>>>>           return r;
>>>>>>>       }
>>>>>>>   -    INIT_WORK(&adev->virt.flr_work, xgpu_ai_mailbox_flr_work);
>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>> xgpu_ai_mailbox_flr_work);
>>>>>>> +    INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>         return 0;
>>>>>>>   }
>>>>>>> @@ -389,6 +390,8 @@ void xgpu_ai_mailbox_put_irq(struct 
>>>>>>> amdgpu_device *adev)
>>>>>>>   {
>>>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>> +
>>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>> &adev->virt.flr_work);
>>>>>>>   }
>>>>>>>     static int xgpu_ai_request_init_data(struct amdgpu_device 
>>>>>>> *adev)
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>> @@ -275,7 +275,7 @@ static int 
>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device *adev,
>>>>>>>     static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
>>>>>>>   {
>>>>>>> -    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>>> amdgpu_virt, flr_work);
>>>>>>> +    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>>> amdgpu_virt, flr_work.base.work);
>>>>>>>       struct amdgpu_device *adev = container_of(virt, struct 
>>>>>>> amdgpu_device, virt);
>>>>>>>       int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>   @@ -407,7 +407,8 @@ int xgpu_nv_mailbox_get_irq(struct 
>>>>>>> amdgpu_device *adev)
>>>>>>>           return r;
>>>>>>>       }
>>>>>>>   -    INIT_WORK(&adev->virt.flr_work, xgpu_nv_mailbox_flr_work);
>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>> xgpu_nv_mailbox_flr_work);
>>>>>>> +    INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>         return 0;
>>>>>>>   }
>>>>>>> @@ -416,6 +417,8 @@ void xgpu_nv_mailbox_put_irq(struct 
>>>>>>> amdgpu_device *adev)
>>>>>>>   {
>>>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>> +
>>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>> &adev->virt.flr_work);
>>>>>>>   }
>>>>>>>     const struct amdgpu_virt_ops xgpu_nv_virt_ops = {
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c 
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>> @@ -512,7 +512,7 @@ static int 
>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device *adev,
>>>>>>>     static void xgpu_vi_mailbox_flr_work(struct work_struct *work)
>>>>>>>   {
>>>>>>> -    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>>> amdgpu_virt, flr_work);
>>>>>>> +    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>>> amdgpu_virt, flr_work.base.work);
>>>>>>>       struct amdgpu_device *adev = container_of(virt, struct 
>>>>>>> amdgpu_device, virt);
>>>>>>>         /* wait until RCV_MSG become 3 */
>>>>>>> @@ -610,7 +610,8 @@ int xgpu_vi_mailbox_get_irq(struct 
>>>>>>> amdgpu_device *adev)
>>>>>>>           return r;
>>>>>>>       }
>>>>>>>   -    INIT_WORK(&adev->virt.flr_work, xgpu_vi_mailbox_flr_work);
>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>> xgpu_vi_mailbox_flr_work);
>>>>>>> +    INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>         return 0;
>>>>>>>   }
>>>>>>> @@ -619,6 +620,8 @@ void xgpu_vi_mailbox_put_irq(struct 
>>>>>>> amdgpu_device *adev)
>>>>>>>   {
>>>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>> +
>>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>> &adev->virt.flr_work);
>>>>>>>   }
>>>>>>>     const struct amdgpu_virt_ops xgpu_vi_virt_ops = {
>>>>>>
>>>>
>>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive.
  2022-05-05 19:49             ` Felix Kuehling
@ 2022-05-05 21:47               ` Andrey Grodzovsky
  2022-05-06  5:41                 ` Luben Tuikov
  2022-05-06  6:02                 ` Lazar, Lijo
  0 siblings, 2 replies; 40+ messages in thread
From: Andrey Grodzovsky @ 2022-05-05 21:47 UTC (permalink / raw)
  To: Felix Kuehling, Christian König, amd-gfx; +Cc: Bai Zoy, lijo.lazar


On 2022-05-05 15:49, Felix Kuehling wrote:
>
> Am 2022-05-05 um 14:57 schrieb Andrey Grodzovsky:
>>
>> On 2022-05-05 11:06, Christian König wrote:
>>> Am 05.05.22 um 15:54 schrieb Andrey Grodzovsky:
>>>>
>>>> On 2022-05-05 09:23, Christian König wrote:
>>>>> Am 05.05.22 um 15:15 schrieb Andrey Grodzovsky:
>>>>>> On 2022-05-05 06:09, Christian König wrote:
>>>>>>
>>>>>>> Am 04.05.22 um 18:18 schrieb Andrey Grodzovsky:
>>>>>>>> Problem:
>>>>>>>> During hive reset caused by command timing out on a ring
>>>>>>>> extra resets are generated by triggered by KFD which is
>>>>>>>> unable to accesses registers on the resetting ASIC.
>>>>>>>>
>>>>>>>> Fix: Rework GPU reset to use a list of pending reset jobs
>>>>>>>> such that the first reset jobs that actaully resets the entire
>>>>>>>> reset domain will cancel all those pending redundant resets.
>>>>>>>>
>>>>>>>> This is in line with what we already do for redundant TDRs
>>>>>>>> in scheduler code.
>>>>>>>
>>>>>>> Mhm, why exactly do you need the extra linked list then?
>>>>>>>
>>>>>>> Let's talk about that on our call today.
>>>>>>
>>>>>>
>>>>>> Going to miss it as you know, and also this is the place to 
>>>>>> discuss technical questions anyway so -
>>>>>
>>>>> Good point.
>>>>>
>>>>>> It's needed because those other resets are not time out handlers 
>>>>>> that are governed by the scheduler
>>>>>> but rather external resets that are triggered by such clients as 
>>>>>> KFD, RAS and sysfs. Scheduler has no
>>>>>> knowledge of them (and should not have) but they are serialized 
>>>>>> into same wq as the TO handlers
>>>>>> from the scheduler. It just happens that TO triggered reset 
>>>>>> causes in turn another reset (from KFD in
>>>>>> this case) and we want to prevent this second reset from taking 
>>>>>> place just as we want to avoid multiple
>>>>>> TO resets to take place in scheduler code.
>>>>>
>>>>> Yeah, but why do you need multiple workers?
>>>>>
>>>>> You have a single worker for the GPU reset not triggered by the 
>>>>> scheduler in you adev and cancel that at the end of the reset 
>>>>> procedure.
>>>>>
>>>>> If anybody things it needs to trigger another reset while in reset 
>>>>> (which is actually a small design bug separately) the reset will 
>>>>> just be canceled in the same way we cancel the scheduler resets.
>>>>>
>>>>> Christian.
>>>>
>>>>
>>>> Had this in mind at first but then I realized that each client 
>>>> (RAS, KFD and sysfs) will want to fill his own data for the work 
>>>> (see amdgpu_device_gpu_recover) - for XGMI hive each will want to 
>>>> set his own adev (which is fine if you set a work per adev as you 
>>>> suggest) but also each client might want (they all put NULL there 
>>>> but in theory in the future) also set his own bad job value and 
>>>> here you might have a collision.
>>>
>>> Yeah, but that is intentional. See when we have a job that needs to 
>>> be consumed by the reset handler and not overwritten or something.
>>
>>
>> I am not sure why this is a requirement, multiple clients can decide 
>> concurrently to trigger a reset for some reason (possibly independent 
>> reasons) hence they cannot share same work struct to pass to it their 
>> data.
>
> If those concurrent clients could detect that a reset was already in 
> progress, you wouldn't need the complexity of multiple work structs 
> being scheduled. You could simply return without triggering another 
> reset.


In my view main problem here with single work struct either at reset 
domain level or even adev level is that in some cases we optimize resets 
and don't really perform ASIC HW reset (see amdgpu_job_timedout with 
soft recovery and skip_hw_reset in amdgpu_device_gpu_recover_imp for the 
case the bad job does get signaled just before we start HW reset and we 
just skip this). You can see that if many different reset sources share 
same work struct what can happen is that the first to obtain the lock 
you describe bellow might opt out from full HW reset because his bad job 
did signal for example or because his hunged IP block was able to 
recover through SW reset but in the meantime another reset source who 
needed an actual HW reset just silently returned and we end up with 
unhandled reset request. True that today this happens only to job 
timeout reset sources that are handled form within the scheduler and 
won't use this single work struct but no one prevents a future case for 
this to happen and also, if we actually want to unify scheduler time out 
handlers within reset domain (which seems to me the right design 
approach) we won't be able to use just one work struct for this reason 
anyway.

Andrey


>
> I'd put the reset work struct into the reset_domain struct. That way 
> you'd have exactly one worker for the reset domain. You could 
> implement a lock-less scheme to decide whether you need to schedule a 
> reset, e.g. using an atomic counter in the shared work struct that 
> gets incremented when a client wants to trigger a reset 
> (atomic_add_return). If that counter is exactly 1 after incrementing, 
> you need to fill in the rest of the work struct and schedule the work. 
> Otherwise, it's already scheduled (or another client is in the process 
> of scheduling it) and you just return. When the worker finishes (after 
> confirming a successful reset), it resets the counter to 0, so the 
> next client requesting a reset will schedule the worker again.
>
> Regards,
>   Felix
>
>
>>
>>
>>>
>>>
>>> Additional to that keep in mind that you can't allocate any memory 
>>> before or during the GPU reset nor wait for the reset to complete 
>>> (so you can't allocate anything on the stack either).
>>
>>
>> There is no dynamic allocation here, regarding stack allocations - we 
>> do it all the time when we call functions, even during GPU resets, 
>> how on stack allocation of work struct in amdgpu_device_gpu_recover 
>> is different from any other local variable we allocate in any 
>> function we call ?
>>
>> I am also not sure why it's not allowed to wait for reset to complete 
>> ? Also, see in amdgpu_ras_do_recovery and gpu_recover_get (debugfs) - 
>> the caller expects the reset to complete before he returns. I can 
>> probably work around it in RAS code by calling 
>> atomic_set(&ras->in_recovery, 0)  from some callback within actual 
>> reset function but regarding sysfs it actually expects a result 
>> returned indicating whether the call was successful or not.
>>
>> Andrey
>>
>>
>>>
>>> I don't think that concept you try here will work.
>>>
>>> Regards,
>>> Christian.
>>>
>>>> Also in general seems to me it's cleaner approach where this logic 
>>>> (the work items) are held and handled in reset_domain and are not 
>>>> split in each adev or any other entity. We might want in the future 
>>>> to even move the scheduler handling into reset domain since reset 
>>>> domain is supposed to be a generic things and not only or AMD.
>>>>
>>>> Andrey
>>>>
>>>>
>>>>>
>>>>>>
>>>>>> Andrey
>>>>>>
>>>>>>
>>>>>>>
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>>>>
>>>>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com>
>>>>>>>> ---
>>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 11 +---
>>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++--
>>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c  |  3 +
>>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h  | 73 
>>>>>>>> +++++++++++++++++++++-
>>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h   |  3 +-
>>>>>>>>   drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c      |  7 ++-
>>>>>>>>   drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c      |  7 ++-
>>>>>>>>   drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c      |  7 ++-
>>>>>>>>   8 files changed, 104 insertions(+), 24 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>> index 4264abc5604d..99efd8317547 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>> @@ -109,6 +109,7 @@
>>>>>>>>   #include "amdgpu_fdinfo.h"
>>>>>>>>   #include "amdgpu_mca.h"
>>>>>>>>   #include "amdgpu_ras.h"
>>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>>     #define MAX_GPU_INSTANCE        16
>>>>>>>>   @@ -509,16 +510,6 @@ struct amdgpu_allowed_register_entry {
>>>>>>>>       bool grbm_indexed;
>>>>>>>>   };
>>>>>>>>   -enum amd_reset_method {
>>>>>>>> -    AMD_RESET_METHOD_NONE = -1,
>>>>>>>> -    AMD_RESET_METHOD_LEGACY = 0,
>>>>>>>> -    AMD_RESET_METHOD_MODE0,
>>>>>>>> -    AMD_RESET_METHOD_MODE1,
>>>>>>>> -    AMD_RESET_METHOD_MODE2,
>>>>>>>> -    AMD_RESET_METHOD_BACO,
>>>>>>>> -    AMD_RESET_METHOD_PCI,
>>>>>>>> -};
>>>>>>>> -
>>>>>>>>   struct amdgpu_video_codec_info {
>>>>>>>>       u32 codec_type;
>>>>>>>>       u32 max_width;
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>> index e582f1044c0f..7fa82269c30f 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>> @@ -5201,6 +5201,12 @@ int amdgpu_device_gpu_recover_imp(struct 
>>>>>>>> amdgpu_device *adev,
>>>>>>>>       }
>>>>>>>>         tmp_vram_lost_counter = 
>>>>>>>> atomic_read(&((adev)->vram_lost_counter));
>>>>>>>> +
>>>>>>>> +    /* Drop all pending resets since we will reset now anyway */
>>>>>>>> +    tmp_adev = list_first_entry(device_list_handle, struct 
>>>>>>>> amdgpu_device,
>>>>>>>> +                        reset_list);
>>>>>>>> + amdgpu_reset_pending_list(tmp_adev->reset_domain);
>>>>>>>> +
>>>>>>>>       /* Actual ASIC resets if needed.*/
>>>>>>>>       /* Host driver will handle XGMI hive reset for SRIOV */
>>>>>>>>       if (amdgpu_sriov_vf(adev)) {
>>>>>>>> @@ -5296,7 +5302,7 @@ int amdgpu_device_gpu_recover_imp(struct 
>>>>>>>> amdgpu_device *adev,
>>>>>>>>   }
>>>>>>>>     struct amdgpu_recover_work_struct {
>>>>>>>> -    struct work_struct base;
>>>>>>>> +    struct amdgpu_reset_work_struct base;
>>>>>>>>       struct amdgpu_device *adev;
>>>>>>>>       struct amdgpu_job *job;
>>>>>>>>       int ret;
>>>>>>>> @@ -5304,7 +5310,7 @@ struct amdgpu_recover_work_struct {
>>>>>>>>     static void amdgpu_device_queue_gpu_recover_work(struct 
>>>>>>>> work_struct *work)
>>>>>>>>   {
>>>>>>>> -    struct amdgpu_recover_work_struct *recover_work = 
>>>>>>>> container_of(work, struct amdgpu_recover_work_struct, base);
>>>>>>>> +    struct amdgpu_recover_work_struct *recover_work = 
>>>>>>>> container_of(work, struct amdgpu_recover_work_struct, 
>>>>>>>> base.base.work);
>>>>>>>>         recover_work->ret = 
>>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, 
>>>>>>>> recover_work->job);
>>>>>>>>   }
>>>>>>>> @@ -5316,12 +5322,15 @@ int amdgpu_device_gpu_recover(struct 
>>>>>>>> amdgpu_device *adev,
>>>>>>>>   {
>>>>>>>>       struct amdgpu_recover_work_struct work = {.adev = adev, 
>>>>>>>> .job = job};
>>>>>>>>   -    INIT_WORK(&work.base, 
>>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>>> +    INIT_DELAYED_WORK(&work.base.base, 
>>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>>> +    INIT_LIST_HEAD(&work.base.node);
>>>>>>>>         if (!amdgpu_reset_domain_schedule(adev->reset_domain, 
>>>>>>>> &work.base))
>>>>>>>>           return -EAGAIN;
>>>>>>>>   -    flush_work(&work.base);
>>>>>>>> +    flush_delayed_work(&work.base.base);
>>>>>>>> +
>>>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>> &work.base);
>>>>>>>>         return work.ret;
>>>>>>>>   }
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>> index c80af0889773..ffddd419c351 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain 
>>>>>>>> *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d
>>>>>>>>       atomic_set(&reset_domain->in_gpu_reset, 0);
>>>>>>>>       init_rwsem(&reset_domain->sem);
>>>>>>>>   + INIT_LIST_HEAD(&reset_domain->pending_works);
>>>>>>>> +    mutex_init(&reset_domain->reset_lock);
>>>>>>>> +
>>>>>>>>       return reset_domain;
>>>>>>>>   }
>>>>>>>>   diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>> index 1949dbe28a86..863ec5720fc1 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>> @@ -24,7 +24,18 @@
>>>>>>>>   #ifndef __AMDGPU_RESET_H__
>>>>>>>>   #define __AMDGPU_RESET_H__
>>>>>>>>   -#include "amdgpu.h"
>>>>>>>> +
>>>>>>>> +#include <linux/atomic.h>
>>>>>>>> +#include <linux/mutex.h>
>>>>>>>> +#include <linux/list.h>
>>>>>>>> +#include <linux/kref.h>
>>>>>>>> +#include <linux/rwsem.h>
>>>>>>>> +#include <linux/workqueue.h>
>>>>>>>> +
>>>>>>>> +struct amdgpu_device;
>>>>>>>> +struct amdgpu_job;
>>>>>>>> +struct amdgpu_hive_info;
>>>>>>>> +
>>>>>>>>     enum AMDGPU_RESET_FLAGS {
>>>>>>>>   @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS {
>>>>>>>>       AMDGPU_SKIP_HW_RESET = 1,
>>>>>>>>   };
>>>>>>>>   +
>>>>>>>> +enum amd_reset_method {
>>>>>>>> +    AMD_RESET_METHOD_NONE = -1,
>>>>>>>> +    AMD_RESET_METHOD_LEGACY = 0,
>>>>>>>> +    AMD_RESET_METHOD_MODE0,
>>>>>>>> +    AMD_RESET_METHOD_MODE1,
>>>>>>>> +    AMD_RESET_METHOD_MODE2,
>>>>>>>> +    AMD_RESET_METHOD_BACO,
>>>>>>>> +    AMD_RESET_METHOD_PCI,
>>>>>>>> +};
>>>>>>>> +
>>>>>>>>   struct amdgpu_reset_context {
>>>>>>>>       enum amd_reset_method method;
>>>>>>>>       struct amdgpu_device *reset_req_dev;
>>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context {
>>>>>>>>       unsigned long flags;
>>>>>>>>   };
>>>>>>>>   +struct amdgpu_reset_control;
>>>>>>>> +
>>>>>>>>   struct amdgpu_reset_handler {
>>>>>>>>       enum amd_reset_method reset_method;
>>>>>>>>       struct list_head handler_list;
>>>>>>>> @@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type {
>>>>>>>>       XGMI_HIVE
>>>>>>>>   };
>>>>>>>>   +
>>>>>>>> +struct amdgpu_reset_work_struct {
>>>>>>>> +    struct delayed_work base;
>>>>>>>> +    struct list_head node;
>>>>>>>> +};
>>>>>>>> +
>>>>>>>>   struct amdgpu_reset_domain {
>>>>>>>>       struct kref refcount;
>>>>>>>>       struct workqueue_struct *wq;
>>>>>>>>       enum amdgpu_reset_domain_type type;
>>>>>>>>       struct rw_semaphore sem;
>>>>>>>>       atomic_t in_gpu_reset;
>>>>>>>> +
>>>>>>>> +    struct list_head pending_works;
>>>>>>>> +    struct mutex reset_lock;
>>>>>>>>   };
>>>>>>>>     @@ -113,9 +146,43 @@ static inline void 
>>>>>>>> amdgpu_reset_put_reset_domain(struct amdgpu_reset_domain *dom
>>>>>>>>   }
>>>>>>>>     static inline bool amdgpu_reset_domain_schedule(struct 
>>>>>>>> amdgpu_reset_domain *domain,
>>>>>>>> -                        struct work_struct *work)
>>>>>>>> +                        struct amdgpu_reset_work_struct *work)
>>>>>>>>   {
>>>>>>>> -    return queue_work(domain->wq, work);
>>>>>>>> +    mutex_lock(&domain->reset_lock);
>>>>>>>> +
>>>>>>>> +    if (!queue_delayed_work(domain->wq, &work->base, 0)) {
>>>>>>>> +        mutex_unlock(&domain->reset_lock);
>>>>>>>> +        return false;
>>>>>>>> +    }
>>>>>>>> +
>>>>>>>> +    list_add_tail(&work->node, &domain->pending_works);
>>>>>>>> +    mutex_unlock(&domain->reset_lock);
>>>>>>>> +
>>>>>>>> +    return true;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static inline void 
>>>>>>>> amdgpu_reset_domain_del_pendning_work(struct 
>>>>>>>> amdgpu_reset_domain *domain,
>>>>>>>> +                  struct amdgpu_reset_work_struct *work)
>>>>>>>> +{
>>>>>>>> +    mutex_lock(&domain->reset_lock);
>>>>>>>> +    list_del_init(&work->node);
>>>>>>>> +    mutex_unlock(&domain->reset_lock);
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static inline void amdgpu_reset_pending_list(struct 
>>>>>>>> amdgpu_reset_domain *domain)
>>>>>>>> +{
>>>>>>>> +    struct amdgpu_reset_work_struct *entry, *tmp;
>>>>>>>> +
>>>>>>>> +    mutex_lock(&domain->reset_lock);
>>>>>>>> +    list_for_each_entry_safe(entry, tmp, 
>>>>>>>> &domain->pending_works, node) {
>>>>>>>> +
>>>>>>>> +        list_del_init(&entry->node);
>>>>>>>> +
>>>>>>>> +        /* Stop any other related pending resets */
>>>>>>>> +        cancel_delayed_work(&entry->base);
>>>>>>>> +    }
>>>>>>>> +
>>>>>>>> +    mutex_unlock(&domain->reset_lock);
>>>>>>>>   }
>>>>>>>>     void amdgpu_device_lock_reset_domain(struct 
>>>>>>>> amdgpu_reset_domain *reset_domain);
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h 
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>> index 239f232f9c02..574e870d3064 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>> @@ -25,6 +25,7 @@
>>>>>>>>   #define AMDGPU_VIRT_H
>>>>>>>>     #include "amdgv_sriovmsg.h"
>>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>>     #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS  (1 << 0) /* vBIOS is 
>>>>>>>> sr-iov ready */
>>>>>>>>   #define AMDGPU_SRIOV_CAPS_ENABLE_IOV   (1 << 1) /* sr-iov is 
>>>>>>>> enabled on this GPU */
>>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt {
>>>>>>>>       uint32_t            reg_val_offs;
>>>>>>>>       struct amdgpu_irq_src        ack_irq;
>>>>>>>>       struct amdgpu_irq_src        rcv_irq;
>>>>>>>> -    struct work_struct        flr_work;
>>>>>>>> +    struct amdgpu_reset_work_struct flr_work;
>>>>>>>>       struct amdgpu_mm_table        mm_table;
>>>>>>>>       const struct amdgpu_virt_ops    *ops;
>>>>>>>>       struct amdgpu_vf_error_buffer    vf_errors;
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>> index b81acf59870c..f3d1c2be9292 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>> @@ -251,7 +251,7 @@ static int 
>>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device *adev,
>>>>>>>>     static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
>>>>>>>>   {
>>>>>>>> -    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>>>> amdgpu_virt, flr_work);
>>>>>>>> +    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>>>> amdgpu_virt, flr_work.base.work);
>>>>>>>>       struct amdgpu_device *adev = container_of(virt, struct 
>>>>>>>> amdgpu_device, virt);
>>>>>>>>       int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>   @@ -380,7 +380,8 @@ int xgpu_ai_mailbox_get_irq(struct 
>>>>>>>> amdgpu_device *adev)
>>>>>>>>           return r;
>>>>>>>>       }
>>>>>>>>   -    INIT_WORK(&adev->virt.flr_work, xgpu_ai_mailbox_flr_work);
>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>> xgpu_ai_mailbox_flr_work);
>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>         return 0;
>>>>>>>>   }
>>>>>>>> @@ -389,6 +390,8 @@ void xgpu_ai_mailbox_put_irq(struct 
>>>>>>>> amdgpu_device *adev)
>>>>>>>>   {
>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>> +
>>>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>> &adev->virt.flr_work);
>>>>>>>>   }
>>>>>>>>     static int xgpu_ai_request_init_data(struct amdgpu_device 
>>>>>>>> *adev)
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>> @@ -275,7 +275,7 @@ static int 
>>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device *adev,
>>>>>>>>     static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
>>>>>>>>   {
>>>>>>>> -    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>>>> amdgpu_virt, flr_work);
>>>>>>>> +    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>>>> amdgpu_virt, flr_work.base.work);
>>>>>>>>       struct amdgpu_device *adev = container_of(virt, struct 
>>>>>>>> amdgpu_device, virt);
>>>>>>>>       int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>   @@ -407,7 +407,8 @@ int xgpu_nv_mailbox_get_irq(struct 
>>>>>>>> amdgpu_device *adev)
>>>>>>>>           return r;
>>>>>>>>       }
>>>>>>>>   -    INIT_WORK(&adev->virt.flr_work, xgpu_nv_mailbox_flr_work);
>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>> xgpu_nv_mailbox_flr_work);
>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>         return 0;
>>>>>>>>   }
>>>>>>>> @@ -416,6 +417,8 @@ void xgpu_nv_mailbox_put_irq(struct 
>>>>>>>> amdgpu_device *adev)
>>>>>>>>   {
>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>> +
>>>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>> &adev->virt.flr_work);
>>>>>>>>   }
>>>>>>>>     const struct amdgpu_virt_ops xgpu_nv_virt_ops = {
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c 
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>> @@ -512,7 +512,7 @@ static int 
>>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device *adev,
>>>>>>>>     static void xgpu_vi_mailbox_flr_work(struct work_struct *work)
>>>>>>>>   {
>>>>>>>> -    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>>>> amdgpu_virt, flr_work);
>>>>>>>> +    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>>>> amdgpu_virt, flr_work.base.work);
>>>>>>>>       struct amdgpu_device *adev = container_of(virt, struct 
>>>>>>>> amdgpu_device, virt);
>>>>>>>>         /* wait until RCV_MSG become 3 */
>>>>>>>> @@ -610,7 +610,8 @@ int xgpu_vi_mailbox_get_irq(struct 
>>>>>>>> amdgpu_device *adev)
>>>>>>>>           return r;
>>>>>>>>       }
>>>>>>>>   -    INIT_WORK(&adev->virt.flr_work, xgpu_vi_mailbox_flr_work);
>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>> xgpu_vi_mailbox_flr_work);
>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>         return 0;
>>>>>>>>   }
>>>>>>>> @@ -619,6 +620,8 @@ void xgpu_vi_mailbox_put_irq(struct 
>>>>>>>> amdgpu_device *adev)
>>>>>>>>   {
>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>> +
>>>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>> &adev->virt.flr_work);
>>>>>>>>   }
>>>>>>>>     const struct amdgpu_virt_ops xgpu_vi_virt_ops = {
>>>>>>>
>>>>>
>>>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive.
  2022-05-05 21:47               ` Andrey Grodzovsky
@ 2022-05-06  5:41                 ` Luben Tuikov
  2022-05-06  6:02                 ` Lazar, Lijo
  1 sibling, 0 replies; 40+ messages in thread
From: Luben Tuikov @ 2022-05-06  5:41 UTC (permalink / raw)
  To: Andrey Grodzovsky, Felix Kuehling, Christian König, amd-gfx
  Cc: Bai Zoy, lijo.lazar



On 2022-05-05 17:47, Andrey Grodzovsky wrote:
> 
> On 2022-05-05 15:49, Felix Kuehling wrote:
>>
>> Am 2022-05-05 um 14:57 schrieb Andrey Grodzovsky:
>>>
>>> On 2022-05-05 11:06, Christian König wrote:
>>>> Am 05.05.22 um 15:54 schrieb Andrey Grodzovsky:
>>>>>
>>>>> On 2022-05-05 09:23, Christian König wrote:
>>>>>> Am 05.05.22 um 15:15 schrieb Andrey Grodzovsky:
>>>>>>> On 2022-05-05 06:09, Christian König wrote:
>>>>>>>
>>>>>>>> Am 04.05.22 um 18:18 schrieb Andrey Grodzovsky:
>>>>>>>>> Problem:
>>>>>>>>> During hive reset caused by command timing out on a ring
>>>>>>>>> extra resets are generated by triggered by KFD which is
>>>>>>>>> unable to accesses registers on the resetting ASIC.
>>>>>>>>>
>>>>>>>>> Fix: Rework GPU reset to use a list of pending reset jobs
>>>>>>>>> such that the first reset jobs that actaully resets the entire
>>>>>>>>> reset domain will cancel all those pending redundant resets.
>>>>>>>>>
>>>>>>>>> This is in line with what we already do for redundant TDRs
>>>>>>>>> in scheduler code.
>>>>>>>>
>>>>>>>> Mhm, why exactly do you need the extra linked list then?
>>>>>>>>
>>>>>>>> Let's talk about that on our call today.
>>>>>>>
>>>>>>>
>>>>>>> Going to miss it as you know, and also this is the place to 
>>>>>>> discuss technical questions anyway so -
>>>>>>
>>>>>> Good point.
>>>>>>
>>>>>>> It's needed because those other resets are not time out handlers 
>>>>>>> that are governed by the scheduler
>>>>>>> but rather external resets that are triggered by such clients as 
>>>>>>> KFD, RAS and sysfs. Scheduler has no
>>>>>>> knowledge of them (and should not have) but they are serialized 
>>>>>>> into same wq as the TO handlers
>>>>>>> from the scheduler. It just happens that TO triggered reset 
>>>>>>> causes in turn another reset (from KFD in
>>>>>>> this case) and we want to prevent this second reset from taking 
>>>>>>> place just as we want to avoid multiple
>>>>>>> TO resets to take place in scheduler code.
>>>>>>
>>>>>> Yeah, but why do you need multiple workers?
>>>>>>
>>>>>> You have a single worker for the GPU reset not triggered by the 
>>>>>> scheduler in you adev and cancel that at the end of the reset 
>>>>>> procedure.
>>>>>>
>>>>>> If anybody things it needs to trigger another reset while in reset 
>>>>>> (which is actually a small design bug separately) the reset will 
>>>>>> just be canceled in the same way we cancel the scheduler resets.
>>>>>>
>>>>>> Christian.
>>>>>
>>>>>
>>>>> Had this in mind at first but then I realized that each client 
>>>>> (RAS, KFD and sysfs) will want to fill his own data for the work 
>>>>> (see amdgpu_device_gpu_recover) - for XGMI hive each will want to 
>>>>> set his own adev (which is fine if you set a work per adev as you 
>>>>> suggest) but also each client might want (they all put NULL there 
>>>>> but in theory in the future) also set his own bad job value and 
>>>>> here you might have a collision.
>>>>
>>>> Yeah, but that is intentional. See when we have a job that needs to 
>>>> be consumed by the reset handler and not overwritten or something.
>>>
>>>
>>> I am not sure why this is a requirement, multiple clients can decide 
>>> concurrently to trigger a reset for some reason (possibly independent 
>>> reasons) hence they cannot share same work struct to pass to it their 
>>> data.
>>
>> If those concurrent clients could detect that a reset was already in 
>> progress, you wouldn't need the complexity of multiple work structs 
>> being scheduled. You could simply return without triggering another 
>> reset.
> 
> 
> In my view main problem here with single work struct either at reset 
> domain level or even adev level is that in some cases we optimize resets 
> and don't really perform ASIC HW reset (see amdgpu_job_timedout with 
> soft recovery and skip_hw_reset in amdgpu_device_gpu_recover_imp for the 
> case the bad job does get signaled just before we start HW reset and we 
> just skip this). You can see that if many different reset sources share 
> same work struct what can happen is that the first to obtain the lock 
> you describe bellow might opt out from full HW reset because his bad job 

The problem is this "opting out" of reset--meaning that the client didn't do
the complete recovery work, and were too quick to call for a GPU reset. So
they need to fix this locally in their code.

Generally when a GPU reset is scheduled, it should proceed regardless,
and there shouldn't be any opting out.

> did signal for example or because his hunged IP block was able to 
> recover through SW reset but in the meantime another reset source who 
> needed an actual HW reset just silently returned and we end up with 
> unhandled reset request. True that today this happens only to job 
> timeout reset sources that are handled form within the scheduler and 
> won't use this single work struct but no one prevents a future case for 
> this to happen and also, if we actually want to unify scheduler time out 
> handlers within reset domain (which seems to me the right design 
> approach) we won't be able to use just one work struct for this reason 
> anyway.

Felix idea is good, and here is another idea which I've implemented in
open source when having to handle infinite number of wire events with finite
number of type (5 types of event), in a finite memory.

Instead of allocating a myriad of reset events when each client wants
a GPU reset, you'd have only one instance of a "reset" event, say an instance
of a linked list entry, call it R. Initially R is list_init(), so it points
to itself.

Anytime someone wants to do a reset, they do a list_move(R, event work list);
(or list_move_tail() if you want them events chronologically ordered)
and wake up a thread. If R is already on the list, meaning another client
has had scheduled the reset, then no change occurs. (You could check if R is non-empty,
and that check would tell you if R is scheduled to be processed, or you could just do
the list_move() and up(processing thread), always, to mitigate races).

Note that many clients can call list_move(R, event work list), and the result
is the same--a reset is pending to occur, regardless if it had already occurred,
or if it hadn't occurred and is about to.

Then when the processing thread wakes up, it starts consuming "events" from that
linked list, and removing them from it, and processing them. Obviously, access to
the linked list needs to be guarded by a lock.

If you have a set of different type of event, say "reset", "start", "stop", "load",
etc., you can create an array of linked list event structs and index them by a macro
whose value is the index into the array, and whose name is the type of the event.

Seeing that it's just a reset event, you could just have a single instance of a linked list.

Anyway, just an idea of an old old implementation of mine.

> 
> Andrey
> 
> 
>>
>> I'd put the reset work struct into the reset_domain struct. That way 
>> you'd have exactly one worker for the reset domain. You could 
>> implement a lock-less scheme to decide whether you need to schedule a 
>> reset, e.g. using an atomic counter in the shared work struct that 
>> gets incremented when a client wants to trigger a reset 
>> (atomic_add_return). If that counter is exactly 1 after incrementing, 
>> you need to fill in the rest of the work struct and schedule the work. 
>> Otherwise, it's already scheduled (or another client is in the process 
>> of scheduling it) and you just return. When the worker finishes (after 
>> confirming a successful reset), it resets the counter to 0, so the 
>> next client requesting a reset will schedule the worker again.
>>
>> Regards,
>>   Felix
>>
>>
>>>
>>>
>>>>
>>>>
>>>> Additional to that keep in mind that you can't allocate any memory 
>>>> before or during the GPU reset nor wait for the reset to complete 
>>>> (so you can't allocate anything on the stack either).
>>>
>>>
>>> There is no dynamic allocation here, regarding stack allocations - we 
>>> do it all the time when we call functions, even during GPU resets, 
>>> how on stack allocation of work struct in amdgpu_device_gpu_recover 
>>> is different from any other local variable we allocate in any 
>>> function we call ?
>>>
>>> I am also not sure why it's not allowed to wait for reset to complete 
>>> ? Also, see in amdgpu_ras_do_recovery and gpu_recover_get (debugfs) - 
>>> the caller expects the reset to complete before he returns. I can 
>>> probably work around it in RAS code by calling 
>>> atomic_set(&ras->in_recovery, 0)  from some callback within actual 
>>> reset function but regarding sysfs it actually expects a result 
>>> returned indicating whether the call was successful or not.
>>>
>>> Andrey
>>>
>>>
>>>>
>>>> I don't think that concept you try here will work.
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>> Also in general seems to me it's cleaner approach where this logic 
>>>>> (the work items) are held and handled in reset_domain and are not 
>>>>> split in each adev or any other entity. We might want in the future 
>>>>> to even move the scheduler handling into reset domain since reset 
>>>>> domain is supposed to be a generic things and not only or AMD.
>>>>>
>>>>> Andrey
>>>>>
>>>>>
>>>>>>
>>>>>>>
>>>>>>> Andrey
>>>>>>>
>>>>>>>
>>>>>>>>
>>>>>>>> Regards,
>>>>>>>> Christian.
>>>>>>>>
>>>>>>>>>
>>>>>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com>
>>>>>>>>> ---
>>>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 11 +---
>>>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++--
>>>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c  |  3 +
>>>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h  | 73 
>>>>>>>>> +++++++++++++++++++++-
>>>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h   |  3 +-
>>>>>>>>>   drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c      |  7 ++-
>>>>>>>>>   drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c      |  7 ++-
>>>>>>>>>   drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c      |  7 ++-
>>>>>>>>>   8 files changed, 104 insertions(+), 24 deletions(-)
>>>>>>>>>
>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>> index 4264abc5604d..99efd8317547 100644
>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>> @@ -109,6 +109,7 @@
>>>>>>>>>   #include "amdgpu_fdinfo.h"
>>>>>>>>>   #include "amdgpu_mca.h"
>>>>>>>>>   #include "amdgpu_ras.h"
>>>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>>>     #define MAX_GPU_INSTANCE        16
>>>>>>>>>   @@ -509,16 +510,6 @@ struct amdgpu_allowed_register_entry {
>>>>>>>>>       bool grbm_indexed;
>>>>>>>>>   };
>>>>>>>>>   -enum amd_reset_method {
>>>>>>>>> -    AMD_RESET_METHOD_NONE = -1,
>>>>>>>>> -    AMD_RESET_METHOD_LEGACY = 0,
>>>>>>>>> -    AMD_RESET_METHOD_MODE0,
>>>>>>>>> -    AMD_RESET_METHOD_MODE1,
>>>>>>>>> -    AMD_RESET_METHOD_MODE2,
>>>>>>>>> -    AMD_RESET_METHOD_BACO,
>>>>>>>>> -    AMD_RESET_METHOD_PCI,
>>>>>>>>> -};
>>>>>>>>> -
>>>>>>>>>   struct amdgpu_video_codec_info {
>>>>>>>>>       u32 codec_type;
>>>>>>>>>       u32 max_width;
>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>> index e582f1044c0f..7fa82269c30f 100644
>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>> @@ -5201,6 +5201,12 @@ int amdgpu_device_gpu_recover_imp(struct 
>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>       }
>>>>>>>>>         tmp_vram_lost_counter = 
>>>>>>>>> atomic_read(&((adev)->vram_lost_counter));
>>>>>>>>> +
>>>>>>>>> +    /* Drop all pending resets since we will reset now anyway */
>>>>>>>>> +    tmp_adev = list_first_entry(device_list_handle, struct 
>>>>>>>>> amdgpu_device,
>>>>>>>>> +                        reset_list);
>>>>>>>>> + amdgpu_reset_pending_list(tmp_adev->reset_domain);
>>>>>>>>> +
>>>>>>>>>       /* Actual ASIC resets if needed.*/
>>>>>>>>>       /* Host driver will handle XGMI hive reset for SRIOV */
>>>>>>>>>       if (amdgpu_sriov_vf(adev)) {
>>>>>>>>> @@ -5296,7 +5302,7 @@ int amdgpu_device_gpu_recover_imp(struct 
>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>   }
>>>>>>>>>     struct amdgpu_recover_work_struct {
>>>>>>>>> -    struct work_struct base;
>>>>>>>>> +    struct amdgpu_reset_work_struct base;
>>>>>>>>>       struct amdgpu_device *adev;
>>>>>>>>>       struct amdgpu_job *job;
>>>>>>>>>       int ret;
>>>>>>>>> @@ -5304,7 +5310,7 @@ struct amdgpu_recover_work_struct {
>>>>>>>>>     static void amdgpu_device_queue_gpu_recover_work(struct 
>>>>>>>>> work_struct *work)
>>>>>>>>>   {
>>>>>>>>> -    struct amdgpu_recover_work_struct *recover_work = 
>>>>>>>>> container_of(work, struct amdgpu_recover_work_struct, base);
>>>>>>>>> +    struct amdgpu_recover_work_struct *recover_work = 
>>>>>>>>> container_of(work, struct amdgpu_recover_work_struct, 
>>>>>>>>> base.base.work);
>>>>>>>>>         recover_work->ret = 
>>>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, 
>>>>>>>>> recover_work->job);
>>>>>>>>>   }
>>>>>>>>> @@ -5316,12 +5322,15 @@ int amdgpu_device_gpu_recover(struct 
>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>   {
>>>>>>>>>       struct amdgpu_recover_work_struct work = {.adev = adev, 
>>>>>>>>> .job = job};
>>>>>>>>>   -    INIT_WORK(&work.base, 
>>>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>>>> +    INIT_DELAYED_WORK(&work.base.base, 
>>>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>>>> +    INIT_LIST_HEAD(&work.base.node);
>>>>>>>>>         if (!amdgpu_reset_domain_schedule(adev->reset_domain, 
>>>>>>>>> &work.base))
>>>>>>>>>           return -EAGAIN;
>>>>>>>>>   -    flush_work(&work.base);
>>>>>>>>> +    flush_delayed_work(&work.base.base);
>>>>>>>>> +
>>>>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>> &work.base);
>>>>>>>>>         return work.ret;
>>>>>>>>>   }
>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>> index c80af0889773..ffddd419c351 100644
>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain 
>>>>>>>>> *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d
>>>>>>>>>       atomic_set(&reset_domain->in_gpu_reset, 0);
>>>>>>>>>       init_rwsem(&reset_domain->sem);
>>>>>>>>>   + INIT_LIST_HEAD(&reset_domain->pending_works);
>>>>>>>>> +    mutex_init(&reset_domain->reset_lock);
>>>>>>>>> +
>>>>>>>>>       return reset_domain;
>>>>>>>>>   }
>>>>>>>>>   diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>> index 1949dbe28a86..863ec5720fc1 100644
>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>> @@ -24,7 +24,18 @@
>>>>>>>>>   #ifndef __AMDGPU_RESET_H__
>>>>>>>>>   #define __AMDGPU_RESET_H__
>>>>>>>>>   -#include "amdgpu.h"
>>>>>>>>> +
>>>>>>>>> +#include <linux/atomic.h>
>>>>>>>>> +#include <linux/mutex.h>
>>>>>>>>> +#include <linux/list.h>
>>>>>>>>> +#include <linux/kref.h>
>>>>>>>>> +#include <linux/rwsem.h>
>>>>>>>>> +#include <linux/workqueue.h>
>>>>>>>>> +
>>>>>>>>> +struct amdgpu_device;
>>>>>>>>> +struct amdgpu_job;
>>>>>>>>> +struct amdgpu_hive_info;
>>>>>>>>> +
>>>>>>>>>     enum AMDGPU_RESET_FLAGS {
>>>>>>>>>   @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS {
>>>>>>>>>       AMDGPU_SKIP_HW_RESET = 1,
>>>>>>>>>   };
>>>>>>>>>   +
>>>>>>>>> +enum amd_reset_method {
>>>>>>>>> +    AMD_RESET_METHOD_NONE = -1,
>>>>>>>>> +    AMD_RESET_METHOD_LEGACY = 0,
>>>>>>>>> +    AMD_RESET_METHOD_MODE0,
>>>>>>>>> +    AMD_RESET_METHOD_MODE1,
>>>>>>>>> +    AMD_RESET_METHOD_MODE2,
>>>>>>>>> +    AMD_RESET_METHOD_BACO,
>>>>>>>>> +    AMD_RESET_METHOD_PCI,
>>>>>>>>> +};
>>>>>>>>> +
>>>>>>>>>   struct amdgpu_reset_context {
>>>>>>>>>       enum amd_reset_method method;
>>>>>>>>>       struct amdgpu_device *reset_req_dev;
>>>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context {
>>>>>>>>>       unsigned long flags;
>>>>>>>>>   };
>>>>>>>>>   +struct amdgpu_reset_control;
>>>>>>>>> +
>>>>>>>>>   struct amdgpu_reset_handler {
>>>>>>>>>       enum amd_reset_method reset_method;
>>>>>>>>>       struct list_head handler_list;
>>>>>>>>> @@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type {
>>>>>>>>>       XGMI_HIVE
>>>>>>>>>   };
>>>>>>>>>   +
>>>>>>>>> +struct amdgpu_reset_work_struct {
>>>>>>>>> +    struct delayed_work base;
>>>>>>>>> +    struct list_head node;
>>>>>>>>> +};
>>>>>>>>> +
>>>>>>>>>   struct amdgpu_reset_domain {
>>>>>>>>>       struct kref refcount;
>>>>>>>>>       struct workqueue_struct *wq;
>>>>>>>>>       enum amdgpu_reset_domain_type type;
>>>>>>>>>       struct rw_semaphore sem;
>>>>>>>>>       atomic_t in_gpu_reset;
>>>>>>>>> +
>>>>>>>>> +    struct list_head pending_works;
>>>>>>>>> +    struct mutex reset_lock;
>>>>>>>>>   };
>>>>>>>>>     @@ -113,9 +146,43 @@ static inline void 
>>>>>>>>> amdgpu_reset_put_reset_domain(struct amdgpu_reset_domain *dom
>>>>>>>>>   }
>>>>>>>>>     static inline bool amdgpu_reset_domain_schedule(struct 
>>>>>>>>> amdgpu_reset_domain *domain,
>>>>>>>>> -                        struct work_struct *work)
>>>>>>>>> +                        struct amdgpu_reset_work_struct *work)
>>>>>>>>>   {
>>>>>>>>> -    return queue_work(domain->wq, work);
>>>>>>>>> +    mutex_lock(&domain->reset_lock);
>>>>>>>>> +
>>>>>>>>> +    if (!queue_delayed_work(domain->wq, &work->base, 0)) {
>>>>>>>>> +        mutex_unlock(&domain->reset_lock);
>>>>>>>>> +        return false;
>>>>>>>>> +    }
>>>>>>>>> +
>>>>>>>>> +    list_add_tail(&work->node, &domain->pending_works);
>>>>>>>>> +    mutex_unlock(&domain->reset_lock);
>>>>>>>>> +
>>>>>>>>> +    return true;
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>> +static inline void 
>>>>>>>>> amdgpu_reset_domain_del_pendning_work(struct 
>>>>>>>>> amdgpu_reset_domain *domain,
>>>>>>>>> +                  struct amdgpu_reset_work_struct *work)
>>>>>>>>> +{
>>>>>>>>> +    mutex_lock(&domain->reset_lock);
>>>>>>>>> +    list_del_init(&work->node);
>>>>>>>>> +    mutex_unlock(&domain->reset_lock);
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>> +static inline void amdgpu_reset_pending_list(struct 
>>>>>>>>> amdgpu_reset_domain *domain)
>>>>>>>>> +{
>>>>>>>>> +    struct amdgpu_reset_work_struct *entry, *tmp;
>>>>>>>>> +
>>>>>>>>> +    mutex_lock(&domain->reset_lock);
>>>>>>>>> +    list_for_each_entry_safe(entry, tmp, 
>>>>>>>>> &domain->pending_works, node) {
>>>>>>>>> +
>>>>>>>>> +        list_del_init(&entry->node);
>>>>>>>>> +
>>>>>>>>> +        /* Stop any other related pending resets */
>>>>>>>>> +        cancel_delayed_work(&entry->base);
>>>>>>>>> +    }
>>>>>>>>> +
>>>>>>>>> +    mutex_unlock(&domain->reset_lock);
>>>>>>>>>   }
>>>>>>>>>     void amdgpu_device_lock_reset_domain(struct 
>>>>>>>>> amdgpu_reset_domain *reset_domain);
>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h 
>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>> index 239f232f9c02..574e870d3064 100644
>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>> @@ -25,6 +25,7 @@
>>>>>>>>>   #define AMDGPU_VIRT_H
>>>>>>>>>     #include "amdgv_sriovmsg.h"
>>>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>>>     #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS  (1 << 0) /* vBIOS is 
>>>>>>>>> sr-iov ready */
>>>>>>>>>   #define AMDGPU_SRIOV_CAPS_ENABLE_IOV   (1 << 1) /* sr-iov is 
>>>>>>>>> enabled on this GPU */
>>>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt {
>>>>>>>>>       uint32_t            reg_val_offs;
>>>>>>>>>       struct amdgpu_irq_src        ack_irq;
>>>>>>>>>       struct amdgpu_irq_src        rcv_irq;
>>>>>>>>> -    struct work_struct        flr_work;
>>>>>>>>> +    struct amdgpu_reset_work_struct flr_work;
>>>>>>>>>       struct amdgpu_mm_table        mm_table;
>>>>>>>>>       const struct amdgpu_virt_ops    *ops;
>>>>>>>>>       struct amdgpu_vf_error_buffer    vf_errors;
>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>> index b81acf59870c..f3d1c2be9292 100644
>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>> @@ -251,7 +251,7 @@ static int 
>>>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device *adev,
>>>>>>>>>     static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
>>>>>>>>>   {
>>>>>>>>> -    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>>>>> amdgpu_virt, flr_work);
>>>>>>>>> +    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>>>>> amdgpu_virt, flr_work.base.work);
>>>>>>>>>       struct amdgpu_device *adev = container_of(virt, struct 
>>>>>>>>> amdgpu_device, virt);
>>>>>>>>>       int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>   @@ -380,7 +380,8 @@ int xgpu_ai_mailbox_get_irq(struct 
>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>           return r;
>>>>>>>>>       }
>>>>>>>>>   -    INIT_WORK(&adev->virt.flr_work, xgpu_ai_mailbox_flr_work);
>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>> xgpu_ai_mailbox_flr_work);
>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>         return 0;
>>>>>>>>>   }
>>>>>>>>> @@ -389,6 +390,8 @@ void xgpu_ai_mailbox_put_irq(struct 
>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>   {
>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>> +
>>>>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>   }
>>>>>>>>>     static int xgpu_ai_request_init_data(struct amdgpu_device 
>>>>>>>>> *adev)
>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644
>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>> @@ -275,7 +275,7 @@ static int 
>>>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device *adev,
>>>>>>>>>     static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
>>>>>>>>>   {
>>>>>>>>> -    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>>>>> amdgpu_virt, flr_work);
>>>>>>>>> +    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>>>>> amdgpu_virt, flr_work.base.work);
>>>>>>>>>       struct amdgpu_device *adev = container_of(virt, struct 
>>>>>>>>> amdgpu_device, virt);
>>>>>>>>>       int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>   @@ -407,7 +407,8 @@ int xgpu_nv_mailbox_get_irq(struct 
>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>           return r;
>>>>>>>>>       }
>>>>>>>>>   -    INIT_WORK(&adev->virt.flr_work, xgpu_nv_mailbox_flr_work);
>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>> xgpu_nv_mailbox_flr_work);
>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>         return 0;
>>>>>>>>>   }
>>>>>>>>> @@ -416,6 +417,8 @@ void xgpu_nv_mailbox_put_irq(struct 
>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>   {
>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>> +
>>>>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>   }
>>>>>>>>>     const struct amdgpu_virt_ops xgpu_nv_virt_ops = {
>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c 
>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644
>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>> @@ -512,7 +512,7 @@ static int 
>>>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device *adev,
>>>>>>>>>     static void xgpu_vi_mailbox_flr_work(struct work_struct *work)
>>>>>>>>>   {
>>>>>>>>> -    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>>>>> amdgpu_virt, flr_work);
>>>>>>>>> +    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>>>>> amdgpu_virt, flr_work.base.work);
>>>>>>>>>       struct amdgpu_device *adev = container_of(virt, struct 
>>>>>>>>> amdgpu_device, virt);
>>>>>>>>>         /* wait until RCV_MSG become 3 */
>>>>>>>>> @@ -610,7 +610,8 @@ int xgpu_vi_mailbox_get_irq(struct 
>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>           return r;
>>>>>>>>>       }
>>>>>>>>>   -    INIT_WORK(&adev->virt.flr_work, xgpu_vi_mailbox_flr_work);
>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>> xgpu_vi_mailbox_flr_work);
>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>         return 0;
>>>>>>>>>   }
>>>>>>>>> @@ -619,6 +620,8 @@ void xgpu_vi_mailbox_put_irq(struct 
>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>   {
>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>> +
>>>>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>   }
>>>>>>>>>     const struct amdgpu_virt_ops xgpu_vi_virt_ops = {
>>>>>>>>
>>>>>>
>>>>

Regards,
-- 
Luben

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive.
  2022-05-05 21:47               ` Andrey Grodzovsky
  2022-05-06  5:41                 ` Luben Tuikov
@ 2022-05-06  6:02                 ` Lazar, Lijo
  2022-05-06  8:56                   ` Christian König
  1 sibling, 1 reply; 40+ messages in thread
From: Lazar, Lijo @ 2022-05-06  6:02 UTC (permalink / raw)
  To: Andrey Grodzovsky, Felix Kuehling, Christian König, amd-gfx; +Cc: Bai Zoy



On 5/6/2022 3:17 AM, Andrey Grodzovsky wrote:
> 
> On 2022-05-05 15:49, Felix Kuehling wrote:
>>
>> Am 2022-05-05 um 14:57 schrieb Andrey Grodzovsky:
>>>
>>> On 2022-05-05 11:06, Christian König wrote:
>>>> Am 05.05.22 um 15:54 schrieb Andrey Grodzovsky:
>>>>>
>>>>> On 2022-05-05 09:23, Christian König wrote:
>>>>>> Am 05.05.22 um 15:15 schrieb Andrey Grodzovsky:
>>>>>>> On 2022-05-05 06:09, Christian König wrote:
>>>>>>>
>>>>>>>> Am 04.05.22 um 18:18 schrieb Andrey Grodzovsky:
>>>>>>>>> Problem:
>>>>>>>>> During hive reset caused by command timing out on a ring
>>>>>>>>> extra resets are generated by triggered by KFD which is
>>>>>>>>> unable to accesses registers on the resetting ASIC.
>>>>>>>>>
>>>>>>>>> Fix: Rework GPU reset to use a list of pending reset jobs
>>>>>>>>> such that the first reset jobs that actaully resets the entire
>>>>>>>>> reset domain will cancel all those pending redundant resets.
>>>>>>>>>
>>>>>>>>> This is in line with what we already do for redundant TDRs
>>>>>>>>> in scheduler code.
>>>>>>>>
>>>>>>>> Mhm, why exactly do you need the extra linked list then?
>>>>>>>>
>>>>>>>> Let's talk about that on our call today.
>>>>>>>
>>>>>>>
>>>>>>> Going to miss it as you know, and also this is the place to 
>>>>>>> discuss technical questions anyway so -
>>>>>>
>>>>>> Good point.
>>>>>>
>>>>>>> It's needed because those other resets are not time out handlers 
>>>>>>> that are governed by the scheduler
>>>>>>> but rather external resets that are triggered by such clients as 
>>>>>>> KFD, RAS and sysfs. Scheduler has no
>>>>>>> knowledge of them (and should not have) but they are serialized 
>>>>>>> into same wq as the TO handlers
>>>>>>> from the scheduler. It just happens that TO triggered reset 
>>>>>>> causes in turn another reset (from KFD in
>>>>>>> this case) and we want to prevent this second reset from taking 
>>>>>>> place just as we want to avoid multiple
>>>>>>> TO resets to take place in scheduler code.
>>>>>>
>>>>>> Yeah, but why do you need multiple workers?
>>>>>>
>>>>>> You have a single worker for the GPU reset not triggered by the 
>>>>>> scheduler in you adev and cancel that at the end of the reset 
>>>>>> procedure.
>>>>>>
>>>>>> If anybody things it needs to trigger another reset while in reset 
>>>>>> (which is actually a small design bug separately) the reset will 
>>>>>> just be canceled in the same way we cancel the scheduler resets.
>>>>>>
>>>>>> Christian.
>>>>>
>>>>>
>>>>> Had this in mind at first but then I realized that each client 
>>>>> (RAS, KFD and sysfs) will want to fill his own data for the work 
>>>>> (see amdgpu_device_gpu_recover) - for XGMI hive each will want to 
>>>>> set his own adev (which is fine if you set a work per adev as you 
>>>>> suggest) but also each client might want (they all put NULL there 
>>>>> but in theory in the future) also set his own bad job value and 
>>>>> here you might have a collision.
>>>>
>>>> Yeah, but that is intentional. See when we have a job that needs to 
>>>> be consumed by the reset handler and not overwritten or something.
>>>
>>>
>>> I am not sure why this is a requirement, multiple clients can decide 
>>> concurrently to trigger a reset for some reason (possibly independent 
>>> reasons) hence they cannot share same work struct to pass to it their 
>>> data.
>>
>> If those concurrent clients could detect that a reset was already in 
>> progress, you wouldn't need the complexity of multiple work structs 
>> being scheduled. You could simply return without triggering another 
>> reset.
> 
> 
> In my view main problem here with single work struct either at reset 
> domain level or even adev level is that in some cases we optimize resets 
> and don't really perform ASIC HW reset (see amdgpu_job_timedout with 
> soft recovery and skip_hw_reset in amdgpu_device_gpu_recover_imp for the 
> case the bad job does get signaled just before we start HW reset and we 
> just skip this). You can see that if many different reset sources share 
> same work struct what can happen is that the first to obtain the lock 
> you describe bellow might opt out from full HW reset because his bad job 
> did signal for example or because his hunged IP block was able to 
> recover through SW reset but in the meantime another reset source who 
> needed an actual HW reset just silently returned and we end up with 
> unhandled reset request. True that today this happens only to job 
> timeout reset sources that are handled form within the scheduler and 
> won't use this single work struct but no one prevents a future case for 
> this to happen and also, if we actually want to unify scheduler time out 
> handlers within reset domain (which seems to me the right design 
> approach) we won't be able to use just one work struct for this reason 
> anyway.
> 

Just to add to this point - a reset domain is co-operative domain. In 
addition to sharing a set of clients from various reset sources for one 
device, it also will have a set of devices like in XGMI hive. The job 
timeout on one device may not eventually result in result, but a RAS 
error happening on another device at the same time would need a reset. 
The second device's RAS error cannot return seeing  that a reset work 
already started, or ignore the reset work given that another device has 
filled the reset data.

When there is a reset domain, it should take care of the work scheduled 
and keeping it in device or any other level doesn't sound good.

Thanks,
Lijo

> Andrey
> 
> 
>>
>> I'd put the reset work struct into the reset_domain struct. That way 
>> you'd have exactly one worker for the reset domain. You could 
>> implement a lock-less scheme to decide whether you need to schedule a 
>> reset, e.g. using an atomic counter in the shared work struct that 
>> gets incremented when a client wants to trigger a reset 
>> (atomic_add_return). If that counter is exactly 1 after incrementing, 
>> you need to fill in the rest of the work struct and schedule the work. 
>> Otherwise, it's already scheduled (or another client is in the process 
>> of scheduling it) and you just return. When the worker finishes (after 
>> confirming a successful reset), it resets the counter to 0, so the 
>> next client requesting a reset will schedule the worker again.
>>
>> Regards,
>>   Felix
>>
>>
>>>
>>>
>>>>
>>>>
>>>> Additional to that keep in mind that you can't allocate any memory 
>>>> before or during the GPU reset nor wait for the reset to complete 
>>>> (so you can't allocate anything on the stack either).
>>>
>>>
>>> There is no dynamic allocation here, regarding stack allocations - we 
>>> do it all the time when we call functions, even during GPU resets, 
>>> how on stack allocation of work struct in amdgpu_device_gpu_recover 
>>> is different from any other local variable we allocate in any 
>>> function we call ?
>>>
>>> I am also not sure why it's not allowed to wait for reset to complete 
>>> ? Also, see in amdgpu_ras_do_recovery and gpu_recover_get (debugfs) - 
>>> the caller expects the reset to complete before he returns. I can 
>>> probably work around it in RAS code by calling 
>>> atomic_set(&ras->in_recovery, 0)  from some callback within actual 
>>> reset function but regarding sysfs it actually expects a result 
>>> returned indicating whether the call was successful or not.
>>>
>>> Andrey
>>>
>>>
>>>>
>>>> I don't think that concept you try here will work.
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>> Also in general seems to me it's cleaner approach where this logic 
>>>>> (the work items) are held and handled in reset_domain and are not 
>>>>> split in each adev or any other entity. We might want in the future 
>>>>> to even move the scheduler handling into reset domain since reset 
>>>>> domain is supposed to be a generic things and not only or AMD.
>>>>>
>>>>> Andrey
>>>>>
>>>>>
>>>>>>
>>>>>>>
>>>>>>> Andrey
>>>>>>>
>>>>>>>
>>>>>>>>
>>>>>>>> Regards,
>>>>>>>> Christian.
>>>>>>>>
>>>>>>>>>
>>>>>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com>
>>>>>>>>> ---
>>>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 11 +---
>>>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++--
>>>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c  |  3 +
>>>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h  | 73 
>>>>>>>>> +++++++++++++++++++++-
>>>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h   |  3 +-
>>>>>>>>>   drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c      |  7 ++-
>>>>>>>>>   drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c      |  7 ++-
>>>>>>>>>   drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c      |  7 ++-
>>>>>>>>>   8 files changed, 104 insertions(+), 24 deletions(-)
>>>>>>>>>
>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>> index 4264abc5604d..99efd8317547 100644
>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>> @@ -109,6 +109,7 @@
>>>>>>>>>   #include "amdgpu_fdinfo.h"
>>>>>>>>>   #include "amdgpu_mca.h"
>>>>>>>>>   #include "amdgpu_ras.h"
>>>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>>>     #define MAX_GPU_INSTANCE        16
>>>>>>>>>   @@ -509,16 +510,6 @@ struct amdgpu_allowed_register_entry {
>>>>>>>>>       bool grbm_indexed;
>>>>>>>>>   };
>>>>>>>>>   -enum amd_reset_method {
>>>>>>>>> -    AMD_RESET_METHOD_NONE = -1,
>>>>>>>>> -    AMD_RESET_METHOD_LEGACY = 0,
>>>>>>>>> -    AMD_RESET_METHOD_MODE0,
>>>>>>>>> -    AMD_RESET_METHOD_MODE1,
>>>>>>>>> -    AMD_RESET_METHOD_MODE2,
>>>>>>>>> -    AMD_RESET_METHOD_BACO,
>>>>>>>>> -    AMD_RESET_METHOD_PCI,
>>>>>>>>> -};
>>>>>>>>> -
>>>>>>>>>   struct amdgpu_video_codec_info {
>>>>>>>>>       u32 codec_type;
>>>>>>>>>       u32 max_width;
>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>> index e582f1044c0f..7fa82269c30f 100644
>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>> @@ -5201,6 +5201,12 @@ int amdgpu_device_gpu_recover_imp(struct 
>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>       }
>>>>>>>>>         tmp_vram_lost_counter = 
>>>>>>>>> atomic_read(&((adev)->vram_lost_counter));
>>>>>>>>> +
>>>>>>>>> +    /* Drop all pending resets since we will reset now anyway */
>>>>>>>>> +    tmp_adev = list_first_entry(device_list_handle, struct 
>>>>>>>>> amdgpu_device,
>>>>>>>>> +                        reset_list);
>>>>>>>>> + amdgpu_reset_pending_list(tmp_adev->reset_domain);
>>>>>>>>> +
>>>>>>>>>       /* Actual ASIC resets if needed.*/
>>>>>>>>>       /* Host driver will handle XGMI hive reset for SRIOV */
>>>>>>>>>       if (amdgpu_sriov_vf(adev)) {
>>>>>>>>> @@ -5296,7 +5302,7 @@ int amdgpu_device_gpu_recover_imp(struct 
>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>   }
>>>>>>>>>     struct amdgpu_recover_work_struct {
>>>>>>>>> -    struct work_struct base;
>>>>>>>>> +    struct amdgpu_reset_work_struct base;
>>>>>>>>>       struct amdgpu_device *adev;
>>>>>>>>>       struct amdgpu_job *job;
>>>>>>>>>       int ret;
>>>>>>>>> @@ -5304,7 +5310,7 @@ struct amdgpu_recover_work_struct {
>>>>>>>>>     static void amdgpu_device_queue_gpu_recover_work(struct 
>>>>>>>>> work_struct *work)
>>>>>>>>>   {
>>>>>>>>> -    struct amdgpu_recover_work_struct *recover_work = 
>>>>>>>>> container_of(work, struct amdgpu_recover_work_struct, base);
>>>>>>>>> +    struct amdgpu_recover_work_struct *recover_work = 
>>>>>>>>> container_of(work, struct amdgpu_recover_work_struct, 
>>>>>>>>> base.base.work);
>>>>>>>>>         recover_work->ret = 
>>>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, 
>>>>>>>>> recover_work->job);
>>>>>>>>>   }
>>>>>>>>> @@ -5316,12 +5322,15 @@ int amdgpu_device_gpu_recover(struct 
>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>   {
>>>>>>>>>       struct amdgpu_recover_work_struct work = {.adev = adev, 
>>>>>>>>> .job = job};
>>>>>>>>>   -    INIT_WORK(&work.base, 
>>>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>>>> +    INIT_DELAYED_WORK(&work.base.base, 
>>>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>>>> +    INIT_LIST_HEAD(&work.base.node);
>>>>>>>>>         if (!amdgpu_reset_domain_schedule(adev->reset_domain, 
>>>>>>>>> &work.base))
>>>>>>>>>           return -EAGAIN;
>>>>>>>>>   -    flush_work(&work.base);
>>>>>>>>> +    flush_delayed_work(&work.base.base);
>>>>>>>>> +
>>>>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>> &work.base);
>>>>>>>>>         return work.ret;
>>>>>>>>>   }
>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>> index c80af0889773..ffddd419c351 100644
>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain 
>>>>>>>>> *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d
>>>>>>>>>       atomic_set(&reset_domain->in_gpu_reset, 0);
>>>>>>>>>       init_rwsem(&reset_domain->sem);
>>>>>>>>>   + INIT_LIST_HEAD(&reset_domain->pending_works);
>>>>>>>>> +    mutex_init(&reset_domain->reset_lock);
>>>>>>>>> +
>>>>>>>>>       return reset_domain;
>>>>>>>>>   }
>>>>>>>>>   diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>> index 1949dbe28a86..863ec5720fc1 100644
>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>> @@ -24,7 +24,18 @@
>>>>>>>>>   #ifndef __AMDGPU_RESET_H__
>>>>>>>>>   #define __AMDGPU_RESET_H__
>>>>>>>>>   -#include "amdgpu.h"
>>>>>>>>> +
>>>>>>>>> +#include <linux/atomic.h>
>>>>>>>>> +#include <linux/mutex.h>
>>>>>>>>> +#include <linux/list.h>
>>>>>>>>> +#include <linux/kref.h>
>>>>>>>>> +#include <linux/rwsem.h>
>>>>>>>>> +#include <linux/workqueue.h>
>>>>>>>>> +
>>>>>>>>> +struct amdgpu_device;
>>>>>>>>> +struct amdgpu_job;
>>>>>>>>> +struct amdgpu_hive_info;
>>>>>>>>> +
>>>>>>>>>     enum AMDGPU_RESET_FLAGS {
>>>>>>>>>   @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS {
>>>>>>>>>       AMDGPU_SKIP_HW_RESET = 1,
>>>>>>>>>   };
>>>>>>>>>   +
>>>>>>>>> +enum amd_reset_method {
>>>>>>>>> +    AMD_RESET_METHOD_NONE = -1,
>>>>>>>>> +    AMD_RESET_METHOD_LEGACY = 0,
>>>>>>>>> +    AMD_RESET_METHOD_MODE0,
>>>>>>>>> +    AMD_RESET_METHOD_MODE1,
>>>>>>>>> +    AMD_RESET_METHOD_MODE2,
>>>>>>>>> +    AMD_RESET_METHOD_BACO,
>>>>>>>>> +    AMD_RESET_METHOD_PCI,
>>>>>>>>> +};
>>>>>>>>> +
>>>>>>>>>   struct amdgpu_reset_context {
>>>>>>>>>       enum amd_reset_method method;
>>>>>>>>>       struct amdgpu_device *reset_req_dev;
>>>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context {
>>>>>>>>>       unsigned long flags;
>>>>>>>>>   };
>>>>>>>>>   +struct amdgpu_reset_control;
>>>>>>>>> +
>>>>>>>>>   struct amdgpu_reset_handler {
>>>>>>>>>       enum amd_reset_method reset_method;
>>>>>>>>>       struct list_head handler_list;
>>>>>>>>> @@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type {
>>>>>>>>>       XGMI_HIVE
>>>>>>>>>   };
>>>>>>>>>   +
>>>>>>>>> +struct amdgpu_reset_work_struct {
>>>>>>>>> +    struct delayed_work base;
>>>>>>>>> +    struct list_head node;
>>>>>>>>> +};
>>>>>>>>> +
>>>>>>>>>   struct amdgpu_reset_domain {
>>>>>>>>>       struct kref refcount;
>>>>>>>>>       struct workqueue_struct *wq;
>>>>>>>>>       enum amdgpu_reset_domain_type type;
>>>>>>>>>       struct rw_semaphore sem;
>>>>>>>>>       atomic_t in_gpu_reset;
>>>>>>>>> +
>>>>>>>>> +    struct list_head pending_works;
>>>>>>>>> +    struct mutex reset_lock;
>>>>>>>>>   };
>>>>>>>>>     @@ -113,9 +146,43 @@ static inline void 
>>>>>>>>> amdgpu_reset_put_reset_domain(struct amdgpu_reset_domain *dom
>>>>>>>>>   }
>>>>>>>>>     static inline bool amdgpu_reset_domain_schedule(struct 
>>>>>>>>> amdgpu_reset_domain *domain,
>>>>>>>>> -                        struct work_struct *work)
>>>>>>>>> +                        struct amdgpu_reset_work_struct *work)
>>>>>>>>>   {
>>>>>>>>> -    return queue_work(domain->wq, work);
>>>>>>>>> +    mutex_lock(&domain->reset_lock);
>>>>>>>>> +
>>>>>>>>> +    if (!queue_delayed_work(domain->wq, &work->base, 0)) {
>>>>>>>>> +        mutex_unlock(&domain->reset_lock);
>>>>>>>>> +        return false;
>>>>>>>>> +    }
>>>>>>>>> +
>>>>>>>>> +    list_add_tail(&work->node, &domain->pending_works);
>>>>>>>>> +    mutex_unlock(&domain->reset_lock);
>>>>>>>>> +
>>>>>>>>> +    return true;
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>> +static inline void 
>>>>>>>>> amdgpu_reset_domain_del_pendning_work(struct 
>>>>>>>>> amdgpu_reset_domain *domain,
>>>>>>>>> +                  struct amdgpu_reset_work_struct *work)
>>>>>>>>> +{
>>>>>>>>> +    mutex_lock(&domain->reset_lock);
>>>>>>>>> +    list_del_init(&work->node);
>>>>>>>>> +    mutex_unlock(&domain->reset_lock);
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>> +static inline void amdgpu_reset_pending_list(struct 
>>>>>>>>> amdgpu_reset_domain *domain)
>>>>>>>>> +{
>>>>>>>>> +    struct amdgpu_reset_work_struct *entry, *tmp;
>>>>>>>>> +
>>>>>>>>> +    mutex_lock(&domain->reset_lock);
>>>>>>>>> +    list_for_each_entry_safe(entry, tmp, 
>>>>>>>>> &domain->pending_works, node) {
>>>>>>>>> +
>>>>>>>>> +        list_del_init(&entry->node);
>>>>>>>>> +
>>>>>>>>> +        /* Stop any other related pending resets */
>>>>>>>>> +        cancel_delayed_work(&entry->base);
>>>>>>>>> +    }
>>>>>>>>> +
>>>>>>>>> +    mutex_unlock(&domain->reset_lock);
>>>>>>>>>   }
>>>>>>>>>     void amdgpu_device_lock_reset_domain(struct 
>>>>>>>>> amdgpu_reset_domain *reset_domain);
>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h 
>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>> index 239f232f9c02..574e870d3064 100644
>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>> @@ -25,6 +25,7 @@
>>>>>>>>>   #define AMDGPU_VIRT_H
>>>>>>>>>     #include "amdgv_sriovmsg.h"
>>>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>>>     #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS  (1 << 0) /* vBIOS is 
>>>>>>>>> sr-iov ready */
>>>>>>>>>   #define AMDGPU_SRIOV_CAPS_ENABLE_IOV   (1 << 1) /* sr-iov is 
>>>>>>>>> enabled on this GPU */
>>>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt {
>>>>>>>>>       uint32_t            reg_val_offs;
>>>>>>>>>       struct amdgpu_irq_src        ack_irq;
>>>>>>>>>       struct amdgpu_irq_src        rcv_irq;
>>>>>>>>> -    struct work_struct        flr_work;
>>>>>>>>> +    struct amdgpu_reset_work_struct flr_work;
>>>>>>>>>       struct amdgpu_mm_table        mm_table;
>>>>>>>>>       const struct amdgpu_virt_ops    *ops;
>>>>>>>>>       struct amdgpu_vf_error_buffer    vf_errors;
>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>> index b81acf59870c..f3d1c2be9292 100644
>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>> @@ -251,7 +251,7 @@ static int 
>>>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device *adev,
>>>>>>>>>     static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
>>>>>>>>>   {
>>>>>>>>> -    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>>>>> amdgpu_virt, flr_work);
>>>>>>>>> +    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>>>>> amdgpu_virt, flr_work.base.work);
>>>>>>>>>       struct amdgpu_device *adev = container_of(virt, struct 
>>>>>>>>> amdgpu_device, virt);
>>>>>>>>>       int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>   @@ -380,7 +380,8 @@ int xgpu_ai_mailbox_get_irq(struct 
>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>           return r;
>>>>>>>>>       }
>>>>>>>>>   -    INIT_WORK(&adev->virt.flr_work, xgpu_ai_mailbox_flr_work);
>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>> xgpu_ai_mailbox_flr_work);
>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>         return 0;
>>>>>>>>>   }
>>>>>>>>> @@ -389,6 +390,8 @@ void xgpu_ai_mailbox_put_irq(struct 
>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>   {
>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>> +
>>>>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>   }
>>>>>>>>>     static int xgpu_ai_request_init_data(struct amdgpu_device 
>>>>>>>>> *adev)
>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644
>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>> @@ -275,7 +275,7 @@ static int 
>>>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device *adev,
>>>>>>>>>     static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
>>>>>>>>>   {
>>>>>>>>> -    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>>>>> amdgpu_virt, flr_work);
>>>>>>>>> +    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>>>>> amdgpu_virt, flr_work.base.work);
>>>>>>>>>       struct amdgpu_device *adev = container_of(virt, struct 
>>>>>>>>> amdgpu_device, virt);
>>>>>>>>>       int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>   @@ -407,7 +407,8 @@ int xgpu_nv_mailbox_get_irq(struct 
>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>           return r;
>>>>>>>>>       }
>>>>>>>>>   -    INIT_WORK(&adev->virt.flr_work, xgpu_nv_mailbox_flr_work);
>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>> xgpu_nv_mailbox_flr_work);
>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>         return 0;
>>>>>>>>>   }
>>>>>>>>> @@ -416,6 +417,8 @@ void xgpu_nv_mailbox_put_irq(struct 
>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>   {
>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>> +
>>>>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>   }
>>>>>>>>>     const struct amdgpu_virt_ops xgpu_nv_virt_ops = {
>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c 
>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644
>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>> @@ -512,7 +512,7 @@ static int 
>>>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device *adev,
>>>>>>>>>     static void xgpu_vi_mailbox_flr_work(struct work_struct *work)
>>>>>>>>>   {
>>>>>>>>> -    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>>>>> amdgpu_virt, flr_work);
>>>>>>>>> +    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>>>>> amdgpu_virt, flr_work.base.work);
>>>>>>>>>       struct amdgpu_device *adev = container_of(virt, struct 
>>>>>>>>> amdgpu_device, virt);
>>>>>>>>>         /* wait until RCV_MSG become 3 */
>>>>>>>>> @@ -610,7 +610,8 @@ int xgpu_vi_mailbox_get_irq(struct 
>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>           return r;
>>>>>>>>>       }
>>>>>>>>>   -    INIT_WORK(&adev->virt.flr_work, xgpu_vi_mailbox_flr_work);
>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>> xgpu_vi_mailbox_flr_work);
>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>         return 0;
>>>>>>>>>   }
>>>>>>>>> @@ -619,6 +620,8 @@ void xgpu_vi_mailbox_put_irq(struct 
>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>   {
>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>> +
>>>>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>   }
>>>>>>>>>     const struct amdgpu_virt_ops xgpu_vi_virt_ops = {
>>>>>>>>
>>>>>>
>>>>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive.
  2022-05-06  6:02                 ` Lazar, Lijo
@ 2022-05-06  8:56                   ` Christian König
  2022-05-10 16:00                     ` Andrey Grodzovsky
  0 siblings, 1 reply; 40+ messages in thread
From: Christian König @ 2022-05-06  8:56 UTC (permalink / raw)
  To: Lazar, Lijo, Andrey Grodzovsky, Felix Kuehling,
	Christian König, amd-gfx
  Cc: Bai Zoy

Am 06.05.22 um 08:02 schrieb Lazar, Lijo:
> On 5/6/2022 3:17 AM, Andrey Grodzovsky wrote:
>>
>> On 2022-05-05 15:49, Felix Kuehling wrote:
>>>
>>> Am 2022-05-05 um 14:57 schrieb Andrey Grodzovsky:
>>>>
>>>> On 2022-05-05 11:06, Christian König wrote:
>>>>> Am 05.05.22 um 15:54 schrieb Andrey Grodzovsky:
>>>>>>
>>>>>> On 2022-05-05 09:23, Christian König wrote:
>>>>>>> Am 05.05.22 um 15:15 schrieb Andrey Grodzovsky:
>>>>>>>> On 2022-05-05 06:09, Christian König wrote:
>>>>>>>>
>>>>>>>>> Am 04.05.22 um 18:18 schrieb Andrey Grodzovsky:
>>>>>>>>>> Problem:
>>>>>>>>>> During hive reset caused by command timing out on a ring
>>>>>>>>>> extra resets are generated by triggered by KFD which is
>>>>>>>>>> unable to accesses registers on the resetting ASIC.
>>>>>>>>>>
>>>>>>>>>> Fix: Rework GPU reset to use a list of pending reset jobs
>>>>>>>>>> such that the first reset jobs that actaully resets the entire
>>>>>>>>>> reset domain will cancel all those pending redundant resets.
>>>>>>>>>>
>>>>>>>>>> This is in line with what we already do for redundant TDRs
>>>>>>>>>> in scheduler code.
>>>>>>>>>
>>>>>>>>> Mhm, why exactly do you need the extra linked list then?
>>>>>>>>>
>>>>>>>>> Let's talk about that on our call today.
>>>>>>>>
>>>>>>>>
>>>>>>>> Going to miss it as you know, and also this is the place to 
>>>>>>>> discuss technical questions anyway so -
>>>>>>>
>>>>>>> Good point.
>>>>>>>
>>>>>>>> It's needed because those other resets are not time out 
>>>>>>>> handlers that are governed by the scheduler
>>>>>>>> but rather external resets that are triggered by such clients 
>>>>>>>> as KFD, RAS and sysfs. Scheduler has no
>>>>>>>> knowledge of them (and should not have) but they are serialized 
>>>>>>>> into same wq as the TO handlers
>>>>>>>> from the scheduler. It just happens that TO triggered reset 
>>>>>>>> causes in turn another reset (from KFD in
>>>>>>>> this case) and we want to prevent this second reset from taking 
>>>>>>>> place just as we want to avoid multiple
>>>>>>>> TO resets to take place in scheduler code.
>>>>>>>
>>>>>>> Yeah, but why do you need multiple workers?
>>>>>>>
>>>>>>> You have a single worker for the GPU reset not triggered by the 
>>>>>>> scheduler in you adev and cancel that at the end of the reset 
>>>>>>> procedure.
>>>>>>>
>>>>>>> If anybody things it needs to trigger another reset while in 
>>>>>>> reset (which is actually a small design bug separately) the 
>>>>>>> reset will just be canceled in the same way we cancel the 
>>>>>>> scheduler resets.
>>>>>>>
>>>>>>> Christian.
>>>>>>
>>>>>>
>>>>>> Had this in mind at first but then I realized that each client 
>>>>>> (RAS, KFD and sysfs) will want to fill his own data for the work 
>>>>>> (see amdgpu_device_gpu_recover) - for XGMI hive each will want to 
>>>>>> set his own adev (which is fine if you set a work per adev as you 
>>>>>> suggest) but also each client might want (they all put NULL there 
>>>>>> but in theory in the future) also set his own bad job value and 
>>>>>> here you might have a collision.
>>>>>
>>>>> Yeah, but that is intentional. See when we have a job that needs 
>>>>> to be consumed by the reset handler and not overwritten or something.
>>>>
>>>>
>>>> I am not sure why this is a requirement, multiple clients can 
>>>> decide concurrently to trigger a reset for some reason (possibly 
>>>> independent reasons) hence they cannot share same work struct to 
>>>> pass to it their data.
>>>
>>> If those concurrent clients could detect that a reset was already in 
>>> progress, you wouldn't need the complexity of multiple work structs 
>>> being scheduled. You could simply return without triggering another 
>>> reset.
>>
>>
>> In my view main problem here with single work struct either at reset 
>> domain level or even adev level is that in some cases we optimize 
>> resets and don't really perform ASIC HW reset (see 
>> amdgpu_job_timedout with soft recovery and skip_hw_reset in 
>> amdgpu_device_gpu_recover_imp for the case the bad job does get 
>> signaled just before we start HW reset and we just skip this).

That's one of the reasons why we should have multiple work items for job 
based reset and other reset sources.

See the whole idea is the following:
1. We have one single queued work queue for each reset domain which 
makes sure that all reset requests execute in order.
2. We have one delayed work item for each scheduler which fires when a 
timeout on a scheduler occurs and eventually calls the reset procedure 
with the last running job.
3. We have one work item for each necessary hard reset.

The delayed work item from the scheduler first tries a soft recovery and 
checks if a hard reset is really necessary. If it's not necessary and we 
can cancel the offending job we skip the hard reset.

The hard reset work item doesn't do any of those checks and just does a 
reset no matter what.

When we really do a reset, independent if its triggered by a job or 
other source we cancel all sources at the end of the reset procedure.

This makes sure that a) We only do one reset even when multiple sources 
fire at the same time and b) when any source bails out and only does a 
soft recovery we do a full reset anyway when necessary.

That design was outlined multiple times now on the mailing list and 
looks totally clear to me. We should probably document that somewhere.

Regards,
Christian.

>> You can see that if many different reset sources share same work 
>> struct what can happen is that the first to obtain the lock you 
>> describe bellow might opt out from full HW reset because his bad job 
>> did signal for example or because his hunged IP block was able to 
>> recover through SW reset but in the meantime another reset source who 
>> needed an actual HW reset just silently returned and we end up with 
>> unhandled reset request. True that today this happens only to job 
>> timeout reset sources that are handled form within the scheduler and 
>> won't use this single work struct but no one prevents a future case 
>> for this to happen and also, if we actually want to unify scheduler 
>> time out handlers within reset domain (which seems to me the right 
>> design approach) we won't be able to use just one work struct for 
>> this reason anyway.
>>
>
> Just to add to this point - a reset domain is co-operative domain. In 
> addition to sharing a set of clients from various reset sources for 
> one device, it also will have a set of devices like in XGMI hive. The 
> job timeout on one device may not eventually result in result, but a 
> RAS error happening on another device at the same time would need a 
> reset. The second device's RAS error cannot return seeing  that a 
> reset work already started, or ignore the reset work given that 
> another device has filled the reset data.
>
> When there is a reset domain, it should take care of the work 
> scheduled and keeping it in device or any other level doesn't sound good.
>
> Thanks,
> Lijo
>
>> Andrey
>>
>>
>>>
>>> I'd put the reset work struct into the reset_domain struct. That way 
>>> you'd have exactly one worker for the reset domain. You could 
>>> implement a lock-less scheme to decide whether you need to schedule 
>>> a reset, e.g. using an atomic counter in the shared work struct that 
>>> gets incremented when a client wants to trigger a reset 
>>> (atomic_add_return). If that counter is exactly 1 after 
>>> incrementing, you need to fill in the rest of the work struct and 
>>> schedule the work. Otherwise, it's already scheduled (or another 
>>> client is in the process of scheduling it) and you just return. When 
>>> the worker finishes (after confirming a successful reset), it resets 
>>> the counter to 0, so the next client requesting a reset will 
>>> schedule the worker again.
>>>
>>> Regards,
>>>   Felix
>>>
>>>
>>>>
>>>>
>>>>>
>>>>>
>>>>> Additional to that keep in mind that you can't allocate any memory 
>>>>> before or during the GPU reset nor wait for the reset to complete 
>>>>> (so you can't allocate anything on the stack either).
>>>>
>>>>
>>>> There is no dynamic allocation here, regarding stack allocations - 
>>>> we do it all the time when we call functions, even during GPU 
>>>> resets, how on stack allocation of work struct in 
>>>> amdgpu_device_gpu_recover is different from any other local 
>>>> variable we allocate in any function we call ?
>>>>
>>>> I am also not sure why it's not allowed to wait for reset to 
>>>> complete ? Also, see in amdgpu_ras_do_recovery and gpu_recover_get 
>>>> (debugfs) - the caller expects the reset to complete before he 
>>>> returns. I can probably work around it in RAS code by calling 
>>>> atomic_set(&ras->in_recovery, 0) from some callback within actual 
>>>> reset function but regarding sysfs it actually expects a result 
>>>> returned indicating whether the call was successful or not.
>>>>
>>>> Andrey
>>>>
>>>>
>>>>>
>>>>> I don't think that concept you try here will work.
>>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>>> Also in general seems to me it's cleaner approach where this 
>>>>>> logic (the work items) are held and handled in reset_domain and 
>>>>>> are not split in each adev or any other entity. We might want in 
>>>>>> the future to even move the scheduler handling into reset domain 
>>>>>> since reset domain is supposed to be a generic things and not 
>>>>>> only or AMD.
>>>>>>
>>>>>> Andrey
>>>>>>
>>>>>>
>>>>>>>
>>>>>>>>
>>>>>>>> Andrey
>>>>>>>>
>>>>>>>>
>>>>>>>>>
>>>>>>>>> Regards,
>>>>>>>>> Christian.
>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com>
>>>>>>>>>> ---
>>>>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 11 +---
>>>>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++--
>>>>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c  | 3 +
>>>>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h  | 73 
>>>>>>>>>> +++++++++++++++++++++-
>>>>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h   | 3 +-
>>>>>>>>>>   drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c      | 7 ++-
>>>>>>>>>>   drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c      | 7 ++-
>>>>>>>>>>   drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c      | 7 ++-
>>>>>>>>>>   8 files changed, 104 insertions(+), 24 deletions(-)
>>>>>>>>>>
>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>> index 4264abc5604d..99efd8317547 100644
>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>> @@ -109,6 +109,7 @@
>>>>>>>>>>   #include "amdgpu_fdinfo.h"
>>>>>>>>>>   #include "amdgpu_mca.h"
>>>>>>>>>>   #include "amdgpu_ras.h"
>>>>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>>>>     #define MAX_GPU_INSTANCE        16
>>>>>>>>>>   @@ -509,16 +510,6 @@ struct amdgpu_allowed_register_entry {
>>>>>>>>>>       bool grbm_indexed;
>>>>>>>>>>   };
>>>>>>>>>>   -enum amd_reset_method {
>>>>>>>>>> -    AMD_RESET_METHOD_NONE = -1,
>>>>>>>>>> -    AMD_RESET_METHOD_LEGACY = 0,
>>>>>>>>>> -    AMD_RESET_METHOD_MODE0,
>>>>>>>>>> -    AMD_RESET_METHOD_MODE1,
>>>>>>>>>> -    AMD_RESET_METHOD_MODE2,
>>>>>>>>>> -    AMD_RESET_METHOD_BACO,
>>>>>>>>>> -    AMD_RESET_METHOD_PCI,
>>>>>>>>>> -};
>>>>>>>>>> -
>>>>>>>>>>   struct amdgpu_video_codec_info {
>>>>>>>>>>       u32 codec_type;
>>>>>>>>>>       u32 max_width;
>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>> index e582f1044c0f..7fa82269c30f 100644
>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>> @@ -5201,6 +5201,12 @@ int 
>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,
>>>>>>>>>>       }
>>>>>>>>>>         tmp_vram_lost_counter = 
>>>>>>>>>> atomic_read(&((adev)->vram_lost_counter));
>>>>>>>>>> +
>>>>>>>>>> +    /* Drop all pending resets since we will reset now 
>>>>>>>>>> anyway */
>>>>>>>>>> +    tmp_adev = list_first_entry(device_list_handle, struct 
>>>>>>>>>> amdgpu_device,
>>>>>>>>>> +                        reset_list);
>>>>>>>>>> + amdgpu_reset_pending_list(tmp_adev->reset_domain);
>>>>>>>>>> +
>>>>>>>>>>       /* Actual ASIC resets if needed.*/
>>>>>>>>>>       /* Host driver will handle XGMI hive reset for SRIOV */
>>>>>>>>>>       if (amdgpu_sriov_vf(adev)) {
>>>>>>>>>> @@ -5296,7 +5302,7 @@ int 
>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,
>>>>>>>>>>   }
>>>>>>>>>>     struct amdgpu_recover_work_struct {
>>>>>>>>>> -    struct work_struct base;
>>>>>>>>>> +    struct amdgpu_reset_work_struct base;
>>>>>>>>>>       struct amdgpu_device *adev;
>>>>>>>>>>       struct amdgpu_job *job;
>>>>>>>>>>       int ret;
>>>>>>>>>> @@ -5304,7 +5310,7 @@ struct amdgpu_recover_work_struct {
>>>>>>>>>>     static void amdgpu_device_queue_gpu_recover_work(struct 
>>>>>>>>>> work_struct *work)
>>>>>>>>>>   {
>>>>>>>>>> -    struct amdgpu_recover_work_struct *recover_work = 
>>>>>>>>>> container_of(work, struct amdgpu_recover_work_struct, base);
>>>>>>>>>> +    struct amdgpu_recover_work_struct *recover_work = 
>>>>>>>>>> container_of(work, struct amdgpu_recover_work_struct, 
>>>>>>>>>> base.base.work);
>>>>>>>>>>         recover_work->ret = 
>>>>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, 
>>>>>>>>>> recover_work->job);
>>>>>>>>>>   }
>>>>>>>>>> @@ -5316,12 +5322,15 @@ int amdgpu_device_gpu_recover(struct 
>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>   {
>>>>>>>>>>       struct amdgpu_recover_work_struct work = {.adev = adev, 
>>>>>>>>>> .job = job};
>>>>>>>>>>   -    INIT_WORK(&work.base, 
>>>>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>>>>> +    INIT_DELAYED_WORK(&work.base.base, 
>>>>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>>>>> +    INIT_LIST_HEAD(&work.base.node);
>>>>>>>>>>         if (!amdgpu_reset_domain_schedule(adev->reset_domain, 
>>>>>>>>>> &work.base))
>>>>>>>>>>           return -EAGAIN;
>>>>>>>>>>   -    flush_work(&work.base);
>>>>>>>>>> +    flush_delayed_work(&work.base.base);
>>>>>>>>>> +
>>>>>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>> &work.base);
>>>>>>>>>>         return work.ret;
>>>>>>>>>>   }
>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>> index c80af0889773..ffddd419c351 100644
>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain 
>>>>>>>>>> *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d
>>>>>>>>>> atomic_set(&reset_domain->in_gpu_reset, 0);
>>>>>>>>>>       init_rwsem(&reset_domain->sem);
>>>>>>>>>>   + INIT_LIST_HEAD(&reset_domain->pending_works);
>>>>>>>>>> + mutex_init(&reset_domain->reset_lock);
>>>>>>>>>> +
>>>>>>>>>>       return reset_domain;
>>>>>>>>>>   }
>>>>>>>>>>   diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>> index 1949dbe28a86..863ec5720fc1 100644
>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>> @@ -24,7 +24,18 @@
>>>>>>>>>>   #ifndef __AMDGPU_RESET_H__
>>>>>>>>>>   #define __AMDGPU_RESET_H__
>>>>>>>>>>   -#include "amdgpu.h"
>>>>>>>>>> +
>>>>>>>>>> +#include <linux/atomic.h>
>>>>>>>>>> +#include <linux/mutex.h>
>>>>>>>>>> +#include <linux/list.h>
>>>>>>>>>> +#include <linux/kref.h>
>>>>>>>>>> +#include <linux/rwsem.h>
>>>>>>>>>> +#include <linux/workqueue.h>
>>>>>>>>>> +
>>>>>>>>>> +struct amdgpu_device;
>>>>>>>>>> +struct amdgpu_job;
>>>>>>>>>> +struct amdgpu_hive_info;
>>>>>>>>>> +
>>>>>>>>>>     enum AMDGPU_RESET_FLAGS {
>>>>>>>>>>   @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS {
>>>>>>>>>>       AMDGPU_SKIP_HW_RESET = 1,
>>>>>>>>>>   };
>>>>>>>>>>   +
>>>>>>>>>> +enum amd_reset_method {
>>>>>>>>>> +    AMD_RESET_METHOD_NONE = -1,
>>>>>>>>>> +    AMD_RESET_METHOD_LEGACY = 0,
>>>>>>>>>> +    AMD_RESET_METHOD_MODE0,
>>>>>>>>>> +    AMD_RESET_METHOD_MODE1,
>>>>>>>>>> +    AMD_RESET_METHOD_MODE2,
>>>>>>>>>> +    AMD_RESET_METHOD_BACO,
>>>>>>>>>> +    AMD_RESET_METHOD_PCI,
>>>>>>>>>> +};
>>>>>>>>>> +
>>>>>>>>>>   struct amdgpu_reset_context {
>>>>>>>>>>       enum amd_reset_method method;
>>>>>>>>>>       struct amdgpu_device *reset_req_dev;
>>>>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context {
>>>>>>>>>>       unsigned long flags;
>>>>>>>>>>   };
>>>>>>>>>>   +struct amdgpu_reset_control;
>>>>>>>>>> +
>>>>>>>>>>   struct amdgpu_reset_handler {
>>>>>>>>>>       enum amd_reset_method reset_method;
>>>>>>>>>>       struct list_head handler_list;
>>>>>>>>>> @@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type {
>>>>>>>>>>       XGMI_HIVE
>>>>>>>>>>   };
>>>>>>>>>>   +
>>>>>>>>>> +struct amdgpu_reset_work_struct {
>>>>>>>>>> +    struct delayed_work base;
>>>>>>>>>> +    struct list_head node;
>>>>>>>>>> +};
>>>>>>>>>> +
>>>>>>>>>>   struct amdgpu_reset_domain {
>>>>>>>>>>       struct kref refcount;
>>>>>>>>>>       struct workqueue_struct *wq;
>>>>>>>>>>       enum amdgpu_reset_domain_type type;
>>>>>>>>>>       struct rw_semaphore sem;
>>>>>>>>>>       atomic_t in_gpu_reset;
>>>>>>>>>> +
>>>>>>>>>> +    struct list_head pending_works;
>>>>>>>>>> +    struct mutex reset_lock;
>>>>>>>>>>   };
>>>>>>>>>>     @@ -113,9 +146,43 @@ static inline void 
>>>>>>>>>> amdgpu_reset_put_reset_domain(struct amdgpu_reset_domain *dom
>>>>>>>>>>   }
>>>>>>>>>>     static inline bool amdgpu_reset_domain_schedule(struct 
>>>>>>>>>> amdgpu_reset_domain *domain,
>>>>>>>>>> -                        struct work_struct *work)
>>>>>>>>>> +                        struct amdgpu_reset_work_struct *work)
>>>>>>>>>>   {
>>>>>>>>>> -    return queue_work(domain->wq, work);
>>>>>>>>>> +    mutex_lock(&domain->reset_lock);
>>>>>>>>>> +
>>>>>>>>>> +    if (!queue_delayed_work(domain->wq, &work->base, 0)) {
>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>> +        return false;
>>>>>>>>>> +    }
>>>>>>>>>> +
>>>>>>>>>> +    list_add_tail(&work->node, &domain->pending_works);
>>>>>>>>>> +    mutex_unlock(&domain->reset_lock);
>>>>>>>>>> +
>>>>>>>>>> +    return true;
>>>>>>>>>> +}
>>>>>>>>>> +
>>>>>>>>>> +static inline void 
>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(struct 
>>>>>>>>>> amdgpu_reset_domain *domain,
>>>>>>>>>> +                  struct amdgpu_reset_work_struct *work)
>>>>>>>>>> +{
>>>>>>>>>> +    mutex_lock(&domain->reset_lock);
>>>>>>>>>> +    list_del_init(&work->node);
>>>>>>>>>> +    mutex_unlock(&domain->reset_lock);
>>>>>>>>>> +}
>>>>>>>>>> +
>>>>>>>>>> +static inline void amdgpu_reset_pending_list(struct 
>>>>>>>>>> amdgpu_reset_domain *domain)
>>>>>>>>>> +{
>>>>>>>>>> +    struct amdgpu_reset_work_struct *entry, *tmp;
>>>>>>>>>> +
>>>>>>>>>> +    mutex_lock(&domain->reset_lock);
>>>>>>>>>> +    list_for_each_entry_safe(entry, tmp, 
>>>>>>>>>> &domain->pending_works, node) {
>>>>>>>>>> +
>>>>>>>>>> +        list_del_init(&entry->node);
>>>>>>>>>> +
>>>>>>>>>> +        /* Stop any other related pending resets */
>>>>>>>>>> + cancel_delayed_work(&entry->base);
>>>>>>>>>> +    }
>>>>>>>>>> +
>>>>>>>>>> +    mutex_unlock(&domain->reset_lock);
>>>>>>>>>>   }
>>>>>>>>>>     void amdgpu_device_lock_reset_domain(struct 
>>>>>>>>>> amdgpu_reset_domain *reset_domain);
>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h 
>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>> index 239f232f9c02..574e870d3064 100644
>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>> @@ -25,6 +25,7 @@
>>>>>>>>>>   #define AMDGPU_VIRT_H
>>>>>>>>>>     #include "amdgv_sriovmsg.h"
>>>>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>>>>     #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS  (1 << 0) /* vBIOS 
>>>>>>>>>> is sr-iov ready */
>>>>>>>>>>   #define AMDGPU_SRIOV_CAPS_ENABLE_IOV   (1 << 1) /* sr-iov 
>>>>>>>>>> is enabled on this GPU */
>>>>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt {
>>>>>>>>>>       uint32_t            reg_val_offs;
>>>>>>>>>>       struct amdgpu_irq_src        ack_irq;
>>>>>>>>>>       struct amdgpu_irq_src        rcv_irq;
>>>>>>>>>> -    struct work_struct        flr_work;
>>>>>>>>>> +    struct amdgpu_reset_work_struct flr_work;
>>>>>>>>>>       struct amdgpu_mm_table        mm_table;
>>>>>>>>>>       const struct amdgpu_virt_ops    *ops;
>>>>>>>>>>       struct amdgpu_vf_error_buffer vf_errors;
>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>> index b81acf59870c..f3d1c2be9292 100644
>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>> @@ -251,7 +251,7 @@ static int 
>>>>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device *adev,
>>>>>>>>>>     static void xgpu_ai_mailbox_flr_work(struct work_struct 
>>>>>>>>>> *work)
>>>>>>>>>>   {
>>>>>>>>>> -    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>>>>>> amdgpu_virt, flr_work);
>>>>>>>>>> +    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>>>>>> amdgpu_virt, flr_work.base.work);
>>>>>>>>>>       struct amdgpu_device *adev = container_of(virt, struct 
>>>>>>>>>> amdgpu_device, virt);
>>>>>>>>>>       int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>   @@ -380,7 +380,8 @@ int xgpu_ai_mailbox_get_irq(struct 
>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>           return r;
>>>>>>>>>>       }
>>>>>>>>>>   -    INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>> xgpu_ai_mailbox_flr_work);
>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>> xgpu_ai_mailbox_flr_work);
>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>         return 0;
>>>>>>>>>>   }
>>>>>>>>>> @@ -389,6 +390,8 @@ void xgpu_ai_mailbox_put_irq(struct 
>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>   {
>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>> +
>>>>>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>   }
>>>>>>>>>>     static int xgpu_ai_request_init_data(struct amdgpu_device 
>>>>>>>>>> *adev)
>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644
>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>> @@ -275,7 +275,7 @@ static int 
>>>>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device *adev,
>>>>>>>>>>     static void xgpu_nv_mailbox_flr_work(struct work_struct 
>>>>>>>>>> *work)
>>>>>>>>>>   {
>>>>>>>>>> -    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>>>>>> amdgpu_virt, flr_work);
>>>>>>>>>> +    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>>>>>> amdgpu_virt, flr_work.base.work);
>>>>>>>>>>       struct amdgpu_device *adev = container_of(virt, struct 
>>>>>>>>>> amdgpu_device, virt);
>>>>>>>>>>       int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>   @@ -407,7 +407,8 @@ int xgpu_nv_mailbox_get_irq(struct 
>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>           return r;
>>>>>>>>>>       }
>>>>>>>>>>   -    INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>> xgpu_nv_mailbox_flr_work);
>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>> xgpu_nv_mailbox_flr_work);
>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>         return 0;
>>>>>>>>>>   }
>>>>>>>>>> @@ -416,6 +417,8 @@ void xgpu_nv_mailbox_put_irq(struct 
>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>   {
>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>> +
>>>>>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>   }
>>>>>>>>>>     const struct amdgpu_virt_ops xgpu_nv_virt_ops = {
>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c 
>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644
>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>> @@ -512,7 +512,7 @@ static int 
>>>>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device *adev,
>>>>>>>>>>     static void xgpu_vi_mailbox_flr_work(struct work_struct 
>>>>>>>>>> *work)
>>>>>>>>>>   {
>>>>>>>>>> -    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>>>>>> amdgpu_virt, flr_work);
>>>>>>>>>> +    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>>>>>> amdgpu_virt, flr_work.base.work);
>>>>>>>>>>       struct amdgpu_device *adev = container_of(virt, struct 
>>>>>>>>>> amdgpu_device, virt);
>>>>>>>>>>         /* wait until RCV_MSG become 3 */
>>>>>>>>>> @@ -610,7 +610,8 @@ int xgpu_vi_mailbox_get_irq(struct 
>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>           return r;
>>>>>>>>>>       }
>>>>>>>>>>   -    INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>> xgpu_vi_mailbox_flr_work);
>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>> xgpu_vi_mailbox_flr_work);
>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>         return 0;
>>>>>>>>>>   }
>>>>>>>>>> @@ -619,6 +620,8 @@ void xgpu_vi_mailbox_put_irq(struct 
>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>   {
>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>> +
>>>>>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>   }
>>>>>>>>>>     const struct amdgpu_virt_ops xgpu_vi_virt_ops = {
>>>>>>>>>
>>>>>>>
>>>>>


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive.
  2022-05-06  8:56                   ` Christian König
@ 2022-05-10 16:00                     ` Andrey Grodzovsky
  2022-05-10 16:17                       ` Christian König
  0 siblings, 1 reply; 40+ messages in thread
From: Andrey Grodzovsky @ 2022-05-10 16:00 UTC (permalink / raw)
  To: Christian König, Lazar, Lijo, Felix Kuehling,
	Christian König, amd-gfx
  Cc: Bai Zoy


On 2022-05-06 04:56, Christian König wrote:
> Am 06.05.22 um 08:02 schrieb Lazar, Lijo:
>> On 5/6/2022 3:17 AM, Andrey Grodzovsky wrote:
>>>
>>> On 2022-05-05 15:49, Felix Kuehling wrote:
>>>>
>>>> Am 2022-05-05 um 14:57 schrieb Andrey Grodzovsky:
>>>>>
>>>>> On 2022-05-05 11:06, Christian König wrote:
>>>>>> Am 05.05.22 um 15:54 schrieb Andrey Grodzovsky:
>>>>>>>
>>>>>>> On 2022-05-05 09:23, Christian König wrote:
>>>>>>>> Am 05.05.22 um 15:15 schrieb Andrey Grodzovsky:
>>>>>>>>> On 2022-05-05 06:09, Christian König wrote:
>>>>>>>>>
>>>>>>>>>> Am 04.05.22 um 18:18 schrieb Andrey Grodzovsky:
>>>>>>>>>>> Problem:
>>>>>>>>>>> During hive reset caused by command timing out on a ring
>>>>>>>>>>> extra resets are generated by triggered by KFD which is
>>>>>>>>>>> unable to accesses registers on the resetting ASIC.
>>>>>>>>>>>
>>>>>>>>>>> Fix: Rework GPU reset to use a list of pending reset jobs
>>>>>>>>>>> such that the first reset jobs that actaully resets the entire
>>>>>>>>>>> reset domain will cancel all those pending redundant resets.
>>>>>>>>>>>
>>>>>>>>>>> This is in line with what we already do for redundant TDRs
>>>>>>>>>>> in scheduler code.
>>>>>>>>>>
>>>>>>>>>> Mhm, why exactly do you need the extra linked list then?
>>>>>>>>>>
>>>>>>>>>> Let's talk about that on our call today.
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> Going to miss it as you know, and also this is the place to 
>>>>>>>>> discuss technical questions anyway so -
>>>>>>>>
>>>>>>>> Good point.
>>>>>>>>
>>>>>>>>> It's needed because those other resets are not time out 
>>>>>>>>> handlers that are governed by the scheduler
>>>>>>>>> but rather external resets that are triggered by such clients 
>>>>>>>>> as KFD, RAS and sysfs. Scheduler has no
>>>>>>>>> knowledge of them (and should not have) but they are 
>>>>>>>>> serialized into same wq as the TO handlers
>>>>>>>>> from the scheduler. It just happens that TO triggered reset 
>>>>>>>>> causes in turn another reset (from KFD in
>>>>>>>>> this case) and we want to prevent this second reset from 
>>>>>>>>> taking place just as we want to avoid multiple
>>>>>>>>> TO resets to take place in scheduler code.
>>>>>>>>
>>>>>>>> Yeah, but why do you need multiple workers?
>>>>>>>>
>>>>>>>> You have a single worker for the GPU reset not triggered by the 
>>>>>>>> scheduler in you adev and cancel that at the end of the reset 
>>>>>>>> procedure.
>>>>>>>>
>>>>>>>> If anybody things it needs to trigger another reset while in 
>>>>>>>> reset (which is actually a small design bug separately) the 
>>>>>>>> reset will just be canceled in the same way we cancel the 
>>>>>>>> scheduler resets.
>>>>>>>>
>>>>>>>> Christian.
>>>>>>>
>>>>>>>
>>>>>>> Had this in mind at first but then I realized that each client 
>>>>>>> (RAS, KFD and sysfs) will want to fill his own data for the work 
>>>>>>> (see amdgpu_device_gpu_recover) - for XGMI hive each will want 
>>>>>>> to set his own adev (which is fine if you set a work per adev as 
>>>>>>> you suggest) but also each client might want (they all put NULL 
>>>>>>> there but in theory in the future) also set his own bad job 
>>>>>>> value and here you might have a collision.
>>>>>>
>>>>>> Yeah, but that is intentional. See when we have a job that needs 
>>>>>> to be consumed by the reset handler and not overwritten or 
>>>>>> something.
>>>>>
>>>>>
>>>>> I am not sure why this is a requirement, multiple clients can 
>>>>> decide concurrently to trigger a reset for some reason (possibly 
>>>>> independent reasons) hence they cannot share same work struct to 
>>>>> pass to it their data.
>>>>
>>>> If those concurrent clients could detect that a reset was already 
>>>> in progress, you wouldn't need the complexity of multiple work 
>>>> structs being scheduled. You could simply return without triggering 
>>>> another reset.
>>>
>>>
>>> In my view main problem here with single work struct either at reset 
>>> domain level or even adev level is that in some cases we optimize 
>>> resets and don't really perform ASIC HW reset (see 
>>> amdgpu_job_timedout with soft recovery and skip_hw_reset in 
>>> amdgpu_device_gpu_recover_imp for the case the bad job does get 
>>> signaled just before we start HW reset and we just skip this).
>
> That's one of the reasons why we should have multiple work items for 
> job based reset and other reset sources.
>
> See the whole idea is the following:
> 1. We have one single queued work queue for each reset domain which 
> makes sure that all reset requests execute in order.
> 2. We have one delayed work item for each scheduler which fires when a 
> timeout on a scheduler occurs and eventually calls the reset procedure 
> with the last running job.
> 3. We have one work item for each necessary hard reset.
>
> The delayed work item from the scheduler first tries a soft recovery 
> and checks if a hard reset is really necessary. If it's not necessary 
> and we can cancel the offending job we skip the hard reset.
>
> The hard reset work item doesn't do any of those checks and just does 
> a reset no matter what.
>
> When we really do a reset, independent if its triggered by a job or 
> other source we cancel all sources at the end of the reset procedure.
>
> This makes sure that a) We only do one reset even when multiple 
> sources fire at the same time and b) when any source bails out and 
> only does a soft recovery we do a full reset anyway when necessary.
>
> That design was outlined multiple times now on the mailing list and 
> looks totally clear to me. We should probably document that somewhere.


If you look at the patch what you described above is exactly what is 
happening - since scheduler's delayed work is different from any non 
scheduler delayed work the SW reset which might take place from 
scheduler's reset
will not have any impact on any non scheduler delayed work and will not 
cancel them. In case the scheduler actually reaches the point of HW 
reset it will cancel out all pending reset works from any other sources 
on the same
reset domain. Non scheduler reset will always proceed to do full HW 
reset and will cancel any other pending resets.

The only difference is I chose to do the canceling right BEFORE the HW 
reset and not AFTER. I did this because I see a possible race where a 
new reset request is being generated exactly after we finished the HW 
reset but before we canceled out all pending resets - in such case you 
wold not want to cancel this 'border line new' reset request.

Andrey


>
> Regards,
> Christian.
>
>>> You can see that if many different reset sources share same work 
>>> struct what can happen is that the first to obtain the lock you 
>>> describe bellow might opt out from full HW reset because his bad job 
>>> did signal for example or because his hunged IP block was able to 
>>> recover through SW reset but in the meantime another reset source 
>>> who needed an actual HW reset just silently returned and we end up 
>>> with unhandled reset request. True that today this happens only to 
>>> job timeout reset sources that are handled form within the scheduler 
>>> and won't use this single work struct but no one prevents a future 
>>> case for this to happen and also, if we actually want to unify 
>>> scheduler time out handlers within reset domain (which seems to me 
>>> the right design approach) we won't be able to use just one work 
>>> struct for this reason anyway.
>>>
>>
>> Just to add to this point - a reset domain is co-operative domain. In 
>> addition to sharing a set of clients from various reset sources for 
>> one device, it also will have a set of devices like in XGMI hive. The 
>> job timeout on one device may not eventually result in result, but a 
>> RAS error happening on another device at the same time would need a 
>> reset. The second device's RAS error cannot return seeing  that a 
>> reset work already started, or ignore the reset work given that 
>> another device has filled the reset data.
>>
>> When there is a reset domain, it should take care of the work 
>> scheduled and keeping it in device or any other level doesn't sound 
>> good.
>>
>> Thanks,
>> Lijo
>>
>>> Andrey
>>>
>>>
>>>>
>>>> I'd put the reset work struct into the reset_domain struct. That 
>>>> way you'd have exactly one worker for the reset domain. You could 
>>>> implement a lock-less scheme to decide whether you need to schedule 
>>>> a reset, e.g. using an atomic counter in the shared work struct 
>>>> that gets incremented when a client wants to trigger a reset 
>>>> (atomic_add_return). If that counter is exactly 1 after 
>>>> incrementing, you need to fill in the rest of the work struct and 
>>>> schedule the work. Otherwise, it's already scheduled (or another 
>>>> client is in the process of scheduling it) and you just return. 
>>>> When the worker finishes (after confirming a successful reset), it 
>>>> resets the counter to 0, so the next client requesting a reset will 
>>>> schedule the worker again.
>>>>
>>>> Regards,
>>>>   Felix
>>>>
>>>>
>>>>>
>>>>>
>>>>>>
>>>>>>
>>>>>> Additional to that keep in mind that you can't allocate any 
>>>>>> memory before or during the GPU reset nor wait for the reset to 
>>>>>> complete (so you can't allocate anything on the stack either).
>>>>>
>>>>>
>>>>> There is no dynamic allocation here, regarding stack allocations - 
>>>>> we do it all the time when we call functions, even during GPU 
>>>>> resets, how on stack allocation of work struct in 
>>>>> amdgpu_device_gpu_recover is different from any other local 
>>>>> variable we allocate in any function we call ?
>>>>>
>>>>> I am also not sure why it's not allowed to wait for reset to 
>>>>> complete ? Also, see in amdgpu_ras_do_recovery and gpu_recover_get 
>>>>> (debugfs) - the caller expects the reset to complete before he 
>>>>> returns. I can probably work around it in RAS code by calling 
>>>>> atomic_set(&ras->in_recovery, 0) from some callback within actual 
>>>>> reset function but regarding sysfs it actually expects a result 
>>>>> returned indicating whether the call was successful or not.
>>>>>
>>>>> Andrey
>>>>>
>>>>>
>>>>>>
>>>>>> I don't think that concept you try here will work.
>>>>>>
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>>>> Also in general seems to me it's cleaner approach where this 
>>>>>>> logic (the work items) are held and handled in reset_domain and 
>>>>>>> are not split in each adev or any other entity. We might want in 
>>>>>>> the future to even move the scheduler handling into reset domain 
>>>>>>> since reset domain is supposed to be a generic things and not 
>>>>>>> only or AMD.
>>>>>>>
>>>>>>> Andrey
>>>>>>>
>>>>>>>
>>>>>>>>
>>>>>>>>>
>>>>>>>>> Andrey
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> Regards,
>>>>>>>>>> Christian.
>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>>>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com>
>>>>>>>>>>> ---
>>>>>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 11 +---
>>>>>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++--
>>>>>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c  | 3 +
>>>>>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h  | 73 
>>>>>>>>>>> +++++++++++++++++++++-
>>>>>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h   | 3 +-
>>>>>>>>>>>   drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c      | 7 ++-
>>>>>>>>>>>   drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c      | 7 ++-
>>>>>>>>>>>   drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c      | 7 ++-
>>>>>>>>>>>   8 files changed, 104 insertions(+), 24 deletions(-)
>>>>>>>>>>>
>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>> index 4264abc5604d..99efd8317547 100644
>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>> @@ -109,6 +109,7 @@
>>>>>>>>>>>   #include "amdgpu_fdinfo.h"
>>>>>>>>>>>   #include "amdgpu_mca.h"
>>>>>>>>>>>   #include "amdgpu_ras.h"
>>>>>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>>>>>     #define MAX_GPU_INSTANCE        16
>>>>>>>>>>>   @@ -509,16 +510,6 @@ struct amdgpu_allowed_register_entry {
>>>>>>>>>>>       bool grbm_indexed;
>>>>>>>>>>>   };
>>>>>>>>>>>   -enum amd_reset_method {
>>>>>>>>>>> -    AMD_RESET_METHOD_NONE = -1,
>>>>>>>>>>> -    AMD_RESET_METHOD_LEGACY = 0,
>>>>>>>>>>> -    AMD_RESET_METHOD_MODE0,
>>>>>>>>>>> -    AMD_RESET_METHOD_MODE1,
>>>>>>>>>>> -    AMD_RESET_METHOD_MODE2,
>>>>>>>>>>> -    AMD_RESET_METHOD_BACO,
>>>>>>>>>>> -    AMD_RESET_METHOD_PCI,
>>>>>>>>>>> -};
>>>>>>>>>>> -
>>>>>>>>>>>   struct amdgpu_video_codec_info {
>>>>>>>>>>>       u32 codec_type;
>>>>>>>>>>>       u32 max_width;
>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>> index e582f1044c0f..7fa82269c30f 100644
>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>> @@ -5201,6 +5201,12 @@ int 
>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,
>>>>>>>>>>>       }
>>>>>>>>>>>         tmp_vram_lost_counter = 
>>>>>>>>>>> atomic_read(&((adev)->vram_lost_counter));
>>>>>>>>>>> +
>>>>>>>>>>> +    /* Drop all pending resets since we will reset now 
>>>>>>>>>>> anyway */
>>>>>>>>>>> +    tmp_adev = list_first_entry(device_list_handle, struct 
>>>>>>>>>>> amdgpu_device,
>>>>>>>>>>> +                        reset_list);
>>>>>>>>>>> + amdgpu_reset_pending_list(tmp_adev->reset_domain);
>>>>>>>>>>> +
>>>>>>>>>>>       /* Actual ASIC resets if needed.*/
>>>>>>>>>>>       /* Host driver will handle XGMI hive reset for SRIOV */
>>>>>>>>>>>       if (amdgpu_sriov_vf(adev)) {
>>>>>>>>>>> @@ -5296,7 +5302,7 @@ int 
>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,
>>>>>>>>>>>   }
>>>>>>>>>>>     struct amdgpu_recover_work_struct {
>>>>>>>>>>> -    struct work_struct base;
>>>>>>>>>>> +    struct amdgpu_reset_work_struct base;
>>>>>>>>>>>       struct amdgpu_device *adev;
>>>>>>>>>>>       struct amdgpu_job *job;
>>>>>>>>>>>       int ret;
>>>>>>>>>>> @@ -5304,7 +5310,7 @@ struct amdgpu_recover_work_struct {
>>>>>>>>>>>     static void amdgpu_device_queue_gpu_recover_work(struct 
>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>   {
>>>>>>>>>>> -    struct amdgpu_recover_work_struct *recover_work = 
>>>>>>>>>>> container_of(work, struct amdgpu_recover_work_struct, base);
>>>>>>>>>>> +    struct amdgpu_recover_work_struct *recover_work = 
>>>>>>>>>>> container_of(work, struct amdgpu_recover_work_struct, 
>>>>>>>>>>> base.base.work);
>>>>>>>>>>>         recover_work->ret = 
>>>>>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, 
>>>>>>>>>>> recover_work->job);
>>>>>>>>>>>   }
>>>>>>>>>>> @@ -5316,12 +5322,15 @@ int amdgpu_device_gpu_recover(struct 
>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>   {
>>>>>>>>>>>       struct amdgpu_recover_work_struct work = {.adev = 
>>>>>>>>>>> adev, .job = job};
>>>>>>>>>>>   -    INIT_WORK(&work.base, 
>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>>>>>> +    INIT_DELAYED_WORK(&work.base.base, 
>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>>>>>> +    INIT_LIST_HEAD(&work.base.node);
>>>>>>>>>>>         if 
>>>>>>>>>>> (!amdgpu_reset_domain_schedule(adev->reset_domain, &work.base))
>>>>>>>>>>>           return -EAGAIN;
>>>>>>>>>>>   -    flush_work(&work.base);
>>>>>>>>>>> +    flush_delayed_work(&work.base.base);
>>>>>>>>>>> +
>>>>>>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>> &work.base);
>>>>>>>>>>>         return work.ret;
>>>>>>>>>>>   }
>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>> index c80af0889773..ffddd419c351 100644
>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain 
>>>>>>>>>>> *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d
>>>>>>>>>>> atomic_set(&reset_domain->in_gpu_reset, 0);
>>>>>>>>>>>       init_rwsem(&reset_domain->sem);
>>>>>>>>>>>   + INIT_LIST_HEAD(&reset_domain->pending_works);
>>>>>>>>>>> + mutex_init(&reset_domain->reset_lock);
>>>>>>>>>>> +
>>>>>>>>>>>       return reset_domain;
>>>>>>>>>>>   }
>>>>>>>>>>>   diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>> index 1949dbe28a86..863ec5720fc1 100644
>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>> @@ -24,7 +24,18 @@
>>>>>>>>>>>   #ifndef __AMDGPU_RESET_H__
>>>>>>>>>>>   #define __AMDGPU_RESET_H__
>>>>>>>>>>>   -#include "amdgpu.h"
>>>>>>>>>>> +
>>>>>>>>>>> +#include <linux/atomic.h>
>>>>>>>>>>> +#include <linux/mutex.h>
>>>>>>>>>>> +#include <linux/list.h>
>>>>>>>>>>> +#include <linux/kref.h>
>>>>>>>>>>> +#include <linux/rwsem.h>
>>>>>>>>>>> +#include <linux/workqueue.h>
>>>>>>>>>>> +
>>>>>>>>>>> +struct amdgpu_device;
>>>>>>>>>>> +struct amdgpu_job;
>>>>>>>>>>> +struct amdgpu_hive_info;
>>>>>>>>>>> +
>>>>>>>>>>>     enum AMDGPU_RESET_FLAGS {
>>>>>>>>>>>   @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS {
>>>>>>>>>>>       AMDGPU_SKIP_HW_RESET = 1,
>>>>>>>>>>>   };
>>>>>>>>>>>   +
>>>>>>>>>>> +enum amd_reset_method {
>>>>>>>>>>> +    AMD_RESET_METHOD_NONE = -1,
>>>>>>>>>>> +    AMD_RESET_METHOD_LEGACY = 0,
>>>>>>>>>>> +    AMD_RESET_METHOD_MODE0,
>>>>>>>>>>> +    AMD_RESET_METHOD_MODE1,
>>>>>>>>>>> +    AMD_RESET_METHOD_MODE2,
>>>>>>>>>>> +    AMD_RESET_METHOD_BACO,
>>>>>>>>>>> +    AMD_RESET_METHOD_PCI,
>>>>>>>>>>> +};
>>>>>>>>>>> +
>>>>>>>>>>>   struct amdgpu_reset_context {
>>>>>>>>>>>       enum amd_reset_method method;
>>>>>>>>>>>       struct amdgpu_device *reset_req_dev;
>>>>>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context {
>>>>>>>>>>>       unsigned long flags;
>>>>>>>>>>>   };
>>>>>>>>>>>   +struct amdgpu_reset_control;
>>>>>>>>>>> +
>>>>>>>>>>>   struct amdgpu_reset_handler {
>>>>>>>>>>>       enum amd_reset_method reset_method;
>>>>>>>>>>>       struct list_head handler_list;
>>>>>>>>>>> @@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type {
>>>>>>>>>>>       XGMI_HIVE
>>>>>>>>>>>   };
>>>>>>>>>>>   +
>>>>>>>>>>> +struct amdgpu_reset_work_struct {
>>>>>>>>>>> +    struct delayed_work base;
>>>>>>>>>>> +    struct list_head node;
>>>>>>>>>>> +};
>>>>>>>>>>> +
>>>>>>>>>>>   struct amdgpu_reset_domain {
>>>>>>>>>>>       struct kref refcount;
>>>>>>>>>>>       struct workqueue_struct *wq;
>>>>>>>>>>>       enum amdgpu_reset_domain_type type;
>>>>>>>>>>>       struct rw_semaphore sem;
>>>>>>>>>>>       atomic_t in_gpu_reset;
>>>>>>>>>>> +
>>>>>>>>>>> +    struct list_head pending_works;
>>>>>>>>>>> +    struct mutex reset_lock;
>>>>>>>>>>>   };
>>>>>>>>>>>     @@ -113,9 +146,43 @@ static inline void 
>>>>>>>>>>> amdgpu_reset_put_reset_domain(struct amdgpu_reset_domain *dom
>>>>>>>>>>>   }
>>>>>>>>>>>     static inline bool amdgpu_reset_domain_schedule(struct 
>>>>>>>>>>> amdgpu_reset_domain *domain,
>>>>>>>>>>> -                        struct work_struct *work)
>>>>>>>>>>> +                        struct amdgpu_reset_work_struct *work)
>>>>>>>>>>>   {
>>>>>>>>>>> -    return queue_work(domain->wq, work);
>>>>>>>>>>> +    mutex_lock(&domain->reset_lock);
>>>>>>>>>>> +
>>>>>>>>>>> +    if (!queue_delayed_work(domain->wq, &work->base, 0)) {
>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>> +        return false;
>>>>>>>>>>> +    }
>>>>>>>>>>> +
>>>>>>>>>>> +    list_add_tail(&work->node, &domain->pending_works);
>>>>>>>>>>> +    mutex_unlock(&domain->reset_lock);
>>>>>>>>>>> +
>>>>>>>>>>> +    return true;
>>>>>>>>>>> +}
>>>>>>>>>>> +
>>>>>>>>>>> +static inline void 
>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(struct 
>>>>>>>>>>> amdgpu_reset_domain *domain,
>>>>>>>>>>> +                  struct amdgpu_reset_work_struct *work)
>>>>>>>>>>> +{
>>>>>>>>>>> +    mutex_lock(&domain->reset_lock);
>>>>>>>>>>> +    list_del_init(&work->node);
>>>>>>>>>>> +    mutex_unlock(&domain->reset_lock);
>>>>>>>>>>> +}
>>>>>>>>>>> +
>>>>>>>>>>> +static inline void amdgpu_reset_pending_list(struct 
>>>>>>>>>>> amdgpu_reset_domain *domain)
>>>>>>>>>>> +{
>>>>>>>>>>> +    struct amdgpu_reset_work_struct *entry, *tmp;
>>>>>>>>>>> +
>>>>>>>>>>> +    mutex_lock(&domain->reset_lock);
>>>>>>>>>>> +    list_for_each_entry_safe(entry, tmp, 
>>>>>>>>>>> &domain->pending_works, node) {
>>>>>>>>>>> +
>>>>>>>>>>> +        list_del_init(&entry->node);
>>>>>>>>>>> +
>>>>>>>>>>> +        /* Stop any other related pending resets */
>>>>>>>>>>> + cancel_delayed_work(&entry->base);
>>>>>>>>>>> +    }
>>>>>>>>>>> +
>>>>>>>>>>> +    mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>   }
>>>>>>>>>>>     void amdgpu_device_lock_reset_domain(struct 
>>>>>>>>>>> amdgpu_reset_domain *reset_domain);
>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h 
>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>> index 239f232f9c02..574e870d3064 100644
>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>> @@ -25,6 +25,7 @@
>>>>>>>>>>>   #define AMDGPU_VIRT_H
>>>>>>>>>>>     #include "amdgv_sriovmsg.h"
>>>>>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>>>>>     #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS  (1 << 0) /* vBIOS 
>>>>>>>>>>> is sr-iov ready */
>>>>>>>>>>>   #define AMDGPU_SRIOV_CAPS_ENABLE_IOV   (1 << 1) /* sr-iov 
>>>>>>>>>>> is enabled on this GPU */
>>>>>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt {
>>>>>>>>>>>       uint32_t            reg_val_offs;
>>>>>>>>>>>       struct amdgpu_irq_src        ack_irq;
>>>>>>>>>>>       struct amdgpu_irq_src        rcv_irq;
>>>>>>>>>>> -    struct work_struct        flr_work;
>>>>>>>>>>> +    struct amdgpu_reset_work_struct flr_work;
>>>>>>>>>>>       struct amdgpu_mm_table        mm_table;
>>>>>>>>>>>       const struct amdgpu_virt_ops    *ops;
>>>>>>>>>>>       struct amdgpu_vf_error_buffer vf_errors;
>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>> index b81acf59870c..f3d1c2be9292 100644
>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>> @@ -251,7 +251,7 @@ static int 
>>>>>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device *adev,
>>>>>>>>>>>     static void xgpu_ai_mailbox_flr_work(struct work_struct 
>>>>>>>>>>> *work)
>>>>>>>>>>>   {
>>>>>>>>>>> -    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>>>>>>> amdgpu_virt, flr_work);
>>>>>>>>>>> +    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>>>>>>> amdgpu_virt, flr_work.base.work);
>>>>>>>>>>>       struct amdgpu_device *adev = container_of(virt, struct 
>>>>>>>>>>> amdgpu_device, virt);
>>>>>>>>>>>       int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>   @@ -380,7 +380,8 @@ int xgpu_ai_mailbox_get_irq(struct 
>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>           return r;
>>>>>>>>>>>       }
>>>>>>>>>>>   -    INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>> xgpu_ai_mailbox_flr_work);
>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>> xgpu_ai_mailbox_flr_work);
>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>         return 0;
>>>>>>>>>>>   }
>>>>>>>>>>> @@ -389,6 +390,8 @@ void xgpu_ai_mailbox_put_irq(struct 
>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>   {
>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>> +
>>>>>>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>   }
>>>>>>>>>>>     static int xgpu_ai_request_init_data(struct 
>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644
>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>> @@ -275,7 +275,7 @@ static int 
>>>>>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device *adev,
>>>>>>>>>>>     static void xgpu_nv_mailbox_flr_work(struct work_struct 
>>>>>>>>>>> *work)
>>>>>>>>>>>   {
>>>>>>>>>>> -    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>>>>>>> amdgpu_virt, flr_work);
>>>>>>>>>>> +    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>>>>>>> amdgpu_virt, flr_work.base.work);
>>>>>>>>>>>       struct amdgpu_device *adev = container_of(virt, struct 
>>>>>>>>>>> amdgpu_device, virt);
>>>>>>>>>>>       int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>   @@ -407,7 +407,8 @@ int xgpu_nv_mailbox_get_irq(struct 
>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>           return r;
>>>>>>>>>>>       }
>>>>>>>>>>>   -    INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>> xgpu_nv_mailbox_flr_work);
>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>> xgpu_nv_mailbox_flr_work);
>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>         return 0;
>>>>>>>>>>>   }
>>>>>>>>>>> @@ -416,6 +417,8 @@ void xgpu_nv_mailbox_put_irq(struct 
>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>   {
>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>> +
>>>>>>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>   }
>>>>>>>>>>>     const struct amdgpu_virt_ops xgpu_nv_virt_ops = {
>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c 
>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644
>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>> @@ -512,7 +512,7 @@ static int 
>>>>>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device *adev,
>>>>>>>>>>>     static void xgpu_vi_mailbox_flr_work(struct work_struct 
>>>>>>>>>>> *work)
>>>>>>>>>>>   {
>>>>>>>>>>> -    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>>>>>>> amdgpu_virt, flr_work);
>>>>>>>>>>> +    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>>>>>>> amdgpu_virt, flr_work.base.work);
>>>>>>>>>>>       struct amdgpu_device *adev = container_of(virt, struct 
>>>>>>>>>>> amdgpu_device, virt);
>>>>>>>>>>>         /* wait until RCV_MSG become 3 */
>>>>>>>>>>> @@ -610,7 +610,8 @@ int xgpu_vi_mailbox_get_irq(struct 
>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>           return r;
>>>>>>>>>>>       }
>>>>>>>>>>>   -    INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>> xgpu_vi_mailbox_flr_work);
>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>> xgpu_vi_mailbox_flr_work);
>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>         return 0;
>>>>>>>>>>>   }
>>>>>>>>>>> @@ -619,6 +620,8 @@ void xgpu_vi_mailbox_put_irq(struct 
>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>   {
>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>> +
>>>>>>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>   }
>>>>>>>>>>>     const struct amdgpu_virt_ops xgpu_vi_virt_ops = {
>>>>>>>>>>
>>>>>>>>
>>>>>>
>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive.
  2022-05-10 16:00                     ` Andrey Grodzovsky
@ 2022-05-10 16:17                       ` Christian König
  2022-05-10 17:01                         ` Andrey Grodzovsky
  0 siblings, 1 reply; 40+ messages in thread
From: Christian König @ 2022-05-10 16:17 UTC (permalink / raw)
  To: Andrey Grodzovsky, Christian König, Lazar, Lijo,
	Felix Kuehling, amd-gfx
  Cc: Bai Zoy

Am 10.05.22 um 18:00 schrieb Andrey Grodzovsky:
> [SNIP]
>> That's one of the reasons why we should have multiple work items for 
>> job based reset and other reset sources.
>>
>> See the whole idea is the following:
>> 1. We have one single queued work queue for each reset domain which 
>> makes sure that all reset requests execute in order.
>> 2. We have one delayed work item for each scheduler which fires when 
>> a timeout on a scheduler occurs and eventually calls the reset 
>> procedure with the last running job.
>> 3. We have one work item for each necessary hard reset.
>>
>> The delayed work item from the scheduler first tries a soft recovery 
>> and checks if a hard reset is really necessary. If it's not necessary 
>> and we can cancel the offending job we skip the hard reset.
>>
>> The hard reset work item doesn't do any of those checks and just does 
>> a reset no matter what.
>>
>> When we really do a reset, independent if its triggered by a job or 
>> other source we cancel all sources at the end of the reset procedure.
>>
>> This makes sure that a) We only do one reset even when multiple 
>> sources fire at the same time and b) when any source bails out and 
>> only does a soft recovery we do a full reset anyway when necessary.
>>
>> That design was outlined multiple times now on the mailing list and 
>> looks totally clear to me. We should probably document that somewhere.
>
>
> If you look at the patch what you described above is exactly what is 
> happening - since scheduler's delayed work is different from any non 
> scheduler delayed work the SW reset which might take place from 
> scheduler's reset
> will not have any impact on any non scheduler delayed work and will 
> not cancel them. In case the scheduler actually reaches the point of 
> HW reset it will cancel out all pending reset works from any other 
> sources on the same
> reset domain. Non scheduler reset will always proceed to do full HW 
> reset and will cancel any other pending resets.

Ok, but why you then need that linked list? The number of reset sources 
should be static and not in any way dynamic.

See using the linked list sounds like you only wanted to cancel the 
reset sources raised so far which would not be correct as far as I can see.

>
> The only difference is I chose to do the canceling right BEFORE the HW 
> reset and not AFTER. I did this because I see a possible race where a 
> new reset request is being generated exactly after we finished the HW 
> reset but before we canceled out all pending resets - in such case you 
> wold not want to cancel this 'border line new' reset request.

Why not? Any new reset request directly after a hardware reset is most 
likely just falsely generated by the reset itself.

Ideally I would cancel all sources after the reset, but before starting 
any new work.

Regards,
Christian.

>
>
> Andrey
>
>
>>
>> Regards,
>> Christian.
>>
>>>> You can see that if many different reset sources share same work 
>>>> struct what can happen is that the first to obtain the lock you 
>>>> describe bellow might opt out from full HW reset because his bad 
>>>> job did signal for example or because his hunged IP block was able 
>>>> to recover through SW reset but in the meantime another reset 
>>>> source who needed an actual HW reset just silently returned and we 
>>>> end up with unhandled reset request. True that today this happens 
>>>> only to job timeout reset sources that are handled form within the 
>>>> scheduler and won't use this single work struct but no one prevents 
>>>> a future case for this to happen and also, if we actually want to 
>>>> unify scheduler time out handlers within reset domain (which seems 
>>>> to me the right design approach) we won't be able to use just one 
>>>> work struct for this reason anyway.
>>>>
>>>
>>> Just to add to this point - a reset domain is co-operative domain. 
>>> In addition to sharing a set of clients from various reset sources 
>>> for one device, it also will have a set of devices like in XGMI 
>>> hive. The job timeout on one device may not eventually result in 
>>> result, but a RAS error happening on another device at the same time 
>>> would need a reset. The second device's RAS error cannot return 
>>> seeing  that a reset work already started, or ignore the reset work 
>>> given that another device has filled the reset data.
>>>
>>> When there is a reset domain, it should take care of the work 
>>> scheduled and keeping it in device or any other level doesn't sound 
>>> good.
>>>
>>> Thanks,
>>> Lijo
>>>
>>>> Andrey
>>>>
>>>>
>>>>>
>>>>> I'd put the reset work struct into the reset_domain struct. That 
>>>>> way you'd have exactly one worker for the reset domain. You could 
>>>>> implement a lock-less scheme to decide whether you need to 
>>>>> schedule a reset, e.g. using an atomic counter in the shared work 
>>>>> struct that gets incremented when a client wants to trigger a 
>>>>> reset (atomic_add_return). If that counter is exactly 1 after 
>>>>> incrementing, you need to fill in the rest of the work struct and 
>>>>> schedule the work. Otherwise, it's already scheduled (or another 
>>>>> client is in the process of scheduling it) and you just return. 
>>>>> When the worker finishes (after confirming a successful reset), it 
>>>>> resets the counter to 0, so the next client requesting a reset 
>>>>> will schedule the worker again.
>>>>>
>>>>> Regards,
>>>>>   Felix
>>>>>
>>>>>
>>>>>>
>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> Additional to that keep in mind that you can't allocate any 
>>>>>>> memory before or during the GPU reset nor wait for the reset to 
>>>>>>> complete (so you can't allocate anything on the stack either).
>>>>>>
>>>>>>
>>>>>> There is no dynamic allocation here, regarding stack allocations 
>>>>>> - we do it all the time when we call functions, even during GPU 
>>>>>> resets, how on stack allocation of work struct in 
>>>>>> amdgpu_device_gpu_recover is different from any other local 
>>>>>> variable we allocate in any function we call ?
>>>>>>
>>>>>> I am also not sure why it's not allowed to wait for reset to 
>>>>>> complete ? Also, see in amdgpu_ras_do_recovery and 
>>>>>> gpu_recover_get (debugfs) - the caller expects the reset to 
>>>>>> complete before he returns. I can probably work around it in RAS 
>>>>>> code by calling atomic_set(&ras->in_recovery, 0) from some 
>>>>>> callback within actual reset function but regarding sysfs it 
>>>>>> actually expects a result returned indicating whether the call 
>>>>>> was successful or not.
>>>>>>
>>>>>> Andrey
>>>>>>
>>>>>>
>>>>>>>
>>>>>>> I don't think that concept you try here will work.
>>>>>>>
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>>>> Also in general seems to me it's cleaner approach where this 
>>>>>>>> logic (the work items) are held and handled in reset_domain and 
>>>>>>>> are not split in each adev or any other entity. We might want 
>>>>>>>> in the future to even move the scheduler handling into reset 
>>>>>>>> domain since reset domain is supposed to be a generic things 
>>>>>>>> and not only or AMD.
>>>>>>>>
>>>>>>>> Andrey
>>>>>>>>
>>>>>>>>
>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> Andrey
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> Regards,
>>>>>>>>>>> Christian.
>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>>>>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com>
>>>>>>>>>>>> ---
>>>>>>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +---
>>>>>>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++--
>>>>>>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 +
>>>>>>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 
>>>>>>>>>>>> +++++++++++++++++++++-
>>>>>>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +-
>>>>>>>>>>>>   drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++-
>>>>>>>>>>>>   drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++-
>>>>>>>>>>>>   drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++-
>>>>>>>>>>>>   8 files changed, 104 insertions(+), 24 deletions(-)
>>>>>>>>>>>>
>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>> index 4264abc5604d..99efd8317547 100644
>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>> @@ -109,6 +109,7 @@
>>>>>>>>>>>>   #include "amdgpu_fdinfo.h"
>>>>>>>>>>>>   #include "amdgpu_mca.h"
>>>>>>>>>>>>   #include "amdgpu_ras.h"
>>>>>>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>>>>>>     #define MAX_GPU_INSTANCE        16
>>>>>>>>>>>>   @@ -509,16 +510,6 @@ struct amdgpu_allowed_register_entry {
>>>>>>>>>>>>       bool grbm_indexed;
>>>>>>>>>>>>   };
>>>>>>>>>>>>   -enum amd_reset_method {
>>>>>>>>>>>> -    AMD_RESET_METHOD_NONE = -1,
>>>>>>>>>>>> -    AMD_RESET_METHOD_LEGACY = 0,
>>>>>>>>>>>> -    AMD_RESET_METHOD_MODE0,
>>>>>>>>>>>> -    AMD_RESET_METHOD_MODE1,
>>>>>>>>>>>> -    AMD_RESET_METHOD_MODE2,
>>>>>>>>>>>> -    AMD_RESET_METHOD_BACO,
>>>>>>>>>>>> -    AMD_RESET_METHOD_PCI,
>>>>>>>>>>>> -};
>>>>>>>>>>>> -
>>>>>>>>>>>>   struct amdgpu_video_codec_info {
>>>>>>>>>>>>       u32 codec_type;
>>>>>>>>>>>>       u32 max_width;
>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>> index e582f1044c0f..7fa82269c30f 100644
>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>> @@ -5201,6 +5201,12 @@ int 
>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,
>>>>>>>>>>>>       }
>>>>>>>>>>>>         tmp_vram_lost_counter = 
>>>>>>>>>>>> atomic_read(&((adev)->vram_lost_counter));
>>>>>>>>>>>> +
>>>>>>>>>>>> +    /* Drop all pending resets since we will reset now 
>>>>>>>>>>>> anyway */
>>>>>>>>>>>> +    tmp_adev = list_first_entry(device_list_handle, struct 
>>>>>>>>>>>> amdgpu_device,
>>>>>>>>>>>> +                        reset_list);
>>>>>>>>>>>> + amdgpu_reset_pending_list(tmp_adev->reset_domain);
>>>>>>>>>>>> +
>>>>>>>>>>>>       /* Actual ASIC resets if needed.*/
>>>>>>>>>>>>       /* Host driver will handle XGMI hive reset for SRIOV */
>>>>>>>>>>>>       if (amdgpu_sriov_vf(adev)) {
>>>>>>>>>>>> @@ -5296,7 +5302,7 @@ int 
>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,
>>>>>>>>>>>>   }
>>>>>>>>>>>>     struct amdgpu_recover_work_struct {
>>>>>>>>>>>> -    struct work_struct base;
>>>>>>>>>>>> +    struct amdgpu_reset_work_struct base;
>>>>>>>>>>>>       struct amdgpu_device *adev;
>>>>>>>>>>>>       struct amdgpu_job *job;
>>>>>>>>>>>>       int ret;
>>>>>>>>>>>> @@ -5304,7 +5310,7 @@ struct amdgpu_recover_work_struct {
>>>>>>>>>>>>     static void amdgpu_device_queue_gpu_recover_work(struct 
>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>   {
>>>>>>>>>>>> -    struct amdgpu_recover_work_struct *recover_work = 
>>>>>>>>>>>> container_of(work, struct amdgpu_recover_work_struct, base);
>>>>>>>>>>>> +    struct amdgpu_recover_work_struct *recover_work = 
>>>>>>>>>>>> container_of(work, struct amdgpu_recover_work_struct, 
>>>>>>>>>>>> base.base.work);
>>>>>>>>>>>>         recover_work->ret = 
>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, 
>>>>>>>>>>>> recover_work->job);
>>>>>>>>>>>>   }
>>>>>>>>>>>> @@ -5316,12 +5322,15 @@ int 
>>>>>>>>>>>> amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>>>>>>>>>>>   {
>>>>>>>>>>>>       struct amdgpu_recover_work_struct work = {.adev = 
>>>>>>>>>>>> adev, .job = job};
>>>>>>>>>>>>   -    INIT_WORK(&work.base, 
>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>>>>>>> +    INIT_DELAYED_WORK(&work.base.base, 
>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>>>>>>> +    INIT_LIST_HEAD(&work.base.node);
>>>>>>>>>>>>         if 
>>>>>>>>>>>> (!amdgpu_reset_domain_schedule(adev->reset_domain, 
>>>>>>>>>>>> &work.base))
>>>>>>>>>>>>           return -EAGAIN;
>>>>>>>>>>>>   -    flush_work(&work.base);
>>>>>>>>>>>> + flush_delayed_work(&work.base.base);
>>>>>>>>>>>> +
>>>>>>>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>> &work.base);
>>>>>>>>>>>>         return work.ret;
>>>>>>>>>>>>   }
>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>> index c80af0889773..ffddd419c351 100644
>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain 
>>>>>>>>>>>> *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d
>>>>>>>>>>>> atomic_set(&reset_domain->in_gpu_reset, 0);
>>>>>>>>>>>>       init_rwsem(&reset_domain->sem);
>>>>>>>>>>>>   + INIT_LIST_HEAD(&reset_domain->pending_works);
>>>>>>>>>>>> + mutex_init(&reset_domain->reset_lock);
>>>>>>>>>>>> +
>>>>>>>>>>>>       return reset_domain;
>>>>>>>>>>>>   }
>>>>>>>>>>>>   diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>> index 1949dbe28a86..863ec5720fc1 100644
>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>> @@ -24,7 +24,18 @@
>>>>>>>>>>>>   #ifndef __AMDGPU_RESET_H__
>>>>>>>>>>>>   #define __AMDGPU_RESET_H__
>>>>>>>>>>>>   -#include "amdgpu.h"
>>>>>>>>>>>> +
>>>>>>>>>>>> +#include <linux/atomic.h>
>>>>>>>>>>>> +#include <linux/mutex.h>
>>>>>>>>>>>> +#include <linux/list.h>
>>>>>>>>>>>> +#include <linux/kref.h>
>>>>>>>>>>>> +#include <linux/rwsem.h>
>>>>>>>>>>>> +#include <linux/workqueue.h>
>>>>>>>>>>>> +
>>>>>>>>>>>> +struct amdgpu_device;
>>>>>>>>>>>> +struct amdgpu_job;
>>>>>>>>>>>> +struct amdgpu_hive_info;
>>>>>>>>>>>> +
>>>>>>>>>>>>     enum AMDGPU_RESET_FLAGS {
>>>>>>>>>>>>   @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS {
>>>>>>>>>>>>       AMDGPU_SKIP_HW_RESET = 1,
>>>>>>>>>>>>   };
>>>>>>>>>>>>   +
>>>>>>>>>>>> +enum amd_reset_method {
>>>>>>>>>>>> +    AMD_RESET_METHOD_NONE = -1,
>>>>>>>>>>>> +    AMD_RESET_METHOD_LEGACY = 0,
>>>>>>>>>>>> +    AMD_RESET_METHOD_MODE0,
>>>>>>>>>>>> +    AMD_RESET_METHOD_MODE1,
>>>>>>>>>>>> +    AMD_RESET_METHOD_MODE2,
>>>>>>>>>>>> +    AMD_RESET_METHOD_BACO,
>>>>>>>>>>>> +    AMD_RESET_METHOD_PCI,
>>>>>>>>>>>> +};
>>>>>>>>>>>> +
>>>>>>>>>>>>   struct amdgpu_reset_context {
>>>>>>>>>>>>       enum amd_reset_method method;
>>>>>>>>>>>>       struct amdgpu_device *reset_req_dev;
>>>>>>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context {
>>>>>>>>>>>>       unsigned long flags;
>>>>>>>>>>>>   };
>>>>>>>>>>>>   +struct amdgpu_reset_control;
>>>>>>>>>>>> +
>>>>>>>>>>>>   struct amdgpu_reset_handler {
>>>>>>>>>>>>       enum amd_reset_method reset_method;
>>>>>>>>>>>>       struct list_head handler_list;
>>>>>>>>>>>> @@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type {
>>>>>>>>>>>>       XGMI_HIVE
>>>>>>>>>>>>   };
>>>>>>>>>>>>   +
>>>>>>>>>>>> +struct amdgpu_reset_work_struct {
>>>>>>>>>>>> +    struct delayed_work base;
>>>>>>>>>>>> +    struct list_head node;
>>>>>>>>>>>> +};
>>>>>>>>>>>> +
>>>>>>>>>>>>   struct amdgpu_reset_domain {
>>>>>>>>>>>>       struct kref refcount;
>>>>>>>>>>>>       struct workqueue_struct *wq;
>>>>>>>>>>>>       enum amdgpu_reset_domain_type type;
>>>>>>>>>>>>       struct rw_semaphore sem;
>>>>>>>>>>>>       atomic_t in_gpu_reset;
>>>>>>>>>>>> +
>>>>>>>>>>>> +    struct list_head pending_works;
>>>>>>>>>>>> +    struct mutex reset_lock;
>>>>>>>>>>>>   };
>>>>>>>>>>>>     @@ -113,9 +146,43 @@ static inline void 
>>>>>>>>>>>> amdgpu_reset_put_reset_domain(struct amdgpu_reset_domain *dom
>>>>>>>>>>>>   }
>>>>>>>>>>>>     static inline bool amdgpu_reset_domain_schedule(struct 
>>>>>>>>>>>> amdgpu_reset_domain *domain,
>>>>>>>>>>>> -                        struct work_struct *work)
>>>>>>>>>>>> +                        struct amdgpu_reset_work_struct 
>>>>>>>>>>>> *work)
>>>>>>>>>>>>   {
>>>>>>>>>>>> -    return queue_work(domain->wq, work);
>>>>>>>>>>>> +    mutex_lock(&domain->reset_lock);
>>>>>>>>>>>> +
>>>>>>>>>>>> +    if (!queue_delayed_work(domain->wq, &work->base, 0)) {
>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>> +        return false;
>>>>>>>>>>>> +    }
>>>>>>>>>>>> +
>>>>>>>>>>>> +    list_add_tail(&work->node, &domain->pending_works);
>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>> +
>>>>>>>>>>>> +    return true;
>>>>>>>>>>>> +}
>>>>>>>>>>>> +
>>>>>>>>>>>> +static inline void 
>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(struct 
>>>>>>>>>>>> amdgpu_reset_domain *domain,
>>>>>>>>>>>> +                  struct amdgpu_reset_work_struct *work)
>>>>>>>>>>>> +{
>>>>>>>>>>>> +    mutex_lock(&domain->reset_lock);
>>>>>>>>>>>> +    list_del_init(&work->node);
>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>> +}
>>>>>>>>>>>> +
>>>>>>>>>>>> +static inline void amdgpu_reset_pending_list(struct 
>>>>>>>>>>>> amdgpu_reset_domain *domain)
>>>>>>>>>>>> +{
>>>>>>>>>>>> +    struct amdgpu_reset_work_struct *entry, *tmp;
>>>>>>>>>>>> +
>>>>>>>>>>>> +    mutex_lock(&domain->reset_lock);
>>>>>>>>>>>> +    list_for_each_entry_safe(entry, tmp, 
>>>>>>>>>>>> &domain->pending_works, node) {
>>>>>>>>>>>> +
>>>>>>>>>>>> +        list_del_init(&entry->node);
>>>>>>>>>>>> +
>>>>>>>>>>>> +        /* Stop any other related pending resets */
>>>>>>>>>>>> + cancel_delayed_work(&entry->base);
>>>>>>>>>>>> +    }
>>>>>>>>>>>> +
>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>   }
>>>>>>>>>>>>     void amdgpu_device_lock_reset_domain(struct 
>>>>>>>>>>>> amdgpu_reset_domain *reset_domain);
>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h 
>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>> index 239f232f9c02..574e870d3064 100644
>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>> @@ -25,6 +25,7 @@
>>>>>>>>>>>>   #define AMDGPU_VIRT_H
>>>>>>>>>>>>     #include "amdgv_sriovmsg.h"
>>>>>>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>>>>>>     #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 << 0) /* vBIOS 
>>>>>>>>>>>> is sr-iov ready */
>>>>>>>>>>>>   #define AMDGPU_SRIOV_CAPS_ENABLE_IOV   (1 << 1) /* sr-iov 
>>>>>>>>>>>> is enabled on this GPU */
>>>>>>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt {
>>>>>>>>>>>>       uint32_t            reg_val_offs;
>>>>>>>>>>>>       struct amdgpu_irq_src        ack_irq;
>>>>>>>>>>>>       struct amdgpu_irq_src        rcv_irq;
>>>>>>>>>>>> -    struct work_struct        flr_work;
>>>>>>>>>>>> +    struct amdgpu_reset_work_struct flr_work;
>>>>>>>>>>>>       struct amdgpu_mm_table mm_table;
>>>>>>>>>>>>       const struct amdgpu_virt_ops    *ops;
>>>>>>>>>>>>       struct amdgpu_vf_error_buffer vf_errors;
>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>> index b81acf59870c..f3d1c2be9292 100644
>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>> @@ -251,7 +251,7 @@ static int 
>>>>>>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device *adev,
>>>>>>>>>>>>     static void xgpu_ai_mailbox_flr_work(struct work_struct 
>>>>>>>>>>>> *work)
>>>>>>>>>>>>   {
>>>>>>>>>>>> -    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>>>>>>>> amdgpu_virt, flr_work);
>>>>>>>>>>>> +    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>>>>>>>> amdgpu_virt, flr_work.base.work);
>>>>>>>>>>>>       struct amdgpu_device *adev = container_of(virt, 
>>>>>>>>>>>> struct amdgpu_device, virt);
>>>>>>>>>>>>       int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>   @@ -380,7 +380,8 @@ int xgpu_ai_mailbox_get_irq(struct 
>>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>>           return r;
>>>>>>>>>>>>       }
>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, xgpu_ai_mailbox_flr_work);
>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>> xgpu_ai_mailbox_flr_work);
>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>   }
>>>>>>>>>>>> @@ -389,6 +390,8 @@ void xgpu_ai_mailbox_put_irq(struct 
>>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>>   {
>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>> +
>>>>>>>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>   }
>>>>>>>>>>>>     static int xgpu_ai_request_init_data(struct 
>>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644
>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>> @@ -275,7 +275,7 @@ static int 
>>>>>>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device *adev,
>>>>>>>>>>>>     static void xgpu_nv_mailbox_flr_work(struct work_struct 
>>>>>>>>>>>> *work)
>>>>>>>>>>>>   {
>>>>>>>>>>>> -    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>>>>>>>> amdgpu_virt, flr_work);
>>>>>>>>>>>> +    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>>>>>>>> amdgpu_virt, flr_work.base.work);
>>>>>>>>>>>>       struct amdgpu_device *adev = container_of(virt, 
>>>>>>>>>>>> struct amdgpu_device, virt);
>>>>>>>>>>>>       int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>   @@ -407,7 +407,8 @@ int xgpu_nv_mailbox_get_irq(struct 
>>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>>           return r;
>>>>>>>>>>>>       }
>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, xgpu_nv_mailbox_flr_work);
>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>> xgpu_nv_mailbox_flr_work);
>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>   }
>>>>>>>>>>>> @@ -416,6 +417,8 @@ void xgpu_nv_mailbox_put_irq(struct 
>>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>>   {
>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>> +
>>>>>>>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>   }
>>>>>>>>>>>>     const struct amdgpu_virt_ops xgpu_nv_virt_ops = {
>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c 
>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644
>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>> @@ -512,7 +512,7 @@ static int 
>>>>>>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device *adev,
>>>>>>>>>>>>     static void xgpu_vi_mailbox_flr_work(struct work_struct 
>>>>>>>>>>>> *work)
>>>>>>>>>>>>   {
>>>>>>>>>>>> -    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>>>>>>>> amdgpu_virt, flr_work);
>>>>>>>>>>>> +    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>>>>>>>> amdgpu_virt, flr_work.base.work);
>>>>>>>>>>>>       struct amdgpu_device *adev = container_of(virt, 
>>>>>>>>>>>> struct amdgpu_device, virt);
>>>>>>>>>>>>         /* wait until RCV_MSG become 3 */
>>>>>>>>>>>> @@ -610,7 +610,8 @@ int xgpu_vi_mailbox_get_irq(struct 
>>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>>           return r;
>>>>>>>>>>>>       }
>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, xgpu_vi_mailbox_flr_work);
>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>> xgpu_vi_mailbox_flr_work);
>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>   }
>>>>>>>>>>>> @@ -619,6 +620,8 @@ void xgpu_vi_mailbox_put_irq(struct 
>>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>>   {
>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>> +
>>>>>>>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>   }
>>>>>>>>>>>>     const struct amdgpu_virt_ops xgpu_vi_virt_ops = {
>>>>>>>>>>>
>>>>>>>>>
>>>>>>>
>>


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive.
  2022-05-10 16:17                       ` Christian König
@ 2022-05-10 17:01                         ` Andrey Grodzovsky
  2022-05-10 17:19                           ` Christian König
  0 siblings, 1 reply; 40+ messages in thread
From: Andrey Grodzovsky @ 2022-05-10 17:01 UTC (permalink / raw)
  To: Christian König, Christian König, Lazar, Lijo,
	Felix Kuehling, amd-gfx
  Cc: Bai Zoy


On 2022-05-10 12:17, Christian König wrote:
> Am 10.05.22 um 18:00 schrieb Andrey Grodzovsky:
>> [SNIP]
>>> That's one of the reasons why we should have multiple work items for 
>>> job based reset and other reset sources.
>>>
>>> See the whole idea is the following:
>>> 1. We have one single queued work queue for each reset domain which 
>>> makes sure that all reset requests execute in order.
>>> 2. We have one delayed work item for each scheduler which fires when 
>>> a timeout on a scheduler occurs and eventually calls the reset 
>>> procedure with the last running job.
>>> 3. We have one work item for each necessary hard reset.
>>>
>>> The delayed work item from the scheduler first tries a soft recovery 
>>> and checks if a hard reset is really necessary. If it's not 
>>> necessary and we can cancel the offending job we skip the hard reset.
>>>
>>> The hard reset work item doesn't do any of those checks and just 
>>> does a reset no matter what.
>>>
>>> When we really do a reset, independent if its triggered by a job or 
>>> other source we cancel all sources at the end of the reset procedure.
>>>
>>> This makes sure that a) We only do one reset even when multiple 
>>> sources fire at the same time and b) when any source bails out and 
>>> only does a soft recovery we do a full reset anyway when necessary.
>>>
>>> That design was outlined multiple times now on the mailing list and 
>>> looks totally clear to me. We should probably document that somewhere.
>>
>>
>> If you look at the patch what you described above is exactly what is 
>> happening - since scheduler's delayed work is different from any non 
>> scheduler delayed work the SW reset which might take place from 
>> scheduler's reset
>> will not have any impact on any non scheduler delayed work and will 
>> not cancel them. In case the scheduler actually reaches the point of 
>> HW reset it will cancel out all pending reset works from any other 
>> sources on the same
>> reset domain. Non scheduler reset will always proceed to do full HW 
>> reset and will cancel any other pending resets.
>
> Ok, but why you then need that linked list? The number of reset 
> sources should be static and not in any way dynamic.


So array reset_src[i] holds a pointer to pending delayed work from 
source i or NULL if no pedning work ?
What if same source triggers multiple reset requests such as multiple 
RAS errors at once , don't set the delayed work pointer in the 
arr[RAS_index] if it's already not NULL ?

>
> See using the linked list sounds like you only wanted to cancel the 
> reset sources raised so far which would not be correct as far as I can 
> see.


Not clear about this one ? We do want to cancel those reset sources that 
were raised so far because we just did a HW reset which should fix them 
anyway ? Those who not raised reset request so far their respective 
array index will have a NULL ptr.

Andrey


>
>>
>> The only difference is I chose to do the canceling right BEFORE the 
>> HW reset and not AFTER. I did this because I see a possible race 
>> where a new reset request is being generated exactly after we 
>> finished the HW reset but before we canceled out all pending resets - 
>> in such case you wold not want to cancel this 'border line new' reset 
>> request.
>
> Why not? Any new reset request directly after a hardware reset is most 
> likely just falsely generated by the reset itself.
>
> Ideally I would cancel all sources after the reset, but before 
> starting any new work.
>
> Regards,
> Christian.
>
>>
>>
>> Andrey
>>
>>
>>>
>>> Regards,
>>> Christian.
>>>
>>>>> You can see that if many different reset sources share same work 
>>>>> struct what can happen is that the first to obtain the lock you 
>>>>> describe bellow might opt out from full HW reset because his bad 
>>>>> job did signal for example or because his hunged IP block was able 
>>>>> to recover through SW reset but in the meantime another reset 
>>>>> source who needed an actual HW reset just silently returned and we 
>>>>> end up with unhandled reset request. True that today this happens 
>>>>> only to job timeout reset sources that are handled form within the 
>>>>> scheduler and won't use this single work struct but no one 
>>>>> prevents a future case for this to happen and also, if we actually 
>>>>> want to unify scheduler time out handlers within reset domain 
>>>>> (which seems to me the right design approach) we won't be able to 
>>>>> use just one work struct for this reason anyway.
>>>>>
>>>>
>>>> Just to add to this point - a reset domain is co-operative domain. 
>>>> In addition to sharing a set of clients from various reset sources 
>>>> for one device, it also will have a set of devices like in XGMI 
>>>> hive. The job timeout on one device may not eventually result in 
>>>> result, but a RAS error happening on another device at the same 
>>>> time would need a reset. The second device's RAS error cannot 
>>>> return seeing  that a reset work already started, or ignore the 
>>>> reset work given that another device has filled the reset data.
>>>>
>>>> When there is a reset domain, it should take care of the work 
>>>> scheduled and keeping it in device or any other level doesn't sound 
>>>> good.
>>>>
>>>> Thanks,
>>>> Lijo
>>>>
>>>>> Andrey
>>>>>
>>>>>
>>>>>>
>>>>>> I'd put the reset work struct into the reset_domain struct. That 
>>>>>> way you'd have exactly one worker for the reset domain. You could 
>>>>>> implement a lock-less scheme to decide whether you need to 
>>>>>> schedule a reset, e.g. using an atomic counter in the shared work 
>>>>>> struct that gets incremented when a client wants to trigger a 
>>>>>> reset (atomic_add_return). If that counter is exactly 1 after 
>>>>>> incrementing, you need to fill in the rest of the work struct and 
>>>>>> schedule the work. Otherwise, it's already scheduled (or another 
>>>>>> client is in the process of scheduling it) and you just return. 
>>>>>> When the worker finishes (after confirming a successful reset), 
>>>>>> it resets the counter to 0, so the next client requesting a reset 
>>>>>> will schedule the worker again.
>>>>>>
>>>>>> Regards,
>>>>>>   Felix
>>>>>>
>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>> Additional to that keep in mind that you can't allocate any 
>>>>>>>> memory before or during the GPU reset nor wait for the reset to 
>>>>>>>> complete (so you can't allocate anything on the stack either).
>>>>>>>
>>>>>>>
>>>>>>> There is no dynamic allocation here, regarding stack allocations 
>>>>>>> - we do it all the time when we call functions, even during GPU 
>>>>>>> resets, how on stack allocation of work struct in 
>>>>>>> amdgpu_device_gpu_recover is different from any other local 
>>>>>>> variable we allocate in any function we call ?
>>>>>>>
>>>>>>> I am also not sure why it's not allowed to wait for reset to 
>>>>>>> complete ? Also, see in amdgpu_ras_do_recovery and 
>>>>>>> gpu_recover_get (debugfs) - the caller expects the reset to 
>>>>>>> complete before he returns. I can probably work around it in RAS 
>>>>>>> code by calling atomic_set(&ras->in_recovery, 0) from some 
>>>>>>> callback within actual reset function but regarding sysfs it 
>>>>>>> actually expects a result returned indicating whether the call 
>>>>>>> was successful or not.
>>>>>>>
>>>>>>> Andrey
>>>>>>>
>>>>>>>
>>>>>>>>
>>>>>>>> I don't think that concept you try here will work.
>>>>>>>>
>>>>>>>> Regards,
>>>>>>>> Christian.
>>>>>>>>
>>>>>>>>> Also in general seems to me it's cleaner approach where this 
>>>>>>>>> logic (the work items) are held and handled in reset_domain 
>>>>>>>>> and are not split in each adev or any other entity. We might 
>>>>>>>>> want in the future to even move the scheduler handling into 
>>>>>>>>> reset domain since reset domain is supposed to be a generic 
>>>>>>>>> things and not only or AMD.
>>>>>>>>>
>>>>>>>>> Andrey
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> Andrey
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> Regards,
>>>>>>>>>>>> Christian.
>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>>>>>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com>
>>>>>>>>>>>>> ---
>>>>>>>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +---
>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++--
>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 +
>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 
>>>>>>>>>>>>> +++++++++++++++++++++-
>>>>>>>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +-
>>>>>>>>>>>>>   drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++-
>>>>>>>>>>>>>   drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++-
>>>>>>>>>>>>>   drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++-
>>>>>>>>>>>>>   8 files changed, 104 insertions(+), 24 deletions(-)
>>>>>>>>>>>>>
>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>> index 4264abc5604d..99efd8317547 100644
>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>> @@ -109,6 +109,7 @@
>>>>>>>>>>>>>   #include "amdgpu_fdinfo.h"
>>>>>>>>>>>>>   #include "amdgpu_mca.h"
>>>>>>>>>>>>>   #include "amdgpu_ras.h"
>>>>>>>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>>>>>>>     #define MAX_GPU_INSTANCE        16
>>>>>>>>>>>>>   @@ -509,16 +510,6 @@ struct amdgpu_allowed_register_entry {
>>>>>>>>>>>>>       bool grbm_indexed;
>>>>>>>>>>>>>   };
>>>>>>>>>>>>>   -enum amd_reset_method {
>>>>>>>>>>>>> -    AMD_RESET_METHOD_NONE = -1,
>>>>>>>>>>>>> -    AMD_RESET_METHOD_LEGACY = 0,
>>>>>>>>>>>>> -    AMD_RESET_METHOD_MODE0,
>>>>>>>>>>>>> -    AMD_RESET_METHOD_MODE1,
>>>>>>>>>>>>> -    AMD_RESET_METHOD_MODE2,
>>>>>>>>>>>>> -    AMD_RESET_METHOD_BACO,
>>>>>>>>>>>>> -    AMD_RESET_METHOD_PCI,
>>>>>>>>>>>>> -};
>>>>>>>>>>>>> -
>>>>>>>>>>>>>   struct amdgpu_video_codec_info {
>>>>>>>>>>>>>       u32 codec_type;
>>>>>>>>>>>>>       u32 max_width;
>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>> index e582f1044c0f..7fa82269c30f 100644
>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>> @@ -5201,6 +5201,12 @@ int 
>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,
>>>>>>>>>>>>>       }
>>>>>>>>>>>>>         tmp_vram_lost_counter = 
>>>>>>>>>>>>> atomic_read(&((adev)->vram_lost_counter));
>>>>>>>>>>>>> +
>>>>>>>>>>>>> +    /* Drop all pending resets since we will reset now 
>>>>>>>>>>>>> anyway */
>>>>>>>>>>>>> +    tmp_adev = list_first_entry(device_list_handle, 
>>>>>>>>>>>>> struct amdgpu_device,
>>>>>>>>>>>>> +                        reset_list);
>>>>>>>>>>>>> + amdgpu_reset_pending_list(tmp_adev->reset_domain);
>>>>>>>>>>>>> +
>>>>>>>>>>>>>       /* Actual ASIC resets if needed.*/
>>>>>>>>>>>>>       /* Host driver will handle XGMI hive reset for SRIOV */
>>>>>>>>>>>>>       if (amdgpu_sriov_vf(adev)) {
>>>>>>>>>>>>> @@ -5296,7 +5302,7 @@ int 
>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,
>>>>>>>>>>>>>   }
>>>>>>>>>>>>>     struct amdgpu_recover_work_struct {
>>>>>>>>>>>>> -    struct work_struct base;
>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct base;
>>>>>>>>>>>>>       struct amdgpu_device *adev;
>>>>>>>>>>>>>       struct amdgpu_job *job;
>>>>>>>>>>>>>       int ret;
>>>>>>>>>>>>> @@ -5304,7 +5310,7 @@ struct amdgpu_recover_work_struct {
>>>>>>>>>>>>>     static void 
>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work(struct work_struct 
>>>>>>>>>>>>> *work)
>>>>>>>>>>>>>   {
>>>>>>>>>>>>> -    struct amdgpu_recover_work_struct *recover_work = 
>>>>>>>>>>>>> container_of(work, struct amdgpu_recover_work_struct, base);
>>>>>>>>>>>>> +    struct amdgpu_recover_work_struct *recover_work = 
>>>>>>>>>>>>> container_of(work, struct amdgpu_recover_work_struct, 
>>>>>>>>>>>>> base.base.work);
>>>>>>>>>>>>>         recover_work->ret = 
>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, 
>>>>>>>>>>>>> recover_work->job);
>>>>>>>>>>>>>   }
>>>>>>>>>>>>> @@ -5316,12 +5322,15 @@ int 
>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>>>>>>>>>>>>   {
>>>>>>>>>>>>>       struct amdgpu_recover_work_struct work = {.adev = 
>>>>>>>>>>>>> adev, .job = job};
>>>>>>>>>>>>>   -    INIT_WORK(&work.base, 
>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>>>>>>>> + INIT_DELAYED_WORK(&work.base.base, 
>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>>>>>>>> +    INIT_LIST_HEAD(&work.base.node);
>>>>>>>>>>>>>         if 
>>>>>>>>>>>>> (!amdgpu_reset_domain_schedule(adev->reset_domain, 
>>>>>>>>>>>>> &work.base))
>>>>>>>>>>>>>           return -EAGAIN;
>>>>>>>>>>>>>   -    flush_work(&work.base);
>>>>>>>>>>>>> + flush_delayed_work(&work.base.base);
>>>>>>>>>>>>> +
>>>>>>>>>>>>> + 
>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>> &work.base);
>>>>>>>>>>>>>         return work.ret;
>>>>>>>>>>>>>   }
>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>> index c80af0889773..ffddd419c351 100644
>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain 
>>>>>>>>>>>>> *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d
>>>>>>>>>>>>> atomic_set(&reset_domain->in_gpu_reset, 0);
>>>>>>>>>>>>> init_rwsem(&reset_domain->sem);
>>>>>>>>>>>>>   + INIT_LIST_HEAD(&reset_domain->pending_works);
>>>>>>>>>>>>> + mutex_init(&reset_domain->reset_lock);
>>>>>>>>>>>>> +
>>>>>>>>>>>>>       return reset_domain;
>>>>>>>>>>>>>   }
>>>>>>>>>>>>>   diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>> index 1949dbe28a86..863ec5720fc1 100644
>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>> @@ -24,7 +24,18 @@
>>>>>>>>>>>>>   #ifndef __AMDGPU_RESET_H__
>>>>>>>>>>>>>   #define __AMDGPU_RESET_H__
>>>>>>>>>>>>>   -#include "amdgpu.h"
>>>>>>>>>>>>> +
>>>>>>>>>>>>> +#include <linux/atomic.h>
>>>>>>>>>>>>> +#include <linux/mutex.h>
>>>>>>>>>>>>> +#include <linux/list.h>
>>>>>>>>>>>>> +#include <linux/kref.h>
>>>>>>>>>>>>> +#include <linux/rwsem.h>
>>>>>>>>>>>>> +#include <linux/workqueue.h>
>>>>>>>>>>>>> +
>>>>>>>>>>>>> +struct amdgpu_device;
>>>>>>>>>>>>> +struct amdgpu_job;
>>>>>>>>>>>>> +struct amdgpu_hive_info;
>>>>>>>>>>>>> +
>>>>>>>>>>>>>     enum AMDGPU_RESET_FLAGS {
>>>>>>>>>>>>>   @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS {
>>>>>>>>>>>>>       AMDGPU_SKIP_HW_RESET = 1,
>>>>>>>>>>>>>   };
>>>>>>>>>>>>>   +
>>>>>>>>>>>>> +enum amd_reset_method {
>>>>>>>>>>>>> +    AMD_RESET_METHOD_NONE = -1,
>>>>>>>>>>>>> +    AMD_RESET_METHOD_LEGACY = 0,
>>>>>>>>>>>>> +    AMD_RESET_METHOD_MODE0,
>>>>>>>>>>>>> +    AMD_RESET_METHOD_MODE1,
>>>>>>>>>>>>> +    AMD_RESET_METHOD_MODE2,
>>>>>>>>>>>>> +    AMD_RESET_METHOD_BACO,
>>>>>>>>>>>>> +    AMD_RESET_METHOD_PCI,
>>>>>>>>>>>>> +};
>>>>>>>>>>>>> +
>>>>>>>>>>>>>   struct amdgpu_reset_context {
>>>>>>>>>>>>>       enum amd_reset_method method;
>>>>>>>>>>>>>       struct amdgpu_device *reset_req_dev;
>>>>>>>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context {
>>>>>>>>>>>>>       unsigned long flags;
>>>>>>>>>>>>>   };
>>>>>>>>>>>>>   +struct amdgpu_reset_control;
>>>>>>>>>>>>> +
>>>>>>>>>>>>>   struct amdgpu_reset_handler {
>>>>>>>>>>>>>       enum amd_reset_method reset_method;
>>>>>>>>>>>>>       struct list_head handler_list;
>>>>>>>>>>>>> @@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type {
>>>>>>>>>>>>>       XGMI_HIVE
>>>>>>>>>>>>>   };
>>>>>>>>>>>>>   +
>>>>>>>>>>>>> +struct amdgpu_reset_work_struct {
>>>>>>>>>>>>> +    struct delayed_work base;
>>>>>>>>>>>>> +    struct list_head node;
>>>>>>>>>>>>> +};
>>>>>>>>>>>>> +
>>>>>>>>>>>>>   struct amdgpu_reset_domain {
>>>>>>>>>>>>>       struct kref refcount;
>>>>>>>>>>>>>       struct workqueue_struct *wq;
>>>>>>>>>>>>>       enum amdgpu_reset_domain_type type;
>>>>>>>>>>>>>       struct rw_semaphore sem;
>>>>>>>>>>>>>       atomic_t in_gpu_reset;
>>>>>>>>>>>>> +
>>>>>>>>>>>>> +    struct list_head pending_works;
>>>>>>>>>>>>> +    struct mutex reset_lock;
>>>>>>>>>>>>>   };
>>>>>>>>>>>>>     @@ -113,9 +146,43 @@ static inline void 
>>>>>>>>>>>>> amdgpu_reset_put_reset_domain(struct amdgpu_reset_domain *dom
>>>>>>>>>>>>>   }
>>>>>>>>>>>>>     static inline bool amdgpu_reset_domain_schedule(struct 
>>>>>>>>>>>>> amdgpu_reset_domain *domain,
>>>>>>>>>>>>> -                        struct work_struct *work)
>>>>>>>>>>>>> +                        struct amdgpu_reset_work_struct 
>>>>>>>>>>>>> *work)
>>>>>>>>>>>>>   {
>>>>>>>>>>>>> -    return queue_work(domain->wq, work);
>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>> +
>>>>>>>>>>>>> +    if (!queue_delayed_work(domain->wq, &work->base, 0)) {
>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>> +        return false;
>>>>>>>>>>>>> +    }
>>>>>>>>>>>>> +
>>>>>>>>>>>>> +    list_add_tail(&work->node, &domain->pending_works);
>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>> +
>>>>>>>>>>>>> +    return true;
>>>>>>>>>>>>> +}
>>>>>>>>>>>>> +
>>>>>>>>>>>>> +static inline void 
>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(struct 
>>>>>>>>>>>>> amdgpu_reset_domain *domain,
>>>>>>>>>>>>> +                  struct amdgpu_reset_work_struct *work)
>>>>>>>>>>>>> +{
>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>> +    list_del_init(&work->node);
>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>> +}
>>>>>>>>>>>>> +
>>>>>>>>>>>>> +static inline void amdgpu_reset_pending_list(struct 
>>>>>>>>>>>>> amdgpu_reset_domain *domain)
>>>>>>>>>>>>> +{
>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct *entry, *tmp;
>>>>>>>>>>>>> +
>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>> +    list_for_each_entry_safe(entry, tmp, 
>>>>>>>>>>>>> &domain->pending_works, node) {
>>>>>>>>>>>>> +
>>>>>>>>>>>>> + list_del_init(&entry->node);
>>>>>>>>>>>>> +
>>>>>>>>>>>>> +        /* Stop any other related pending resets */
>>>>>>>>>>>>> + cancel_delayed_work(&entry->base);
>>>>>>>>>>>>> +    }
>>>>>>>>>>>>> +
>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>   }
>>>>>>>>>>>>>     void amdgpu_device_lock_reset_domain(struct 
>>>>>>>>>>>>> amdgpu_reset_domain *reset_domain);
>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h 
>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>> index 239f232f9c02..574e870d3064 100644
>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>> @@ -25,6 +25,7 @@
>>>>>>>>>>>>>   #define AMDGPU_VIRT_H
>>>>>>>>>>>>>     #include "amdgv_sriovmsg.h"
>>>>>>>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>>>>>>>     #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 << 0) /* 
>>>>>>>>>>>>> vBIOS is sr-iov ready */
>>>>>>>>>>>>>   #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 1) /* sr-iov 
>>>>>>>>>>>>> is enabled on this GPU */
>>>>>>>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt {
>>>>>>>>>>>>>       uint32_t            reg_val_offs;
>>>>>>>>>>>>>       struct amdgpu_irq_src ack_irq;
>>>>>>>>>>>>>       struct amdgpu_irq_src rcv_irq;
>>>>>>>>>>>>> -    struct work_struct        flr_work;
>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct flr_work;
>>>>>>>>>>>>>       struct amdgpu_mm_table mm_table;
>>>>>>>>>>>>>       const struct amdgpu_virt_ops *ops;
>>>>>>>>>>>>>       struct amdgpu_vf_error_buffer vf_errors;
>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>> index b81acf59870c..f3d1c2be9292 100644
>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>> @@ -251,7 +251,7 @@ static int 
>>>>>>>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device *adev,
>>>>>>>>>>>>>     static void xgpu_ai_mailbox_flr_work(struct 
>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>   {
>>>>>>>>>>>>> -    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>>>>>>>>> amdgpu_virt, flr_work);
>>>>>>>>>>>>> +    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>>>>>>>>> amdgpu_virt, flr_work.base.work);
>>>>>>>>>>>>>       struct amdgpu_device *adev = container_of(virt, 
>>>>>>>>>>>>> struct amdgpu_device, virt);
>>>>>>>>>>>>>       int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>   @@ -380,7 +380,8 @@ int xgpu_ai_mailbox_get_irq(struct 
>>>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>       }
>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work);
>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work);
>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>   }
>>>>>>>>>>>>> @@ -389,6 +390,8 @@ void xgpu_ai_mailbox_put_irq(struct 
>>>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>>>   {
>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>> +
>>>>>>>>>>>>> + 
>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>   }
>>>>>>>>>>>>>     static int xgpu_ai_request_init_data(struct 
>>>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644
>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>> @@ -275,7 +275,7 @@ static int 
>>>>>>>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device *adev,
>>>>>>>>>>>>>     static void xgpu_nv_mailbox_flr_work(struct 
>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>   {
>>>>>>>>>>>>> -    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>>>>>>>>> amdgpu_virt, flr_work);
>>>>>>>>>>>>> +    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>>>>>>>>> amdgpu_virt, flr_work.base.work);
>>>>>>>>>>>>>       struct amdgpu_device *adev = container_of(virt, 
>>>>>>>>>>>>> struct amdgpu_device, virt);
>>>>>>>>>>>>>       int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>   @@ -407,7 +407,8 @@ int xgpu_nv_mailbox_get_irq(struct 
>>>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>       }
>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work);
>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work);
>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>   }
>>>>>>>>>>>>> @@ -416,6 +417,8 @@ void xgpu_nv_mailbox_put_irq(struct 
>>>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>>>   {
>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>> +
>>>>>>>>>>>>> + 
>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>   }
>>>>>>>>>>>>>     const struct amdgpu_virt_ops xgpu_nv_virt_ops = {
>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c 
>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644
>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>> @@ -512,7 +512,7 @@ static int 
>>>>>>>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device *adev,
>>>>>>>>>>>>>     static void xgpu_vi_mailbox_flr_work(struct 
>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>   {
>>>>>>>>>>>>> -    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>>>>>>>>> amdgpu_virt, flr_work);
>>>>>>>>>>>>> +    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>>>>>>>>> amdgpu_virt, flr_work.base.work);
>>>>>>>>>>>>>       struct amdgpu_device *adev = container_of(virt, 
>>>>>>>>>>>>> struct amdgpu_device, virt);
>>>>>>>>>>>>>         /* wait until RCV_MSG become 3 */
>>>>>>>>>>>>> @@ -610,7 +610,8 @@ int xgpu_vi_mailbox_get_irq(struct 
>>>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>       }
>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work);
>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work);
>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>   }
>>>>>>>>>>>>> @@ -619,6 +620,8 @@ void xgpu_vi_mailbox_put_irq(struct 
>>>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>>>   {
>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>> +
>>>>>>>>>>>>> + 
>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>   }
>>>>>>>>>>>>>     const struct amdgpu_virt_ops xgpu_vi_virt_ops = {
>>>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>
>>>
>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive.
  2022-05-10 17:01                         ` Andrey Grodzovsky
@ 2022-05-10 17:19                           ` Christian König
  2022-05-10 18:53                             ` Andrey Grodzovsky
  0 siblings, 1 reply; 40+ messages in thread
From: Christian König @ 2022-05-10 17:19 UTC (permalink / raw)
  To: Andrey Grodzovsky, Christian König, Lazar, Lijo,
	Felix Kuehling, amd-gfx
  Cc: Bai Zoy

Am 10.05.22 um 19:01 schrieb Andrey Grodzovsky:
>
> On 2022-05-10 12:17, Christian König wrote:
>> Am 10.05.22 um 18:00 schrieb Andrey Grodzovsky:
>>> [SNIP]
>>>> That's one of the reasons why we should have multiple work items 
>>>> for job based reset and other reset sources.
>>>>
>>>> See the whole idea is the following:
>>>> 1. We have one single queued work queue for each reset domain which 
>>>> makes sure that all reset requests execute in order.
>>>> 2. We have one delayed work item for each scheduler which fires 
>>>> when a timeout on a scheduler occurs and eventually calls the reset 
>>>> procedure with the last running job.
>>>> 3. We have one work item for each necessary hard reset.
>>>>
>>>> The delayed work item from the scheduler first tries a soft 
>>>> recovery and checks if a hard reset is really necessary. If it's 
>>>> not necessary and we can cancel the offending job we skip the hard 
>>>> reset.
>>>>
>>>> The hard reset work item doesn't do any of those checks and just 
>>>> does a reset no matter what.
>>>>
>>>> When we really do a reset, independent if its triggered by a job or 
>>>> other source we cancel all sources at the end of the reset procedure.
>>>>
>>>> This makes sure that a) We only do one reset even when multiple 
>>>> sources fire at the same time and b) when any source bails out and 
>>>> only does a soft recovery we do a full reset anyway when necessary.
>>>>
>>>> That design was outlined multiple times now on the mailing list and 
>>>> looks totally clear to me. We should probably document that somewhere.
>>>
>>>
>>> If you look at the patch what you described above is exactly what is 
>>> happening - since scheduler's delayed work is different from any non 
>>> scheduler delayed work the SW reset which might take place from 
>>> scheduler's reset
>>> will not have any impact on any non scheduler delayed work and will 
>>> not cancel them. In case the scheduler actually reaches the point of 
>>> HW reset it will cancel out all pending reset works from any other 
>>> sources on the same
>>> reset domain. Non scheduler reset will always proceed to do full HW 
>>> reset and will cancel any other pending resets.
>>
>> Ok, but why you then need that linked list? The number of reset 
>> sources should be static and not in any way dynamic.
>
>
> So array reset_src[i] holds a pointer to pending delayed work from 
> source i or NULL if no pedning work ?
> What if same source triggers multiple reset requests such as multiple 
> RAS errors at once , don't set the delayed work pointer in the 
> arr[RAS_index] if it's already not NULL ?
>
>>
>> See using the linked list sounds like you only wanted to cancel the 
>> reset sources raised so far which would not be correct as far as I 
>> can see.
>
>
> Not clear about this one ? We do want to cancel those reset sources 
> that were raised so far because we just did a HW reset which should 
> fix them anyway ? Those who not raised reset request so far their 
> respective array index will have a NULL ptr.

And exactly that's what I want to prevent. See you don't care if a reset 
source has fired once, twice, ten times or never. You just cancel all of 
them!

That's why I want to come to a static list of reset sources.

E.g. in the reset code (either before or after the reset, that's 
debatable) you do something like this:

for (i = 0; i < num_ring; ++i)
     cancel_delayed_work(ring[i]->scheduler....)
cancel_work(adev->ras_work);
cancel_work(adev->iofault_work);
cancel_work(adev->debugfs_work);
...

You don't really need to track which reset source has fired and which 
hasn't, because that would be racy again. Instead just bluntly reset all 
possible sources.

Christian.

>
> Andrey
>
>
>>
>>>
>>> The only difference is I chose to do the canceling right BEFORE the 
>>> HW reset and not AFTER. I did this because I see a possible race 
>>> where a new reset request is being generated exactly after we 
>>> finished the HW reset but before we canceled out all pending resets 
>>> - in such case you wold not want to cancel this 'border line new' 
>>> reset request.
>>
>> Why not? Any new reset request directly after a hardware reset is 
>> most likely just falsely generated by the reset itself.
>>
>> Ideally I would cancel all sources after the reset, but before 
>> starting any new work.
>>
>> Regards,
>> Christian.
>>
>>>
>>>
>>> Andrey
>>>
>>>
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>>> You can see that if many different reset sources share same work 
>>>>>> struct what can happen is that the first to obtain the lock you 
>>>>>> describe bellow might opt out from full HW reset because his bad 
>>>>>> job did signal for example or because his hunged IP block was 
>>>>>> able to recover through SW reset but in the meantime another 
>>>>>> reset source who needed an actual HW reset just silently returned 
>>>>>> and we end up with unhandled reset request. True that today this 
>>>>>> happens only to job timeout reset sources that are handled form 
>>>>>> within the scheduler and won't use this single work struct but no 
>>>>>> one prevents a future case for this to happen and also, if we 
>>>>>> actually want to unify scheduler time out handlers within reset 
>>>>>> domain (which seems to me the right design approach) we won't be 
>>>>>> able to use just one work struct for this reason anyway.
>>>>>>
>>>>>
>>>>> Just to add to this point - a reset domain is co-operative domain. 
>>>>> In addition to sharing a set of clients from various reset sources 
>>>>> for one device, it also will have a set of devices like in XGMI 
>>>>> hive. The job timeout on one device may not eventually result in 
>>>>> result, but a RAS error happening on another device at the same 
>>>>> time would need a reset. The second device's RAS error cannot 
>>>>> return seeing  that a reset work already started, or ignore the 
>>>>> reset work given that another device has filled the reset data.
>>>>>
>>>>> When there is a reset domain, it should take care of the work 
>>>>> scheduled and keeping it in device or any other level doesn't 
>>>>> sound good.
>>>>>
>>>>> Thanks,
>>>>> Lijo
>>>>>
>>>>>> Andrey
>>>>>>
>>>>>>
>>>>>>>
>>>>>>> I'd put the reset work struct into the reset_domain struct. That 
>>>>>>> way you'd have exactly one worker for the reset domain. You 
>>>>>>> could implement a lock-less scheme to decide whether you need to 
>>>>>>> schedule a reset, e.g. using an atomic counter in the shared 
>>>>>>> work struct that gets incremented when a client wants to trigger 
>>>>>>> a reset (atomic_add_return). If that counter is exactly 1 after 
>>>>>>> incrementing, you need to fill in the rest of the work struct 
>>>>>>> and schedule the work. Otherwise, it's already scheduled (or 
>>>>>>> another client is in the process of scheduling it) and you just 
>>>>>>> return. When the worker finishes (after confirming a successful 
>>>>>>> reset), it resets the counter to 0, so the next client 
>>>>>>> requesting a reset will schedule the worker again.
>>>>>>>
>>>>>>> Regards,
>>>>>>>   Felix
>>>>>>>
>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> Additional to that keep in mind that you can't allocate any 
>>>>>>>>> memory before or during the GPU reset nor wait for the reset 
>>>>>>>>> to complete (so you can't allocate anything on the stack either).
>>>>>>>>
>>>>>>>>
>>>>>>>> There is no dynamic allocation here, regarding stack 
>>>>>>>> allocations - we do it all the time when we call functions, 
>>>>>>>> even during GPU resets, how on stack allocation of work struct 
>>>>>>>> in amdgpu_device_gpu_recover is different from any other local 
>>>>>>>> variable we allocate in any function we call ?
>>>>>>>>
>>>>>>>> I am also not sure why it's not allowed to wait for reset to 
>>>>>>>> complete ? Also, see in amdgpu_ras_do_recovery and 
>>>>>>>> gpu_recover_get (debugfs) - the caller expects the reset to 
>>>>>>>> complete before he returns. I can probably work around it in 
>>>>>>>> RAS code by calling atomic_set(&ras->in_recovery, 0) from some 
>>>>>>>> callback within actual reset function but regarding sysfs it 
>>>>>>>> actually expects a result returned indicating whether the call 
>>>>>>>> was successful or not.
>>>>>>>>
>>>>>>>> Andrey
>>>>>>>>
>>>>>>>>
>>>>>>>>>
>>>>>>>>> I don't think that concept you try here will work.
>>>>>>>>>
>>>>>>>>> Regards,
>>>>>>>>> Christian.
>>>>>>>>>
>>>>>>>>>> Also in general seems to me it's cleaner approach where this 
>>>>>>>>>> logic (the work items) are held and handled in reset_domain 
>>>>>>>>>> and are not split in each adev or any other entity. We might 
>>>>>>>>>> want in the future to even move the scheduler handling into 
>>>>>>>>>> reset domain since reset domain is supposed to be a generic 
>>>>>>>>>> things and not only or AMD.
>>>>>>>>>>
>>>>>>>>>> Andrey
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> Andrey
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>>>>>>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com>
>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +---
>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++--
>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 +
>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 
>>>>>>>>>>>>>> +++++++++++++++++++++-
>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +-
>>>>>>>>>>>>>>   drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++-
>>>>>>>>>>>>>>   drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++-
>>>>>>>>>>>>>>   drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++-
>>>>>>>>>>>>>>   8 files changed, 104 insertions(+), 24 deletions(-)
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>> index 4264abc5604d..99efd8317547 100644
>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>> @@ -109,6 +109,7 @@
>>>>>>>>>>>>>>   #include "amdgpu_fdinfo.h"
>>>>>>>>>>>>>>   #include "amdgpu_mca.h"
>>>>>>>>>>>>>>   #include "amdgpu_ras.h"
>>>>>>>>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>>>>>>>>     #define MAX_GPU_INSTANCE        16
>>>>>>>>>>>>>>   @@ -509,16 +510,6 @@ struct 
>>>>>>>>>>>>>> amdgpu_allowed_register_entry {
>>>>>>>>>>>>>>       bool grbm_indexed;
>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>   -enum amd_reset_method {
>>>>>>>>>>>>>> -    AMD_RESET_METHOD_NONE = -1,
>>>>>>>>>>>>>> -    AMD_RESET_METHOD_LEGACY = 0,
>>>>>>>>>>>>>> -    AMD_RESET_METHOD_MODE0,
>>>>>>>>>>>>>> -    AMD_RESET_METHOD_MODE1,
>>>>>>>>>>>>>> -    AMD_RESET_METHOD_MODE2,
>>>>>>>>>>>>>> -    AMD_RESET_METHOD_BACO,
>>>>>>>>>>>>>> -    AMD_RESET_METHOD_PCI,
>>>>>>>>>>>>>> -};
>>>>>>>>>>>>>> -
>>>>>>>>>>>>>>   struct amdgpu_video_codec_info {
>>>>>>>>>>>>>>       u32 codec_type;
>>>>>>>>>>>>>>       u32 max_width;
>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>> index e582f1044c0f..7fa82269c30f 100644
>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>> @@ -5201,6 +5201,12 @@ int 
>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,
>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>         tmp_vram_lost_counter = 
>>>>>>>>>>>>>> atomic_read(&((adev)->vram_lost_counter));
>>>>>>>>>>>>>> +
>>>>>>>>>>>>>> +    /* Drop all pending resets since we will reset now 
>>>>>>>>>>>>>> anyway */
>>>>>>>>>>>>>> +    tmp_adev = list_first_entry(device_list_handle, 
>>>>>>>>>>>>>> struct amdgpu_device,
>>>>>>>>>>>>>> +                        reset_list);
>>>>>>>>>>>>>> + amdgpu_reset_pending_list(tmp_adev->reset_domain);
>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>       /* Actual ASIC resets if needed.*/
>>>>>>>>>>>>>>       /* Host driver will handle XGMI hive reset for 
>>>>>>>>>>>>>> SRIOV */
>>>>>>>>>>>>>>       if (amdgpu_sriov_vf(adev)) {
>>>>>>>>>>>>>> @@ -5296,7 +5302,7 @@ int 
>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,
>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>     struct amdgpu_recover_work_struct {
>>>>>>>>>>>>>> -    struct work_struct base;
>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct base;
>>>>>>>>>>>>>>       struct amdgpu_device *adev;
>>>>>>>>>>>>>>       struct amdgpu_job *job;
>>>>>>>>>>>>>>       int ret;
>>>>>>>>>>>>>> @@ -5304,7 +5310,7 @@ struct amdgpu_recover_work_struct {
>>>>>>>>>>>>>>     static void 
>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work(struct work_struct 
>>>>>>>>>>>>>> *work)
>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>> -    struct amdgpu_recover_work_struct *recover_work = 
>>>>>>>>>>>>>> container_of(work, struct amdgpu_recover_work_struct, base);
>>>>>>>>>>>>>> +    struct amdgpu_recover_work_struct *recover_work = 
>>>>>>>>>>>>>> container_of(work, struct amdgpu_recover_work_struct, 
>>>>>>>>>>>>>> base.base.work);
>>>>>>>>>>>>>>         recover_work->ret = 
>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, 
>>>>>>>>>>>>>> recover_work->job);
>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>> @@ -5316,12 +5322,15 @@ int 
>>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>       struct amdgpu_recover_work_struct work = {.adev = 
>>>>>>>>>>>>>> adev, .job = job};
>>>>>>>>>>>>>>   -    INIT_WORK(&work.base, 
>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&work.base.base, 
>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>>>>>>>>> + INIT_LIST_HEAD(&work.base.node);
>>>>>>>>>>>>>>         if 
>>>>>>>>>>>>>> (!amdgpu_reset_domain_schedule(adev->reset_domain, 
>>>>>>>>>>>>>> &work.base))
>>>>>>>>>>>>>>           return -EAGAIN;
>>>>>>>>>>>>>>   -    flush_work(&work.base);
>>>>>>>>>>>>>> + flush_delayed_work(&work.base.base);
>>>>>>>>>>>>>> +
>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>> &work.base);
>>>>>>>>>>>>>>         return work.ret;
>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>> index c80af0889773..ffddd419c351 100644
>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain 
>>>>>>>>>>>>>> *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d
>>>>>>>>>>>>>> atomic_set(&reset_domain->in_gpu_reset, 0);
>>>>>>>>>>>>>> init_rwsem(&reset_domain->sem);
>>>>>>>>>>>>>>   + INIT_LIST_HEAD(&reset_domain->pending_works);
>>>>>>>>>>>>>> + mutex_init(&reset_domain->reset_lock);
>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>       return reset_domain;
>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>   diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>> index 1949dbe28a86..863ec5720fc1 100644
>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>> @@ -24,7 +24,18 @@
>>>>>>>>>>>>>>   #ifndef __AMDGPU_RESET_H__
>>>>>>>>>>>>>>   #define __AMDGPU_RESET_H__
>>>>>>>>>>>>>>   -#include "amdgpu.h"
>>>>>>>>>>>>>> +
>>>>>>>>>>>>>> +#include <linux/atomic.h>
>>>>>>>>>>>>>> +#include <linux/mutex.h>
>>>>>>>>>>>>>> +#include <linux/list.h>
>>>>>>>>>>>>>> +#include <linux/kref.h>
>>>>>>>>>>>>>> +#include <linux/rwsem.h>
>>>>>>>>>>>>>> +#include <linux/workqueue.h>
>>>>>>>>>>>>>> +
>>>>>>>>>>>>>> +struct amdgpu_device;
>>>>>>>>>>>>>> +struct amdgpu_job;
>>>>>>>>>>>>>> +struct amdgpu_hive_info;
>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>     enum AMDGPU_RESET_FLAGS {
>>>>>>>>>>>>>>   @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS {
>>>>>>>>>>>>>>       AMDGPU_SKIP_HW_RESET = 1,
>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>   +
>>>>>>>>>>>>>> +enum amd_reset_method {
>>>>>>>>>>>>>> +    AMD_RESET_METHOD_NONE = -1,
>>>>>>>>>>>>>> +    AMD_RESET_METHOD_LEGACY = 0,
>>>>>>>>>>>>>> +    AMD_RESET_METHOD_MODE0,
>>>>>>>>>>>>>> +    AMD_RESET_METHOD_MODE1,
>>>>>>>>>>>>>> +    AMD_RESET_METHOD_MODE2,
>>>>>>>>>>>>>> +    AMD_RESET_METHOD_BACO,
>>>>>>>>>>>>>> +    AMD_RESET_METHOD_PCI,
>>>>>>>>>>>>>> +};
>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>   struct amdgpu_reset_context {
>>>>>>>>>>>>>>       enum amd_reset_method method;
>>>>>>>>>>>>>>       struct amdgpu_device *reset_req_dev;
>>>>>>>>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context {
>>>>>>>>>>>>>>       unsigned long flags;
>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>   +struct amdgpu_reset_control;
>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>   struct amdgpu_reset_handler {
>>>>>>>>>>>>>>       enum amd_reset_method reset_method;
>>>>>>>>>>>>>>       struct list_head handler_list;
>>>>>>>>>>>>>> @@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type {
>>>>>>>>>>>>>>       XGMI_HIVE
>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>   +
>>>>>>>>>>>>>> +struct amdgpu_reset_work_struct {
>>>>>>>>>>>>>> +    struct delayed_work base;
>>>>>>>>>>>>>> +    struct list_head node;
>>>>>>>>>>>>>> +};
>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>   struct amdgpu_reset_domain {
>>>>>>>>>>>>>>       struct kref refcount;
>>>>>>>>>>>>>>       struct workqueue_struct *wq;
>>>>>>>>>>>>>>       enum amdgpu_reset_domain_type type;
>>>>>>>>>>>>>>       struct rw_semaphore sem;
>>>>>>>>>>>>>>       atomic_t in_gpu_reset;
>>>>>>>>>>>>>> +
>>>>>>>>>>>>>> +    struct list_head pending_works;
>>>>>>>>>>>>>> +    struct mutex reset_lock;
>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>     @@ -113,9 +146,43 @@ static inline void 
>>>>>>>>>>>>>> amdgpu_reset_put_reset_domain(struct amdgpu_reset_domain 
>>>>>>>>>>>>>> *dom
>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>     static inline bool 
>>>>>>>>>>>>>> amdgpu_reset_domain_schedule(struct amdgpu_reset_domain 
>>>>>>>>>>>>>> *domain,
>>>>>>>>>>>>>> -                        struct work_struct *work)
>>>>>>>>>>>>>> +                        struct amdgpu_reset_work_struct 
>>>>>>>>>>>>>> *work)
>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>> -    return queue_work(domain->wq, work);
>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>> +
>>>>>>>>>>>>>> +    if (!queue_delayed_work(domain->wq, &work->base, 0)) {
>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>> +        return false;
>>>>>>>>>>>>>> +    }
>>>>>>>>>>>>>> +
>>>>>>>>>>>>>> +    list_add_tail(&work->node, &domain->pending_works);
>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>> +
>>>>>>>>>>>>>> +    return true;
>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>> +
>>>>>>>>>>>>>> +static inline void 
>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(struct 
>>>>>>>>>>>>>> amdgpu_reset_domain *domain,
>>>>>>>>>>>>>> +                  struct amdgpu_reset_work_struct *work)
>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>> +    list_del_init(&work->node);
>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>> +
>>>>>>>>>>>>>> +static inline void amdgpu_reset_pending_list(struct 
>>>>>>>>>>>>>> amdgpu_reset_domain *domain)
>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct *entry, *tmp;
>>>>>>>>>>>>>> +
>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>> +    list_for_each_entry_safe(entry, tmp, 
>>>>>>>>>>>>>> &domain->pending_works, node) {
>>>>>>>>>>>>>> +
>>>>>>>>>>>>>> + list_del_init(&entry->node);
>>>>>>>>>>>>>> +
>>>>>>>>>>>>>> +        /* Stop any other related pending resets */
>>>>>>>>>>>>>> + cancel_delayed_work(&entry->base);
>>>>>>>>>>>>>> +    }
>>>>>>>>>>>>>> +
>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>     void amdgpu_device_lock_reset_domain(struct 
>>>>>>>>>>>>>> amdgpu_reset_domain *reset_domain);
>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h 
>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>> index 239f232f9c02..574e870d3064 100644
>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>> @@ -25,6 +25,7 @@
>>>>>>>>>>>>>>   #define AMDGPU_VIRT_H
>>>>>>>>>>>>>>     #include "amdgv_sriovmsg.h"
>>>>>>>>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>>>>>>>>     #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 << 0) /* 
>>>>>>>>>>>>>> vBIOS is sr-iov ready */
>>>>>>>>>>>>>>   #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 1) /* sr-iov 
>>>>>>>>>>>>>> is enabled on this GPU */
>>>>>>>>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt {
>>>>>>>>>>>>>>       uint32_t            reg_val_offs;
>>>>>>>>>>>>>>       struct amdgpu_irq_src ack_irq;
>>>>>>>>>>>>>>       struct amdgpu_irq_src rcv_irq;
>>>>>>>>>>>>>> -    struct work_struct        flr_work;
>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct flr_work;
>>>>>>>>>>>>>>       struct amdgpu_mm_table mm_table;
>>>>>>>>>>>>>>       const struct amdgpu_virt_ops *ops;
>>>>>>>>>>>>>>       struct amdgpu_vf_error_buffer vf_errors;
>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>> index b81acf59870c..f3d1c2be9292 100644
>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>> @@ -251,7 +251,7 @@ static int 
>>>>>>>>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device *adev,
>>>>>>>>>>>>>>     static void xgpu_ai_mailbox_flr_work(struct 
>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>>>>>>>>>> amdgpu_virt, flr_work);
>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>>>>>>>>>> amdgpu_virt, flr_work.base.work);
>>>>>>>>>>>>>>       struct amdgpu_device *adev = container_of(virt, 
>>>>>>>>>>>>>> struct amdgpu_device, virt);
>>>>>>>>>>>>>>       int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>>   @@ -380,7 +380,8 @@ int xgpu_ai_mailbox_get_irq(struct 
>>>>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work);
>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work);
>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>> @@ -389,6 +390,8 @@ void xgpu_ai_mailbox_put_irq(struct 
>>>>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>> +
>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>     static int xgpu_ai_request_init_data(struct 
>>>>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644
>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>> @@ -275,7 +275,7 @@ static int 
>>>>>>>>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device *adev,
>>>>>>>>>>>>>>     static void xgpu_nv_mailbox_flr_work(struct 
>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>>>>>>>>>> amdgpu_virt, flr_work);
>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>>>>>>>>>> amdgpu_virt, flr_work.base.work);
>>>>>>>>>>>>>>       struct amdgpu_device *adev = container_of(virt, 
>>>>>>>>>>>>>> struct amdgpu_device, virt);
>>>>>>>>>>>>>>       int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>>   @@ -407,7 +407,8 @@ int xgpu_nv_mailbox_get_irq(struct 
>>>>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work);
>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work);
>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>> @@ -416,6 +417,8 @@ void xgpu_nv_mailbox_put_irq(struct 
>>>>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>> +
>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>     const struct amdgpu_virt_ops xgpu_nv_virt_ops = {
>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c 
>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644
>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>> @@ -512,7 +512,7 @@ static int 
>>>>>>>>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device *adev,
>>>>>>>>>>>>>>     static void xgpu_vi_mailbox_flr_work(struct 
>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>>>>>>>>>> amdgpu_virt, flr_work);
>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = container_of(work, struct 
>>>>>>>>>>>>>> amdgpu_virt, flr_work.base.work);
>>>>>>>>>>>>>>       struct amdgpu_device *adev = container_of(virt, 
>>>>>>>>>>>>>> struct amdgpu_device, virt);
>>>>>>>>>>>>>>         /* wait until RCV_MSG become 3 */
>>>>>>>>>>>>>> @@ -610,7 +610,8 @@ int xgpu_vi_mailbox_get_irq(struct 
>>>>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work);
>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work);
>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>> @@ -619,6 +620,8 @@ void xgpu_vi_mailbox_put_irq(struct 
>>>>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>> +
>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>     const struct amdgpu_virt_ops xgpu_vi_virt_ops = {
>>>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>
>>>>
>>


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive.
  2022-05-10 17:19                           ` Christian König
@ 2022-05-10 18:53                             ` Andrey Grodzovsky
  2022-05-11  7:38                               ` Christian König
  0 siblings, 1 reply; 40+ messages in thread
From: Andrey Grodzovsky @ 2022-05-10 18:53 UTC (permalink / raw)
  To: Christian König, Christian König, Lazar, Lijo,
	Felix Kuehling, amd-gfx
  Cc: Bai Zoy


On 2022-05-10 13:19, Christian König wrote:
> Am 10.05.22 um 19:01 schrieb Andrey Grodzovsky:
>>
>> On 2022-05-10 12:17, Christian König wrote:
>>> Am 10.05.22 um 18:00 schrieb Andrey Grodzovsky:
>>>> [SNIP]
>>>>> That's one of the reasons why we should have multiple work items 
>>>>> for job based reset and other reset sources.
>>>>>
>>>>> See the whole idea is the following:
>>>>> 1. We have one single queued work queue for each reset domain 
>>>>> which makes sure that all reset requests execute in order.
>>>>> 2. We have one delayed work item for each scheduler which fires 
>>>>> when a timeout on a scheduler occurs and eventually calls the 
>>>>> reset procedure with the last running job.
>>>>> 3. We have one work item for each necessary hard reset.
>>>>>
>>>>> The delayed work item from the scheduler first tries a soft 
>>>>> recovery and checks if a hard reset is really necessary. If it's 
>>>>> not necessary and we can cancel the offending job we skip the hard 
>>>>> reset.
>>>>>
>>>>> The hard reset work item doesn't do any of those checks and just 
>>>>> does a reset no matter what.
>>>>>
>>>>> When we really do a reset, independent if its triggered by a job 
>>>>> or other source we cancel all sources at the end of the reset 
>>>>> procedure.
>>>>>
>>>>> This makes sure that a) We only do one reset even when multiple 
>>>>> sources fire at the same time and b) when any source bails out and 
>>>>> only does a soft recovery we do a full reset anyway when necessary.
>>>>>
>>>>> That design was outlined multiple times now on the mailing list 
>>>>> and looks totally clear to me. We should probably document that 
>>>>> somewhere.
>>>>
>>>>
>>>> If you look at the patch what you described above is exactly what 
>>>> is happening - since scheduler's delayed work is different from any 
>>>> non scheduler delayed work the SW reset which might take place from 
>>>> scheduler's reset
>>>> will not have any impact on any non scheduler delayed work and will 
>>>> not cancel them. In case the scheduler actually reaches the point 
>>>> of HW reset it will cancel out all pending reset works from any 
>>>> other sources on the same
>>>> reset domain. Non scheduler reset will always proceed to do full HW 
>>>> reset and will cancel any other pending resets.
>>>
>>> Ok, but why you then need that linked list? The number of reset 
>>> sources should be static and not in any way dynamic.
>>
>>
>> So array reset_src[i] holds a pointer to pending delayed work from 
>> source i or NULL if no pedning work ?
>> What if same source triggers multiple reset requests such as multiple 
>> RAS errors at once , don't set the delayed work pointer in the 
>> arr[RAS_index] if it's already not NULL ?
>>
>>>
>>> See using the linked list sounds like you only wanted to cancel the 
>>> reset sources raised so far which would not be correct as far as I 
>>> can see.
>>
>>
>> Not clear about this one ? We do want to cancel those reset sources 
>> that were raised so far because we just did a HW reset which should 
>> fix them anyway ? Those who not raised reset request so far their 
>> respective array index will have a NULL ptr.
>
> And exactly that's what I want to prevent. See you don't care if a 
> reset source has fired once, twice, ten times or never. You just 
> cancel all of them!
>
> That's why I want to come to a static list of reset sources.
>
> E.g. in the reset code (either before or after the reset, that's 
> debatable) you do something like this:
>
> for (i = 0; i < num_ring; ++i)
>     cancel_delayed_work(ring[i]->scheduler....)
> cancel_work(adev->ras_work);
> cancel_work(adev->iofault_work);
> cancel_work(adev->debugfs_work);
> ...
>
> You don't really need to track which reset source has fired and which 
> hasn't, because that would be racy again. Instead just bluntly reset 
> all possible sources.
>
> Christian.


I don't say we care if it fired once or twice (I need to add a fix to 
only insert reset work to pending reset list if it's not already there), 
the point of using list (or array) to which you add and from which you 
remove is that the logic of this is encapsulated within reset domain. In 
your way we need to be aware who exactly schedules reset work and 
explicitly cancel them, this also means that for any new source added in 
the future you will need to remember to add him
to the cancellation list which you showed above. In current way all this 
done automatically within reset_domain code and it's agnostic to 
specific driver and it's specific list of reset sources. Also in case we 
would want to generalize reset_domain to other GPU drivers (which was
a plan as far as i remember) this explicit mention of each reset works 
for cancellation is again less suitable in my opinion.

Andrey


>
>>
>> Andrey
>>
>>
>>>
>>>>
>>>> The only difference is I chose to do the canceling right BEFORE the 
>>>> HW reset and not AFTER. I did this because I see a possible race 
>>>> where a new reset request is being generated exactly after we 
>>>> finished the HW reset but before we canceled out all pending resets 
>>>> - in such case you wold not want to cancel this 'border line new' 
>>>> reset request.
>>>
>>> Why not? Any new reset request directly after a hardware reset is 
>>> most likely just falsely generated by the reset itself.
>>>
>>> Ideally I would cancel all sources after the reset, but before 
>>> starting any new work.
>>>
>>> Regards,
>>> Christian.
>>>
>>>>
>>>>
>>>> Andrey
>>>>
>>>>
>>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>>>> You can see that if many different reset sources share same work 
>>>>>>> struct what can happen is that the first to obtain the lock you 
>>>>>>> describe bellow might opt out from full HW reset because his bad 
>>>>>>> job did signal for example or because his hunged IP block was 
>>>>>>> able to recover through SW reset but in the meantime another 
>>>>>>> reset source who needed an actual HW reset just silently 
>>>>>>> returned and we end up with unhandled reset request. True that 
>>>>>>> today this happens only to job timeout reset sources that are 
>>>>>>> handled form within the scheduler and won't use this single work 
>>>>>>> struct but no one prevents a future case for this to happen and 
>>>>>>> also, if we actually want to unify scheduler time out handlers 
>>>>>>> within reset domain (which seems to me the right design 
>>>>>>> approach) we won't be able to use just one work struct for this 
>>>>>>> reason anyway.
>>>>>>>
>>>>>>
>>>>>> Just to add to this point - a reset domain is co-operative 
>>>>>> domain. In addition to sharing a set of clients from various 
>>>>>> reset sources for one device, it also will have a set of devices 
>>>>>> like in XGMI hive. The job timeout on one device may not 
>>>>>> eventually result in result, but a RAS error happening on another 
>>>>>> device at the same time would need a reset. The second device's 
>>>>>> RAS error cannot return seeing  that a reset work already 
>>>>>> started, or ignore the reset work given that another device has 
>>>>>> filled the reset data.
>>>>>>
>>>>>> When there is a reset domain, it should take care of the work 
>>>>>> scheduled and keeping it in device or any other level doesn't 
>>>>>> sound good.
>>>>>>
>>>>>> Thanks,
>>>>>> Lijo
>>>>>>
>>>>>>> Andrey
>>>>>>>
>>>>>>>
>>>>>>>>
>>>>>>>> I'd put the reset work struct into the reset_domain struct. 
>>>>>>>> That way you'd have exactly one worker for the reset domain. 
>>>>>>>> You could implement a lock-less scheme to decide whether you 
>>>>>>>> need to schedule a reset, e.g. using an atomic counter in the 
>>>>>>>> shared work struct that gets incremented when a client wants to 
>>>>>>>> trigger a reset (atomic_add_return). If that counter is exactly 
>>>>>>>> 1 after incrementing, you need to fill in the rest of the work 
>>>>>>>> struct and schedule the work. Otherwise, it's already scheduled 
>>>>>>>> (or another client is in the process of scheduling it) and you 
>>>>>>>> just return. When the worker finishes (after confirming a 
>>>>>>>> successful reset), it resets the counter to 0, so the next 
>>>>>>>> client requesting a reset will schedule the worker again.
>>>>>>>>
>>>>>>>> Regards,
>>>>>>>>   Felix
>>>>>>>>
>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> Additional to that keep in mind that you can't allocate any 
>>>>>>>>>> memory before or during the GPU reset nor wait for the reset 
>>>>>>>>>> to complete (so you can't allocate anything on the stack 
>>>>>>>>>> either).
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> There is no dynamic allocation here, regarding stack 
>>>>>>>>> allocations - we do it all the time when we call functions, 
>>>>>>>>> even during GPU resets, how on stack allocation of work struct 
>>>>>>>>> in amdgpu_device_gpu_recover is different from any other local 
>>>>>>>>> variable we allocate in any function we call ?
>>>>>>>>>
>>>>>>>>> I am also not sure why it's not allowed to wait for reset to 
>>>>>>>>> complete ? Also, see in amdgpu_ras_do_recovery and 
>>>>>>>>> gpu_recover_get (debugfs) - the caller expects the reset to 
>>>>>>>>> complete before he returns. I can probably work around it in 
>>>>>>>>> RAS code by calling atomic_set(&ras->in_recovery, 0) from some 
>>>>>>>>> callback within actual reset function but regarding sysfs it 
>>>>>>>>> actually expects a result returned indicating whether the call 
>>>>>>>>> was successful or not.
>>>>>>>>>
>>>>>>>>> Andrey
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> I don't think that concept you try here will work.
>>>>>>>>>>
>>>>>>>>>> Regards,
>>>>>>>>>> Christian.
>>>>>>>>>>
>>>>>>>>>>> Also in general seems to me it's cleaner approach where this 
>>>>>>>>>>> logic (the work items) are held and handled in reset_domain 
>>>>>>>>>>> and are not split in each adev or any other entity. We might 
>>>>>>>>>>> want in the future to even move the scheduler handling into 
>>>>>>>>>>> reset domain since reset domain is supposed to be a generic 
>>>>>>>>>>> things and not only or AMD.
>>>>>>>>>>>
>>>>>>>>>>> Andrey
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky 
>>>>>>>>>>>>>>> <andrey.grodzovsky@amd.com>
>>>>>>>>>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com>
>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +---
>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++--
>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 +
>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 
>>>>>>>>>>>>>>> +++++++++++++++++++++-
>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +-
>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++-
>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++-
>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++-
>>>>>>>>>>>>>>>   8 files changed, 104 insertions(+), 24 deletions(-)
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>> index 4264abc5604d..99efd8317547 100644
>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>> @@ -109,6 +109,7 @@
>>>>>>>>>>>>>>>   #include "amdgpu_fdinfo.h"
>>>>>>>>>>>>>>>   #include "amdgpu_mca.h"
>>>>>>>>>>>>>>>   #include "amdgpu_ras.h"
>>>>>>>>>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>>>>>>>>>     #define MAX_GPU_INSTANCE        16
>>>>>>>>>>>>>>>   @@ -509,16 +510,6 @@ struct 
>>>>>>>>>>>>>>> amdgpu_allowed_register_entry {
>>>>>>>>>>>>>>>       bool grbm_indexed;
>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>   -enum amd_reset_method {
>>>>>>>>>>>>>>> -    AMD_RESET_METHOD_NONE = -1,
>>>>>>>>>>>>>>> -    AMD_RESET_METHOD_LEGACY = 0,
>>>>>>>>>>>>>>> -    AMD_RESET_METHOD_MODE0,
>>>>>>>>>>>>>>> -    AMD_RESET_METHOD_MODE1,
>>>>>>>>>>>>>>> -    AMD_RESET_METHOD_MODE2,
>>>>>>>>>>>>>>> -    AMD_RESET_METHOD_BACO,
>>>>>>>>>>>>>>> -    AMD_RESET_METHOD_PCI,
>>>>>>>>>>>>>>> -};
>>>>>>>>>>>>>>> -
>>>>>>>>>>>>>>>   struct amdgpu_video_codec_info {
>>>>>>>>>>>>>>>       u32 codec_type;
>>>>>>>>>>>>>>>       u32 max_width;
>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>> index e582f1044c0f..7fa82269c30f 100644
>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>> @@ -5201,6 +5201,12 @@ int 
>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,
>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>         tmp_vram_lost_counter = 
>>>>>>>>>>>>>>> atomic_read(&((adev)->vram_lost_counter));
>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>> +    /* Drop all pending resets since we will reset now 
>>>>>>>>>>>>>>> anyway */
>>>>>>>>>>>>>>> +    tmp_adev = list_first_entry(device_list_handle, 
>>>>>>>>>>>>>>> struct amdgpu_device,
>>>>>>>>>>>>>>> +                        reset_list);
>>>>>>>>>>>>>>> + amdgpu_reset_pending_list(tmp_adev->reset_domain);
>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>       /* Actual ASIC resets if needed.*/
>>>>>>>>>>>>>>>       /* Host driver will handle XGMI hive reset for 
>>>>>>>>>>>>>>> SRIOV */
>>>>>>>>>>>>>>>       if (amdgpu_sriov_vf(adev)) {
>>>>>>>>>>>>>>> @@ -5296,7 +5302,7 @@ int 
>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,
>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>     struct amdgpu_recover_work_struct {
>>>>>>>>>>>>>>> -    struct work_struct base;
>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct base;
>>>>>>>>>>>>>>>       struct amdgpu_device *adev;
>>>>>>>>>>>>>>>       struct amdgpu_job *job;
>>>>>>>>>>>>>>>       int ret;
>>>>>>>>>>>>>>> @@ -5304,7 +5310,7 @@ struct amdgpu_recover_work_struct {
>>>>>>>>>>>>>>>     static void 
>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work(struct work_struct 
>>>>>>>>>>>>>>> *work)
>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>> -    struct amdgpu_recover_work_struct *recover_work = 
>>>>>>>>>>>>>>> container_of(work, struct amdgpu_recover_work_struct, 
>>>>>>>>>>>>>>> base);
>>>>>>>>>>>>>>> +    struct amdgpu_recover_work_struct *recover_work = 
>>>>>>>>>>>>>>> container_of(work, struct amdgpu_recover_work_struct, 
>>>>>>>>>>>>>>> base.base.work);
>>>>>>>>>>>>>>>         recover_work->ret = 
>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, 
>>>>>>>>>>>>>>> recover_work->job);
>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>> @@ -5316,12 +5322,15 @@ int 
>>>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>       struct amdgpu_recover_work_struct work = {.adev = 
>>>>>>>>>>>>>>> adev, .job = job};
>>>>>>>>>>>>>>>   -    INIT_WORK(&work.base, 
>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&work.base.base, 
>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&work.base.node);
>>>>>>>>>>>>>>>         if 
>>>>>>>>>>>>>>> (!amdgpu_reset_domain_schedule(adev->reset_domain, 
>>>>>>>>>>>>>>> &work.base))
>>>>>>>>>>>>>>>           return -EAGAIN;
>>>>>>>>>>>>>>>   -    flush_work(&work.base);
>>>>>>>>>>>>>>> + flush_delayed_work(&work.base.base);
>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>> &work.base);
>>>>>>>>>>>>>>>         return work.ret;
>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>> index c80af0889773..ffddd419c351 100644
>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain 
>>>>>>>>>>>>>>> *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d
>>>>>>>>>>>>>>> atomic_set(&reset_domain->in_gpu_reset, 0);
>>>>>>>>>>>>>>> init_rwsem(&reset_domain->sem);
>>>>>>>>>>>>>>>   + INIT_LIST_HEAD(&reset_domain->pending_works);
>>>>>>>>>>>>>>> + mutex_init(&reset_domain->reset_lock);
>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>       return reset_domain;
>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>   diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>> index 1949dbe28a86..863ec5720fc1 100644
>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>> @@ -24,7 +24,18 @@
>>>>>>>>>>>>>>>   #ifndef __AMDGPU_RESET_H__
>>>>>>>>>>>>>>>   #define __AMDGPU_RESET_H__
>>>>>>>>>>>>>>>   -#include "amdgpu.h"
>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>> +#include <linux/atomic.h>
>>>>>>>>>>>>>>> +#include <linux/mutex.h>
>>>>>>>>>>>>>>> +#include <linux/list.h>
>>>>>>>>>>>>>>> +#include <linux/kref.h>
>>>>>>>>>>>>>>> +#include <linux/rwsem.h>
>>>>>>>>>>>>>>> +#include <linux/workqueue.h>
>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>> +struct amdgpu_device;
>>>>>>>>>>>>>>> +struct amdgpu_job;
>>>>>>>>>>>>>>> +struct amdgpu_hive_info;
>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>     enum AMDGPU_RESET_FLAGS {
>>>>>>>>>>>>>>>   @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS {
>>>>>>>>>>>>>>>       AMDGPU_SKIP_HW_RESET = 1,
>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>   +
>>>>>>>>>>>>>>> +enum amd_reset_method {
>>>>>>>>>>>>>>> +    AMD_RESET_METHOD_NONE = -1,
>>>>>>>>>>>>>>> +    AMD_RESET_METHOD_LEGACY = 0,
>>>>>>>>>>>>>>> +    AMD_RESET_METHOD_MODE0,
>>>>>>>>>>>>>>> +    AMD_RESET_METHOD_MODE1,
>>>>>>>>>>>>>>> +    AMD_RESET_METHOD_MODE2,
>>>>>>>>>>>>>>> +    AMD_RESET_METHOD_BACO,
>>>>>>>>>>>>>>> +    AMD_RESET_METHOD_PCI,
>>>>>>>>>>>>>>> +};
>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>   struct amdgpu_reset_context {
>>>>>>>>>>>>>>>       enum amd_reset_method method;
>>>>>>>>>>>>>>>       struct amdgpu_device *reset_req_dev;
>>>>>>>>>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context {
>>>>>>>>>>>>>>>       unsigned long flags;
>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>   +struct amdgpu_reset_control;
>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>   struct amdgpu_reset_handler {
>>>>>>>>>>>>>>>       enum amd_reset_method reset_method;
>>>>>>>>>>>>>>>       struct list_head handler_list;
>>>>>>>>>>>>>>> @@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type {
>>>>>>>>>>>>>>>       XGMI_HIVE
>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>   +
>>>>>>>>>>>>>>> +struct amdgpu_reset_work_struct {
>>>>>>>>>>>>>>> +    struct delayed_work base;
>>>>>>>>>>>>>>> +    struct list_head node;
>>>>>>>>>>>>>>> +};
>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>   struct amdgpu_reset_domain {
>>>>>>>>>>>>>>>       struct kref refcount;
>>>>>>>>>>>>>>>       struct workqueue_struct *wq;
>>>>>>>>>>>>>>>       enum amdgpu_reset_domain_type type;
>>>>>>>>>>>>>>>       struct rw_semaphore sem;
>>>>>>>>>>>>>>>       atomic_t in_gpu_reset;
>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>> +    struct list_head pending_works;
>>>>>>>>>>>>>>> +    struct mutex reset_lock;
>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>     @@ -113,9 +146,43 @@ static inline void 
>>>>>>>>>>>>>>> amdgpu_reset_put_reset_domain(struct amdgpu_reset_domain 
>>>>>>>>>>>>>>> *dom
>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>     static inline bool 
>>>>>>>>>>>>>>> amdgpu_reset_domain_schedule(struct amdgpu_reset_domain 
>>>>>>>>>>>>>>> *domain,
>>>>>>>>>>>>>>> -                        struct work_struct *work)
>>>>>>>>>>>>>>> +                        struct amdgpu_reset_work_struct 
>>>>>>>>>>>>>>> *work)
>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>> -    return queue_work(domain->wq, work);
>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>> +    if (!queue_delayed_work(domain->wq, &work->base, 0)) {
>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>> +        return false;
>>>>>>>>>>>>>>> +    }
>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>> +    list_add_tail(&work->node, &domain->pending_works);
>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>> +    return true;
>>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>> +static inline void 
>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(struct 
>>>>>>>>>>>>>>> amdgpu_reset_domain *domain,
>>>>>>>>>>>>>>> +                  struct amdgpu_reset_work_struct *work)
>>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>> + list_del_init(&work->node);
>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>> +static inline void amdgpu_reset_pending_list(struct 
>>>>>>>>>>>>>>> amdgpu_reset_domain *domain)
>>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct *entry, *tmp;
>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>> +    list_for_each_entry_safe(entry, tmp, 
>>>>>>>>>>>>>>> &domain->pending_works, node) {
>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>> + list_del_init(&entry->node);
>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>> +        /* Stop any other related pending resets */
>>>>>>>>>>>>>>> + cancel_delayed_work(&entry->base);
>>>>>>>>>>>>>>> +    }
>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>     void amdgpu_device_lock_reset_domain(struct 
>>>>>>>>>>>>>>> amdgpu_reset_domain *reset_domain);
>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h 
>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>> index 239f232f9c02..574e870d3064 100644
>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>> @@ -25,6 +25,7 @@
>>>>>>>>>>>>>>>   #define AMDGPU_VIRT_H
>>>>>>>>>>>>>>>     #include "amdgv_sriovmsg.h"
>>>>>>>>>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>>>>>>>>>     #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 << 0) /* 
>>>>>>>>>>>>>>> vBIOS is sr-iov ready */
>>>>>>>>>>>>>>>   #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 1) /* 
>>>>>>>>>>>>>>> sr-iov is enabled on this GPU */
>>>>>>>>>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt {
>>>>>>>>>>>>>>>       uint32_t reg_val_offs;
>>>>>>>>>>>>>>>       struct amdgpu_irq_src ack_irq;
>>>>>>>>>>>>>>>       struct amdgpu_irq_src rcv_irq;
>>>>>>>>>>>>>>> -    struct work_struct flr_work;
>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct flr_work;
>>>>>>>>>>>>>>>       struct amdgpu_mm_table mm_table;
>>>>>>>>>>>>>>>       const struct amdgpu_virt_ops *ops;
>>>>>>>>>>>>>>>       struct amdgpu_vf_error_buffer vf_errors;
>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>> index b81acf59870c..f3d1c2be9292 100644
>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>> @@ -251,7 +251,7 @@ static int 
>>>>>>>>>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device *adev,
>>>>>>>>>>>>>>>     static void xgpu_ai_mailbox_flr_work(struct 
>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = container_of(work, 
>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = container_of(work, 
>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work.base.work);
>>>>>>>>>>>>>>>       struct amdgpu_device *adev = container_of(virt, 
>>>>>>>>>>>>>>> struct amdgpu_device, virt);
>>>>>>>>>>>>>>>       int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>>>   @@ -380,7 +380,8 @@ int xgpu_ai_mailbox_get_irq(struct 
>>>>>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work);
>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work);
>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>> @@ -389,6 +390,8 @@ void xgpu_ai_mailbox_put_irq(struct 
>>>>>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>     static int xgpu_ai_request_init_data(struct 
>>>>>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644
>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>> @@ -275,7 +275,7 @@ static int 
>>>>>>>>>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device *adev,
>>>>>>>>>>>>>>>     static void xgpu_nv_mailbox_flr_work(struct 
>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = container_of(work, 
>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = container_of(work, 
>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work.base.work);
>>>>>>>>>>>>>>>       struct amdgpu_device *adev = container_of(virt, 
>>>>>>>>>>>>>>> struct amdgpu_device, virt);
>>>>>>>>>>>>>>>       int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>>>   @@ -407,7 +407,8 @@ int xgpu_nv_mailbox_get_irq(struct 
>>>>>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work);
>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work);
>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>> @@ -416,6 +417,8 @@ void xgpu_nv_mailbox_put_irq(struct 
>>>>>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>     const struct amdgpu_virt_ops xgpu_nv_virt_ops = {
>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c 
>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644
>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>> @@ -512,7 +512,7 @@ static int 
>>>>>>>>>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device *adev,
>>>>>>>>>>>>>>>     static void xgpu_vi_mailbox_flr_work(struct 
>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = container_of(work, 
>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = container_of(work, 
>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work.base.work);
>>>>>>>>>>>>>>>       struct amdgpu_device *adev = container_of(virt, 
>>>>>>>>>>>>>>> struct amdgpu_device, virt);
>>>>>>>>>>>>>>>         /* wait until RCV_MSG become 3 */
>>>>>>>>>>>>>>> @@ -610,7 +610,8 @@ int xgpu_vi_mailbox_get_irq(struct 
>>>>>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work);
>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work);
>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>> @@ -619,6 +620,8 @@ void xgpu_vi_mailbox_put_irq(struct 
>>>>>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>     const struct amdgpu_virt_ops xgpu_vi_virt_ops = {
>>>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>
>>>>>
>>>
>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive.
  2022-05-10 18:53                             ` Andrey Grodzovsky
@ 2022-05-11  7:38                               ` Christian König
  2022-05-11 13:43                                 ` Andrey Grodzovsky
  0 siblings, 1 reply; 40+ messages in thread
From: Christian König @ 2022-05-11  7:38 UTC (permalink / raw)
  To: Andrey Grodzovsky, Christian König, Lazar, Lijo,
	Felix Kuehling, amd-gfx
  Cc: Bai Zoy

Am 10.05.22 um 20:53 schrieb Andrey Grodzovsky:
>
> On 2022-05-10 13:19, Christian König wrote:
>> Am 10.05.22 um 19:01 schrieb Andrey Grodzovsky:
>>>
>>> On 2022-05-10 12:17, Christian König wrote:
>>>> Am 10.05.22 um 18:00 schrieb Andrey Grodzovsky:
>>>>> [SNIP]
>>>>>> That's one of the reasons why we should have multiple work items 
>>>>>> for job based reset and other reset sources.
>>>>>>
>>>>>> See the whole idea is the following:
>>>>>> 1. We have one single queued work queue for each reset domain 
>>>>>> which makes sure that all reset requests execute in order.
>>>>>> 2. We have one delayed work item for each scheduler which fires 
>>>>>> when a timeout on a scheduler occurs and eventually calls the 
>>>>>> reset procedure with the last running job.
>>>>>> 3. We have one work item for each necessary hard reset.
>>>>>>
>>>>>> The delayed work item from the scheduler first tries a soft 
>>>>>> recovery and checks if a hard reset is really necessary. If it's 
>>>>>> not necessary and we can cancel the offending job we skip the 
>>>>>> hard reset.
>>>>>>
>>>>>> The hard reset work item doesn't do any of those checks and just 
>>>>>> does a reset no matter what.
>>>>>>
>>>>>> When we really do a reset, independent if its triggered by a job 
>>>>>> or other source we cancel all sources at the end of the reset 
>>>>>> procedure.
>>>>>>
>>>>>> This makes sure that a) We only do one reset even when multiple 
>>>>>> sources fire at the same time and b) when any source bails out 
>>>>>> and only does a soft recovery we do a full reset anyway when 
>>>>>> necessary.
>>>>>>
>>>>>> That design was outlined multiple times now on the mailing list 
>>>>>> and looks totally clear to me. We should probably document that 
>>>>>> somewhere.
>>>>>
>>>>>
>>>>> If you look at the patch what you described above is exactly what 
>>>>> is happening - since scheduler's delayed work is different from 
>>>>> any non scheduler delayed work the SW reset which might take place 
>>>>> from scheduler's reset
>>>>> will not have any impact on any non scheduler delayed work and 
>>>>> will not cancel them. In case the scheduler actually reaches the 
>>>>> point of HW reset it will cancel out all pending reset works from 
>>>>> any other sources on the same
>>>>> reset domain. Non scheduler reset will always proceed to do full 
>>>>> HW reset and will cancel any other pending resets.
>>>>
>>>> Ok, but why you then need that linked list? The number of reset 
>>>> sources should be static and not in any way dynamic.
>>>
>>>
>>> So array reset_src[i] holds a pointer to pending delayed work from 
>>> source i or NULL if no pedning work ?
>>> What if same source triggers multiple reset requests such as 
>>> multiple RAS errors at once , don't set the delayed work pointer in 
>>> the arr[RAS_index] if it's already not NULL ?
>>>
>>>>
>>>> See using the linked list sounds like you only wanted to cancel the 
>>>> reset sources raised so far which would not be correct as far as I 
>>>> can see.
>>>
>>>
>>> Not clear about this one ? We do want to cancel those reset sources 
>>> that were raised so far because we just did a HW reset which should 
>>> fix them anyway ? Those who not raised reset request so far their 
>>> respective array index will have a NULL ptr.
>>
>> And exactly that's what I want to prevent. See you don't care if a 
>> reset source has fired once, twice, ten times or never. You just 
>> cancel all of them!
>>
>> That's why I want to come to a static list of reset sources.
>>
>> E.g. in the reset code (either before or after the reset, that's 
>> debatable) you do something like this:
>>
>> for (i = 0; i < num_ring; ++i)
>>     cancel_delayed_work(ring[i]->scheduler....)
>> cancel_work(adev->ras_work);
>> cancel_work(adev->iofault_work);
>> cancel_work(adev->debugfs_work);
>> ...
>>
>> You don't really need to track which reset source has fired and which 
>> hasn't, because that would be racy again. Instead just bluntly reset 
>> all possible sources.
>>
>> Christian.
>
>
> I don't say we care if it fired once or twice (I need to add a fix to 
> only insert reset work to pending reset list if it's not already 
> there), the point of using list (or array) to which you add and from 
> which you remove is that the logic of this is encapsulated within 
> reset domain. In your way we need to be aware who exactly schedules 
> reset work and explicitly cancel them, this also means that for any 
> new source added in the future you will need to remember to add him

I don't think that this is a valid argument. Additionally to the 
schedulers we probably just need less than a handful of reset sources, 
most likely even just one or two is enough.

The only justification I can see of having additional separate reset 
sources would be if somebody wants to know if a specific source has been 
handled or not (e.g. call flush_work() or work_pending()). Like in the 
case of a reset triggered through debugfs.

> to the cancellation list which you showed above. In current way all 
> this done automatically within reset_domain code and it's agnostic to 
> specific driver and it's specific list of reset sources. Also in case 
> we would want to generalize reset_domain to other GPU drivers (which was
> a plan as far as i remember) this explicit mention of each reset works 
> for cancellation is again less suitable in my opinion.

Well we could put the work item for the scheduler independent reset 
source into the reset domain as well. But I'm not sure those additional 
reset sources should be part of any common handling, that is largely 
amdgpu specific.

Christian.

>
> Andrey
>
>
>>
>>>
>>> Andrey
>>>
>>>
>>>>
>>>>>
>>>>> The only difference is I chose to do the canceling right BEFORE 
>>>>> the HW reset and not AFTER. I did this because I see a possible 
>>>>> race where a new reset request is being generated exactly after we 
>>>>> finished the HW reset but before we canceled out all pending 
>>>>> resets - in such case you wold not want to cancel this 'border 
>>>>> line new' reset request.
>>>>
>>>> Why not? Any new reset request directly after a hardware reset is 
>>>> most likely just falsely generated by the reset itself.
>>>>
>>>> Ideally I would cancel all sources after the reset, but before 
>>>> starting any new work.
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>>
>>>>>
>>>>> Andrey
>>>>>
>>>>>
>>>>>>
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>>>>> You can see that if many different reset sources share same 
>>>>>>>> work struct what can happen is that the first to obtain the 
>>>>>>>> lock you describe bellow might opt out from full HW reset 
>>>>>>>> because his bad job did signal for example or because his 
>>>>>>>> hunged IP block was able to recover through SW reset but in the 
>>>>>>>> meantime another reset source who needed an actual HW reset 
>>>>>>>> just silently returned and we end up with unhandled reset 
>>>>>>>> request. True that today this happens only to job timeout reset 
>>>>>>>> sources that are handled form within the scheduler and won't 
>>>>>>>> use this single work struct but no one prevents a future case 
>>>>>>>> for this to happen and also, if we actually want to unify 
>>>>>>>> scheduler time out handlers within reset domain (which seems to 
>>>>>>>> me the right design approach) we won't be able to use just one 
>>>>>>>> work struct for this reason anyway.
>>>>>>>>
>>>>>>>
>>>>>>> Just to add to this point - a reset domain is co-operative 
>>>>>>> domain. In addition to sharing a set of clients from various 
>>>>>>> reset sources for one device, it also will have a set of devices 
>>>>>>> like in XGMI hive. The job timeout on one device may not 
>>>>>>> eventually result in result, but a RAS error happening on 
>>>>>>> another device at the same time would need a reset. The second 
>>>>>>> device's RAS error cannot return seeing  that a reset work 
>>>>>>> already started, or ignore the reset work given that another 
>>>>>>> device has filled the reset data.
>>>>>>>
>>>>>>> When there is a reset domain, it should take care of the work 
>>>>>>> scheduled and keeping it in device or any other level doesn't 
>>>>>>> sound good.
>>>>>>>
>>>>>>> Thanks,
>>>>>>> Lijo
>>>>>>>
>>>>>>>> Andrey
>>>>>>>>
>>>>>>>>
>>>>>>>>>
>>>>>>>>> I'd put the reset work struct into the reset_domain struct. 
>>>>>>>>> That way you'd have exactly one worker for the reset domain. 
>>>>>>>>> You could implement a lock-less scheme to decide whether you 
>>>>>>>>> need to schedule a reset, e.g. using an atomic counter in the 
>>>>>>>>> shared work struct that gets incremented when a client wants 
>>>>>>>>> to trigger a reset (atomic_add_return). If that counter is 
>>>>>>>>> exactly 1 after incrementing, you need to fill in the rest of 
>>>>>>>>> the work struct and schedule the work. Otherwise, it's already 
>>>>>>>>> scheduled (or another client is in the process of scheduling 
>>>>>>>>> it) and you just return. When the worker finishes (after 
>>>>>>>>> confirming a successful reset), it resets the counter to 0, so 
>>>>>>>>> the next client requesting a reset will schedule the worker 
>>>>>>>>> again.
>>>>>>>>>
>>>>>>>>> Regards,
>>>>>>>>>   Felix
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> Additional to that keep in mind that you can't allocate any 
>>>>>>>>>>> memory before or during the GPU reset nor wait for the reset 
>>>>>>>>>>> to complete (so you can't allocate anything on the stack 
>>>>>>>>>>> either).
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> There is no dynamic allocation here, regarding stack 
>>>>>>>>>> allocations - we do it all the time when we call functions, 
>>>>>>>>>> even during GPU resets, how on stack allocation of work 
>>>>>>>>>> struct in amdgpu_device_gpu_recover is different from any 
>>>>>>>>>> other local variable we allocate in any function we call ?
>>>>>>>>>>
>>>>>>>>>> I am also not sure why it's not allowed to wait for reset to 
>>>>>>>>>> complete ? Also, see in amdgpu_ras_do_recovery and 
>>>>>>>>>> gpu_recover_get (debugfs) - the caller expects the reset to 
>>>>>>>>>> complete before he returns. I can probably work around it in 
>>>>>>>>>> RAS code by calling atomic_set(&ras->in_recovery, 0) from 
>>>>>>>>>> some callback within actual reset function but regarding 
>>>>>>>>>> sysfs it actually expects a result returned indicating 
>>>>>>>>>> whether the call was successful or not.
>>>>>>>>>>
>>>>>>>>>> Andrey
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> I don't think that concept you try here will work.
>>>>>>>>>>>
>>>>>>>>>>> Regards,
>>>>>>>>>>> Christian.
>>>>>>>>>>>
>>>>>>>>>>>> Also in general seems to me it's cleaner approach where 
>>>>>>>>>>>> this logic (the work items) are held and handled in 
>>>>>>>>>>>> reset_domain and are not split in each adev or any other 
>>>>>>>>>>>> entity. We might want in the future to even move the 
>>>>>>>>>>>> scheduler handling into reset domain since reset domain is 
>>>>>>>>>>>> supposed to be a generic things and not only or AMD.
>>>>>>>>>>>>
>>>>>>>>>>>> Andrey
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky 
>>>>>>>>>>>>>>>> <andrey.grodzovsky@amd.com>
>>>>>>>>>>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com>
>>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +---
>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++--
>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 +
>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 
>>>>>>>>>>>>>>>> +++++++++++++++++++++-
>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +-
>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++-
>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++-
>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++-
>>>>>>>>>>>>>>>>   8 files changed, 104 insertions(+), 24 deletions(-)
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>> index 4264abc5604d..99efd8317547 100644
>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>> @@ -109,6 +109,7 @@
>>>>>>>>>>>>>>>>   #include "amdgpu_fdinfo.h"
>>>>>>>>>>>>>>>>   #include "amdgpu_mca.h"
>>>>>>>>>>>>>>>>   #include "amdgpu_ras.h"
>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>>>>>>>>>>     #define MAX_GPU_INSTANCE 16
>>>>>>>>>>>>>>>>   @@ -509,16 +510,6 @@ struct 
>>>>>>>>>>>>>>>> amdgpu_allowed_register_entry {
>>>>>>>>>>>>>>>>       bool grbm_indexed;
>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>   -enum amd_reset_method {
>>>>>>>>>>>>>>>> -    AMD_RESET_METHOD_NONE = -1,
>>>>>>>>>>>>>>>> -    AMD_RESET_METHOD_LEGACY = 0,
>>>>>>>>>>>>>>>> -    AMD_RESET_METHOD_MODE0,
>>>>>>>>>>>>>>>> -    AMD_RESET_METHOD_MODE1,
>>>>>>>>>>>>>>>> -    AMD_RESET_METHOD_MODE2,
>>>>>>>>>>>>>>>> -    AMD_RESET_METHOD_BACO,
>>>>>>>>>>>>>>>> -    AMD_RESET_METHOD_PCI,
>>>>>>>>>>>>>>>> -};
>>>>>>>>>>>>>>>> -
>>>>>>>>>>>>>>>>   struct amdgpu_video_codec_info {
>>>>>>>>>>>>>>>>       u32 codec_type;
>>>>>>>>>>>>>>>>       u32 max_width;
>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>> index e582f1044c0f..7fa82269c30f 100644
>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>> @@ -5201,6 +5201,12 @@ int 
>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,
>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>         tmp_vram_lost_counter = 
>>>>>>>>>>>>>>>> atomic_read(&((adev)->vram_lost_counter));
>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>> +    /* Drop all pending resets since we will reset now 
>>>>>>>>>>>>>>>> anyway */
>>>>>>>>>>>>>>>> +    tmp_adev = list_first_entry(device_list_handle, 
>>>>>>>>>>>>>>>> struct amdgpu_device,
>>>>>>>>>>>>>>>> + reset_list);
>>>>>>>>>>>>>>>> + amdgpu_reset_pending_list(tmp_adev->reset_domain);
>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>       /* Actual ASIC resets if needed.*/
>>>>>>>>>>>>>>>>       /* Host driver will handle XGMI hive reset for 
>>>>>>>>>>>>>>>> SRIOV */
>>>>>>>>>>>>>>>>       if (amdgpu_sriov_vf(adev)) {
>>>>>>>>>>>>>>>> @@ -5296,7 +5302,7 @@ int 
>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,
>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>     struct amdgpu_recover_work_struct {
>>>>>>>>>>>>>>>> -    struct work_struct base;
>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct base;
>>>>>>>>>>>>>>>>       struct amdgpu_device *adev;
>>>>>>>>>>>>>>>>       struct amdgpu_job *job;
>>>>>>>>>>>>>>>>       int ret;
>>>>>>>>>>>>>>>> @@ -5304,7 +5310,7 @@ struct amdgpu_recover_work_struct {
>>>>>>>>>>>>>>>>     static void 
>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work(struct work_struct 
>>>>>>>>>>>>>>>> *work)
>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>> -    struct amdgpu_recover_work_struct *recover_work = 
>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_recover_work_struct, 
>>>>>>>>>>>>>>>> base);
>>>>>>>>>>>>>>>> +    struct amdgpu_recover_work_struct *recover_work = 
>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_recover_work_struct, 
>>>>>>>>>>>>>>>> base.base.work);
>>>>>>>>>>>>>>>>         recover_work->ret = 
>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, 
>>>>>>>>>>>>>>>> recover_work->job);
>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>> @@ -5316,12 +5322,15 @@ int 
>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>       struct amdgpu_recover_work_struct work = {.adev = 
>>>>>>>>>>>>>>>> adev, .job = job};
>>>>>>>>>>>>>>>>   -    INIT_WORK(&work.base, 
>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&work.base.base, 
>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&work.base.node);
>>>>>>>>>>>>>>>>         if 
>>>>>>>>>>>>>>>> (!amdgpu_reset_domain_schedule(adev->reset_domain, 
>>>>>>>>>>>>>>>> &work.base))
>>>>>>>>>>>>>>>>           return -EAGAIN;
>>>>>>>>>>>>>>>>   -    flush_work(&work.base);
>>>>>>>>>>>>>>>> + flush_delayed_work(&work.base.base);
>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>> &work.base);
>>>>>>>>>>>>>>>>         return work.ret;
>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>> index c80af0889773..ffddd419c351 100644
>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain 
>>>>>>>>>>>>>>>> *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d
>>>>>>>>>>>>>>>> atomic_set(&reset_domain->in_gpu_reset, 0);
>>>>>>>>>>>>>>>> init_rwsem(&reset_domain->sem);
>>>>>>>>>>>>>>>>   + INIT_LIST_HEAD(&reset_domain->pending_works);
>>>>>>>>>>>>>>>> + mutex_init(&reset_domain->reset_lock);
>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>       return reset_domain;
>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>   diff --git 
>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>> index 1949dbe28a86..863ec5720fc1 100644
>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>> @@ -24,7 +24,18 @@
>>>>>>>>>>>>>>>>   #ifndef __AMDGPU_RESET_H__
>>>>>>>>>>>>>>>>   #define __AMDGPU_RESET_H__
>>>>>>>>>>>>>>>>   -#include "amdgpu.h"
>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>> +#include <linux/atomic.h>
>>>>>>>>>>>>>>>> +#include <linux/mutex.h>
>>>>>>>>>>>>>>>> +#include <linux/list.h>
>>>>>>>>>>>>>>>> +#include <linux/kref.h>
>>>>>>>>>>>>>>>> +#include <linux/rwsem.h>
>>>>>>>>>>>>>>>> +#include <linux/workqueue.h>
>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>> +struct amdgpu_device;
>>>>>>>>>>>>>>>> +struct amdgpu_job;
>>>>>>>>>>>>>>>> +struct amdgpu_hive_info;
>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>     enum AMDGPU_RESET_FLAGS {
>>>>>>>>>>>>>>>>   @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS {
>>>>>>>>>>>>>>>>       AMDGPU_SKIP_HW_RESET = 1,
>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>   +
>>>>>>>>>>>>>>>> +enum amd_reset_method {
>>>>>>>>>>>>>>>> +    AMD_RESET_METHOD_NONE = -1,
>>>>>>>>>>>>>>>> +    AMD_RESET_METHOD_LEGACY = 0,
>>>>>>>>>>>>>>>> +    AMD_RESET_METHOD_MODE0,
>>>>>>>>>>>>>>>> +    AMD_RESET_METHOD_MODE1,
>>>>>>>>>>>>>>>> +    AMD_RESET_METHOD_MODE2,
>>>>>>>>>>>>>>>> +    AMD_RESET_METHOD_BACO,
>>>>>>>>>>>>>>>> +    AMD_RESET_METHOD_PCI,
>>>>>>>>>>>>>>>> +};
>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>   struct amdgpu_reset_context {
>>>>>>>>>>>>>>>>       enum amd_reset_method method;
>>>>>>>>>>>>>>>>       struct amdgpu_device *reset_req_dev;
>>>>>>>>>>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context {
>>>>>>>>>>>>>>>>       unsigned long flags;
>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>   +struct amdgpu_reset_control;
>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>   struct amdgpu_reset_handler {
>>>>>>>>>>>>>>>>       enum amd_reset_method reset_method;
>>>>>>>>>>>>>>>>       struct list_head handler_list;
>>>>>>>>>>>>>>>> @@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type {
>>>>>>>>>>>>>>>>       XGMI_HIVE
>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>   +
>>>>>>>>>>>>>>>> +struct amdgpu_reset_work_struct {
>>>>>>>>>>>>>>>> +    struct delayed_work base;
>>>>>>>>>>>>>>>> +    struct list_head node;
>>>>>>>>>>>>>>>> +};
>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>   struct amdgpu_reset_domain {
>>>>>>>>>>>>>>>>       struct kref refcount;
>>>>>>>>>>>>>>>>       struct workqueue_struct *wq;
>>>>>>>>>>>>>>>>       enum amdgpu_reset_domain_type type;
>>>>>>>>>>>>>>>>       struct rw_semaphore sem;
>>>>>>>>>>>>>>>>       atomic_t in_gpu_reset;
>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>> +    struct list_head pending_works;
>>>>>>>>>>>>>>>> +    struct mutex reset_lock;
>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>     @@ -113,9 +146,43 @@ static inline void 
>>>>>>>>>>>>>>>> amdgpu_reset_put_reset_domain(struct 
>>>>>>>>>>>>>>>> amdgpu_reset_domain *dom
>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>     static inline bool 
>>>>>>>>>>>>>>>> amdgpu_reset_domain_schedule(struct amdgpu_reset_domain 
>>>>>>>>>>>>>>>> *domain,
>>>>>>>>>>>>>>>> -                        struct work_struct *work)
>>>>>>>>>>>>>>>> +                        struct 
>>>>>>>>>>>>>>>> amdgpu_reset_work_struct *work)
>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>> -    return queue_work(domain->wq, work);
>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>> +    if (!queue_delayed_work(domain->wq, &work->base, 
>>>>>>>>>>>>>>>> 0)) {
>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>> +        return false;
>>>>>>>>>>>>>>>> +    }
>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>> + list_add_tail(&work->node, &domain->pending_works);
>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>> +    return true;
>>>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>> +static inline void 
>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(struct 
>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain,
>>>>>>>>>>>>>>>> +                  struct amdgpu_reset_work_struct *work)
>>>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>> + list_del_init(&work->node);
>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>> +static inline void amdgpu_reset_pending_list(struct 
>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain)
>>>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct *entry, *tmp;
>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>> +    list_for_each_entry_safe(entry, tmp, 
>>>>>>>>>>>>>>>> &domain->pending_works, node) {
>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>> + list_del_init(&entry->node);
>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>> +        /* Stop any other related pending resets */
>>>>>>>>>>>>>>>> + cancel_delayed_work(&entry->base);
>>>>>>>>>>>>>>>> +    }
>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>     void amdgpu_device_lock_reset_domain(struct 
>>>>>>>>>>>>>>>> amdgpu_reset_domain *reset_domain);
>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h 
>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>> index 239f232f9c02..574e870d3064 100644
>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>> @@ -25,6 +25,7 @@
>>>>>>>>>>>>>>>>   #define AMDGPU_VIRT_H
>>>>>>>>>>>>>>>>     #include "amdgv_sriovmsg.h"
>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>>>>>>>>>>     #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 << 0) /* 
>>>>>>>>>>>>>>>> vBIOS is sr-iov ready */
>>>>>>>>>>>>>>>>   #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 1) /* 
>>>>>>>>>>>>>>>> sr-iov is enabled on this GPU */
>>>>>>>>>>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt {
>>>>>>>>>>>>>>>>       uint32_t reg_val_offs;
>>>>>>>>>>>>>>>>       struct amdgpu_irq_src ack_irq;
>>>>>>>>>>>>>>>>       struct amdgpu_irq_src rcv_irq;
>>>>>>>>>>>>>>>> -    struct work_struct flr_work;
>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct flr_work;
>>>>>>>>>>>>>>>>       struct amdgpu_mm_table mm_table;
>>>>>>>>>>>>>>>>       const struct amdgpu_virt_ops *ops;
>>>>>>>>>>>>>>>>       struct amdgpu_vf_error_buffer vf_errors;
>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>> index b81acf59870c..f3d1c2be9292 100644
>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>> @@ -251,7 +251,7 @@ static int 
>>>>>>>>>>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device *adev,
>>>>>>>>>>>>>>>>     static void xgpu_ai_mailbox_flr_work(struct 
>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = container_of(work, 
>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = container_of(work, 
>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work.base.work);
>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = container_of(virt, 
>>>>>>>>>>>>>>>> struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>       int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>>>>   @@ -380,7 +380,8 @@ int 
>>>>>>>>>>>>>>>> xgpu_ai_mailbox_get_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work);
>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work);
>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>> @@ -389,6 +390,8 @@ void xgpu_ai_mailbox_put_irq(struct 
>>>>>>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>     static int xgpu_ai_request_init_data(struct 
>>>>>>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644
>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>> @@ -275,7 +275,7 @@ static int 
>>>>>>>>>>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device *adev,
>>>>>>>>>>>>>>>>     static void xgpu_nv_mailbox_flr_work(struct 
>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = container_of(work, 
>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = container_of(work, 
>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work.base.work);
>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = container_of(virt, 
>>>>>>>>>>>>>>>> struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>       int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>>>>   @@ -407,7 +407,8 @@ int 
>>>>>>>>>>>>>>>> xgpu_nv_mailbox_get_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work);
>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work);
>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>> @@ -416,6 +417,8 @@ void xgpu_nv_mailbox_put_irq(struct 
>>>>>>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>     const struct amdgpu_virt_ops xgpu_nv_virt_ops = {
>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c 
>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644
>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>> @@ -512,7 +512,7 @@ static int 
>>>>>>>>>>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device *adev,
>>>>>>>>>>>>>>>>     static void xgpu_vi_mailbox_flr_work(struct 
>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = container_of(work, 
>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = container_of(work, 
>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work.base.work);
>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = container_of(virt, 
>>>>>>>>>>>>>>>> struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>         /* wait until RCV_MSG become 3 */
>>>>>>>>>>>>>>>> @@ -610,7 +610,8 @@ int xgpu_vi_mailbox_get_irq(struct 
>>>>>>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work);
>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work);
>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>> @@ -619,6 +620,8 @@ void xgpu_vi_mailbox_put_irq(struct 
>>>>>>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>     const struct amdgpu_virt_ops xgpu_vi_virt_ops = {
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>
>>>>
>>


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive.
  2022-05-11  7:38                               ` Christian König
@ 2022-05-11 13:43                                 ` Andrey Grodzovsky
  2022-05-11 13:58                                   ` Christian König
  0 siblings, 1 reply; 40+ messages in thread
From: Andrey Grodzovsky @ 2022-05-11 13:43 UTC (permalink / raw)
  To: Christian König, Christian König, Lazar, Lijo,
	Felix Kuehling, amd-gfx
  Cc: Bai Zoy


On 2022-05-11 03:38, Christian König wrote:
> Am 10.05.22 um 20:53 schrieb Andrey Grodzovsky:
>>
>> On 2022-05-10 13:19, Christian König wrote:
>>> Am 10.05.22 um 19:01 schrieb Andrey Grodzovsky:
>>>>
>>>> On 2022-05-10 12:17, Christian König wrote:
>>>>> Am 10.05.22 um 18:00 schrieb Andrey Grodzovsky:
>>>>>> [SNIP]
>>>>>>> That's one of the reasons why we should have multiple work items 
>>>>>>> for job based reset and other reset sources.
>>>>>>>
>>>>>>> See the whole idea is the following:
>>>>>>> 1. We have one single queued work queue for each reset domain 
>>>>>>> which makes sure that all reset requests execute in order.
>>>>>>> 2. We have one delayed work item for each scheduler which fires 
>>>>>>> when a timeout on a scheduler occurs and eventually calls the 
>>>>>>> reset procedure with the last running job.
>>>>>>> 3. We have one work item for each necessary hard reset.
>>>>>>>
>>>>>>> The delayed work item from the scheduler first tries a soft 
>>>>>>> recovery and checks if a hard reset is really necessary. If it's 
>>>>>>> not necessary and we can cancel the offending job we skip the 
>>>>>>> hard reset.
>>>>>>>
>>>>>>> The hard reset work item doesn't do any of those checks and just 
>>>>>>> does a reset no matter what.
>>>>>>>
>>>>>>> When we really do a reset, independent if its triggered by a job 
>>>>>>> or other source we cancel all sources at the end of the reset 
>>>>>>> procedure.
>>>>>>>
>>>>>>> This makes sure that a) We only do one reset even when multiple 
>>>>>>> sources fire at the same time and b) when any source bails out 
>>>>>>> and only does a soft recovery we do a full reset anyway when 
>>>>>>> necessary.
>>>>>>>
>>>>>>> That design was outlined multiple times now on the mailing list 
>>>>>>> and looks totally clear to me. We should probably document that 
>>>>>>> somewhere.
>>>>>>
>>>>>>
>>>>>> If you look at the patch what you described above is exactly what 
>>>>>> is happening - since scheduler's delayed work is different from 
>>>>>> any non scheduler delayed work the SW reset which might take 
>>>>>> place from scheduler's reset
>>>>>> will not have any impact on any non scheduler delayed work and 
>>>>>> will not cancel them. In case the scheduler actually reaches the 
>>>>>> point of HW reset it will cancel out all pending reset works from 
>>>>>> any other sources on the same
>>>>>> reset domain. Non scheduler reset will always proceed to do full 
>>>>>> HW reset and will cancel any other pending resets.
>>>>>
>>>>> Ok, but why you then need that linked list? The number of reset 
>>>>> sources should be static and not in any way dynamic.
>>>>
>>>>
>>>> So array reset_src[i] holds a pointer to pending delayed work from 
>>>> source i or NULL if no pedning work ?
>>>> What if same source triggers multiple reset requests such as 
>>>> multiple RAS errors at once , don't set the delayed work pointer in 
>>>> the arr[RAS_index] if it's already not NULL ?
>>>>
>>>>>
>>>>> See using the linked list sounds like you only wanted to cancel 
>>>>> the reset sources raised so far which would not be correct as far 
>>>>> as I can see.
>>>>
>>>>
>>>> Not clear about this one ? We do want to cancel those reset sources 
>>>> that were raised so far because we just did a HW reset which should 
>>>> fix them anyway ? Those who not raised reset request so far their 
>>>> respective array index will have a NULL ptr.
>>>
>>> And exactly that's what I want to prevent. See you don't care if a 
>>> reset source has fired once, twice, ten times or never. You just 
>>> cancel all of them!
>>>
>>> That's why I want to come to a static list of reset sources.
>>>
>>> E.g. in the reset code (either before or after the reset, that's 
>>> debatable) you do something like this:
>>>
>>> for (i = 0; i < num_ring; ++i)
>>>     cancel_delayed_work(ring[i]->scheduler....)
>>> cancel_work(adev->ras_work);
>>> cancel_work(adev->iofault_work);
>>> cancel_work(adev->debugfs_work);
>>> ...
>>>
>>> You don't really need to track which reset source has fired and 
>>> which hasn't, because that would be racy again. Instead just bluntly 
>>> reset all possible sources.
>>>
>>> Christian.
>>
>>
>> I don't say we care if it fired once or twice (I need to add a fix to 
>> only insert reset work to pending reset list if it's not already 
>> there), the point of using list (or array) to which you add and from 
>> which you remove is that the logic of this is encapsulated within 
>> reset domain. In your way we need to be aware who exactly schedules 
>> reset work and explicitly cancel them, this also means that for any 
>> new source added in the future you will need to remember to add him
>
> I don't think that this is a valid argument. Additionally to the 
> schedulers we probably just need less than a handful of reset sources, 
> most likely even just one or two is enough.
>
> The only justification I can see of having additional separate reset 
> sources would be if somebody wants to know if a specific source has 
> been handled or not (e.g. call flush_work() or work_pending()). Like 
> in the case of a reset triggered through debugfs.


This is indeed one reason, another is as we said before that if you 
share 'reset source' (meaning a delayed work) with another client (i.e. 
RAS and KFD) it means you make assumption that the other client always 
proceeds with the
reset exactly the same way as you expect. So today we have this only in 
scheduler vs non scheduler reset happening - non scheduler reset clients 
assume the reset is always fully executed in HW while scheduler based 
reset makes shortcuts and not always does HW reset hence they cannot 
share 'reset source' (delayed work). Yes, we can always add this in the 
future if and when such problem will arise but no one will remember this 
then and a new bug will be introduced and will take time to find and 
resolve.

>
>> to the cancellation list which you showed above. In current way all 
>> this done automatically within reset_domain code and it's agnostic to 
>> specific driver and it's specific list of reset sources. Also in case 
>> we would want to generalize reset_domain to other GPU drivers (which was
>> a plan as far as i remember) this explicit mention of each reset 
>> works for cancellation is again less suitable in my opinion.
>
> Well we could put the work item for the scheduler independent reset 
> source into the reset domain as well. But I'm not sure those 
> additional reset sources should be part of any common handling, that 
> is largely amdgpu specific.


So it's for sure more then one source for the reasons described above, 
also note that for scheduler we already cancel delayed work in 
drm_sched_stop so calling them again in amdgpu code kind of superfluous.

Andrey


>
> Christian.
>
>>
>> Andrey
>>
>>
>>>
>>>>
>>>> Andrey
>>>>
>>>>
>>>>>
>>>>>>
>>>>>> The only difference is I chose to do the canceling right BEFORE 
>>>>>> the HW reset and not AFTER. I did this because I see a possible 
>>>>>> race where a new reset request is being generated exactly after 
>>>>>> we finished the HW reset but before we canceled out all pending 
>>>>>> resets - in such case you wold not want to cancel this 'border 
>>>>>> line new' reset request.
>>>>>
>>>>> Why not? Any new reset request directly after a hardware reset is 
>>>>> most likely just falsely generated by the reset itself.
>>>>>
>>>>> Ideally I would cancel all sources after the reset, but before 
>>>>> starting any new work.
>>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>>>
>>>>>>
>>>>>> Andrey
>>>>>>
>>>>>>
>>>>>>>
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>>>>> You can see that if many different reset sources share same 
>>>>>>>>> work struct what can happen is that the first to obtain the 
>>>>>>>>> lock you describe bellow might opt out from full HW reset 
>>>>>>>>> because his bad job did signal for example or because his 
>>>>>>>>> hunged IP block was able to recover through SW reset but in 
>>>>>>>>> the meantime another reset source who needed an actual HW 
>>>>>>>>> reset just silently returned and we end up with unhandled 
>>>>>>>>> reset request. True that today this happens only to job 
>>>>>>>>> timeout reset sources that are handled form within the 
>>>>>>>>> scheduler and won't use this single work struct but no one 
>>>>>>>>> prevents a future case for this to happen and also, if we 
>>>>>>>>> actually want to unify scheduler time out handlers within 
>>>>>>>>> reset domain (which seems to me the right design approach) we 
>>>>>>>>> won't be able to use just one work struct for this reason anyway.
>>>>>>>>>
>>>>>>>>
>>>>>>>> Just to add to this point - a reset domain is co-operative 
>>>>>>>> domain. In addition to sharing a set of clients from various 
>>>>>>>> reset sources for one device, it also will have a set of 
>>>>>>>> devices like in XGMI hive. The job timeout on one device may 
>>>>>>>> not eventually result in result, but a RAS error happening on 
>>>>>>>> another device at the same time would need a reset. The second 
>>>>>>>> device's RAS error cannot return seeing  that a reset work 
>>>>>>>> already started, or ignore the reset work given that another 
>>>>>>>> device has filled the reset data.
>>>>>>>>
>>>>>>>> When there is a reset domain, it should take care of the work 
>>>>>>>> scheduled and keeping it in device or any other level doesn't 
>>>>>>>> sound good.
>>>>>>>>
>>>>>>>> Thanks,
>>>>>>>> Lijo
>>>>>>>>
>>>>>>>>> Andrey
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> I'd put the reset work struct into the reset_domain struct. 
>>>>>>>>>> That way you'd have exactly one worker for the reset domain. 
>>>>>>>>>> You could implement a lock-less scheme to decide whether you 
>>>>>>>>>> need to schedule a reset, e.g. using an atomic counter in the 
>>>>>>>>>> shared work struct that gets incremented when a client wants 
>>>>>>>>>> to trigger a reset (atomic_add_return). If that counter is 
>>>>>>>>>> exactly 1 after incrementing, you need to fill in the rest of 
>>>>>>>>>> the work struct and schedule the work. Otherwise, it's 
>>>>>>>>>> already scheduled (or another client is in the process of 
>>>>>>>>>> scheduling it) and you just return. When the worker finishes 
>>>>>>>>>> (after confirming a successful reset), it resets the counter 
>>>>>>>>>> to 0, so the next client requesting a reset will schedule the 
>>>>>>>>>> worker again.
>>>>>>>>>>
>>>>>>>>>> Regards,
>>>>>>>>>>   Felix
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> Additional to that keep in mind that you can't allocate any 
>>>>>>>>>>>> memory before or during the GPU reset nor wait for the 
>>>>>>>>>>>> reset to complete (so you can't allocate anything on the 
>>>>>>>>>>>> stack either).
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> There is no dynamic allocation here, regarding stack 
>>>>>>>>>>> allocations - we do it all the time when we call functions, 
>>>>>>>>>>> even during GPU resets, how on stack allocation of work 
>>>>>>>>>>> struct in amdgpu_device_gpu_recover is different from any 
>>>>>>>>>>> other local variable we allocate in any function we call ?
>>>>>>>>>>>
>>>>>>>>>>> I am also not sure why it's not allowed to wait for reset to 
>>>>>>>>>>> complete ? Also, see in amdgpu_ras_do_recovery and 
>>>>>>>>>>> gpu_recover_get (debugfs) - the caller expects the reset to 
>>>>>>>>>>> complete before he returns. I can probably work around it in 
>>>>>>>>>>> RAS code by calling atomic_set(&ras->in_recovery, 0) from 
>>>>>>>>>>> some callback within actual reset function but regarding 
>>>>>>>>>>> sysfs it actually expects a result returned indicating 
>>>>>>>>>>> whether the call was successful or not.
>>>>>>>>>>>
>>>>>>>>>>> Andrey
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> I don't think that concept you try here will work.
>>>>>>>>>>>>
>>>>>>>>>>>> Regards,
>>>>>>>>>>>> Christian.
>>>>>>>>>>>>
>>>>>>>>>>>>> Also in general seems to me it's cleaner approach where 
>>>>>>>>>>>>> this logic (the work items) are held and handled in 
>>>>>>>>>>>>> reset_domain and are not split in each adev or any other 
>>>>>>>>>>>>> entity. We might want in the future to even move the 
>>>>>>>>>>>>> scheduler handling into reset domain since reset domain is 
>>>>>>>>>>>>> supposed to be a generic things and not only or AMD.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky 
>>>>>>>>>>>>>>>>> <andrey.grodzovsky@amd.com>
>>>>>>>>>>>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com>
>>>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +---
>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++--
>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 +
>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 
>>>>>>>>>>>>>>>>> +++++++++++++++++++++-
>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +-
>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++-
>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++-
>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++-
>>>>>>>>>>>>>>>>>   8 files changed, 104 insertions(+), 24 deletions(-)
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>>> index 4264abc5604d..99efd8317547 100644
>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>>> @@ -109,6 +109,7 @@
>>>>>>>>>>>>>>>>>   #include "amdgpu_fdinfo.h"
>>>>>>>>>>>>>>>>>   #include "amdgpu_mca.h"
>>>>>>>>>>>>>>>>>   #include "amdgpu_ras.h"
>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>>>>>>>>>>>     #define MAX_GPU_INSTANCE 16
>>>>>>>>>>>>>>>>>   @@ -509,16 +510,6 @@ struct 
>>>>>>>>>>>>>>>>> amdgpu_allowed_register_entry {
>>>>>>>>>>>>>>>>>       bool grbm_indexed;
>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>   -enum amd_reset_method {
>>>>>>>>>>>>>>>>> -    AMD_RESET_METHOD_NONE = -1,
>>>>>>>>>>>>>>>>> -    AMD_RESET_METHOD_LEGACY = 0,
>>>>>>>>>>>>>>>>> -    AMD_RESET_METHOD_MODE0,
>>>>>>>>>>>>>>>>> -    AMD_RESET_METHOD_MODE1,
>>>>>>>>>>>>>>>>> -    AMD_RESET_METHOD_MODE2,
>>>>>>>>>>>>>>>>> -    AMD_RESET_METHOD_BACO,
>>>>>>>>>>>>>>>>> -    AMD_RESET_METHOD_PCI,
>>>>>>>>>>>>>>>>> -};
>>>>>>>>>>>>>>>>> -
>>>>>>>>>>>>>>>>>   struct amdgpu_video_codec_info {
>>>>>>>>>>>>>>>>>       u32 codec_type;
>>>>>>>>>>>>>>>>>       u32 max_width;
>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>> index e582f1044c0f..7fa82269c30f 100644
>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>> @@ -5201,6 +5201,12 @@ int 
>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,
>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>         tmp_vram_lost_counter = 
>>>>>>>>>>>>>>>>> atomic_read(&((adev)->vram_lost_counter));
>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>> +    /* Drop all pending resets since we will reset 
>>>>>>>>>>>>>>>>> now anyway */
>>>>>>>>>>>>>>>>> +    tmp_adev = list_first_entry(device_list_handle, 
>>>>>>>>>>>>>>>>> struct amdgpu_device,
>>>>>>>>>>>>>>>>> + reset_list);
>>>>>>>>>>>>>>>>> + amdgpu_reset_pending_list(tmp_adev->reset_domain);
>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>       /* Actual ASIC resets if needed.*/
>>>>>>>>>>>>>>>>>       /* Host driver will handle XGMI hive reset for 
>>>>>>>>>>>>>>>>> SRIOV */
>>>>>>>>>>>>>>>>>       if (amdgpu_sriov_vf(adev)) {
>>>>>>>>>>>>>>>>> @@ -5296,7 +5302,7 @@ int 
>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,
>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>     struct amdgpu_recover_work_struct {
>>>>>>>>>>>>>>>>> -    struct work_struct base;
>>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct base;
>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev;
>>>>>>>>>>>>>>>>>       struct amdgpu_job *job;
>>>>>>>>>>>>>>>>>       int ret;
>>>>>>>>>>>>>>>>> @@ -5304,7 +5310,7 @@ struct amdgpu_recover_work_struct {
>>>>>>>>>>>>>>>>>     static void 
>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work(struct 
>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>> -    struct amdgpu_recover_work_struct *recover_work = 
>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_recover_work_struct, 
>>>>>>>>>>>>>>>>> base);
>>>>>>>>>>>>>>>>> +    struct amdgpu_recover_work_struct *recover_work = 
>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_recover_work_struct, 
>>>>>>>>>>>>>>>>> base.base.work);
>>>>>>>>>>>>>>>>>         recover_work->ret = 
>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, 
>>>>>>>>>>>>>>>>> recover_work->job);
>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>> @@ -5316,12 +5322,15 @@ int 
>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>       struct amdgpu_recover_work_struct work = {.adev 
>>>>>>>>>>>>>>>>> = adev, .job = job};
>>>>>>>>>>>>>>>>>   -    INIT_WORK(&work.base, 
>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&work.base.base, 
>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&work.base.node);
>>>>>>>>>>>>>>>>>         if 
>>>>>>>>>>>>>>>>> (!amdgpu_reset_domain_schedule(adev->reset_domain, 
>>>>>>>>>>>>>>>>> &work.base))
>>>>>>>>>>>>>>>>>           return -EAGAIN;
>>>>>>>>>>>>>>>>>   -    flush_work(&work.base);
>>>>>>>>>>>>>>>>> + flush_delayed_work(&work.base.base);
>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>> &work.base);
>>>>>>>>>>>>>>>>>         return work.ret;
>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>>> index c80af0889773..ffddd419c351 100644
>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain 
>>>>>>>>>>>>>>>>> *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d
>>>>>>>>>>>>>>>>> atomic_set(&reset_domain->in_gpu_reset, 0);
>>>>>>>>>>>>>>>>> init_rwsem(&reset_domain->sem);
>>>>>>>>>>>>>>>>>   + INIT_LIST_HEAD(&reset_domain->pending_works);
>>>>>>>>>>>>>>>>> + mutex_init(&reset_domain->reset_lock);
>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>       return reset_domain;
>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>   diff --git 
>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>>> index 1949dbe28a86..863ec5720fc1 100644
>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>>> @@ -24,7 +24,18 @@
>>>>>>>>>>>>>>>>>   #ifndef __AMDGPU_RESET_H__
>>>>>>>>>>>>>>>>>   #define __AMDGPU_RESET_H__
>>>>>>>>>>>>>>>>>   -#include "amdgpu.h"
>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>> +#include <linux/atomic.h>
>>>>>>>>>>>>>>>>> +#include <linux/mutex.h>
>>>>>>>>>>>>>>>>> +#include <linux/list.h>
>>>>>>>>>>>>>>>>> +#include <linux/kref.h>
>>>>>>>>>>>>>>>>> +#include <linux/rwsem.h>
>>>>>>>>>>>>>>>>> +#include <linux/workqueue.h>
>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>> +struct amdgpu_device;
>>>>>>>>>>>>>>>>> +struct amdgpu_job;
>>>>>>>>>>>>>>>>> +struct amdgpu_hive_info;
>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>     enum AMDGPU_RESET_FLAGS {
>>>>>>>>>>>>>>>>>   @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS {
>>>>>>>>>>>>>>>>>       AMDGPU_SKIP_HW_RESET = 1,
>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>   +
>>>>>>>>>>>>>>>>> +enum amd_reset_method {
>>>>>>>>>>>>>>>>> +    AMD_RESET_METHOD_NONE = -1,
>>>>>>>>>>>>>>>>> +    AMD_RESET_METHOD_LEGACY = 0,
>>>>>>>>>>>>>>>>> +    AMD_RESET_METHOD_MODE0,
>>>>>>>>>>>>>>>>> +    AMD_RESET_METHOD_MODE1,
>>>>>>>>>>>>>>>>> +    AMD_RESET_METHOD_MODE2,
>>>>>>>>>>>>>>>>> +    AMD_RESET_METHOD_BACO,
>>>>>>>>>>>>>>>>> +    AMD_RESET_METHOD_PCI,
>>>>>>>>>>>>>>>>> +};
>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>   struct amdgpu_reset_context {
>>>>>>>>>>>>>>>>>       enum amd_reset_method method;
>>>>>>>>>>>>>>>>>       struct amdgpu_device *reset_req_dev;
>>>>>>>>>>>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context {
>>>>>>>>>>>>>>>>>       unsigned long flags;
>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>   +struct amdgpu_reset_control;
>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>   struct amdgpu_reset_handler {
>>>>>>>>>>>>>>>>>       enum amd_reset_method reset_method;
>>>>>>>>>>>>>>>>>       struct list_head handler_list;
>>>>>>>>>>>>>>>>> @@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type {
>>>>>>>>>>>>>>>>>       XGMI_HIVE
>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>   +
>>>>>>>>>>>>>>>>> +struct amdgpu_reset_work_struct {
>>>>>>>>>>>>>>>>> +    struct delayed_work base;
>>>>>>>>>>>>>>>>> +    struct list_head node;
>>>>>>>>>>>>>>>>> +};
>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>   struct amdgpu_reset_domain {
>>>>>>>>>>>>>>>>>       struct kref refcount;
>>>>>>>>>>>>>>>>>       struct workqueue_struct *wq;
>>>>>>>>>>>>>>>>>       enum amdgpu_reset_domain_type type;
>>>>>>>>>>>>>>>>>       struct rw_semaphore sem;
>>>>>>>>>>>>>>>>>       atomic_t in_gpu_reset;
>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>> +    struct list_head pending_works;
>>>>>>>>>>>>>>>>> +    struct mutex reset_lock;
>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>     @@ -113,9 +146,43 @@ static inline void 
>>>>>>>>>>>>>>>>> amdgpu_reset_put_reset_domain(struct 
>>>>>>>>>>>>>>>>> amdgpu_reset_domain *dom
>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>     static inline bool 
>>>>>>>>>>>>>>>>> amdgpu_reset_domain_schedule(struct 
>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain,
>>>>>>>>>>>>>>>>> -                        struct work_struct *work)
>>>>>>>>>>>>>>>>> +                        struct 
>>>>>>>>>>>>>>>>> amdgpu_reset_work_struct *work)
>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>> -    return queue_work(domain->wq, work);
>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>> +    if (!queue_delayed_work(domain->wq, &work->base, 
>>>>>>>>>>>>>>>>> 0)) {
>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>> +        return false;
>>>>>>>>>>>>>>>>> +    }
>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>> + list_add_tail(&work->node, &domain->pending_works);
>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>> +    return true;
>>>>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>> +static inline void 
>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(struct 
>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain,
>>>>>>>>>>>>>>>>> +                  struct amdgpu_reset_work_struct *work)
>>>>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>>> + list_del_init(&work->node);
>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>> +static inline void amdgpu_reset_pending_list(struct 
>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain)
>>>>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct *entry, *tmp;
>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>>> + list_for_each_entry_safe(entry, tmp, 
>>>>>>>>>>>>>>>>> &domain->pending_works, node) {
>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>> + list_del_init(&entry->node);
>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>> +        /* Stop any other related pending resets */
>>>>>>>>>>>>>>>>> + cancel_delayed_work(&entry->base);
>>>>>>>>>>>>>>>>> +    }
>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>     void amdgpu_device_lock_reset_domain(struct 
>>>>>>>>>>>>>>>>> amdgpu_reset_domain *reset_domain);
>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h 
>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>>> index 239f232f9c02..574e870d3064 100644
>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>>> @@ -25,6 +25,7 @@
>>>>>>>>>>>>>>>>>   #define AMDGPU_VIRT_H
>>>>>>>>>>>>>>>>>     #include "amdgv_sriovmsg.h"
>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>>>>>>>>>>>     #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 << 0) /* 
>>>>>>>>>>>>>>>>> vBIOS is sr-iov ready */
>>>>>>>>>>>>>>>>>   #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 1) /* 
>>>>>>>>>>>>>>>>> sr-iov is enabled on this GPU */
>>>>>>>>>>>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt {
>>>>>>>>>>>>>>>>>       uint32_t reg_val_offs;
>>>>>>>>>>>>>>>>>       struct amdgpu_irq_src ack_irq;
>>>>>>>>>>>>>>>>>       struct amdgpu_irq_src rcv_irq;
>>>>>>>>>>>>>>>>> -    struct work_struct flr_work;
>>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct flr_work;
>>>>>>>>>>>>>>>>>       struct amdgpu_mm_table mm_table;
>>>>>>>>>>>>>>>>>       const struct amdgpu_virt_ops *ops;
>>>>>>>>>>>>>>>>>       struct amdgpu_vf_error_buffer vf_errors;
>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>> index b81acf59870c..f3d1c2be9292 100644
>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>> @@ -251,7 +251,7 @@ static int 
>>>>>>>>>>>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device *adev,
>>>>>>>>>>>>>>>>>     static void xgpu_ai_mailbox_flr_work(struct 
>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = container_of(work, 
>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = container_of(work, 
>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work.base.work);
>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = container_of(virt, 
>>>>>>>>>>>>>>>>> struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>>       int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>>>>>   @@ -380,7 +380,8 @@ int 
>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_get_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work);
>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work);
>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>> @@ -389,6 +390,8 @@ void 
>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_put_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>     static int xgpu_ai_request_init_data(struct 
>>>>>>>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644
>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>> @@ -275,7 +275,7 @@ static int 
>>>>>>>>>>>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device *adev,
>>>>>>>>>>>>>>>>>     static void xgpu_nv_mailbox_flr_work(struct 
>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = container_of(work, 
>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = container_of(work, 
>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work.base.work);
>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = container_of(virt, 
>>>>>>>>>>>>>>>>> struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>>       int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>>>>>   @@ -407,7 +407,8 @@ int 
>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_get_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work);
>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work);
>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>> @@ -416,6 +417,8 @@ void 
>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_put_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>     const struct amdgpu_virt_ops xgpu_nv_virt_ops = {
>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c 
>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644
>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>>> @@ -512,7 +512,7 @@ static int 
>>>>>>>>>>>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device *adev,
>>>>>>>>>>>>>>>>>     static void xgpu_vi_mailbox_flr_work(struct 
>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = container_of(work, 
>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = container_of(work, 
>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work.base.work);
>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = container_of(virt, 
>>>>>>>>>>>>>>>>> struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>>         /* wait until RCV_MSG become 3 */
>>>>>>>>>>>>>>>>> @@ -610,7 +610,8 @@ int xgpu_vi_mailbox_get_irq(struct 
>>>>>>>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work);
>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work);
>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>> @@ -619,6 +620,8 @@ void 
>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_put_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>     const struct amdgpu_virt_ops xgpu_vi_virt_ops = {
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>
>>>>>
>>>
>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive.
  2022-05-11 13:43                                 ` Andrey Grodzovsky
@ 2022-05-11 13:58                                   ` Christian König
  2022-05-11 15:20                                     ` Lazar, Lijo
  0 siblings, 1 reply; 40+ messages in thread
From: Christian König @ 2022-05-11 13:58 UTC (permalink / raw)
  To: Andrey Grodzovsky, Christian König, Lazar, Lijo,
	Felix Kuehling, amd-gfx
  Cc: Bai Zoy

Am 11.05.22 um 15:43 schrieb Andrey Grodzovsky:
> On 2022-05-11 03:38, Christian König wrote:
>> Am 10.05.22 um 20:53 schrieb Andrey Grodzovsky:
>>> [SNIP]
>>>> E.g. in the reset code (either before or after the reset, that's 
>>>> debatable) you do something like this:
>>>>
>>>> for (i = 0; i < num_ring; ++i)
>>>>     cancel_delayed_work(ring[i]->scheduler....)
>>>> cancel_work(adev->ras_work);
>>>> cancel_work(adev->iofault_work);
>>>> cancel_work(adev->debugfs_work);
>>>> ...
>>>>
>>>> You don't really need to track which reset source has fired and 
>>>> which hasn't, because that would be racy again. Instead just 
>>>> bluntly reset all possible sources.
>>>>
>>>> Christian.
>>>
>>>
>>> I don't say we care if it fired once or twice (I need to add a fix 
>>> to only insert reset work to pending reset list if it's not already 
>>> there), the point of using list (or array) to which you add and from 
>>> which you remove is that the logic of this is encapsulated within 
>>> reset domain. In your way we need to be aware who exactly schedules 
>>> reset work and explicitly cancel them, this also means that for any 
>>> new source added in the future you will need to remember to add him
>>
>> I don't think that this is a valid argument. Additionally to the 
>> schedulers we probably just need less than a handful of reset 
>> sources, most likely even just one or two is enough.
>>
>> The only justification I can see of having additional separate reset 
>> sources would be if somebody wants to know if a specific source has 
>> been handled or not (e.g. call flush_work() or work_pending()). Like 
>> in the case of a reset triggered through debugfs.
>
>
> This is indeed one reason, another is as we said before that if you 
> share 'reset source' (meaning a delayed work) with another client 
> (i.e. RAS and KFD) it means you make assumption that the other client 
> always proceeds with the
> reset exactly the same way as you expect. So today we have this only 
> in scheduler vs non scheduler reset happening - non scheduler reset 
> clients assume the reset is always fully executed in HW while 
> scheduler based reset makes shortcuts and not always does HW reset 
> hence they cannot share 'reset source' (delayed work). Yes, we can 
> always add this in the future if and when such problem will arise but 
> no one will remember this then and a new bug will be introduced and 
> will take time to find and resolve.

Mhm, so your main concern is that we forget to correctly handle the new 
reset sources?

How about we do it like this then:

struct amdgpu_reset_domain {
     ....
     union {
         struct {
             struct work_item debugfs;
             struct work_item ras;
             ....
         };
         struct work_item array[]
     } reset_sources;
}

Not 100% sure if that works, but something like that should do the trick.

My main concern is that I don't want to allocate the work items on the 
stack and dynamic allocation (e.g. kmalloc) is usually not possible either.

Additional to that putting/removing work items from a list, array or 
other container is a very common source for race conditions.

Regards,
Christian.

>
>>> to the cancellation list which you showed above. In current way all 
>>> this done automatically within reset_domain code and it's agnostic 
>>> to specific driver and it's specific list of reset sources. Also in 
>>> case we would want to generalize reset_domain to other GPU drivers 
>>> (which was
>>> a plan as far as i remember) this explicit mention of each reset 
>>> works for cancellation is again less suitable in my opinion.
>>
>> Well we could put the work item for the scheduler independent reset 
>> source into the reset domain as well. But I'm not sure those 
>> additional reset sources should be part of any common handling, that 
>> is largely amdgpu specific.
>
>
> So it's for sure more then one source for the reasons described above, 
> also note that for scheduler we already cancel delayed work in 
> drm_sched_stop so calling them again in amdgpu code kind of superfluous.
>
> Andrey
>
>
>>
>> Christian.
>>
>>>
>>> Andrey
>>>
>>>
>>>>
>>>>>
>>>>> Andrey
>>>>>
>>>>>
>>>>>>
>>>>>>>
>>>>>>> The only difference is I chose to do the canceling right BEFORE 
>>>>>>> the HW reset and not AFTER. I did this because I see a possible 
>>>>>>> race where a new reset request is being generated exactly after 
>>>>>>> we finished the HW reset but before we canceled out all pending 
>>>>>>> resets - in such case you wold not want to cancel this 'border 
>>>>>>> line new' reset request.
>>>>>>
>>>>>> Why not? Any new reset request directly after a hardware reset is 
>>>>>> most likely just falsely generated by the reset itself.
>>>>>>
>>>>>> Ideally I would cancel all sources after the reset, but before 
>>>>>> starting any new work.
>>>>>>
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> Andrey
>>>>>>>
>>>>>>>
>>>>>>>>
>>>>>>>> Regards,
>>>>>>>> Christian.
>>>>>>>>
>>>>>>>>>> You can see that if many different reset sources share same 
>>>>>>>>>> work struct what can happen is that the first to obtain the 
>>>>>>>>>> lock you describe bellow might opt out from full HW reset 
>>>>>>>>>> because his bad job did signal for example or because his 
>>>>>>>>>> hunged IP block was able to recover through SW reset but in 
>>>>>>>>>> the meantime another reset source who needed an actual HW 
>>>>>>>>>> reset just silently returned and we end up with unhandled 
>>>>>>>>>> reset request. True that today this happens only to job 
>>>>>>>>>> timeout reset sources that are handled form within the 
>>>>>>>>>> scheduler and won't use this single work struct but no one 
>>>>>>>>>> prevents a future case for this to happen and also, if we 
>>>>>>>>>> actually want to unify scheduler time out handlers within 
>>>>>>>>>> reset domain (which seems to me the right design approach) we 
>>>>>>>>>> won't be able to use just one work struct for this reason 
>>>>>>>>>> anyway.
>>>>>>>>>>
>>>>>>>>>
>>>>>>>>> Just to add to this point - a reset domain is co-operative 
>>>>>>>>> domain. In addition to sharing a set of clients from various 
>>>>>>>>> reset sources for one device, it also will have a set of 
>>>>>>>>> devices like in XGMI hive. The job timeout on one device may 
>>>>>>>>> not eventually result in result, but a RAS error happening on 
>>>>>>>>> another device at the same time would need a reset. The second 
>>>>>>>>> device's RAS error cannot return seeing  that a reset work 
>>>>>>>>> already started, or ignore the reset work given that another 
>>>>>>>>> device has filled the reset data.
>>>>>>>>>
>>>>>>>>> When there is a reset domain, it should take care of the work 
>>>>>>>>> scheduled and keeping it in device or any other level doesn't 
>>>>>>>>> sound good.
>>>>>>>>>
>>>>>>>>> Thanks,
>>>>>>>>> Lijo
>>>>>>>>>
>>>>>>>>>> Andrey
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> I'd put the reset work struct into the reset_domain struct. 
>>>>>>>>>>> That way you'd have exactly one worker for the reset domain. 
>>>>>>>>>>> You could implement a lock-less scheme to decide whether you 
>>>>>>>>>>> need to schedule a reset, e.g. using an atomic counter in 
>>>>>>>>>>> the shared work struct that gets incremented when a client 
>>>>>>>>>>> wants to trigger a reset (atomic_add_return). If that 
>>>>>>>>>>> counter is exactly 1 after incrementing, you need to fill in 
>>>>>>>>>>> the rest of the work struct and schedule the work. 
>>>>>>>>>>> Otherwise, it's already scheduled (or another client is in 
>>>>>>>>>>> the process of scheduling it) and you just return. When the 
>>>>>>>>>>> worker finishes (after confirming a successful reset), it 
>>>>>>>>>>> resets the counter to 0, so the next client requesting a 
>>>>>>>>>>> reset will schedule the worker again.
>>>>>>>>>>>
>>>>>>>>>>> Regards,
>>>>>>>>>>>   Felix
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>> Additional to that keep in mind that you can't allocate 
>>>>>>>>>>>>> any memory before or during the GPU reset nor wait for the 
>>>>>>>>>>>>> reset to complete (so you can't allocate anything on the 
>>>>>>>>>>>>> stack either).
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> There is no dynamic allocation here, regarding stack 
>>>>>>>>>>>> allocations - we do it all the time when we call functions, 
>>>>>>>>>>>> even during GPU resets, how on stack allocation of work 
>>>>>>>>>>>> struct in amdgpu_device_gpu_recover is different from any 
>>>>>>>>>>>> other local variable we allocate in any function we call ?
>>>>>>>>>>>>
>>>>>>>>>>>> I am also not sure why it's not allowed to wait for reset 
>>>>>>>>>>>> to complete ? Also, see in amdgpu_ras_do_recovery and 
>>>>>>>>>>>> gpu_recover_get (debugfs) - the caller expects the reset to 
>>>>>>>>>>>> complete before he returns. I can probably work around it 
>>>>>>>>>>>> in RAS code by calling atomic_set(&ras->in_recovery, 0) 
>>>>>>>>>>>> from some callback within actual reset function but 
>>>>>>>>>>>> regarding sysfs it actually expects a result returned 
>>>>>>>>>>>> indicating whether the call was successful or not.
>>>>>>>>>>>>
>>>>>>>>>>>> Andrey
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>> I don't think that concept you try here will work.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>
>>>>>>>>>>>>>> Also in general seems to me it's cleaner approach where 
>>>>>>>>>>>>>> this logic (the work items) are held and handled in 
>>>>>>>>>>>>>> reset_domain and are not split in each adev or any other 
>>>>>>>>>>>>>> entity. We might want in the future to even move the 
>>>>>>>>>>>>>> scheduler handling into reset domain since reset domain 
>>>>>>>>>>>>>> is supposed to be a generic things and not only or AMD.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky 
>>>>>>>>>>>>>>>>>> <andrey.grodzovsky@amd.com>
>>>>>>>>>>>>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com>
>>>>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +---
>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++--
>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 +
>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 
>>>>>>>>>>>>>>>>>> +++++++++++++++++++++-
>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +-
>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++-
>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++-
>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++-
>>>>>>>>>>>>>>>>>>   8 files changed, 104 insertions(+), 24 deletions(-)
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>>>> index 4264abc5604d..99efd8317547 100644
>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>>>> @@ -109,6 +109,7 @@
>>>>>>>>>>>>>>>>>>   #include "amdgpu_fdinfo.h"
>>>>>>>>>>>>>>>>>>   #include "amdgpu_mca.h"
>>>>>>>>>>>>>>>>>>   #include "amdgpu_ras.h"
>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>>>>>>>>>>>>     #define MAX_GPU_INSTANCE 16
>>>>>>>>>>>>>>>>>>   @@ -509,16 +510,6 @@ struct 
>>>>>>>>>>>>>>>>>> amdgpu_allowed_register_entry {
>>>>>>>>>>>>>>>>>>       bool grbm_indexed;
>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>   -enum amd_reset_method {
>>>>>>>>>>>>>>>>>> -    AMD_RESET_METHOD_NONE = -1,
>>>>>>>>>>>>>>>>>> -    AMD_RESET_METHOD_LEGACY = 0,
>>>>>>>>>>>>>>>>>> -    AMD_RESET_METHOD_MODE0,
>>>>>>>>>>>>>>>>>> -    AMD_RESET_METHOD_MODE1,
>>>>>>>>>>>>>>>>>> -    AMD_RESET_METHOD_MODE2,
>>>>>>>>>>>>>>>>>> -    AMD_RESET_METHOD_BACO,
>>>>>>>>>>>>>>>>>> -    AMD_RESET_METHOD_PCI,
>>>>>>>>>>>>>>>>>> -};
>>>>>>>>>>>>>>>>>> -
>>>>>>>>>>>>>>>>>>   struct amdgpu_video_codec_info {
>>>>>>>>>>>>>>>>>>       u32 codec_type;
>>>>>>>>>>>>>>>>>>       u32 max_width;
>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>> index e582f1044c0f..7fa82269c30f 100644
>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>> @@ -5201,6 +5201,12 @@ int 
>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct amdgpu_device 
>>>>>>>>>>>>>>>>>> *adev,
>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>         tmp_vram_lost_counter = 
>>>>>>>>>>>>>>>>>> atomic_read(&((adev)->vram_lost_counter));
>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>> +    /* Drop all pending resets since we will reset 
>>>>>>>>>>>>>>>>>> now anyway */
>>>>>>>>>>>>>>>>>> +    tmp_adev = list_first_entry(device_list_handle, 
>>>>>>>>>>>>>>>>>> struct amdgpu_device,
>>>>>>>>>>>>>>>>>> + reset_list);
>>>>>>>>>>>>>>>>>> + amdgpu_reset_pending_list(tmp_adev->reset_domain);
>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>       /* Actual ASIC resets if needed.*/
>>>>>>>>>>>>>>>>>>       /* Host driver will handle XGMI hive reset for 
>>>>>>>>>>>>>>>>>> SRIOV */
>>>>>>>>>>>>>>>>>>       if (amdgpu_sriov_vf(adev)) {
>>>>>>>>>>>>>>>>>> @@ -5296,7 +5302,7 @@ int 
>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct amdgpu_device 
>>>>>>>>>>>>>>>>>> *adev,
>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>     struct amdgpu_recover_work_struct {
>>>>>>>>>>>>>>>>>> -    struct work_struct base;
>>>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct base;
>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev;
>>>>>>>>>>>>>>>>>>       struct amdgpu_job *job;
>>>>>>>>>>>>>>>>>>       int ret;
>>>>>>>>>>>>>>>>>> @@ -5304,7 +5310,7 @@ struct 
>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct {
>>>>>>>>>>>>>>>>>>     static void 
>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work(struct 
>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>> -    struct amdgpu_recover_work_struct *recover_work 
>>>>>>>>>>>>>>>>>> = container_of(work, struct 
>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base);
>>>>>>>>>>>>>>>>>> +    struct amdgpu_recover_work_struct *recover_work 
>>>>>>>>>>>>>>>>>> = container_of(work, struct 
>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base.base.work);
>>>>>>>>>>>>>>>>>>         recover_work->ret = 
>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, 
>>>>>>>>>>>>>>>>>> recover_work->job);
>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>> @@ -5316,12 +5322,15 @@ int 
>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>       struct amdgpu_recover_work_struct work = {.adev 
>>>>>>>>>>>>>>>>>> = adev, .job = job};
>>>>>>>>>>>>>>>>>>   -    INIT_WORK(&work.base, 
>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&work.base.base, 
>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&work.base.node);
>>>>>>>>>>>>>>>>>>         if 
>>>>>>>>>>>>>>>>>> (!amdgpu_reset_domain_schedule(adev->reset_domain, 
>>>>>>>>>>>>>>>>>> &work.base))
>>>>>>>>>>>>>>>>>>           return -EAGAIN;
>>>>>>>>>>>>>>>>>>   - flush_work(&work.base);
>>>>>>>>>>>>>>>>>> + flush_delayed_work(&work.base.base);
>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>> &work.base);
>>>>>>>>>>>>>>>>>>         return work.ret;
>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>>>> index c80af0889773..ffddd419c351 100644
>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain 
>>>>>>>>>>>>>>>>>> *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d
>>>>>>>>>>>>>>>>>> atomic_set(&reset_domain->in_gpu_reset, 0);
>>>>>>>>>>>>>>>>>> init_rwsem(&reset_domain->sem);
>>>>>>>>>>>>>>>>>>   + INIT_LIST_HEAD(&reset_domain->pending_works);
>>>>>>>>>>>>>>>>>> + mutex_init(&reset_domain->reset_lock);
>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>       return reset_domain;
>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>   diff --git 
>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>>>> index 1949dbe28a86..863ec5720fc1 100644
>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>>>> @@ -24,7 +24,18 @@
>>>>>>>>>>>>>>>>>>   #ifndef __AMDGPU_RESET_H__
>>>>>>>>>>>>>>>>>>   #define __AMDGPU_RESET_H__
>>>>>>>>>>>>>>>>>>   -#include "amdgpu.h"
>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>> +#include <linux/atomic.h>
>>>>>>>>>>>>>>>>>> +#include <linux/mutex.h>
>>>>>>>>>>>>>>>>>> +#include <linux/list.h>
>>>>>>>>>>>>>>>>>> +#include <linux/kref.h>
>>>>>>>>>>>>>>>>>> +#include <linux/rwsem.h>
>>>>>>>>>>>>>>>>>> +#include <linux/workqueue.h>
>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>> +struct amdgpu_device;
>>>>>>>>>>>>>>>>>> +struct amdgpu_job;
>>>>>>>>>>>>>>>>>> +struct amdgpu_hive_info;
>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>     enum AMDGPU_RESET_FLAGS {
>>>>>>>>>>>>>>>>>>   @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS {
>>>>>>>>>>>>>>>>>>       AMDGPU_SKIP_HW_RESET = 1,
>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>   +
>>>>>>>>>>>>>>>>>> +enum amd_reset_method {
>>>>>>>>>>>>>>>>>> +    AMD_RESET_METHOD_NONE = -1,
>>>>>>>>>>>>>>>>>> +    AMD_RESET_METHOD_LEGACY = 0,
>>>>>>>>>>>>>>>>>> +    AMD_RESET_METHOD_MODE0,
>>>>>>>>>>>>>>>>>> +    AMD_RESET_METHOD_MODE1,
>>>>>>>>>>>>>>>>>> +    AMD_RESET_METHOD_MODE2,
>>>>>>>>>>>>>>>>>> +    AMD_RESET_METHOD_BACO,
>>>>>>>>>>>>>>>>>> +    AMD_RESET_METHOD_PCI,
>>>>>>>>>>>>>>>>>> +};
>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>   struct amdgpu_reset_context {
>>>>>>>>>>>>>>>>>>       enum amd_reset_method method;
>>>>>>>>>>>>>>>>>>       struct amdgpu_device *reset_req_dev;
>>>>>>>>>>>>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context {
>>>>>>>>>>>>>>>>>>       unsigned long flags;
>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>   +struct amdgpu_reset_control;
>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>   struct amdgpu_reset_handler {
>>>>>>>>>>>>>>>>>>       enum amd_reset_method reset_method;
>>>>>>>>>>>>>>>>>>       struct list_head handler_list;
>>>>>>>>>>>>>>>>>> @@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type {
>>>>>>>>>>>>>>>>>>       XGMI_HIVE
>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>   +
>>>>>>>>>>>>>>>>>> +struct amdgpu_reset_work_struct {
>>>>>>>>>>>>>>>>>> +    struct delayed_work base;
>>>>>>>>>>>>>>>>>> +    struct list_head node;
>>>>>>>>>>>>>>>>>> +};
>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>   struct amdgpu_reset_domain {
>>>>>>>>>>>>>>>>>>       struct kref refcount;
>>>>>>>>>>>>>>>>>>       struct workqueue_struct *wq;
>>>>>>>>>>>>>>>>>>       enum amdgpu_reset_domain_type type;
>>>>>>>>>>>>>>>>>>       struct rw_semaphore sem;
>>>>>>>>>>>>>>>>>>       atomic_t in_gpu_reset;
>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>> +    struct list_head pending_works;
>>>>>>>>>>>>>>>>>> +    struct mutex reset_lock;
>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>     @@ -113,9 +146,43 @@ static inline void 
>>>>>>>>>>>>>>>>>> amdgpu_reset_put_reset_domain(struct 
>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *dom
>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>     static inline bool 
>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_schedule(struct 
>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain,
>>>>>>>>>>>>>>>>>> -                        struct work_struct *work)
>>>>>>>>>>>>>>>>>> +                        struct 
>>>>>>>>>>>>>>>>>> amdgpu_reset_work_struct *work)
>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>> -    return queue_work(domain->wq, work);
>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>> +    if (!queue_delayed_work(domain->wq, &work->base, 
>>>>>>>>>>>>>>>>>> 0)) {
>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>> +        return false;
>>>>>>>>>>>>>>>>>> +    }
>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>> + list_add_tail(&work->node, &domain->pending_works);
>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>> +    return true;
>>>>>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>> +static inline void 
>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(struct 
>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain,
>>>>>>>>>>>>>>>>>> +                  struct amdgpu_reset_work_struct 
>>>>>>>>>>>>>>>>>> *work)
>>>>>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>> + list_del_init(&work->node);
>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>> +static inline void amdgpu_reset_pending_list(struct 
>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain)
>>>>>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct *entry, *tmp;
>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>> + list_for_each_entry_safe(entry, tmp, 
>>>>>>>>>>>>>>>>>> &domain->pending_works, node) {
>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>> + list_del_init(&entry->node);
>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>> +        /* Stop any other related pending resets */
>>>>>>>>>>>>>>>>>> + cancel_delayed_work(&entry->base);
>>>>>>>>>>>>>>>>>> +    }
>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>     void amdgpu_device_lock_reset_domain(struct 
>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *reset_domain);
>>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h 
>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>>>> index 239f232f9c02..574e870d3064 100644
>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>>>> @@ -25,6 +25,7 @@
>>>>>>>>>>>>>>>>>>   #define AMDGPU_VIRT_H
>>>>>>>>>>>>>>>>>>     #include "amdgv_sriovmsg.h"
>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>>>>>>>>>>>>     #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 << 0) /* 
>>>>>>>>>>>>>>>>>> vBIOS is sr-iov ready */
>>>>>>>>>>>>>>>>>>   #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 1) /* 
>>>>>>>>>>>>>>>>>> sr-iov is enabled on this GPU */
>>>>>>>>>>>>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt {
>>>>>>>>>>>>>>>>>>       uint32_t reg_val_offs;
>>>>>>>>>>>>>>>>>>       struct amdgpu_irq_src ack_irq;
>>>>>>>>>>>>>>>>>>       struct amdgpu_irq_src rcv_irq;
>>>>>>>>>>>>>>>>>> -    struct work_struct flr_work;
>>>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct flr_work;
>>>>>>>>>>>>>>>>>>       struct amdgpu_mm_table mm_table;
>>>>>>>>>>>>>>>>>>       const struct amdgpu_virt_ops *ops;
>>>>>>>>>>>>>>>>>>       struct amdgpu_vf_error_buffer vf_errors;
>>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>> index b81acf59870c..f3d1c2be9292 100644
>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>> @@ -251,7 +251,7 @@ static int 
>>>>>>>>>>>>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>     static void xgpu_ai_mailbox_flr_work(struct 
>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = container_of(work, 
>>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = container_of(work, 
>>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work.base.work);
>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = container_of(virt, 
>>>>>>>>>>>>>>>>>> struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>>>       int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>>>>>>   @@ -380,7 +380,8 @@ int 
>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_get_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work);
>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work);
>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>> @@ -389,6 +390,8 @@ void 
>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_put_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>     static int xgpu_ai_request_init_data(struct 
>>>>>>>>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644
>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>> @@ -275,7 +275,7 @@ static int 
>>>>>>>>>>>>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>     static void xgpu_nv_mailbox_flr_work(struct 
>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = container_of(work, 
>>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = container_of(work, 
>>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work.base.work);
>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = container_of(virt, 
>>>>>>>>>>>>>>>>>> struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>>>       int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>>>>>>   @@ -407,7 +407,8 @@ int 
>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_get_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work);
>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work);
>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>> @@ -416,6 +417,8 @@ void 
>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_put_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>     const struct amdgpu_virt_ops xgpu_nv_virt_ops = {
>>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c 
>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644
>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>>>> @@ -512,7 +512,7 @@ static int 
>>>>>>>>>>>>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>     static void xgpu_vi_mailbox_flr_work(struct 
>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = container_of(work, 
>>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = container_of(work, 
>>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work.base.work);
>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = container_of(virt, 
>>>>>>>>>>>>>>>>>> struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>>>         /* wait until RCV_MSG become 3 */
>>>>>>>>>>>>>>>>>> @@ -610,7 +610,8 @@ int 
>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_get_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work);
>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work);
>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>> @@ -619,6 +620,8 @@ void 
>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_put_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>     const struct amdgpu_virt_ops xgpu_vi_virt_ops = {
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>
>>>>>>
>>>>
>>


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive.
  2022-05-11 13:58                                   ` Christian König
@ 2022-05-11 15:20                                     ` Lazar, Lijo
  2022-05-11 15:35                                       ` Andrey Grodzovsky
  0 siblings, 1 reply; 40+ messages in thread
From: Lazar, Lijo @ 2022-05-11 15:20 UTC (permalink / raw)
  To: Christian König, Andrey Grodzovsky, Christian König,
	Felix Kuehling, amd-gfx
  Cc: Bai Zoy



On 5/11/2022 7:28 PM, Christian König wrote:
> Am 11.05.22 um 15:43 schrieb Andrey Grodzovsky:
>> On 2022-05-11 03:38, Christian König wrote:
>>> Am 10.05.22 um 20:53 schrieb Andrey Grodzovsky:
>>>> [SNIP]
>>>>> E.g. in the reset code (either before or after the reset, that's 
>>>>> debatable) you do something like this:
>>>>>
>>>>> for (i = 0; i < num_ring; ++i)
>>>>>     cancel_delayed_work(ring[i]->scheduler....)
>>>>> cancel_work(adev->ras_work);
>>>>> cancel_work(adev->iofault_work);
>>>>> cancel_work(adev->debugfs_work);
>>>>> ...
>>>>>
>>>>> You don't really need to track which reset source has fired and 
>>>>> which hasn't, because that would be racy again. Instead just 
>>>>> bluntly reset all possible sources.
>>>>>
>>>>> Christian.
>>>>
>>>>
>>>> I don't say we care if it fired once or twice (I need to add a fix 
>>>> to only insert reset work to pending reset list if it's not already 
>>>> there), the point of using list (or array) to which you add and from 
>>>> which you remove is that the logic of this is encapsulated within 
>>>> reset domain. In your way we need to be aware who exactly schedules 
>>>> reset work and explicitly cancel them, this also means that for any 
>>>> new source added in the future you will need to remember to add him
>>>
>>> I don't think that this is a valid argument. Additionally to the 
>>> schedulers we probably just need less than a handful of reset 
>>> sources, most likely even just one or two is enough.
>>>
>>> The only justification I can see of having additional separate reset 
>>> sources would be if somebody wants to know if a specific source has 
>>> been handled or not (e.g. call flush_work() or work_pending()). Like 
>>> in the case of a reset triggered through debugfs.
>>
>>
>> This is indeed one reason, another is as we said before that if you 
>> share 'reset source' (meaning a delayed work) with another client 
>> (i.e. RAS and KFD) it means you make assumption that the other client 
>> always proceeds with the
>> reset exactly the same way as you expect. So today we have this only 
>> in scheduler vs non scheduler reset happening - non scheduler reset 
>> clients assume the reset is always fully executed in HW while 
>> scheduler based reset makes shortcuts and not always does HW reset 
>> hence they cannot share 'reset source' (delayed work). Yes, we can 
>> always add this in the future if and when such problem will arise but 
>> no one will remember this then and a new bug will be introduced and 
>> will take time to find and resolve.
> 
> Mhm, so your main concern is that we forget to correctly handle the new 
> reset sources?
> 
> How about we do it like this then:
> 
> struct amdgpu_reset_domain {
>      ....
>      union {
>          struct {
>              struct work_item debugfs;
>              struct work_item ras;
>              ....
>          };
>          struct work_item array[]
>      } reset_sources;
> }
> 

If it's only about static array,

enum amdgpu_reset_soruce {

AMDGPU_RESET_SRC_RAS,
AMDGPU_RESET_SRC_ABC,
.....
AMDGPU_RESET_SRC_XYZ,
AMDGPU_RESET_SRC_MAX

};

struct work_struct reset_work[AMDGPU_RESET_SRC_MAX]; => An index for 
each work item


Thanks,
Lijo

> Not 100% sure if that works, but something like that should do the trick.
> 
> My main concern is that I don't want to allocate the work items on the 
> stack and dynamic allocation (e.g. kmalloc) is usually not possible either.
> 
> Additional to that putting/removing work items from a list, array or 
> other container is a very common source for race conditions.
> 
> Regards,
> Christian.
> 
>>
>>>> to the cancellation list which you showed above. In current way all 
>>>> this done automatically within reset_domain code and it's agnostic 
>>>> to specific driver and it's specific list of reset sources. Also in 
>>>> case we would want to generalize reset_domain to other GPU drivers 
>>>> (which was
>>>> a plan as far as i remember) this explicit mention of each reset 
>>>> works for cancellation is again less suitable in my opinion.
>>>
>>> Well we could put the work item for the scheduler independent reset 
>>> source into the reset domain as well. But I'm not sure those 
>>> additional reset sources should be part of any common handling, that 
>>> is largely amdgpu specific.
>>
>>
>> So it's for sure more then one source for the reasons described above, 
>> also note that for scheduler we already cancel delayed work in 
>> drm_sched_stop so calling them again in amdgpu code kind of superfluous.
>>
>> Andrey
>>
>>
>>>
>>> Christian.
>>>
>>>>
>>>> Andrey
>>>>
>>>>
>>>>>
>>>>>>
>>>>>> Andrey
>>>>>>
>>>>>>
>>>>>>>
>>>>>>>>
>>>>>>>> The only difference is I chose to do the canceling right BEFORE 
>>>>>>>> the HW reset and not AFTER. I did this because I see a possible 
>>>>>>>> race where a new reset request is being generated exactly after 
>>>>>>>> we finished the HW reset but before we canceled out all pending 
>>>>>>>> resets - in such case you wold not want to cancel this 'border 
>>>>>>>> line new' reset request.
>>>>>>>
>>>>>>> Why not? Any new reset request directly after a hardware reset is 
>>>>>>> most likely just falsely generated by the reset itself.
>>>>>>>
>>>>>>> Ideally I would cancel all sources after the reset, but before 
>>>>>>> starting any new work.
>>>>>>>
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>> Andrey
>>>>>>>>
>>>>>>>>
>>>>>>>>>
>>>>>>>>> Regards,
>>>>>>>>> Christian.
>>>>>>>>>
>>>>>>>>>>> You can see that if many different reset sources share same 
>>>>>>>>>>> work struct what can happen is that the first to obtain the 
>>>>>>>>>>> lock you describe bellow might opt out from full HW reset 
>>>>>>>>>>> because his bad job did signal for example or because his 
>>>>>>>>>>> hunged IP block was able to recover through SW reset but in 
>>>>>>>>>>> the meantime another reset source who needed an actual HW 
>>>>>>>>>>> reset just silently returned and we end up with unhandled 
>>>>>>>>>>> reset request. True that today this happens only to job 
>>>>>>>>>>> timeout reset sources that are handled form within the 
>>>>>>>>>>> scheduler and won't use this single work struct but no one 
>>>>>>>>>>> prevents a future case for this to happen and also, if we 
>>>>>>>>>>> actually want to unify scheduler time out handlers within 
>>>>>>>>>>> reset domain (which seems to me the right design approach) we 
>>>>>>>>>>> won't be able to use just one work struct for this reason 
>>>>>>>>>>> anyway.
>>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> Just to add to this point - a reset domain is co-operative 
>>>>>>>>>> domain. In addition to sharing a set of clients from various 
>>>>>>>>>> reset sources for one device, it also will have a set of 
>>>>>>>>>> devices like in XGMI hive. The job timeout on one device may 
>>>>>>>>>> not eventually result in result, but a RAS error happening on 
>>>>>>>>>> another device at the same time would need a reset. The second 
>>>>>>>>>> device's RAS error cannot return seeing  that a reset work 
>>>>>>>>>> already started, or ignore the reset work given that another 
>>>>>>>>>> device has filled the reset data.
>>>>>>>>>>
>>>>>>>>>> When there is a reset domain, it should take care of the work 
>>>>>>>>>> scheduled and keeping it in device or any other level doesn't 
>>>>>>>>>> sound good.
>>>>>>>>>>
>>>>>>>>>> Thanks,
>>>>>>>>>> Lijo
>>>>>>>>>>
>>>>>>>>>>> Andrey
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> I'd put the reset work struct into the reset_domain struct. 
>>>>>>>>>>>> That way you'd have exactly one worker for the reset domain. 
>>>>>>>>>>>> You could implement a lock-less scheme to decide whether you 
>>>>>>>>>>>> need to schedule a reset, e.g. using an atomic counter in 
>>>>>>>>>>>> the shared work struct that gets incremented when a client 
>>>>>>>>>>>> wants to trigger a reset (atomic_add_return). If that 
>>>>>>>>>>>> counter is exactly 1 after incrementing, you need to fill in 
>>>>>>>>>>>> the rest of the work struct and schedule the work. 
>>>>>>>>>>>> Otherwise, it's already scheduled (or another client is in 
>>>>>>>>>>>> the process of scheduling it) and you just return. When the 
>>>>>>>>>>>> worker finishes (after confirming a successful reset), it 
>>>>>>>>>>>> resets the counter to 0, so the next client requesting a 
>>>>>>>>>>>> reset will schedule the worker again.
>>>>>>>>>>>>
>>>>>>>>>>>> Regards,
>>>>>>>>>>>>   Felix
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Additional to that keep in mind that you can't allocate 
>>>>>>>>>>>>>> any memory before or during the GPU reset nor wait for the 
>>>>>>>>>>>>>> reset to complete (so you can't allocate anything on the 
>>>>>>>>>>>>>> stack either).
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>> There is no dynamic allocation here, regarding stack 
>>>>>>>>>>>>> allocations - we do it all the time when we call functions, 
>>>>>>>>>>>>> even during GPU resets, how on stack allocation of work 
>>>>>>>>>>>>> struct in amdgpu_device_gpu_recover is different from any 
>>>>>>>>>>>>> other local variable we allocate in any function we call ?
>>>>>>>>>>>>>
>>>>>>>>>>>>> I am also not sure why it's not allowed to wait for reset 
>>>>>>>>>>>>> to complete ? Also, see in amdgpu_ras_do_recovery and 
>>>>>>>>>>>>> gpu_recover_get (debugfs) - the caller expects the reset to 
>>>>>>>>>>>>> complete before he returns. I can probably work around it 
>>>>>>>>>>>>> in RAS code by calling atomic_set(&ras->in_recovery, 0) 
>>>>>>>>>>>>> from some callback within actual reset function but 
>>>>>>>>>>>>> regarding sysfs it actually expects a result returned 
>>>>>>>>>>>>> indicating whether the call was successful or not.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> I don't think that concept you try here will work.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Also in general seems to me it's cleaner approach where 
>>>>>>>>>>>>>>> this logic (the work items) are held and handled in 
>>>>>>>>>>>>>>> reset_domain and are not split in each adev or any other 
>>>>>>>>>>>>>>> entity. We might want in the future to even move the 
>>>>>>>>>>>>>>> scheduler handling into reset domain since reset domain 
>>>>>>>>>>>>>>> is supposed to be a generic things and not only or AMD.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky 
>>>>>>>>>>>>>>>>>>> <andrey.grodzovsky@amd.com>
>>>>>>>>>>>>>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com>
>>>>>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +---
>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++--
>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 +
>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 
>>>>>>>>>>>>>>>>>>> +++++++++++++++++++++-
>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +-
>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++-
>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++-
>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++-
>>>>>>>>>>>>>>>>>>>   8 files changed, 104 insertions(+), 24 deletions(-)
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>>>>> index 4264abc5604d..99efd8317547 100644
>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>>>>> @@ -109,6 +109,7 @@
>>>>>>>>>>>>>>>>>>>   #include "amdgpu_fdinfo.h"
>>>>>>>>>>>>>>>>>>>   #include "amdgpu_mca.h"
>>>>>>>>>>>>>>>>>>>   #include "amdgpu_ras.h"
>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>>>>>>>>>>>>>     #define MAX_GPU_INSTANCE 16
>>>>>>>>>>>>>>>>>>>   @@ -509,16 +510,6 @@ struct 
>>>>>>>>>>>>>>>>>>> amdgpu_allowed_register_entry {
>>>>>>>>>>>>>>>>>>>       bool grbm_indexed;
>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>   -enum amd_reset_method {
>>>>>>>>>>>>>>>>>>> -    AMD_RESET_METHOD_NONE = -1,
>>>>>>>>>>>>>>>>>>> -    AMD_RESET_METHOD_LEGACY = 0,
>>>>>>>>>>>>>>>>>>> -    AMD_RESET_METHOD_MODE0,
>>>>>>>>>>>>>>>>>>> -    AMD_RESET_METHOD_MODE1,
>>>>>>>>>>>>>>>>>>> -    AMD_RESET_METHOD_MODE2,
>>>>>>>>>>>>>>>>>>> -    AMD_RESET_METHOD_BACO,
>>>>>>>>>>>>>>>>>>> -    AMD_RESET_METHOD_PCI,
>>>>>>>>>>>>>>>>>>> -};
>>>>>>>>>>>>>>>>>>> -
>>>>>>>>>>>>>>>>>>>   struct amdgpu_video_codec_info {
>>>>>>>>>>>>>>>>>>>       u32 codec_type;
>>>>>>>>>>>>>>>>>>>       u32 max_width;
>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>> index e582f1044c0f..7fa82269c30f 100644
>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>> @@ -5201,6 +5201,12 @@ int 
>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>> *adev,
>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>         tmp_vram_lost_counter = 
>>>>>>>>>>>>>>>>>>> atomic_read(&((adev)->vram_lost_counter));
>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>> +    /* Drop all pending resets since we will reset 
>>>>>>>>>>>>>>>>>>> now anyway */
>>>>>>>>>>>>>>>>>>> +    tmp_adev = list_first_entry(device_list_handle, 
>>>>>>>>>>>>>>>>>>> struct amdgpu_device,
>>>>>>>>>>>>>>>>>>> + reset_list);
>>>>>>>>>>>>>>>>>>> + amdgpu_reset_pending_list(tmp_adev->reset_domain);
>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>       /* Actual ASIC resets if needed.*/
>>>>>>>>>>>>>>>>>>>       /* Host driver will handle XGMI hive reset for 
>>>>>>>>>>>>>>>>>>> SRIOV */
>>>>>>>>>>>>>>>>>>>       if (amdgpu_sriov_vf(adev)) {
>>>>>>>>>>>>>>>>>>> @@ -5296,7 +5302,7 @@ int 
>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>> *adev,
>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>     struct amdgpu_recover_work_struct {
>>>>>>>>>>>>>>>>>>> -    struct work_struct base;
>>>>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct base;
>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev;
>>>>>>>>>>>>>>>>>>>       struct amdgpu_job *job;
>>>>>>>>>>>>>>>>>>>       int ret;
>>>>>>>>>>>>>>>>>>> @@ -5304,7 +5310,7 @@ struct 
>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct {
>>>>>>>>>>>>>>>>>>>     static void 
>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work(struct 
>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>> -    struct amdgpu_recover_work_struct *recover_work 
>>>>>>>>>>>>>>>>>>> = container_of(work, struct 
>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base);
>>>>>>>>>>>>>>>>>>> +    struct amdgpu_recover_work_struct *recover_work 
>>>>>>>>>>>>>>>>>>> = container_of(work, struct 
>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base.base.work);
>>>>>>>>>>>>>>>>>>>         recover_work->ret = 
>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, 
>>>>>>>>>>>>>>>>>>> recover_work->job);
>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>> @@ -5316,12 +5322,15 @@ int 
>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>       struct amdgpu_recover_work_struct work = {.adev 
>>>>>>>>>>>>>>>>>>> = adev, .job = job};
>>>>>>>>>>>>>>>>>>>   -    INIT_WORK(&work.base, 
>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&work.base.base, 
>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&work.base.node);
>>>>>>>>>>>>>>>>>>>         if 
>>>>>>>>>>>>>>>>>>> (!amdgpu_reset_domain_schedule(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>> &work.base))
>>>>>>>>>>>>>>>>>>>           return -EAGAIN;
>>>>>>>>>>>>>>>>>>>   - flush_work(&work.base);
>>>>>>>>>>>>>>>>>>> + flush_delayed_work(&work.base.base);
>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>> &work.base);
>>>>>>>>>>>>>>>>>>>         return work.ret;
>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>>>>> index c80af0889773..ffddd419c351 100644
>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain 
>>>>>>>>>>>>>>>>>>> *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d
>>>>>>>>>>>>>>>>>>> atomic_set(&reset_domain->in_gpu_reset, 0);
>>>>>>>>>>>>>>>>>>> init_rwsem(&reset_domain->sem);
>>>>>>>>>>>>>>>>>>>   + INIT_LIST_HEAD(&reset_domain->pending_works);
>>>>>>>>>>>>>>>>>>> + mutex_init(&reset_domain->reset_lock);
>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>       return reset_domain;
>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>   diff --git 
>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>>>>> index 1949dbe28a86..863ec5720fc1 100644
>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>>>>> @@ -24,7 +24,18 @@
>>>>>>>>>>>>>>>>>>>   #ifndef __AMDGPU_RESET_H__
>>>>>>>>>>>>>>>>>>>   #define __AMDGPU_RESET_H__
>>>>>>>>>>>>>>>>>>>   -#include "amdgpu.h"
>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>> +#include <linux/atomic.h>
>>>>>>>>>>>>>>>>>>> +#include <linux/mutex.h>
>>>>>>>>>>>>>>>>>>> +#include <linux/list.h>
>>>>>>>>>>>>>>>>>>> +#include <linux/kref.h>
>>>>>>>>>>>>>>>>>>> +#include <linux/rwsem.h>
>>>>>>>>>>>>>>>>>>> +#include <linux/workqueue.h>
>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>> +struct amdgpu_device;
>>>>>>>>>>>>>>>>>>> +struct amdgpu_job;
>>>>>>>>>>>>>>>>>>> +struct amdgpu_hive_info;
>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>     enum AMDGPU_RESET_FLAGS {
>>>>>>>>>>>>>>>>>>>   @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS {
>>>>>>>>>>>>>>>>>>>       AMDGPU_SKIP_HW_RESET = 1,
>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>   +
>>>>>>>>>>>>>>>>>>> +enum amd_reset_method {
>>>>>>>>>>>>>>>>>>> +    AMD_RESET_METHOD_NONE = -1,
>>>>>>>>>>>>>>>>>>> +    AMD_RESET_METHOD_LEGACY = 0,
>>>>>>>>>>>>>>>>>>> +    AMD_RESET_METHOD_MODE0,
>>>>>>>>>>>>>>>>>>> +    AMD_RESET_METHOD_MODE1,
>>>>>>>>>>>>>>>>>>> +    AMD_RESET_METHOD_MODE2,
>>>>>>>>>>>>>>>>>>> +    AMD_RESET_METHOD_BACO,
>>>>>>>>>>>>>>>>>>> +    AMD_RESET_METHOD_PCI,
>>>>>>>>>>>>>>>>>>> +};
>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>   struct amdgpu_reset_context {
>>>>>>>>>>>>>>>>>>>       enum amd_reset_method method;
>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *reset_req_dev;
>>>>>>>>>>>>>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context {
>>>>>>>>>>>>>>>>>>>       unsigned long flags;
>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>   +struct amdgpu_reset_control;
>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>   struct amdgpu_reset_handler {
>>>>>>>>>>>>>>>>>>>       enum amd_reset_method reset_method;
>>>>>>>>>>>>>>>>>>>       struct list_head handler_list;
>>>>>>>>>>>>>>>>>>> @@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type {
>>>>>>>>>>>>>>>>>>>       XGMI_HIVE
>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>   +
>>>>>>>>>>>>>>>>>>> +struct amdgpu_reset_work_struct {
>>>>>>>>>>>>>>>>>>> +    struct delayed_work base;
>>>>>>>>>>>>>>>>>>> +    struct list_head node;
>>>>>>>>>>>>>>>>>>> +};
>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>   struct amdgpu_reset_domain {
>>>>>>>>>>>>>>>>>>>       struct kref refcount;
>>>>>>>>>>>>>>>>>>>       struct workqueue_struct *wq;
>>>>>>>>>>>>>>>>>>>       enum amdgpu_reset_domain_type type;
>>>>>>>>>>>>>>>>>>>       struct rw_semaphore sem;
>>>>>>>>>>>>>>>>>>>       atomic_t in_gpu_reset;
>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>> +    struct list_head pending_works;
>>>>>>>>>>>>>>>>>>> +    struct mutex reset_lock;
>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>     @@ -113,9 +146,43 @@ static inline void 
>>>>>>>>>>>>>>>>>>> amdgpu_reset_put_reset_domain(struct 
>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *dom
>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>     static inline bool 
>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_schedule(struct 
>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain,
>>>>>>>>>>>>>>>>>>> -                        struct work_struct *work)
>>>>>>>>>>>>>>>>>>> +                        struct 
>>>>>>>>>>>>>>>>>>> amdgpu_reset_work_struct *work)
>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>> -    return queue_work(domain->wq, work);
>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>> +    if (!queue_delayed_work(domain->wq, &work->base, 
>>>>>>>>>>>>>>>>>>> 0)) {
>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>> +        return false;
>>>>>>>>>>>>>>>>>>> +    }
>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>> + list_add_tail(&work->node, &domain->pending_works);
>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>> +    return true;
>>>>>>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>> +static inline void 
>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(struct 
>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain,
>>>>>>>>>>>>>>>>>>> +                  struct amdgpu_reset_work_struct 
>>>>>>>>>>>>>>>>>>> *work)
>>>>>>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>> + list_del_init(&work->node);
>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>> +static inline void amdgpu_reset_pending_list(struct 
>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain)
>>>>>>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct *entry, *tmp;
>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>> + list_for_each_entry_safe(entry, tmp, 
>>>>>>>>>>>>>>>>>>> &domain->pending_works, node) {
>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>> + list_del_init(&entry->node);
>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>> +        /* Stop any other related pending resets */
>>>>>>>>>>>>>>>>>>> + cancel_delayed_work(&entry->base);
>>>>>>>>>>>>>>>>>>> +    }
>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>     void amdgpu_device_lock_reset_domain(struct 
>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *reset_domain);
>>>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h 
>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>>>>> index 239f232f9c02..574e870d3064 100644
>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>>>>> @@ -25,6 +25,7 @@
>>>>>>>>>>>>>>>>>>>   #define AMDGPU_VIRT_H
>>>>>>>>>>>>>>>>>>>     #include "amdgv_sriovmsg.h"
>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>>>>>>>>>>>>>     #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 << 0) /* 
>>>>>>>>>>>>>>>>>>> vBIOS is sr-iov ready */
>>>>>>>>>>>>>>>>>>>   #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 1) /* 
>>>>>>>>>>>>>>>>>>> sr-iov is enabled on this GPU */
>>>>>>>>>>>>>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt {
>>>>>>>>>>>>>>>>>>>       uint32_t reg_val_offs;
>>>>>>>>>>>>>>>>>>>       struct amdgpu_irq_src ack_irq;
>>>>>>>>>>>>>>>>>>>       struct amdgpu_irq_src rcv_irq;
>>>>>>>>>>>>>>>>>>> -    struct work_struct flr_work;
>>>>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct flr_work;
>>>>>>>>>>>>>>>>>>>       struct amdgpu_mm_table mm_table;
>>>>>>>>>>>>>>>>>>>       const struct amdgpu_virt_ops *ops;
>>>>>>>>>>>>>>>>>>>       struct amdgpu_vf_error_buffer vf_errors;
>>>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>>> index b81acf59870c..f3d1c2be9292 100644
>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>>> @@ -251,7 +251,7 @@ static int 
>>>>>>>>>>>>>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>     static void xgpu_ai_mailbox_flr_work(struct 
>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = container_of(work, 
>>>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = container_of(work, 
>>>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work.base.work);
>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = container_of(virt, 
>>>>>>>>>>>>>>>>>>> struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>>>>       int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>>>>>>>   @@ -380,7 +380,8 @@ int 
>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_get_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>> @@ -389,6 +390,8 @@ void 
>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_put_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>     static int xgpu_ai_request_init_data(struct 
>>>>>>>>>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644
>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>>> @@ -275,7 +275,7 @@ static int 
>>>>>>>>>>>>>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>     static void xgpu_nv_mailbox_flr_work(struct 
>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = container_of(work, 
>>>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = container_of(work, 
>>>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work.base.work);
>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = container_of(virt, 
>>>>>>>>>>>>>>>>>>> struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>>>>       int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>>>>>>>   @@ -407,7 +407,8 @@ int 
>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_get_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>> @@ -416,6 +417,8 @@ void 
>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_put_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>     const struct amdgpu_virt_ops xgpu_nv_virt_ops = {
>>>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c 
>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644
>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>>>>> @@ -512,7 +512,7 @@ static int 
>>>>>>>>>>>>>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>     static void xgpu_vi_mailbox_flr_work(struct 
>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = container_of(work, 
>>>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = container_of(work, 
>>>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work.base.work);
>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = container_of(virt, 
>>>>>>>>>>>>>>>>>>> struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>>>>         /* wait until RCV_MSG become 3 */
>>>>>>>>>>>>>>>>>>> @@ -610,7 +610,8 @@ int 
>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_get_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>> @@ -619,6 +620,8 @@ void 
>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_put_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>     const struct amdgpu_virt_ops xgpu_vi_virt_ops = {
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>
>>>>>>>
>>>>>
>>>
> 

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive.
  2022-05-11 15:20                                     ` Lazar, Lijo
@ 2022-05-11 15:35                                       ` Andrey Grodzovsky
  2022-05-11 15:37                                         ` Lazar, Lijo
  2022-05-11 15:39                                         ` Christian König
  0 siblings, 2 replies; 40+ messages in thread
From: Andrey Grodzovsky @ 2022-05-11 15:35 UTC (permalink / raw)
  To: Lazar, Lijo, Christian König, Christian König,
	Felix Kuehling, amd-gfx
  Cc: Bai Zoy


On 2022-05-11 11:20, Lazar, Lijo wrote:
>
>
> On 5/11/2022 7:28 PM, Christian König wrote:
>> Am 11.05.22 um 15:43 schrieb Andrey Grodzovsky:
>>> On 2022-05-11 03:38, Christian König wrote:
>>>> Am 10.05.22 um 20:53 schrieb Andrey Grodzovsky:
>>>>> [SNIP]
>>>>>> E.g. in the reset code (either before or after the reset, that's 
>>>>>> debatable) you do something like this:
>>>>>>
>>>>>> for (i = 0; i < num_ring; ++i)
>>>>>>     cancel_delayed_work(ring[i]->scheduler....)
>>>>>> cancel_work(adev->ras_work);
>>>>>> cancel_work(adev->iofault_work);
>>>>>> cancel_work(adev->debugfs_work);
>>>>>> ...
>>>>>>
>>>>>> You don't really need to track which reset source has fired and 
>>>>>> which hasn't, because that would be racy again. Instead just 
>>>>>> bluntly reset all possible sources.
>>>>>>
>>>>>> Christian.
>>>>>
>>>>>
>>>>> I don't say we care if it fired once or twice (I need to add a fix 
>>>>> to only insert reset work to pending reset list if it's not 
>>>>> already there), the point of using list (or array) to which you 
>>>>> add and from which you remove is that the logic of this is 
>>>>> encapsulated within reset domain. In your way we need to be aware 
>>>>> who exactly schedules reset work and explicitly cancel them, this 
>>>>> also means that for any new source added in the future you will 
>>>>> need to remember to add him
>>>>
>>>> I don't think that this is a valid argument. Additionally to the 
>>>> schedulers we probably just need less than a handful of reset 
>>>> sources, most likely even just one or two is enough.
>>>>
>>>> The only justification I can see of having additional separate 
>>>> reset sources would be if somebody wants to know if a specific 
>>>> source has been handled or not (e.g. call flush_work() or 
>>>> work_pending()). Like in the case of a reset triggered through 
>>>> debugfs.
>>>
>>>
>>> This is indeed one reason, another is as we said before that if you 
>>> share 'reset source' (meaning a delayed work) with another client 
>>> (i.e. RAS and KFD) it means you make assumption that the other 
>>> client always proceeds with the
>>> reset exactly the same way as you expect. So today we have this only 
>>> in scheduler vs non scheduler reset happening - non scheduler reset 
>>> clients assume the reset is always fully executed in HW while 
>>> scheduler based reset makes shortcuts and not always does HW reset 
>>> hence they cannot share 'reset source' (delayed work). Yes, we can 
>>> always add this in the future if and when such problem will arise 
>>> but no one will remember this then and a new bug will be introduced 
>>> and will take time to find and resolve.
>>
>> Mhm, so your main concern is that we forget to correctly handle the 
>> new reset sources?
>>
>> How about we do it like this then:
>>
>> struct amdgpu_reset_domain {
>>      ....
>>      union {
>>          struct {
>>              struct work_item debugfs;
>>              struct work_item ras;
>>              ....
>>          };
>>          struct work_item array[]
>>      } reset_sources;
>> }
>>
>
> If it's only about static array,
>
> enum amdgpu_reset_soruce {
>
> AMDGPU_RESET_SRC_RAS,
> AMDGPU_RESET_SRC_ABC,
> .....
> AMDGPU_RESET_SRC_XYZ,
> AMDGPU_RESET_SRC_MAX
>
> };
>
> struct work_struct reset_work[AMDGPU_RESET_SRC_MAX]; => An index for 
> each work item
>
>
> Thanks,
> Lijo


It's possible though it makes harder to generalize reset_domain later 
for other drivers.
But still one caveat, look at amdgpu_recover_work_struct and it's usage 
in amdgpu_device_gpu_recover and in gpu_recover_get,
At least for debugfs i need to return back the result of GPU reset and 
so I cannot store actual work items in the array mentioned above
but rather pointers to work_item because i need a way to get back the 
return value from gpu_recover like I do now in amdgpu_device_gpu_recover.

Andrey


>
>> Not 100% sure if that works, but something like that should do the 
>> trick.
>>
>> My main concern is that I don't want to allocate the work items on 
>> the stack and dynamic allocation (e.g. kmalloc) is usually not 
>> possible either.
>>
>> Additional to that putting/removing work items from a list, array or 
>> other container is a very common source for race conditions.
>>
>> Regards,
>> Christian.
>>
>>>
>>>>> to the cancellation list which you showed above. In current way 
>>>>> all this done automatically within reset_domain code and it's 
>>>>> agnostic to specific driver and it's specific list of reset 
>>>>> sources. Also in case we would want to generalize reset_domain to 
>>>>> other GPU drivers (which was
>>>>> a plan as far as i remember) this explicit mention of each reset 
>>>>> works for cancellation is again less suitable in my opinion.
>>>>
>>>> Well we could put the work item for the scheduler independent reset 
>>>> source into the reset domain as well. But I'm not sure those 
>>>> additional reset sources should be part of any common handling, 
>>>> that is largely amdgpu specific.
>>>
>>>
>>> So it's for sure more then one source for the reasons described 
>>> above, also note that for scheduler we already cancel delayed work 
>>> in drm_sched_stop so calling them again in amdgpu code kind of 
>>> superfluous.
>>>
>>> Andrey
>>>
>>>
>>>>
>>>> Christian.
>>>>
>>>>>
>>>>> Andrey
>>>>>
>>>>>
>>>>>>
>>>>>>>
>>>>>>> Andrey
>>>>>>>
>>>>>>>
>>>>>>>>
>>>>>>>>>
>>>>>>>>> The only difference is I chose to do the canceling right 
>>>>>>>>> BEFORE the HW reset and not AFTER. I did this because I see a 
>>>>>>>>> possible race where a new reset request is being generated 
>>>>>>>>> exactly after we finished the HW reset but before we canceled 
>>>>>>>>> out all pending resets - in such case you wold not want to 
>>>>>>>>> cancel this 'border line new' reset request.
>>>>>>>>
>>>>>>>> Why not? Any new reset request directly after a hardware reset 
>>>>>>>> is most likely just falsely generated by the reset itself.
>>>>>>>>
>>>>>>>> Ideally I would cancel all sources after the reset, but before 
>>>>>>>> starting any new work.
>>>>>>>>
>>>>>>>> Regards,
>>>>>>>> Christian.
>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> Andrey
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> Regards,
>>>>>>>>>> Christian.
>>>>>>>>>>
>>>>>>>>>>>> You can see that if many different reset sources share same 
>>>>>>>>>>>> work struct what can happen is that the first to obtain the 
>>>>>>>>>>>> lock you describe bellow might opt out from full HW reset 
>>>>>>>>>>>> because his bad job did signal for example or because his 
>>>>>>>>>>>> hunged IP block was able to recover through SW reset but in 
>>>>>>>>>>>> the meantime another reset source who needed an actual HW 
>>>>>>>>>>>> reset just silently returned and we end up with unhandled 
>>>>>>>>>>>> reset request. True that today this happens only to job 
>>>>>>>>>>>> timeout reset sources that are handled form within the 
>>>>>>>>>>>> scheduler and won't use this single work struct but no one 
>>>>>>>>>>>> prevents a future case for this to happen and also, if we 
>>>>>>>>>>>> actually want to unify scheduler time out handlers within 
>>>>>>>>>>>> reset domain (which seems to me the right design approach) 
>>>>>>>>>>>> we won't be able to use just one work struct for this 
>>>>>>>>>>>> reason anyway.
>>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> Just to add to this point - a reset domain is co-operative 
>>>>>>>>>>> domain. In addition to sharing a set of clients from various 
>>>>>>>>>>> reset sources for one device, it also will have a set of 
>>>>>>>>>>> devices like in XGMI hive. The job timeout on one device may 
>>>>>>>>>>> not eventually result in result, but a RAS error happening 
>>>>>>>>>>> on another device at the same time would need a reset. The 
>>>>>>>>>>> second device's RAS error cannot return seeing  that a reset 
>>>>>>>>>>> work already started, or ignore the reset work given that 
>>>>>>>>>>> another device has filled the reset data.
>>>>>>>>>>>
>>>>>>>>>>> When there is a reset domain, it should take care of the 
>>>>>>>>>>> work scheduled and keeping it in device or any other level 
>>>>>>>>>>> doesn't sound good.
>>>>>>>>>>>
>>>>>>>>>>> Thanks,
>>>>>>>>>>> Lijo
>>>>>>>>>>>
>>>>>>>>>>>> Andrey
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>> I'd put the reset work struct into the reset_domain 
>>>>>>>>>>>>> struct. That way you'd have exactly one worker for the 
>>>>>>>>>>>>> reset domain. You could implement a lock-less scheme to 
>>>>>>>>>>>>> decide whether you need to schedule a reset, e.g. using an 
>>>>>>>>>>>>> atomic counter in the shared work struct that gets 
>>>>>>>>>>>>> incremented when a client wants to trigger a reset 
>>>>>>>>>>>>> (atomic_add_return). If that counter is exactly 1 after 
>>>>>>>>>>>>> incrementing, you need to fill in the rest of the work 
>>>>>>>>>>>>> struct and schedule the work. Otherwise, it's already 
>>>>>>>>>>>>> scheduled (or another client is in the process of 
>>>>>>>>>>>>> scheduling it) and you just return. When the worker 
>>>>>>>>>>>>> finishes (after confirming a successful reset), it resets 
>>>>>>>>>>>>> the counter to 0, so the next client requesting a reset 
>>>>>>>>>>>>> will schedule the worker again.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>   Felix
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Additional to that keep in mind that you can't allocate 
>>>>>>>>>>>>>>> any memory before or during the GPU reset nor wait for 
>>>>>>>>>>>>>>> the reset to complete (so you can't allocate anything on 
>>>>>>>>>>>>>>> the stack either).
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> There is no dynamic allocation here, regarding stack 
>>>>>>>>>>>>>> allocations - we do it all the time when we call 
>>>>>>>>>>>>>> functions, even during GPU resets, how on stack 
>>>>>>>>>>>>>> allocation of work struct in amdgpu_device_gpu_recover is 
>>>>>>>>>>>>>> different from any other local variable we allocate in 
>>>>>>>>>>>>>> any function we call ?
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> I am also not sure why it's not allowed to wait for reset 
>>>>>>>>>>>>>> to complete ? Also, see in amdgpu_ras_do_recovery and 
>>>>>>>>>>>>>> gpu_recover_get (debugfs) - the caller expects the reset 
>>>>>>>>>>>>>> to complete before he returns. I can probably work around 
>>>>>>>>>>>>>> it in RAS code by calling atomic_set(&ras->in_recovery, 
>>>>>>>>>>>>>> 0) from some callback within actual reset function but 
>>>>>>>>>>>>>> regarding sysfs it actually expects a result returned 
>>>>>>>>>>>>>> indicating whether the call was successful or not.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> I don't think that concept you try here will work.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Also in general seems to me it's cleaner approach where 
>>>>>>>>>>>>>>>> this logic (the work items) are held and handled in 
>>>>>>>>>>>>>>>> reset_domain and are not split in each adev or any 
>>>>>>>>>>>>>>>> other entity. We might want in the future to even move 
>>>>>>>>>>>>>>>> the scheduler handling into reset domain since reset 
>>>>>>>>>>>>>>>> domain is supposed to be a generic things and not only 
>>>>>>>>>>>>>>>> or AMD.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky 
>>>>>>>>>>>>>>>>>>>> <andrey.grodzovsky@amd.com>
>>>>>>>>>>>>>>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com>
>>>>>>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +---
>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++--
>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 +
>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 
>>>>>>>>>>>>>>>>>>>> +++++++++++++++++++++-
>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +-
>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++-
>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++-
>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++-
>>>>>>>>>>>>>>>>>>>>   8 files changed, 104 insertions(+), 24 deletions(-)
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>>>>>> index 4264abc5604d..99efd8317547 100644
>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>>>>>> @@ -109,6 +109,7 @@
>>>>>>>>>>>>>>>>>>>>   #include "amdgpu_fdinfo.h"
>>>>>>>>>>>>>>>>>>>>   #include "amdgpu_mca.h"
>>>>>>>>>>>>>>>>>>>>   #include "amdgpu_ras.h"
>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>>>>>>>>>>>>>>     #define MAX_GPU_INSTANCE 16
>>>>>>>>>>>>>>>>>>>>   @@ -509,16 +510,6 @@ struct 
>>>>>>>>>>>>>>>>>>>> amdgpu_allowed_register_entry {
>>>>>>>>>>>>>>>>>>>>       bool grbm_indexed;
>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>   -enum amd_reset_method {
>>>>>>>>>>>>>>>>>>>> -    AMD_RESET_METHOD_NONE = -1,
>>>>>>>>>>>>>>>>>>>> -    AMD_RESET_METHOD_LEGACY = 0,
>>>>>>>>>>>>>>>>>>>> -    AMD_RESET_METHOD_MODE0,
>>>>>>>>>>>>>>>>>>>> -    AMD_RESET_METHOD_MODE1,
>>>>>>>>>>>>>>>>>>>> -    AMD_RESET_METHOD_MODE2,
>>>>>>>>>>>>>>>>>>>> -    AMD_RESET_METHOD_BACO,
>>>>>>>>>>>>>>>>>>>> -    AMD_RESET_METHOD_PCI,
>>>>>>>>>>>>>>>>>>>> -};
>>>>>>>>>>>>>>>>>>>> -
>>>>>>>>>>>>>>>>>>>>   struct amdgpu_video_codec_info {
>>>>>>>>>>>>>>>>>>>>       u32 codec_type;
>>>>>>>>>>>>>>>>>>>>       u32 max_width;
>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>> index e582f1044c0f..7fa82269c30f 100644
>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>> @@ -5201,6 +5201,12 @@ int 
>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>> *adev,
>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>> tmp_vram_lost_counter = 
>>>>>>>>>>>>>>>>>>>> atomic_read(&((adev)->vram_lost_counter));
>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>> +    /* Drop all pending resets since we will reset 
>>>>>>>>>>>>>>>>>>>> now anyway */
>>>>>>>>>>>>>>>>>>>> +    tmp_adev = 
>>>>>>>>>>>>>>>>>>>> list_first_entry(device_list_handle, struct 
>>>>>>>>>>>>>>>>>>>> amdgpu_device,
>>>>>>>>>>>>>>>>>>>> + reset_list);
>>>>>>>>>>>>>>>>>>>> + amdgpu_reset_pending_list(tmp_adev->reset_domain);
>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>       /* Actual ASIC resets if needed.*/
>>>>>>>>>>>>>>>>>>>>       /* Host driver will handle XGMI hive reset 
>>>>>>>>>>>>>>>>>>>> for SRIOV */
>>>>>>>>>>>>>>>>>>>>       if (amdgpu_sriov_vf(adev)) {
>>>>>>>>>>>>>>>>>>>> @@ -5296,7 +5302,7 @@ int 
>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>> *adev,
>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>     struct amdgpu_recover_work_struct {
>>>>>>>>>>>>>>>>>>>> -    struct work_struct base;
>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct base;
>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev;
>>>>>>>>>>>>>>>>>>>>       struct amdgpu_job *job;
>>>>>>>>>>>>>>>>>>>>       int ret;
>>>>>>>>>>>>>>>>>>>> @@ -5304,7 +5310,7 @@ struct 
>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct {
>>>>>>>>>>>>>>>>>>>>     static void 
>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work(struct 
>>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_recover_work_struct 
>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct 
>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base);
>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_recover_work_struct 
>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct 
>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base.base.work);
>>>>>>>>>>>>>>>>>>>>         recover_work->ret = 
>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, 
>>>>>>>>>>>>>>>>>>>> recover_work->job);
>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>> @@ -5316,12 +5322,15 @@ int 
>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>       struct amdgpu_recover_work_struct work = 
>>>>>>>>>>>>>>>>>>>> {.adev = adev, .job = job};
>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&work.base, 
>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&work.base.base, 
>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&work.base.node);
>>>>>>>>>>>>>>>>>>>>         if 
>>>>>>>>>>>>>>>>>>>> (!amdgpu_reset_domain_schedule(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>> &work.base))
>>>>>>>>>>>>>>>>>>>>           return -EAGAIN;
>>>>>>>>>>>>>>>>>>>>   - flush_work(&work.base);
>>>>>>>>>>>>>>>>>>>> + flush_delayed_work(&work.base.base);
>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>> &work.base);
>>>>>>>>>>>>>>>>>>>>         return work.ret;
>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>>>>>> index c80af0889773..ffddd419c351 100644
>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain 
>>>>>>>>>>>>>>>>>>>> *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d
>>>>>>>>>>>>>>>>>>>> atomic_set(&reset_domain->in_gpu_reset, 0);
>>>>>>>>>>>>>>>>>>>> init_rwsem(&reset_domain->sem);
>>>>>>>>>>>>>>>>>>>>   + INIT_LIST_HEAD(&reset_domain->pending_works);
>>>>>>>>>>>>>>>>>>>> + mutex_init(&reset_domain->reset_lock);
>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>       return reset_domain;
>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>   diff --git 
>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>>>>>> index 1949dbe28a86..863ec5720fc1 100644
>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>>>>>> @@ -24,7 +24,18 @@
>>>>>>>>>>>>>>>>>>>>   #ifndef __AMDGPU_RESET_H__
>>>>>>>>>>>>>>>>>>>>   #define __AMDGPU_RESET_H__
>>>>>>>>>>>>>>>>>>>>   -#include "amdgpu.h"
>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>> +#include <linux/atomic.h>
>>>>>>>>>>>>>>>>>>>> +#include <linux/mutex.h>
>>>>>>>>>>>>>>>>>>>> +#include <linux/list.h>
>>>>>>>>>>>>>>>>>>>> +#include <linux/kref.h>
>>>>>>>>>>>>>>>>>>>> +#include <linux/rwsem.h>
>>>>>>>>>>>>>>>>>>>> +#include <linux/workqueue.h>
>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>> +struct amdgpu_device;
>>>>>>>>>>>>>>>>>>>> +struct amdgpu_job;
>>>>>>>>>>>>>>>>>>>> +struct amdgpu_hive_info;
>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>     enum AMDGPU_RESET_FLAGS {
>>>>>>>>>>>>>>>>>>>>   @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS {
>>>>>>>>>>>>>>>>>>>>       AMDGPU_SKIP_HW_RESET = 1,
>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>   +
>>>>>>>>>>>>>>>>>>>> +enum amd_reset_method {
>>>>>>>>>>>>>>>>>>>> +    AMD_RESET_METHOD_NONE = -1,
>>>>>>>>>>>>>>>>>>>> +    AMD_RESET_METHOD_LEGACY = 0,
>>>>>>>>>>>>>>>>>>>> +    AMD_RESET_METHOD_MODE0,
>>>>>>>>>>>>>>>>>>>> +    AMD_RESET_METHOD_MODE1,
>>>>>>>>>>>>>>>>>>>> +    AMD_RESET_METHOD_MODE2,
>>>>>>>>>>>>>>>>>>>> +    AMD_RESET_METHOD_BACO,
>>>>>>>>>>>>>>>>>>>> +    AMD_RESET_METHOD_PCI,
>>>>>>>>>>>>>>>>>>>> +};
>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>   struct amdgpu_reset_context {
>>>>>>>>>>>>>>>>>>>>       enum amd_reset_method method;
>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *reset_req_dev;
>>>>>>>>>>>>>>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context {
>>>>>>>>>>>>>>>>>>>>       unsigned long flags;
>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>   +struct amdgpu_reset_control;
>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>   struct amdgpu_reset_handler {
>>>>>>>>>>>>>>>>>>>>       enum amd_reset_method reset_method;
>>>>>>>>>>>>>>>>>>>>       struct list_head handler_list;
>>>>>>>>>>>>>>>>>>>> @@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type {
>>>>>>>>>>>>>>>>>>>>       XGMI_HIVE
>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>   +
>>>>>>>>>>>>>>>>>>>> +struct amdgpu_reset_work_struct {
>>>>>>>>>>>>>>>>>>>> +    struct delayed_work base;
>>>>>>>>>>>>>>>>>>>> +    struct list_head node;
>>>>>>>>>>>>>>>>>>>> +};
>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>   struct amdgpu_reset_domain {
>>>>>>>>>>>>>>>>>>>>       struct kref refcount;
>>>>>>>>>>>>>>>>>>>>       struct workqueue_struct *wq;
>>>>>>>>>>>>>>>>>>>>       enum amdgpu_reset_domain_type type;
>>>>>>>>>>>>>>>>>>>>       struct rw_semaphore sem;
>>>>>>>>>>>>>>>>>>>>       atomic_t in_gpu_reset;
>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>> +    struct list_head pending_works;
>>>>>>>>>>>>>>>>>>>> +    struct mutex reset_lock;
>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>     @@ -113,9 +146,43 @@ static inline void 
>>>>>>>>>>>>>>>>>>>> amdgpu_reset_put_reset_domain(struct 
>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *dom
>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>     static inline bool 
>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_schedule(struct 
>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain,
>>>>>>>>>>>>>>>>>>>> - struct work_struct *work)
>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work)
>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>> -    return queue_work(domain->wq, work);
>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>> +    if (!queue_delayed_work(domain->wq, 
>>>>>>>>>>>>>>>>>>>> &work->base, 0)) {
>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>> +        return false;
>>>>>>>>>>>>>>>>>>>> +    }
>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>> + list_add_tail(&work->node, &domain->pending_works);
>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>> +    return true;
>>>>>>>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>> +static inline void 
>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(struct 
>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain,
>>>>>>>>>>>>>>>>>>>> +                  struct amdgpu_reset_work_struct 
>>>>>>>>>>>>>>>>>>>> *work)
>>>>>>>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>> + list_del_init(&work->node);
>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>> +static inline void 
>>>>>>>>>>>>>>>>>>>> amdgpu_reset_pending_list(struct 
>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain)
>>>>>>>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct *entry, *tmp;
>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>> + list_for_each_entry_safe(entry, tmp, 
>>>>>>>>>>>>>>>>>>>> &domain->pending_works, node) {
>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>> + list_del_init(&entry->node);
>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>> +        /* Stop any other related pending resets */
>>>>>>>>>>>>>>>>>>>> + cancel_delayed_work(&entry->base);
>>>>>>>>>>>>>>>>>>>> +    }
>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>     void amdgpu_device_lock_reset_domain(struct 
>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *reset_domain);
>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h 
>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>>>>>> index 239f232f9c02..574e870d3064 100644
>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>>>>>> @@ -25,6 +25,7 @@
>>>>>>>>>>>>>>>>>>>>   #define AMDGPU_VIRT_H
>>>>>>>>>>>>>>>>>>>>     #include "amdgv_sriovmsg.h"
>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>>>>>>>>>>>>>>     #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 << 0) 
>>>>>>>>>>>>>>>>>>>> /* vBIOS is sr-iov ready */
>>>>>>>>>>>>>>>>>>>>   #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 1) /* 
>>>>>>>>>>>>>>>>>>>> sr-iov is enabled on this GPU */
>>>>>>>>>>>>>>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt {
>>>>>>>>>>>>>>>>>>>>       uint32_t reg_val_offs;
>>>>>>>>>>>>>>>>>>>>       struct amdgpu_irq_src ack_irq;
>>>>>>>>>>>>>>>>>>>>       struct amdgpu_irq_src rcv_irq;
>>>>>>>>>>>>>>>>>>>> -    struct work_struct flr_work;
>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct flr_work;
>>>>>>>>>>>>>>>>>>>>       struct amdgpu_mm_table mm_table;
>>>>>>>>>>>>>>>>>>>>       const struct amdgpu_virt_ops *ops;
>>>>>>>>>>>>>>>>>>>>       struct amdgpu_vf_error_buffer vf_errors;
>>>>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>>>> index b81acf59870c..f3d1c2be9292 100644
>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>>>> @@ -251,7 +251,7 @@ static int 
>>>>>>>>>>>>>>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>> *adev,
>>>>>>>>>>>>>>>>>>>>     static void xgpu_ai_mailbox_flr_work(struct 
>>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = container_of(work, 
>>>>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = container_of(work, 
>>>>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work.base.work);
>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = 
>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>>>>>       int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>>>>>>>>   @@ -380,7 +380,8 @@ int 
>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_get_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>> @@ -389,6 +390,8 @@ void 
>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_put_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>     static int xgpu_ai_request_init_data(struct 
>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644
>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>>>> @@ -275,7 +275,7 @@ static int 
>>>>>>>>>>>>>>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>> *adev,
>>>>>>>>>>>>>>>>>>>>     static void xgpu_nv_mailbox_flr_work(struct 
>>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = container_of(work, 
>>>>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = container_of(work, 
>>>>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work.base.work);
>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = 
>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>>>>>       int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>>>>>>>>   @@ -407,7 +407,8 @@ int 
>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_get_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>> @@ -416,6 +417,8 @@ void 
>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_put_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>     const struct amdgpu_virt_ops xgpu_nv_virt_ops = {
>>>>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c 
>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644
>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>>>>>> @@ -512,7 +512,7 @@ static int 
>>>>>>>>>>>>>>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>> *adev,
>>>>>>>>>>>>>>>>>>>>     static void xgpu_vi_mailbox_flr_work(struct 
>>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = container_of(work, 
>>>>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = container_of(work, 
>>>>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work.base.work);
>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = 
>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>>>>>         /* wait until RCV_MSG become 3 */
>>>>>>>>>>>>>>>>>>>> @@ -610,7 +610,8 @@ int 
>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_get_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>> @@ -619,6 +620,8 @@ void 
>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_put_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>     const struct amdgpu_virt_ops xgpu_vi_virt_ops = {
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>
>>>>>>
>>>>
>>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive.
  2022-05-11 15:35                                       ` Andrey Grodzovsky
@ 2022-05-11 15:37                                         ` Lazar, Lijo
  2022-05-11 15:43                                           ` Andrey Grodzovsky
  2022-05-11 15:39                                         ` Christian König
  1 sibling, 1 reply; 40+ messages in thread
From: Lazar, Lijo @ 2022-05-11 15:37 UTC (permalink / raw)
  To: Andrey Grodzovsky, Christian König, Christian König,
	Felix Kuehling, amd-gfx
  Cc: Bai Zoy



On 5/11/2022 9:05 PM, Andrey Grodzovsky wrote:
> 
> On 2022-05-11 11:20, Lazar, Lijo wrote:
>>
>>
>> On 5/11/2022 7:28 PM, Christian König wrote:
>>> Am 11.05.22 um 15:43 schrieb Andrey Grodzovsky:
>>>> On 2022-05-11 03:38, Christian König wrote:
>>>>> Am 10.05.22 um 20:53 schrieb Andrey Grodzovsky:
>>>>>> [SNIP]
>>>>>>> E.g. in the reset code (either before or after the reset, that's 
>>>>>>> debatable) you do something like this:
>>>>>>>
>>>>>>> for (i = 0; i < num_ring; ++i)
>>>>>>>     cancel_delayed_work(ring[i]->scheduler....)
>>>>>>> cancel_work(adev->ras_work);
>>>>>>> cancel_work(adev->iofault_work);
>>>>>>> cancel_work(adev->debugfs_work);
>>>>>>> ...
>>>>>>>
>>>>>>> You don't really need to track which reset source has fired and 
>>>>>>> which hasn't, because that would be racy again. Instead just 
>>>>>>> bluntly reset all possible sources.
>>>>>>>
>>>>>>> Christian.
>>>>>>
>>>>>>
>>>>>> I don't say we care if it fired once or twice (I need to add a fix 
>>>>>> to only insert reset work to pending reset list if it's not 
>>>>>> already there), the point of using list (or array) to which you 
>>>>>> add and from which you remove is that the logic of this is 
>>>>>> encapsulated within reset domain. In your way we need to be aware 
>>>>>> who exactly schedules reset work and explicitly cancel them, this 
>>>>>> also means that for any new source added in the future you will 
>>>>>> need to remember to add him
>>>>>
>>>>> I don't think that this is a valid argument. Additionally to the 
>>>>> schedulers we probably just need less than a handful of reset 
>>>>> sources, most likely even just one or two is enough.
>>>>>
>>>>> The only justification I can see of having additional separate 
>>>>> reset sources would be if somebody wants to know if a specific 
>>>>> source has been handled or not (e.g. call flush_work() or 
>>>>> work_pending()). Like in the case of a reset triggered through 
>>>>> debugfs.
>>>>
>>>>
>>>> This is indeed one reason, another is as we said before that if you 
>>>> share 'reset source' (meaning a delayed work) with another client 
>>>> (i.e. RAS and KFD) it means you make assumption that the other 
>>>> client always proceeds with the
>>>> reset exactly the same way as you expect. So today we have this only 
>>>> in scheduler vs non scheduler reset happening - non scheduler reset 
>>>> clients assume the reset is always fully executed in HW while 
>>>> scheduler based reset makes shortcuts and not always does HW reset 
>>>> hence they cannot share 'reset source' (delayed work). Yes, we can 
>>>> always add this in the future if and when such problem will arise 
>>>> but no one will remember this then and a new bug will be introduced 
>>>> and will take time to find and resolve.
>>>
>>> Mhm, so your main concern is that we forget to correctly handle the 
>>> new reset sources?
>>>
>>> How about we do it like this then:
>>>
>>> struct amdgpu_reset_domain {
>>>      ....
>>>      union {
>>>          struct {
>>>              struct work_item debugfs;
>>>              struct work_item ras;
>>>              ....
>>>          };
>>>          struct work_item array[]
>>>      } reset_sources;
>>> }
>>>
>>
>> If it's only about static array,
>>
>> enum amdgpu_reset_soruce {
>>
>> AMDGPU_RESET_SRC_RAS,
>> AMDGPU_RESET_SRC_ABC,
>> .....
>> AMDGPU_RESET_SRC_XYZ,
>> AMDGPU_RESET_SRC_MAX
>>
>> };
>>
>> struct work_struct reset_work[AMDGPU_RESET_SRC_MAX]; => An index for 
>> each work item
>>
>>
>> Thanks,
>> Lijo
> 
> 
> It's possible though it makes harder to generalize reset_domain later 
> for other drivers.

The current reset domain queue design is not good for a hierarchichal 
reset within amdgpu itself :)

Thanks,
Lijo

> But still one caveat, look at amdgpu_recover_work_struct and it's usage 
> in amdgpu_device_gpu_recover and in gpu_recover_get,
> At least for debugfs i need to return back the result of GPU reset and 
> so I cannot store actual work items in the array mentioned above
> but rather pointers to work_item because i need a way to get back the 
> return value from gpu_recover like I do now in amdgpu_device_gpu_recover.
> 
> Andrey
> 
> 
>>
>>> Not 100% sure if that works, but something like that should do the 
>>> trick.
>>>
>>> My main concern is that I don't want to allocate the work items on 
>>> the stack and dynamic allocation (e.g. kmalloc) is usually not 
>>> possible either.
>>>
>>> Additional to that putting/removing work items from a list, array or 
>>> other container is a very common source for race conditions.
>>>
>>> Regards,
>>> Christian.
>>>
>>>>
>>>>>> to the cancellation list which you showed above. In current way 
>>>>>> all this done automatically within reset_domain code and it's 
>>>>>> agnostic to specific driver and it's specific list of reset 
>>>>>> sources. Also in case we would want to generalize reset_domain to 
>>>>>> other GPU drivers (which was
>>>>>> a plan as far as i remember) this explicit mention of each reset 
>>>>>> works for cancellation is again less suitable in my opinion.
>>>>>
>>>>> Well we could put the work item for the scheduler independent reset 
>>>>> source into the reset domain as well. But I'm not sure those 
>>>>> additional reset sources should be part of any common handling, 
>>>>> that is largely amdgpu specific.
>>>>
>>>>
>>>> So it's for sure more then one source for the reasons described 
>>>> above, also note that for scheduler we already cancel delayed work 
>>>> in drm_sched_stop so calling them again in amdgpu code kind of 
>>>> superfluous.
>>>>
>>>> Andrey
>>>>
>>>>
>>>>>
>>>>> Christian.
>>>>>
>>>>>>
>>>>>> Andrey
>>>>>>
>>>>>>
>>>>>>>
>>>>>>>>
>>>>>>>> Andrey
>>>>>>>>
>>>>>>>>
>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> The only difference is I chose to do the canceling right 
>>>>>>>>>> BEFORE the HW reset and not AFTER. I did this because I see a 
>>>>>>>>>> possible race where a new reset request is being generated 
>>>>>>>>>> exactly after we finished the HW reset but before we canceled 
>>>>>>>>>> out all pending resets - in such case you wold not want to 
>>>>>>>>>> cancel this 'border line new' reset request.
>>>>>>>>>
>>>>>>>>> Why not? Any new reset request directly after a hardware reset 
>>>>>>>>> is most likely just falsely generated by the reset itself.
>>>>>>>>>
>>>>>>>>> Ideally I would cancel all sources after the reset, but before 
>>>>>>>>> starting any new work.
>>>>>>>>>
>>>>>>>>> Regards,
>>>>>>>>> Christian.
>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> Andrey
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> Regards,
>>>>>>>>>>> Christian.
>>>>>>>>>>>
>>>>>>>>>>>>> You can see that if many different reset sources share same 
>>>>>>>>>>>>> work struct what can happen is that the first to obtain the 
>>>>>>>>>>>>> lock you describe bellow might opt out from full HW reset 
>>>>>>>>>>>>> because his bad job did signal for example or because his 
>>>>>>>>>>>>> hunged IP block was able to recover through SW reset but in 
>>>>>>>>>>>>> the meantime another reset source who needed an actual HW 
>>>>>>>>>>>>> reset just silently returned and we end up with unhandled 
>>>>>>>>>>>>> reset request. True that today this happens only to job 
>>>>>>>>>>>>> timeout reset sources that are handled form within the 
>>>>>>>>>>>>> scheduler and won't use this single work struct but no one 
>>>>>>>>>>>>> prevents a future case for this to happen and also, if we 
>>>>>>>>>>>>> actually want to unify scheduler time out handlers within 
>>>>>>>>>>>>> reset domain (which seems to me the right design approach) 
>>>>>>>>>>>>> we won't be able to use just one work struct for this 
>>>>>>>>>>>>> reason anyway.
>>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> Just to add to this point - a reset domain is co-operative 
>>>>>>>>>>>> domain. In addition to sharing a set of clients from various 
>>>>>>>>>>>> reset sources for one device, it also will have a set of 
>>>>>>>>>>>> devices like in XGMI hive. The job timeout on one device may 
>>>>>>>>>>>> not eventually result in result, but a RAS error happening 
>>>>>>>>>>>> on another device at the same time would need a reset. The 
>>>>>>>>>>>> second device's RAS error cannot return seeing  that a reset 
>>>>>>>>>>>> work already started, or ignore the reset work given that 
>>>>>>>>>>>> another device has filled the reset data.
>>>>>>>>>>>>
>>>>>>>>>>>> When there is a reset domain, it should take care of the 
>>>>>>>>>>>> work scheduled and keeping it in device or any other level 
>>>>>>>>>>>> doesn't sound good.
>>>>>>>>>>>>
>>>>>>>>>>>> Thanks,
>>>>>>>>>>>> Lijo
>>>>>>>>>>>>
>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> I'd put the reset work struct into the reset_domain 
>>>>>>>>>>>>>> struct. That way you'd have exactly one worker for the 
>>>>>>>>>>>>>> reset domain. You could implement a lock-less scheme to 
>>>>>>>>>>>>>> decide whether you need to schedule a reset, e.g. using an 
>>>>>>>>>>>>>> atomic counter in the shared work struct that gets 
>>>>>>>>>>>>>> incremented when a client wants to trigger a reset 
>>>>>>>>>>>>>> (atomic_add_return). If that counter is exactly 1 after 
>>>>>>>>>>>>>> incrementing, you need to fill in the rest of the work 
>>>>>>>>>>>>>> struct and schedule the work. Otherwise, it's already 
>>>>>>>>>>>>>> scheduled (or another client is in the process of 
>>>>>>>>>>>>>> scheduling it) and you just return. When the worker 
>>>>>>>>>>>>>> finishes (after confirming a successful reset), it resets 
>>>>>>>>>>>>>> the counter to 0, so the next client requesting a reset 
>>>>>>>>>>>>>> will schedule the worker again.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>   Felix
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Additional to that keep in mind that you can't allocate 
>>>>>>>>>>>>>>>> any memory before or during the GPU reset nor wait for 
>>>>>>>>>>>>>>>> the reset to complete (so you can't allocate anything on 
>>>>>>>>>>>>>>>> the stack either).
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> There is no dynamic allocation here, regarding stack 
>>>>>>>>>>>>>>> allocations - we do it all the time when we call 
>>>>>>>>>>>>>>> functions, even during GPU resets, how on stack 
>>>>>>>>>>>>>>> allocation of work struct in amdgpu_device_gpu_recover is 
>>>>>>>>>>>>>>> different from any other local variable we allocate in 
>>>>>>>>>>>>>>> any function we call ?
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> I am also not sure why it's not allowed to wait for reset 
>>>>>>>>>>>>>>> to complete ? Also, see in amdgpu_ras_do_recovery and 
>>>>>>>>>>>>>>> gpu_recover_get (debugfs) - the caller expects the reset 
>>>>>>>>>>>>>>> to complete before he returns. I can probably work around 
>>>>>>>>>>>>>>> it in RAS code by calling atomic_set(&ras->in_recovery, 
>>>>>>>>>>>>>>> 0) from some callback within actual reset function but 
>>>>>>>>>>>>>>> regarding sysfs it actually expects a result returned 
>>>>>>>>>>>>>>> indicating whether the call was successful or not.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> I don't think that concept you try here will work.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Also in general seems to me it's cleaner approach where 
>>>>>>>>>>>>>>>>> this logic (the work items) are held and handled in 
>>>>>>>>>>>>>>>>> reset_domain and are not split in each adev or any 
>>>>>>>>>>>>>>>>> other entity. We might want in the future to even move 
>>>>>>>>>>>>>>>>> the scheduler handling into reset domain since reset 
>>>>>>>>>>>>>>>>> domain is supposed to be a generic things and not only 
>>>>>>>>>>>>>>>>> or AMD.
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky 
>>>>>>>>>>>>>>>>>>>>> <andrey.grodzovsky@amd.com>
>>>>>>>>>>>>>>>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com>
>>>>>>>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +---
>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++--
>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 +
>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 
>>>>>>>>>>>>>>>>>>>>> +++++++++++++++++++++-
>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +-
>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++-
>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++-
>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++-
>>>>>>>>>>>>>>>>>>>>>   8 files changed, 104 insertions(+), 24 deletions(-)
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>>>>>>> index 4264abc5604d..99efd8317547 100644
>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>>>>>>> @@ -109,6 +109,7 @@
>>>>>>>>>>>>>>>>>>>>>   #include "amdgpu_fdinfo.h"
>>>>>>>>>>>>>>>>>>>>>   #include "amdgpu_mca.h"
>>>>>>>>>>>>>>>>>>>>>   #include "amdgpu_ras.h"
>>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>>>>>>>>>>>>>>>     #define MAX_GPU_INSTANCE 16
>>>>>>>>>>>>>>>>>>>>>   @@ -509,16 +510,6 @@ struct 
>>>>>>>>>>>>>>>>>>>>> amdgpu_allowed_register_entry {
>>>>>>>>>>>>>>>>>>>>>       bool grbm_indexed;
>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>   -enum amd_reset_method {
>>>>>>>>>>>>>>>>>>>>> -    AMD_RESET_METHOD_NONE = -1,
>>>>>>>>>>>>>>>>>>>>> -    AMD_RESET_METHOD_LEGACY = 0,
>>>>>>>>>>>>>>>>>>>>> -    AMD_RESET_METHOD_MODE0,
>>>>>>>>>>>>>>>>>>>>> -    AMD_RESET_METHOD_MODE1,
>>>>>>>>>>>>>>>>>>>>> -    AMD_RESET_METHOD_MODE2,
>>>>>>>>>>>>>>>>>>>>> -    AMD_RESET_METHOD_BACO,
>>>>>>>>>>>>>>>>>>>>> -    AMD_RESET_METHOD_PCI,
>>>>>>>>>>>>>>>>>>>>> -};
>>>>>>>>>>>>>>>>>>>>> -
>>>>>>>>>>>>>>>>>>>>>   struct amdgpu_video_codec_info {
>>>>>>>>>>>>>>>>>>>>>       u32 codec_type;
>>>>>>>>>>>>>>>>>>>>>       u32 max_width;
>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>> index e582f1044c0f..7fa82269c30f 100644
>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>> @@ -5201,6 +5201,12 @@ int 
>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>> *adev,
>>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>> tmp_vram_lost_counter = 
>>>>>>>>>>>>>>>>>>>>> atomic_read(&((adev)->vram_lost_counter));
>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>> +    /* Drop all pending resets since we will reset 
>>>>>>>>>>>>>>>>>>>>> now anyway */
>>>>>>>>>>>>>>>>>>>>> +    tmp_adev = 
>>>>>>>>>>>>>>>>>>>>> list_first_entry(device_list_handle, struct 
>>>>>>>>>>>>>>>>>>>>> amdgpu_device,
>>>>>>>>>>>>>>>>>>>>> + reset_list);
>>>>>>>>>>>>>>>>>>>>> + amdgpu_reset_pending_list(tmp_adev->reset_domain);
>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>       /* Actual ASIC resets if needed.*/
>>>>>>>>>>>>>>>>>>>>>       /* Host driver will handle XGMI hive reset 
>>>>>>>>>>>>>>>>>>>>> for SRIOV */
>>>>>>>>>>>>>>>>>>>>>       if (amdgpu_sriov_vf(adev)) {
>>>>>>>>>>>>>>>>>>>>> @@ -5296,7 +5302,7 @@ int 
>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>> *adev,
>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>     struct amdgpu_recover_work_struct {
>>>>>>>>>>>>>>>>>>>>> -    struct work_struct base;
>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct base;
>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev;
>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_job *job;
>>>>>>>>>>>>>>>>>>>>>       int ret;
>>>>>>>>>>>>>>>>>>>>> @@ -5304,7 +5310,7 @@ struct 
>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct {
>>>>>>>>>>>>>>>>>>>>>     static void 
>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work(struct 
>>>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_recover_work_struct 
>>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct 
>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base);
>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_recover_work_struct 
>>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct 
>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base.base.work);
>>>>>>>>>>>>>>>>>>>>>         recover_work->ret = 
>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, 
>>>>>>>>>>>>>>>>>>>>> recover_work->job);
>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>> @@ -5316,12 +5322,15 @@ int 
>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_recover_work_struct work = 
>>>>>>>>>>>>>>>>>>>>> {.adev = adev, .job = job};
>>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&work.base, 
>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&work.base.base, 
>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&work.base.node);
>>>>>>>>>>>>>>>>>>>>>         if 
>>>>>>>>>>>>>>>>>>>>> (!amdgpu_reset_domain_schedule(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>> &work.base))
>>>>>>>>>>>>>>>>>>>>>           return -EAGAIN;
>>>>>>>>>>>>>>>>>>>>>   - flush_work(&work.base);
>>>>>>>>>>>>>>>>>>>>> + flush_delayed_work(&work.base.base);
>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>> &work.base);
>>>>>>>>>>>>>>>>>>>>>         return work.ret;
>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>>>>>>> index c80af0889773..ffddd419c351 100644
>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain 
>>>>>>>>>>>>>>>>>>>>> *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d
>>>>>>>>>>>>>>>>>>>>> atomic_set(&reset_domain->in_gpu_reset, 0);
>>>>>>>>>>>>>>>>>>>>> init_rwsem(&reset_domain->sem);
>>>>>>>>>>>>>>>>>>>>>   + INIT_LIST_HEAD(&reset_domain->pending_works);
>>>>>>>>>>>>>>>>>>>>> + mutex_init(&reset_domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>       return reset_domain;
>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>   diff --git 
>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>>>>>>> index 1949dbe28a86..863ec5720fc1 100644
>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>>>>>>> @@ -24,7 +24,18 @@
>>>>>>>>>>>>>>>>>>>>>   #ifndef __AMDGPU_RESET_H__
>>>>>>>>>>>>>>>>>>>>>   #define __AMDGPU_RESET_H__
>>>>>>>>>>>>>>>>>>>>>   -#include "amdgpu.h"
>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>> +#include <linux/atomic.h>
>>>>>>>>>>>>>>>>>>>>> +#include <linux/mutex.h>
>>>>>>>>>>>>>>>>>>>>> +#include <linux/list.h>
>>>>>>>>>>>>>>>>>>>>> +#include <linux/kref.h>
>>>>>>>>>>>>>>>>>>>>> +#include <linux/rwsem.h>
>>>>>>>>>>>>>>>>>>>>> +#include <linux/workqueue.h>
>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_device;
>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_job;
>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_hive_info;
>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>     enum AMDGPU_RESET_FLAGS {
>>>>>>>>>>>>>>>>>>>>>   @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS {
>>>>>>>>>>>>>>>>>>>>>       AMDGPU_SKIP_HW_RESET = 1,
>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>   +
>>>>>>>>>>>>>>>>>>>>> +enum amd_reset_method {
>>>>>>>>>>>>>>>>>>>>> +    AMD_RESET_METHOD_NONE = -1,
>>>>>>>>>>>>>>>>>>>>> +    AMD_RESET_METHOD_LEGACY = 0,
>>>>>>>>>>>>>>>>>>>>> +    AMD_RESET_METHOD_MODE0,
>>>>>>>>>>>>>>>>>>>>> +    AMD_RESET_METHOD_MODE1,
>>>>>>>>>>>>>>>>>>>>> +    AMD_RESET_METHOD_MODE2,
>>>>>>>>>>>>>>>>>>>>> +    AMD_RESET_METHOD_BACO,
>>>>>>>>>>>>>>>>>>>>> +    AMD_RESET_METHOD_PCI,
>>>>>>>>>>>>>>>>>>>>> +};
>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>   struct amdgpu_reset_context {
>>>>>>>>>>>>>>>>>>>>>       enum amd_reset_method method;
>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *reset_req_dev;
>>>>>>>>>>>>>>>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context {
>>>>>>>>>>>>>>>>>>>>>       unsigned long flags;
>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>   +struct amdgpu_reset_control;
>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>   struct amdgpu_reset_handler {
>>>>>>>>>>>>>>>>>>>>>       enum amd_reset_method reset_method;
>>>>>>>>>>>>>>>>>>>>>       struct list_head handler_list;
>>>>>>>>>>>>>>>>>>>>> @@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type {
>>>>>>>>>>>>>>>>>>>>>       XGMI_HIVE
>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>   +
>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_reset_work_struct {
>>>>>>>>>>>>>>>>>>>>> +    struct delayed_work base;
>>>>>>>>>>>>>>>>>>>>> +    struct list_head node;
>>>>>>>>>>>>>>>>>>>>> +};
>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>   struct amdgpu_reset_domain {
>>>>>>>>>>>>>>>>>>>>>       struct kref refcount;
>>>>>>>>>>>>>>>>>>>>>       struct workqueue_struct *wq;
>>>>>>>>>>>>>>>>>>>>>       enum amdgpu_reset_domain_type type;
>>>>>>>>>>>>>>>>>>>>>       struct rw_semaphore sem;
>>>>>>>>>>>>>>>>>>>>>       atomic_t in_gpu_reset;
>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>> +    struct list_head pending_works;
>>>>>>>>>>>>>>>>>>>>> +    struct mutex reset_lock;
>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>     @@ -113,9 +146,43 @@ static inline void 
>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_put_reset_domain(struct 
>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *dom
>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>     static inline bool 
>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_schedule(struct 
>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain,
>>>>>>>>>>>>>>>>>>>>> - struct work_struct *work)
>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work)
>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>> -    return queue_work(domain->wq, work);
>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>> +    if (!queue_delayed_work(domain->wq, 
>>>>>>>>>>>>>>>>>>>>> &work->base, 0)) {
>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>> +        return false;
>>>>>>>>>>>>>>>>>>>>> +    }
>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>> + list_add_tail(&work->node, &domain->pending_works);
>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>> +    return true;
>>>>>>>>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>> +static inline void 
>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(struct 
>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain,
>>>>>>>>>>>>>>>>>>>>> +                  struct amdgpu_reset_work_struct 
>>>>>>>>>>>>>>>>>>>>> *work)
>>>>>>>>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>> + list_del_init(&work->node);
>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>> +static inline void 
>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_pending_list(struct 
>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain)
>>>>>>>>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct *entry, *tmp;
>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>> + list_for_each_entry_safe(entry, tmp, 
>>>>>>>>>>>>>>>>>>>>> &domain->pending_works, node) {
>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>> + list_del_init(&entry->node);
>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>> +        /* Stop any other related pending resets */
>>>>>>>>>>>>>>>>>>>>> + cancel_delayed_work(&entry->base);
>>>>>>>>>>>>>>>>>>>>> +    }
>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>     void amdgpu_device_lock_reset_domain(struct 
>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *reset_domain);
>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h 
>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>>>>>>> index 239f232f9c02..574e870d3064 100644
>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>>>>>>> @@ -25,6 +25,7 @@
>>>>>>>>>>>>>>>>>>>>>   #define AMDGPU_VIRT_H
>>>>>>>>>>>>>>>>>>>>>     #include "amdgv_sriovmsg.h"
>>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>>>>>>>>>>>>>>>     #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 << 0) 
>>>>>>>>>>>>>>>>>>>>> /* vBIOS is sr-iov ready */
>>>>>>>>>>>>>>>>>>>>>   #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 1) /* 
>>>>>>>>>>>>>>>>>>>>> sr-iov is enabled on this GPU */
>>>>>>>>>>>>>>>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt {
>>>>>>>>>>>>>>>>>>>>>       uint32_t reg_val_offs;
>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_irq_src ack_irq;
>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_irq_src rcv_irq;
>>>>>>>>>>>>>>>>>>>>> -    struct work_struct flr_work;
>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct flr_work;
>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_mm_table mm_table;
>>>>>>>>>>>>>>>>>>>>>       const struct amdgpu_virt_ops *ops;
>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_vf_error_buffer vf_errors;
>>>>>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>>>>> index b81acf59870c..f3d1c2be9292 100644
>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>>>>> @@ -251,7 +251,7 @@ static int 
>>>>>>>>>>>>>>>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>> *adev,
>>>>>>>>>>>>>>>>>>>>>     static void xgpu_ai_mailbox_flr_work(struct 
>>>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = container_of(work, 
>>>>>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = container_of(work, 
>>>>>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work.base.work);
>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = 
>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>>>>>>       int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>>>>>>>>>   @@ -380,7 +380,8 @@ int 
>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_get_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>> @@ -389,6 +390,8 @@ void 
>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_put_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>     static int xgpu_ai_request_init_data(struct 
>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644
>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>>>>> @@ -275,7 +275,7 @@ static int 
>>>>>>>>>>>>>>>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>> *adev,
>>>>>>>>>>>>>>>>>>>>>     static void xgpu_nv_mailbox_flr_work(struct 
>>>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = container_of(work, 
>>>>>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = container_of(work, 
>>>>>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work.base.work);
>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = 
>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>>>>>>       int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>>>>>>>>>   @@ -407,7 +407,8 @@ int 
>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_get_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>> @@ -416,6 +417,8 @@ void 
>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_put_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>     const struct amdgpu_virt_ops xgpu_nv_virt_ops = {
>>>>>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c 
>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644
>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>>>>>>> @@ -512,7 +512,7 @@ static int 
>>>>>>>>>>>>>>>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>> *adev,
>>>>>>>>>>>>>>>>>>>>>     static void xgpu_vi_mailbox_flr_work(struct 
>>>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = container_of(work, 
>>>>>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = container_of(work, 
>>>>>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work.base.work);
>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = 
>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>>>>>>         /* wait until RCV_MSG become 3 */
>>>>>>>>>>>>>>>>>>>>> @@ -610,7 +610,8 @@ int 
>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_get_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>> @@ -619,6 +620,8 @@ void 
>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_put_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>     const struct amdgpu_virt_ops xgpu_vi_virt_ops = {
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>
>>>>>>>
>>>>>
>>>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive.
  2022-05-11 15:35                                       ` Andrey Grodzovsky
  2022-05-11 15:37                                         ` Lazar, Lijo
@ 2022-05-11 15:39                                         ` Christian König
  2022-05-11 15:57                                           ` Andrey Grodzovsky
  2022-05-11 20:27                                           ` Andrey Grodzovsky
  1 sibling, 2 replies; 40+ messages in thread
From: Christian König @ 2022-05-11 15:39 UTC (permalink / raw)
  To: Andrey Grodzovsky, Lazar, Lijo, Christian König,
	Felix Kuehling, amd-gfx
  Cc: Bai Zoy

Am 11.05.22 um 17:35 schrieb Andrey Grodzovsky:
> On 2022-05-11 11:20, Lazar, Lijo wrote:
>>
>>
>> On 5/11/2022 7:28 PM, Christian König wrote:
>>> Am 11.05.22 um 15:43 schrieb Andrey Grodzovsky:
>>>> On 2022-05-11 03:38, Christian König wrote:
>>>>> Am 10.05.22 um 20:53 schrieb Andrey Grodzovsky:
>>>>>> [SNIP]
>>>>>>> E.g. in the reset code (either before or after the reset, that's 
>>>>>>> debatable) you do something like this:
>>>>>>>
>>>>>>> for (i = 0; i < num_ring; ++i)
>>>>>>>     cancel_delayed_work(ring[i]->scheduler....)
>>>>>>> cancel_work(adev->ras_work);
>>>>>>> cancel_work(adev->iofault_work);
>>>>>>> cancel_work(adev->debugfs_work);
>>>>>>> ...
>>>>>>>
>>>>>>> You don't really need to track which reset source has fired and 
>>>>>>> which hasn't, because that would be racy again. Instead just 
>>>>>>> bluntly reset all possible sources.
>>>>>>>
>>>>>>> Christian.
>>>>>>
>>>>>>
>>>>>> I don't say we care if it fired once or twice (I need to add a 
>>>>>> fix to only insert reset work to pending reset list if it's not 
>>>>>> already there), the point of using list (or array) to which you 
>>>>>> add and from which you remove is that the logic of this is 
>>>>>> encapsulated within reset domain. In your way we need to be aware 
>>>>>> who exactly schedules reset work and explicitly cancel them, this 
>>>>>> also means that for any new source added in the future you will 
>>>>>> need to remember to add him
>>>>>
>>>>> I don't think that this is a valid argument. Additionally to the 
>>>>> schedulers we probably just need less than a handful of reset 
>>>>> sources, most likely even just one or two is enough.
>>>>>
>>>>> The only justification I can see of having additional separate 
>>>>> reset sources would be if somebody wants to know if a specific 
>>>>> source has been handled or not (e.g. call flush_work() or 
>>>>> work_pending()). Like in the case of a reset triggered through 
>>>>> debugfs.
>>>>
>>>>
>>>> This is indeed one reason, another is as we said before that if you 
>>>> share 'reset source' (meaning a delayed work) with another client 
>>>> (i.e. RAS and KFD) it means you make assumption that the other 
>>>> client always proceeds with the
>>>> reset exactly the same way as you expect. So today we have this 
>>>> only in scheduler vs non scheduler reset happening - non scheduler 
>>>> reset clients assume the reset is always fully executed in HW while 
>>>> scheduler based reset makes shortcuts and not always does HW reset 
>>>> hence they cannot share 'reset source' (delayed work). Yes, we can 
>>>> always add this in the future if and when such problem will arise 
>>>> but no one will remember this then and a new bug will be introduced 
>>>> and will take time to find and resolve.
>>>
>>> Mhm, so your main concern is that we forget to correctly handle the 
>>> new reset sources?
>>>
>>> How about we do it like this then:
>>>
>>> struct amdgpu_reset_domain {
>>>      ....
>>>      union {
>>>          struct {
>>>              struct work_item debugfs;
>>>              struct work_item ras;
>>>              ....
>>>          };
>>>          struct work_item array[]
>>>      } reset_sources;
>>> }
>>>
>>
>> If it's only about static array,
>>
>> enum amdgpu_reset_soruce {
>>
>> AMDGPU_RESET_SRC_RAS,
>> AMDGPU_RESET_SRC_ABC,
>> .....
>> AMDGPU_RESET_SRC_XYZ,
>> AMDGPU_RESET_SRC_MAX
>>
>> };
>>
>> struct work_struct reset_work[AMDGPU_RESET_SRC_MAX]; => An index for 
>> each work item
>>
>>
>> Thanks,
>> Lijo
>
>
> It's possible though it makes harder to generalize reset_domain later 
> for other drivers.
> But still one caveat, look at amdgpu_recover_work_struct and it's 
> usage in amdgpu_device_gpu_recover and in gpu_recover_get,
> At least for debugfs i need to return back the result of GPU reset and 
> so I cannot store actual work items in the array mentioned above
> but rather pointers to work_item because i need a way to get back the 
> return value from gpu_recover like I do now in amdgpu_device_gpu_recover.

You should try to avoid that as well.

See when the debugfs reset is canceled because of a scheduler reset you 
won't get a useful return code either.

What we should do instead is to cache the status of the last reset in 
the reset domain.

Regards,
Christian.

>
> Andrey
>
>
>>
>>> Not 100% sure if that works, but something like that should do the 
>>> trick.
>>>
>>> My main concern is that I don't want to allocate the work items on 
>>> the stack and dynamic allocation (e.g. kmalloc) is usually not 
>>> possible either.
>>>
>>> Additional to that putting/removing work items from a list, array or 
>>> other container is a very common source for race conditions.
>>>
>>> Regards,
>>> Christian.
>>>
>>>>
>>>>>> to the cancellation list which you showed above. In current way 
>>>>>> all this done automatically within reset_domain code and it's 
>>>>>> agnostic to specific driver and it's specific list of reset 
>>>>>> sources. Also in case we would want to generalize reset_domain to 
>>>>>> other GPU drivers (which was
>>>>>> a plan as far as i remember) this explicit mention of each reset 
>>>>>> works for cancellation is again less suitable in my opinion.
>>>>>
>>>>> Well we could put the work item for the scheduler independent 
>>>>> reset source into the reset domain as well. But I'm not sure those 
>>>>> additional reset sources should be part of any common handling, 
>>>>> that is largely amdgpu specific.
>>>>
>>>>
>>>> So it's for sure more then one source for the reasons described 
>>>> above, also note that for scheduler we already cancel delayed work 
>>>> in drm_sched_stop so calling them again in amdgpu code kind of 
>>>> superfluous.
>>>>
>>>> Andrey
>>>>
>>>>
>>>>>
>>>>> Christian.
>>>>>
>>>>>>
>>>>>> Andrey
>>>>>>
>>>>>>
>>>>>>>
>>>>>>>>
>>>>>>>> Andrey
>>>>>>>>
>>>>>>>>
>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> The only difference is I chose to do the canceling right 
>>>>>>>>>> BEFORE the HW reset and not AFTER. I did this because I see a 
>>>>>>>>>> possible race where a new reset request is being generated 
>>>>>>>>>> exactly after we finished the HW reset but before we canceled 
>>>>>>>>>> out all pending resets - in such case you wold not want to 
>>>>>>>>>> cancel this 'border line new' reset request.
>>>>>>>>>
>>>>>>>>> Why not? Any new reset request directly after a hardware reset 
>>>>>>>>> is most likely just falsely generated by the reset itself.
>>>>>>>>>
>>>>>>>>> Ideally I would cancel all sources after the reset, but before 
>>>>>>>>> starting any new work.
>>>>>>>>>
>>>>>>>>> Regards,
>>>>>>>>> Christian.
>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> Andrey
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> Regards,
>>>>>>>>>>> Christian.
>>>>>>>>>>>
>>>>>>>>>>>>> You can see that if many different reset sources share 
>>>>>>>>>>>>> same work struct what can happen is that the first to 
>>>>>>>>>>>>> obtain the lock you describe bellow might opt out from 
>>>>>>>>>>>>> full HW reset because his bad job did signal for example 
>>>>>>>>>>>>> or because his hunged IP block was able to recover through 
>>>>>>>>>>>>> SW reset but in the meantime another reset source who 
>>>>>>>>>>>>> needed an actual HW reset just silently returned and we 
>>>>>>>>>>>>> end up with unhandled reset request. True that today this 
>>>>>>>>>>>>> happens only to job timeout reset sources that are handled 
>>>>>>>>>>>>> form within the scheduler and won't use this single work 
>>>>>>>>>>>>> struct but no one prevents a future case for this to 
>>>>>>>>>>>>> happen and also, if we actually want to unify scheduler 
>>>>>>>>>>>>> time out handlers within reset domain (which seems to me 
>>>>>>>>>>>>> the right design approach) we won't be able to use just 
>>>>>>>>>>>>> one work struct for this reason anyway.
>>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> Just to add to this point - a reset domain is co-operative 
>>>>>>>>>>>> domain. In addition to sharing a set of clients from 
>>>>>>>>>>>> various reset sources for one device, it also will have a 
>>>>>>>>>>>> set of devices like in XGMI hive. The job timeout on one 
>>>>>>>>>>>> device may not eventually result in result, but a RAS error 
>>>>>>>>>>>> happening on another device at the same time would need a 
>>>>>>>>>>>> reset. The second device's RAS error cannot return seeing  
>>>>>>>>>>>> that a reset work already started, or ignore the reset work 
>>>>>>>>>>>> given that another device has filled the reset data.
>>>>>>>>>>>>
>>>>>>>>>>>> When there is a reset domain, it should take care of the 
>>>>>>>>>>>> work scheduled and keeping it in device or any other level 
>>>>>>>>>>>> doesn't sound good.
>>>>>>>>>>>>
>>>>>>>>>>>> Thanks,
>>>>>>>>>>>> Lijo
>>>>>>>>>>>>
>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> I'd put the reset work struct into the reset_domain 
>>>>>>>>>>>>>> struct. That way you'd have exactly one worker for the 
>>>>>>>>>>>>>> reset domain. You could implement a lock-less scheme to 
>>>>>>>>>>>>>> decide whether you need to schedule a reset, e.g. using 
>>>>>>>>>>>>>> an atomic counter in the shared work struct that gets 
>>>>>>>>>>>>>> incremented when a client wants to trigger a reset 
>>>>>>>>>>>>>> (atomic_add_return). If that counter is exactly 1 after 
>>>>>>>>>>>>>> incrementing, you need to fill in the rest of the work 
>>>>>>>>>>>>>> struct and schedule the work. Otherwise, it's already 
>>>>>>>>>>>>>> scheduled (or another client is in the process of 
>>>>>>>>>>>>>> scheduling it) and you just return. When the worker 
>>>>>>>>>>>>>> finishes (after confirming a successful reset), it resets 
>>>>>>>>>>>>>> the counter to 0, so the next client requesting a reset 
>>>>>>>>>>>>>> will schedule the worker again.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>   Felix
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Additional to that keep in mind that you can't allocate 
>>>>>>>>>>>>>>>> any memory before or during the GPU reset nor wait for 
>>>>>>>>>>>>>>>> the reset to complete (so you can't allocate anything 
>>>>>>>>>>>>>>>> on the stack either).
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> There is no dynamic allocation here, regarding stack 
>>>>>>>>>>>>>>> allocations - we do it all the time when we call 
>>>>>>>>>>>>>>> functions, even during GPU resets, how on stack 
>>>>>>>>>>>>>>> allocation of work struct in amdgpu_device_gpu_recover 
>>>>>>>>>>>>>>> is different from any other local variable we allocate 
>>>>>>>>>>>>>>> in any function we call ?
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> I am also not sure why it's not allowed to wait for 
>>>>>>>>>>>>>>> reset to complete ? Also, see in amdgpu_ras_do_recovery 
>>>>>>>>>>>>>>> and gpu_recover_get (debugfs) - the caller expects the 
>>>>>>>>>>>>>>> reset to complete before he returns. I can probably work 
>>>>>>>>>>>>>>> around it in RAS code by calling 
>>>>>>>>>>>>>>> atomic_set(&ras->in_recovery, 0) from some callback 
>>>>>>>>>>>>>>> within actual reset function but regarding sysfs it 
>>>>>>>>>>>>>>> actually expects a result returned indicating whether 
>>>>>>>>>>>>>>> the call was successful or not.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> I don't think that concept you try here will work.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Also in general seems to me it's cleaner approach 
>>>>>>>>>>>>>>>>> where this logic (the work items) are held and handled 
>>>>>>>>>>>>>>>>> in reset_domain and are not split in each adev or any 
>>>>>>>>>>>>>>>>> other entity. We might want in the future to even move 
>>>>>>>>>>>>>>>>> the scheduler handling into reset domain since reset 
>>>>>>>>>>>>>>>>> domain is supposed to be a generic things and not only 
>>>>>>>>>>>>>>>>> or AMD.
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky 
>>>>>>>>>>>>>>>>>>>>> <andrey.grodzovsky@amd.com>
>>>>>>>>>>>>>>>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com>
>>>>>>>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +---
>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++--
>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 +
>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 
>>>>>>>>>>>>>>>>>>>>> +++++++++++++++++++++-
>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +-
>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++-
>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++-
>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++-
>>>>>>>>>>>>>>>>>>>>>   8 files changed, 104 insertions(+), 24 deletions(-)
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>>>>>>> index 4264abc5604d..99efd8317547 100644
>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>>>>>>> @@ -109,6 +109,7 @@
>>>>>>>>>>>>>>>>>>>>>   #include "amdgpu_fdinfo.h"
>>>>>>>>>>>>>>>>>>>>>   #include "amdgpu_mca.h"
>>>>>>>>>>>>>>>>>>>>>   #include "amdgpu_ras.h"
>>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>>>>>>>>>>>>>>>     #define MAX_GPU_INSTANCE 16
>>>>>>>>>>>>>>>>>>>>>   @@ -509,16 +510,6 @@ struct 
>>>>>>>>>>>>>>>>>>>>> amdgpu_allowed_register_entry {
>>>>>>>>>>>>>>>>>>>>>       bool grbm_indexed;
>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>   -enum amd_reset_method {
>>>>>>>>>>>>>>>>>>>>> -    AMD_RESET_METHOD_NONE = -1,
>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_LEGACY = 0,
>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE0,
>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE1,
>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE2,
>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_BACO,
>>>>>>>>>>>>>>>>>>>>> -    AMD_RESET_METHOD_PCI,
>>>>>>>>>>>>>>>>>>>>> -};
>>>>>>>>>>>>>>>>>>>>> -
>>>>>>>>>>>>>>>>>>>>>   struct amdgpu_video_codec_info {
>>>>>>>>>>>>>>>>>>>>>       u32 codec_type;
>>>>>>>>>>>>>>>>>>>>>       u32 max_width;
>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>> index e582f1044c0f..7fa82269c30f 100644
>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>> @@ -5201,6 +5201,12 @@ int 
>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>> *adev,
>>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>> tmp_vram_lost_counter = 
>>>>>>>>>>>>>>>>>>>>> atomic_read(&((adev)->vram_lost_counter));
>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>> +    /* Drop all pending resets since we will 
>>>>>>>>>>>>>>>>>>>>> reset now anyway */
>>>>>>>>>>>>>>>>>>>>> +    tmp_adev = 
>>>>>>>>>>>>>>>>>>>>> list_first_entry(device_list_handle, struct 
>>>>>>>>>>>>>>>>>>>>> amdgpu_device,
>>>>>>>>>>>>>>>>>>>>> + reset_list);
>>>>>>>>>>>>>>>>>>>>> + amdgpu_reset_pending_list(tmp_adev->reset_domain);
>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>       /* Actual ASIC resets if needed.*/
>>>>>>>>>>>>>>>>>>>>>       /* Host driver will handle XGMI hive reset 
>>>>>>>>>>>>>>>>>>>>> for SRIOV */
>>>>>>>>>>>>>>>>>>>>>       if (amdgpu_sriov_vf(adev)) {
>>>>>>>>>>>>>>>>>>>>> @@ -5296,7 +5302,7 @@ int 
>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>> *adev,
>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>     struct amdgpu_recover_work_struct {
>>>>>>>>>>>>>>>>>>>>> -    struct work_struct base;
>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct base;
>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev;
>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_job *job;
>>>>>>>>>>>>>>>>>>>>>       int ret;
>>>>>>>>>>>>>>>>>>>>> @@ -5304,7 +5310,7 @@ struct 
>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct {
>>>>>>>>>>>>>>>>>>>>>     static void 
>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work(struct 
>>>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_recover_work_struct 
>>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct 
>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base);
>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_recover_work_struct 
>>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct 
>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base.base.work);
>>>>>>>>>>>>>>>>>>>>> recover_work->ret = 
>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, 
>>>>>>>>>>>>>>>>>>>>> recover_work->job);
>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>> @@ -5316,12 +5322,15 @@ int 
>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_recover_work_struct work = 
>>>>>>>>>>>>>>>>>>>>> {.adev = adev, .job = job};
>>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&work.base, 
>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&work.base.base, 
>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&work.base.node);
>>>>>>>>>>>>>>>>>>>>>         if 
>>>>>>>>>>>>>>>>>>>>> (!amdgpu_reset_domain_schedule(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>> &work.base))
>>>>>>>>>>>>>>>>>>>>>           return -EAGAIN;
>>>>>>>>>>>>>>>>>>>>>   - flush_work(&work.base);
>>>>>>>>>>>>>>>>>>>>> + flush_delayed_work(&work.base.base);
>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>> &work.base);
>>>>>>>>>>>>>>>>>>>>>         return work.ret;
>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>>>>>>> index c80af0889773..ffddd419c351 100644
>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain 
>>>>>>>>>>>>>>>>>>>>> *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d
>>>>>>>>>>>>>>>>>>>>> atomic_set(&reset_domain->in_gpu_reset, 0);
>>>>>>>>>>>>>>>>>>>>> init_rwsem(&reset_domain->sem);
>>>>>>>>>>>>>>>>>>>>>   + INIT_LIST_HEAD(&reset_domain->pending_works);
>>>>>>>>>>>>>>>>>>>>> + mutex_init(&reset_domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>       return reset_domain;
>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>   diff --git 
>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>>>>>>> index 1949dbe28a86..863ec5720fc1 100644
>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>>>>>>> @@ -24,7 +24,18 @@
>>>>>>>>>>>>>>>>>>>>>   #ifndef __AMDGPU_RESET_H__
>>>>>>>>>>>>>>>>>>>>>   #define __AMDGPU_RESET_H__
>>>>>>>>>>>>>>>>>>>>>   -#include "amdgpu.h"
>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>> +#include <linux/atomic.h>
>>>>>>>>>>>>>>>>>>>>> +#include <linux/mutex.h>
>>>>>>>>>>>>>>>>>>>>> +#include <linux/list.h>
>>>>>>>>>>>>>>>>>>>>> +#include <linux/kref.h>
>>>>>>>>>>>>>>>>>>>>> +#include <linux/rwsem.h>
>>>>>>>>>>>>>>>>>>>>> +#include <linux/workqueue.h>
>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_device;
>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_job;
>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_hive_info;
>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>     enum AMDGPU_RESET_FLAGS {
>>>>>>>>>>>>>>>>>>>>>   @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS {
>>>>>>>>>>>>>>>>>>>>>       AMDGPU_SKIP_HW_RESET = 1,
>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>   +
>>>>>>>>>>>>>>>>>>>>> +enum amd_reset_method {
>>>>>>>>>>>>>>>>>>>>> +    AMD_RESET_METHOD_NONE = -1,
>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_LEGACY = 0,
>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE0,
>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE1,
>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE2,
>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_BACO,
>>>>>>>>>>>>>>>>>>>>> +    AMD_RESET_METHOD_PCI,
>>>>>>>>>>>>>>>>>>>>> +};
>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>   struct amdgpu_reset_context {
>>>>>>>>>>>>>>>>>>>>>       enum amd_reset_method method;
>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *reset_req_dev;
>>>>>>>>>>>>>>>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context {
>>>>>>>>>>>>>>>>>>>>>       unsigned long flags;
>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>   +struct amdgpu_reset_control;
>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>   struct amdgpu_reset_handler {
>>>>>>>>>>>>>>>>>>>>>       enum amd_reset_method reset_method;
>>>>>>>>>>>>>>>>>>>>>       struct list_head handler_list;
>>>>>>>>>>>>>>>>>>>>> @@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type {
>>>>>>>>>>>>>>>>>>>>>       XGMI_HIVE
>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>   +
>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_reset_work_struct {
>>>>>>>>>>>>>>>>>>>>> +    struct delayed_work base;
>>>>>>>>>>>>>>>>>>>>> +    struct list_head node;
>>>>>>>>>>>>>>>>>>>>> +};
>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>   struct amdgpu_reset_domain {
>>>>>>>>>>>>>>>>>>>>>       struct kref refcount;
>>>>>>>>>>>>>>>>>>>>>       struct workqueue_struct *wq;
>>>>>>>>>>>>>>>>>>>>>       enum amdgpu_reset_domain_type type;
>>>>>>>>>>>>>>>>>>>>>       struct rw_semaphore sem;
>>>>>>>>>>>>>>>>>>>>>       atomic_t in_gpu_reset;
>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>> +    struct list_head pending_works;
>>>>>>>>>>>>>>>>>>>>> +    struct mutex reset_lock;
>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>     @@ -113,9 +146,43 @@ static inline void 
>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_put_reset_domain(struct 
>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *dom
>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>     static inline bool 
>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_schedule(struct 
>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain,
>>>>>>>>>>>>>>>>>>>>> - struct work_struct *work)
>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work)
>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>> -    return queue_work(domain->wq, work);
>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>> +    if (!queue_delayed_work(domain->wq, 
>>>>>>>>>>>>>>>>>>>>> &work->base, 0)) {
>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>> +        return false;
>>>>>>>>>>>>>>>>>>>>> +    }
>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>> + list_add_tail(&work->node, &domain->pending_works);
>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>> +    return true;
>>>>>>>>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>> +static inline void 
>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(struct 
>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain,
>>>>>>>>>>>>>>>>>>>>> +                  struct amdgpu_reset_work_struct 
>>>>>>>>>>>>>>>>>>>>> *work)
>>>>>>>>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>> + list_del_init(&work->node);
>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>> +static inline void 
>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_pending_list(struct 
>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain)
>>>>>>>>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct *entry, *tmp;
>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>> + list_for_each_entry_safe(entry, tmp, 
>>>>>>>>>>>>>>>>>>>>> &domain->pending_works, node) {
>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>> + list_del_init(&entry->node);
>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>> +        /* Stop any other related pending resets */
>>>>>>>>>>>>>>>>>>>>> + cancel_delayed_work(&entry->base);
>>>>>>>>>>>>>>>>>>>>> +    }
>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>     void amdgpu_device_lock_reset_domain(struct 
>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *reset_domain);
>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h 
>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>>>>>>> index 239f232f9c02..574e870d3064 100644
>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>>>>>>> @@ -25,6 +25,7 @@
>>>>>>>>>>>>>>>>>>>>>   #define AMDGPU_VIRT_H
>>>>>>>>>>>>>>>>>>>>>     #include "amdgv_sriovmsg.h"
>>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>>>>>>>>>>>>>>>     #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 << 0) 
>>>>>>>>>>>>>>>>>>>>> /* vBIOS is sr-iov ready */
>>>>>>>>>>>>>>>>>>>>>   #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 1) /* 
>>>>>>>>>>>>>>>>>>>>> sr-iov is enabled on this GPU */
>>>>>>>>>>>>>>>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt {
>>>>>>>>>>>>>>>>>>>>>       uint32_t reg_val_offs;
>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_irq_src ack_irq;
>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_irq_src rcv_irq;
>>>>>>>>>>>>>>>>>>>>> -    struct work_struct flr_work;
>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct flr_work;
>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_mm_table mm_table;
>>>>>>>>>>>>>>>>>>>>>       const struct amdgpu_virt_ops *ops;
>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_vf_error_buffer vf_errors;
>>>>>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>>>>> index b81acf59870c..f3d1c2be9292 100644
>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>>>>> @@ -251,7 +251,7 @@ static int 
>>>>>>>>>>>>>>>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>> *adev,
>>>>>>>>>>>>>>>>>>>>>     static void xgpu_ai_mailbox_flr_work(struct 
>>>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = container_of(work, 
>>>>>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = container_of(work, 
>>>>>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work.base.work);
>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = 
>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>>>>>>       int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>>>>>>>>>   @@ -380,7 +380,8 @@ int 
>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_get_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>> @@ -389,6 +390,8 @@ void 
>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_put_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>     static int xgpu_ai_request_init_data(struct 
>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644
>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>>>>> @@ -275,7 +275,7 @@ static int 
>>>>>>>>>>>>>>>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>> *adev,
>>>>>>>>>>>>>>>>>>>>>     static void xgpu_nv_mailbox_flr_work(struct 
>>>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = container_of(work, 
>>>>>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = container_of(work, 
>>>>>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work.base.work);
>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = 
>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>>>>>>       int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>>>>>>>>>   @@ -407,7 +407,8 @@ int 
>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_get_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>> @@ -416,6 +417,8 @@ void 
>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_put_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>     const struct amdgpu_virt_ops xgpu_nv_virt_ops = {
>>>>>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c 
>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644
>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>>>>>>> @@ -512,7 +512,7 @@ static int 
>>>>>>>>>>>>>>>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>> *adev,
>>>>>>>>>>>>>>>>>>>>>     static void xgpu_vi_mailbox_flr_work(struct 
>>>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = container_of(work, 
>>>>>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = container_of(work, 
>>>>>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work.base.work);
>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = 
>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>>>>>>         /* wait until RCV_MSG become 3 */
>>>>>>>>>>>>>>>>>>>>> @@ -610,7 +610,8 @@ int 
>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_get_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>> @@ -619,6 +620,8 @@ void 
>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_put_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>>>>>>       amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>     const struct amdgpu_virt_ops xgpu_vi_virt_ops = {
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>
>>>>>>>
>>>>>
>>>


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive.
  2022-05-11 15:37                                         ` Lazar, Lijo
@ 2022-05-11 15:43                                           ` Andrey Grodzovsky
  2022-05-11 15:46                                             ` Lazar, Lijo
  0 siblings, 1 reply; 40+ messages in thread
From: Andrey Grodzovsky @ 2022-05-11 15:43 UTC (permalink / raw)
  To: Lazar, Lijo, Christian König, Christian König,
	Felix Kuehling, amd-gfx
  Cc: Bai Zoy


On 2022-05-11 11:37, Lazar, Lijo wrote:
>
>
> On 5/11/2022 9:05 PM, Andrey Grodzovsky wrote:
>>
>> On 2022-05-11 11:20, Lazar, Lijo wrote:
>>>
>>>
>>> On 5/11/2022 7:28 PM, Christian König wrote:
>>>> Am 11.05.22 um 15:43 schrieb Andrey Grodzovsky:
>>>>> On 2022-05-11 03:38, Christian König wrote:
>>>>>> Am 10.05.22 um 20:53 schrieb Andrey Grodzovsky:
>>>>>>> [SNIP]
>>>>>>>> E.g. in the reset code (either before or after the reset, 
>>>>>>>> that's debatable) you do something like this:
>>>>>>>>
>>>>>>>> for (i = 0; i < num_ring; ++i)
>>>>>>>>     cancel_delayed_work(ring[i]->scheduler....)
>>>>>>>> cancel_work(adev->ras_work);
>>>>>>>> cancel_work(adev->iofault_work);
>>>>>>>> cancel_work(adev->debugfs_work);
>>>>>>>> ...
>>>>>>>>
>>>>>>>> You don't really need to track which reset source has fired and 
>>>>>>>> which hasn't, because that would be racy again. Instead just 
>>>>>>>> bluntly reset all possible sources.
>>>>>>>>
>>>>>>>> Christian.
>>>>>>>
>>>>>>>
>>>>>>> I don't say we care if it fired once or twice (I need to add a 
>>>>>>> fix to only insert reset work to pending reset list if it's not 
>>>>>>> already there), the point of using list (or array) to which you 
>>>>>>> add and from which you remove is that the logic of this is 
>>>>>>> encapsulated within reset domain. In your way we need to be 
>>>>>>> aware who exactly schedules reset work and explicitly cancel 
>>>>>>> them, this also means that for any new source added in the 
>>>>>>> future you will need to remember to add him
>>>>>>
>>>>>> I don't think that this is a valid argument. Additionally to the 
>>>>>> schedulers we probably just need less than a handful of reset 
>>>>>> sources, most likely even just one or two is enough.
>>>>>>
>>>>>> The only justification I can see of having additional separate 
>>>>>> reset sources would be if somebody wants to know if a specific 
>>>>>> source has been handled or not (e.g. call flush_work() or 
>>>>>> work_pending()). Like in the case of a reset triggered through 
>>>>>> debugfs.
>>>>>
>>>>>
>>>>> This is indeed one reason, another is as we said before that if 
>>>>> you share 'reset source' (meaning a delayed work) with another 
>>>>> client (i.e. RAS and KFD) it means you make assumption that the 
>>>>> other client always proceeds with the
>>>>> reset exactly the same way as you expect. So today we have this 
>>>>> only in scheduler vs non scheduler reset happening - non scheduler 
>>>>> reset clients assume the reset is always fully executed in HW 
>>>>> while scheduler based reset makes shortcuts and not always does HW 
>>>>> reset hence they cannot share 'reset source' (delayed work). Yes, 
>>>>> we can always add this in the future if and when such problem will 
>>>>> arise but no one will remember this then and a new bug will be 
>>>>> introduced and will take time to find and resolve.
>>>>
>>>> Mhm, so your main concern is that we forget to correctly handle the 
>>>> new reset sources?
>>>>
>>>> How about we do it like this then:
>>>>
>>>> struct amdgpu_reset_domain {
>>>>      ....
>>>>      union {
>>>>          struct {
>>>>              struct work_item debugfs;
>>>>              struct work_item ras;
>>>>              ....
>>>>          };
>>>>          struct work_item array[]
>>>>      } reset_sources;
>>>> }
>>>>
>>>
>>> If it's only about static array,
>>>
>>> enum amdgpu_reset_soruce {
>>>
>>> AMDGPU_RESET_SRC_RAS,
>>> AMDGPU_RESET_SRC_ABC,
>>> .....
>>> AMDGPU_RESET_SRC_XYZ,
>>> AMDGPU_RESET_SRC_MAX
>>>
>>> };
>>>
>>> struct work_struct reset_work[AMDGPU_RESET_SRC_MAX]; => An index for 
>>> each work item
>>>
>>>
>>> Thanks,
>>> Lijo
>>
>>
>> It's possible though it makes harder to generalize reset_domain later 
>> for other drivers.
>
> The current reset domain queue design is not good for a hierarchichal 
> reset within amdgpu itself :)
>
> Thanks,
> Lijo


Not sure what do you mean ?

Andrey


>
>> But still one caveat, look at amdgpu_recover_work_struct and it's 
>> usage in amdgpu_device_gpu_recover and in gpu_recover_get,
>> At least for debugfs i need to return back the result of GPU reset 
>> and so I cannot store actual work items in the array mentioned above
>> but rather pointers to work_item because i need a way to get back the 
>> return value from gpu_recover like I do now in 
>> amdgpu_device_gpu_recover.
>>
>> Andrey
>>
>>
>>>
>>>> Not 100% sure if that works, but something like that should do the 
>>>> trick.
>>>>
>>>> My main concern is that I don't want to allocate the work items on 
>>>> the stack and dynamic allocation (e.g. kmalloc) is usually not 
>>>> possible either.
>>>>
>>>> Additional to that putting/removing work items from a list, array 
>>>> or other container is a very common source for race conditions.
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>>
>>>>>>> to the cancellation list which you showed above. In current way 
>>>>>>> all this done automatically within reset_domain code and it's 
>>>>>>> agnostic to specific driver and it's specific list of reset 
>>>>>>> sources. Also in case we would want to generalize reset_domain 
>>>>>>> to other GPU drivers (which was
>>>>>>> a plan as far as i remember) this explicit mention of each reset 
>>>>>>> works for cancellation is again less suitable in my opinion.
>>>>>>
>>>>>> Well we could put the work item for the scheduler independent 
>>>>>> reset source into the reset domain as well. But I'm not sure 
>>>>>> those additional reset sources should be part of any common 
>>>>>> handling, that is largely amdgpu specific.
>>>>>
>>>>>
>>>>> So it's for sure more then one source for the reasons described 
>>>>> above, also note that for scheduler we already cancel delayed work 
>>>>> in drm_sched_stop so calling them again in amdgpu code kind of 
>>>>> superfluous.
>>>>>
>>>>> Andrey
>>>>>
>>>>>
>>>>>>
>>>>>> Christian.
>>>>>>
>>>>>>>
>>>>>>> Andrey
>>>>>>>
>>>>>>>
>>>>>>>>
>>>>>>>>>
>>>>>>>>> Andrey
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> The only difference is I chose to do the canceling right 
>>>>>>>>>>> BEFORE the HW reset and not AFTER. I did this because I see 
>>>>>>>>>>> a possible race where a new reset request is being generated 
>>>>>>>>>>> exactly after we finished the HW reset but before we 
>>>>>>>>>>> canceled out all pending resets - in such case you wold not 
>>>>>>>>>>> want to cancel this 'border line new' reset request.
>>>>>>>>>>
>>>>>>>>>> Why not? Any new reset request directly after a hardware 
>>>>>>>>>> reset is most likely just falsely generated by the reset itself.
>>>>>>>>>>
>>>>>>>>>> Ideally I would cancel all sources after the reset, but 
>>>>>>>>>> before starting any new work.
>>>>>>>>>>
>>>>>>>>>> Regards,
>>>>>>>>>> Christian.
>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> Andrey
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> Regards,
>>>>>>>>>>>> Christian.
>>>>>>>>>>>>
>>>>>>>>>>>>>> You can see that if many different reset sources share 
>>>>>>>>>>>>>> same work struct what can happen is that the first to 
>>>>>>>>>>>>>> obtain the lock you describe bellow might opt out from 
>>>>>>>>>>>>>> full HW reset because his bad job did signal for example 
>>>>>>>>>>>>>> or because his hunged IP block was able to recover 
>>>>>>>>>>>>>> through SW reset but in the meantime another reset source 
>>>>>>>>>>>>>> who needed an actual HW reset just silently returned and 
>>>>>>>>>>>>>> we end up with unhandled reset request. True that today 
>>>>>>>>>>>>>> this happens only to job timeout reset sources that are 
>>>>>>>>>>>>>> handled form within the scheduler and won't use this 
>>>>>>>>>>>>>> single work struct but no one prevents a future case for 
>>>>>>>>>>>>>> this to happen and also, if we actually want to unify 
>>>>>>>>>>>>>> scheduler time out handlers within reset domain (which 
>>>>>>>>>>>>>> seems to me the right design approach) we won't be able 
>>>>>>>>>>>>>> to use just one work struct for this reason anyway.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>> Just to add to this point - a reset domain is co-operative 
>>>>>>>>>>>>> domain. In addition to sharing a set of clients from 
>>>>>>>>>>>>> various reset sources for one device, it also will have a 
>>>>>>>>>>>>> set of devices like in XGMI hive. The job timeout on one 
>>>>>>>>>>>>> device may not eventually result in result, but a RAS 
>>>>>>>>>>>>> error happening on another device at the same time would 
>>>>>>>>>>>>> need a reset. The second device's RAS error cannot return 
>>>>>>>>>>>>> seeing that a reset work already started, or ignore the 
>>>>>>>>>>>>> reset work given that another device has filled the reset 
>>>>>>>>>>>>> data.
>>>>>>>>>>>>>
>>>>>>>>>>>>> When there is a reset domain, it should take care of the 
>>>>>>>>>>>>> work scheduled and keeping it in device or any other level 
>>>>>>>>>>>>> doesn't sound good.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Thanks,
>>>>>>>>>>>>> Lijo
>>>>>>>>>>>>>
>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> I'd put the reset work struct into the reset_domain 
>>>>>>>>>>>>>>> struct. That way you'd have exactly one worker for the 
>>>>>>>>>>>>>>> reset domain. You could implement a lock-less scheme to 
>>>>>>>>>>>>>>> decide whether you need to schedule a reset, e.g. using 
>>>>>>>>>>>>>>> an atomic counter in the shared work struct that gets 
>>>>>>>>>>>>>>> incremented when a client wants to trigger a reset 
>>>>>>>>>>>>>>> (atomic_add_return). If that counter is exactly 1 after 
>>>>>>>>>>>>>>> incrementing, you need to fill in the rest of the work 
>>>>>>>>>>>>>>> struct and schedule the work. Otherwise, it's already 
>>>>>>>>>>>>>>> scheduled (or another client is in the process of 
>>>>>>>>>>>>>>> scheduling it) and you just return. When the worker 
>>>>>>>>>>>>>>> finishes (after confirming a successful reset), it 
>>>>>>>>>>>>>>> resets the counter to 0, so the next client requesting a 
>>>>>>>>>>>>>>> reset will schedule the worker again.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>   Felix
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Additional to that keep in mind that you can't 
>>>>>>>>>>>>>>>>> allocate any memory before or during the GPU reset nor 
>>>>>>>>>>>>>>>>> wait for the reset to complete (so you can't allocate 
>>>>>>>>>>>>>>>>> anything on the stack either).
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> There is no dynamic allocation here, regarding stack 
>>>>>>>>>>>>>>>> allocations - we do it all the time when we call 
>>>>>>>>>>>>>>>> functions, even during GPU resets, how on stack 
>>>>>>>>>>>>>>>> allocation of work struct in amdgpu_device_gpu_recover 
>>>>>>>>>>>>>>>> is different from any other local variable we allocate 
>>>>>>>>>>>>>>>> in any function we call ?
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> I am also not sure why it's not allowed to wait for 
>>>>>>>>>>>>>>>> reset to complete ? Also, see in amdgpu_ras_do_recovery 
>>>>>>>>>>>>>>>> and gpu_recover_get (debugfs) - the caller expects the 
>>>>>>>>>>>>>>>> reset to complete before he returns. I can probably 
>>>>>>>>>>>>>>>> work around it in RAS code by calling 
>>>>>>>>>>>>>>>> atomic_set(&ras->in_recovery, 0) from some callback 
>>>>>>>>>>>>>>>> within actual reset function but regarding sysfs it 
>>>>>>>>>>>>>>>> actually expects a result returned indicating whether 
>>>>>>>>>>>>>>>> the call was successful or not.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> I don't think that concept you try here will work.
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> Also in general seems to me it's cleaner approach 
>>>>>>>>>>>>>>>>>> where this logic (the work items) are held and 
>>>>>>>>>>>>>>>>>> handled in reset_domain and are not split in each 
>>>>>>>>>>>>>>>>>> adev or any other entity. We might want in the future 
>>>>>>>>>>>>>>>>>> to even move the scheduler handling into reset domain 
>>>>>>>>>>>>>>>>>> since reset domain is supposed to be a generic things 
>>>>>>>>>>>>>>>>>> and not only or AMD.
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky 
>>>>>>>>>>>>>>>>>>>>>> <andrey.grodzovsky@amd.com>
>>>>>>>>>>>>>>>>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com>
>>>>>>>>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +---
>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 
>>>>>>>>>>>>>>>>>>>>>> +++--
>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 +
>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 
>>>>>>>>>>>>>>>>>>>>>> +++++++++++++++++++++-
>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +-
>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++-
>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++-
>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++-
>>>>>>>>>>>>>>>>>>>>>>   8 files changed, 104 insertions(+), 24 
>>>>>>>>>>>>>>>>>>>>>> deletions(-)
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>>>>>>>> index 4264abc5604d..99efd8317547 100644
>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>>>>>>>> @@ -109,6 +109,7 @@
>>>>>>>>>>>>>>>>>>>>>>   #include "amdgpu_fdinfo.h"
>>>>>>>>>>>>>>>>>>>>>>   #include "amdgpu_mca.h"
>>>>>>>>>>>>>>>>>>>>>>   #include "amdgpu_ras.h"
>>>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>>>>>>>>>>>>>>>>     #define MAX_GPU_INSTANCE 16
>>>>>>>>>>>>>>>>>>>>>>   @@ -509,16 +510,6 @@ struct 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_allowed_register_entry {
>>>>>>>>>>>>>>>>>>>>>>       bool grbm_indexed;
>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>   -enum amd_reset_method {
>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_NONE = -1,
>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_LEGACY = 0,
>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE0,
>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE1,
>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE2,
>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_BACO,
>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_PCI,
>>>>>>>>>>>>>>>>>>>>>> -};
>>>>>>>>>>>>>>>>>>>>>> -
>>>>>>>>>>>>>>>>>>>>>>   struct amdgpu_video_codec_info {
>>>>>>>>>>>>>>>>>>>>>>       u32 codec_type;
>>>>>>>>>>>>>>>>>>>>>>       u32 max_width;
>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>> index e582f1044c0f..7fa82269c30f 100644
>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>> @@ -5201,6 +5201,12 @@ int 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>>> tmp_vram_lost_counter = 
>>>>>>>>>>>>>>>>>>>>>> atomic_read(&((adev)->vram_lost_counter));
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> +    /* Drop all pending resets since we will 
>>>>>>>>>>>>>>>>>>>>>> reset now anyway */
>>>>>>>>>>>>>>>>>>>>>> +    tmp_adev = 
>>>>>>>>>>>>>>>>>>>>>> list_first_entry(device_list_handle, struct 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_device,
>>>>>>>>>>>>>>>>>>>>>> + reset_list);
>>>>>>>>>>>>>>>>>>>>>> + amdgpu_reset_pending_list(tmp_adev->reset_domain);
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>       /* Actual ASIC resets if needed.*/
>>>>>>>>>>>>>>>>>>>>>>       /* Host driver will handle XGMI hive reset 
>>>>>>>>>>>>>>>>>>>>>> for SRIOV */
>>>>>>>>>>>>>>>>>>>>>>       if (amdgpu_sriov_vf(adev)) {
>>>>>>>>>>>>>>>>>>>>>> @@ -5296,7 +5302,7 @@ int 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>     struct amdgpu_recover_work_struct {
>>>>>>>>>>>>>>>>>>>>>> -    struct work_struct base;
>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct base;
>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev;
>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_job *job;
>>>>>>>>>>>>>>>>>>>>>>       int ret;
>>>>>>>>>>>>>>>>>>>>>> @@ -5304,7 +5310,7 @@ struct 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct {
>>>>>>>>>>>>>>>>>>>>>>     static void 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work(struct 
>>>>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_recover_work_struct 
>>>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base);
>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_recover_work_struct 
>>>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base.base.work);
>>>>>>>>>>>>>>>>>>>>>> recover_work->ret = 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, 
>>>>>>>>>>>>>>>>>>>>>> recover_work->job);
>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>> @@ -5316,12 +5322,15 @@ int 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>> *adev,
>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_recover_work_struct work = 
>>>>>>>>>>>>>>>>>>>>>> {.adev = adev, .job = job};
>>>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&work.base, 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&work.base.base, 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&work.base.node);
>>>>>>>>>>>>>>>>>>>>>>         if 
>>>>>>>>>>>>>>>>>>>>>> (!amdgpu_reset_domain_schedule(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>> &work.base))
>>>>>>>>>>>>>>>>>>>>>>           return -EAGAIN;
>>>>>>>>>>>>>>>>>>>>>>   - flush_work(&work.base);
>>>>>>>>>>>>>>>>>>>>>> + flush_delayed_work(&work.base.base);
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>> &work.base);
>>>>>>>>>>>>>>>>>>>>>>         return work.ret;
>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>>>>>>>> index c80af0889773..ffddd419c351 100644
>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>>>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain 
>>>>>>>>>>>>>>>>>>>>>> *amdgpu_reset_create_reset_domain(enum 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_d
>>>>>>>>>>>>>>>>>>>>>> atomic_set(&reset_domain->in_gpu_reset, 0);
>>>>>>>>>>>>>>>>>>>>>> init_rwsem(&reset_domain->sem);
>>>>>>>>>>>>>>>>>>>>>>   + INIT_LIST_HEAD(&reset_domain->pending_works);
>>>>>>>>>>>>>>>>>>>>>> + mutex_init(&reset_domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>       return reset_domain;
>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>   diff --git 
>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>>>>>>>> index 1949dbe28a86..863ec5720fc1 100644
>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>>>>>>>> @@ -24,7 +24,18 @@
>>>>>>>>>>>>>>>>>>>>>>   #ifndef __AMDGPU_RESET_H__
>>>>>>>>>>>>>>>>>>>>>>   #define __AMDGPU_RESET_H__
>>>>>>>>>>>>>>>>>>>>>>   -#include "amdgpu.h"
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> +#include <linux/atomic.h>
>>>>>>>>>>>>>>>>>>>>>> +#include <linux/mutex.h>
>>>>>>>>>>>>>>>>>>>>>> +#include <linux/list.h>
>>>>>>>>>>>>>>>>>>>>>> +#include <linux/kref.h>
>>>>>>>>>>>>>>>>>>>>>> +#include <linux/rwsem.h>
>>>>>>>>>>>>>>>>>>>>>> +#include <linux/workqueue.h>
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_device;
>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_job;
>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_hive_info;
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>     enum AMDGPU_RESET_FLAGS {
>>>>>>>>>>>>>>>>>>>>>>   @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS {
>>>>>>>>>>>>>>>>>>>>>> AMDGPU_SKIP_HW_RESET = 1,
>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>   +
>>>>>>>>>>>>>>>>>>>>>> +enum amd_reset_method {
>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_NONE = -1,
>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_LEGACY = 0,
>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE0,
>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE1,
>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE2,
>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_BACO,
>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_PCI,
>>>>>>>>>>>>>>>>>>>>>> +};
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>   struct amdgpu_reset_context {
>>>>>>>>>>>>>>>>>>>>>>       enum amd_reset_method method;
>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *reset_req_dev;
>>>>>>>>>>>>>>>>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context {
>>>>>>>>>>>>>>>>>>>>>>       unsigned long flags;
>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>   +struct amdgpu_reset_control;
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>   struct amdgpu_reset_handler {
>>>>>>>>>>>>>>>>>>>>>>       enum amd_reset_method reset_method;
>>>>>>>>>>>>>>>>>>>>>>       struct list_head handler_list;
>>>>>>>>>>>>>>>>>>>>>> @@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type {
>>>>>>>>>>>>>>>>>>>>>>       XGMI_HIVE
>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>   +
>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_reset_work_struct {
>>>>>>>>>>>>>>>>>>>>>> +    struct delayed_work base;
>>>>>>>>>>>>>>>>>>>>>> +    struct list_head node;
>>>>>>>>>>>>>>>>>>>>>> +};
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>   struct amdgpu_reset_domain {
>>>>>>>>>>>>>>>>>>>>>>       struct kref refcount;
>>>>>>>>>>>>>>>>>>>>>>       struct workqueue_struct *wq;
>>>>>>>>>>>>>>>>>>>>>>       enum amdgpu_reset_domain_type type;
>>>>>>>>>>>>>>>>>>>>>>       struct rw_semaphore sem;
>>>>>>>>>>>>>>>>>>>>>>       atomic_t in_gpu_reset;
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> +    struct list_head pending_works;
>>>>>>>>>>>>>>>>>>>>>> +    struct mutex reset_lock;
>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>     @@ -113,9 +146,43 @@ static inline void 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_put_reset_domain(struct 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *dom
>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>     static inline bool 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_schedule(struct 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain,
>>>>>>>>>>>>>>>>>>>>>> - struct work_struct *work)
>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>> -    return queue_work(domain->wq, work);
>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> +    if (!queue_delayed_work(domain->wq, 
>>>>>>>>>>>>>>>>>>>>>> &work->base, 0)) {
>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>> +        return false;
>>>>>>>>>>>>>>>>>>>>>> +    }
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> + list_add_tail(&work->node, 
>>>>>>>>>>>>>>>>>>>>>> &domain->pending_works);
>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> +    return true;
>>>>>>>>>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> +static inline void 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(struct 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain,
>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work)
>>>>>>>>>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>> + list_del_init(&work->node);
>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> +static inline void 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_pending_list(struct 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain)
>>>>>>>>>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct *entry, *tmp;
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>> + list_for_each_entry_safe(entry, tmp, 
>>>>>>>>>>>>>>>>>>>>>> &domain->pending_works, node) {
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> + list_del_init(&entry->node);
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> +        /* Stop any other related pending resets */
>>>>>>>>>>>>>>>>>>>>>> + cancel_delayed_work(&entry->base);
>>>>>>>>>>>>>>>>>>>>>> +    }
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>     void amdgpu_device_lock_reset_domain(struct 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *reset_domain);
>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h 
>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>>>>>>>> index 239f232f9c02..574e870d3064 100644
>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>>>>>>>> @@ -25,6 +25,7 @@
>>>>>>>>>>>>>>>>>>>>>>   #define AMDGPU_VIRT_H
>>>>>>>>>>>>>>>>>>>>>>     #include "amdgv_sriovmsg.h"
>>>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>>>>>>>>>>>>>>>>     #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 << 
>>>>>>>>>>>>>>>>>>>>>> 0) /* vBIOS is sr-iov ready */
>>>>>>>>>>>>>>>>>>>>>>   #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 1) 
>>>>>>>>>>>>>>>>>>>>>> /* sr-iov is enabled on this GPU */
>>>>>>>>>>>>>>>>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt {
>>>>>>>>>>>>>>>>>>>>>>       uint32_t reg_val_offs;
>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_irq_src ack_irq;
>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_irq_src rcv_irq;
>>>>>>>>>>>>>>>>>>>>>> -    struct work_struct flr_work;
>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct flr_work;
>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_mm_table mm_table;
>>>>>>>>>>>>>>>>>>>>>>       const struct amdgpu_virt_ops *ops;
>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_vf_error_buffer vf_errors;
>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>>>>>> index b81acf59870c..f3d1c2be9292 100644
>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>>>>>> @@ -251,7 +251,7 @@ static int 
>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>> *adev,
>>>>>>>>>>>>>>>>>>>>>>     static void xgpu_ai_mailbox_flr_work(struct 
>>>>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, 
>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work);
>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = 
>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>>>>>>>       int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>>>>>>>>>>   @@ -380,7 +380,8 @@ int 
>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_get_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>> @@ -389,6 +390,8 @@ void 
>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_put_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>     static int xgpu_ai_request_init_data(struct 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644
>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>>>>>> @@ -275,7 +275,7 @@ static int 
>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>> *adev,
>>>>>>>>>>>>>>>>>>>>>>     static void xgpu_nv_mailbox_flr_work(struct 
>>>>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, 
>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work);
>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = 
>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>>>>>>>       int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>>>>>>>>>>   @@ -407,7 +407,8 @@ int 
>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_get_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>> @@ -416,6 +417,8 @@ void 
>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_put_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>     const struct amdgpu_virt_ops xgpu_nv_virt_ops 
>>>>>>>>>>>>>>>>>>>>>> = {
>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c 
>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644
>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>>>>>>>> @@ -512,7 +512,7 @@ static int 
>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>> *adev,
>>>>>>>>>>>>>>>>>>>>>>     static void xgpu_vi_mailbox_flr_work(struct 
>>>>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, 
>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work);
>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = 
>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>>>>>>>         /* wait until RCV_MSG become 3 */
>>>>>>>>>>>>>>>>>>>>>> @@ -610,7 +610,8 @@ int 
>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_get_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>> @@ -619,6 +620,8 @@ void 
>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_put_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>     const struct amdgpu_virt_ops xgpu_vi_virt_ops 
>>>>>>>>>>>>>>>>>>>>>> = {
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>
>>>>>>
>>>>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive.
  2022-05-11 15:43                                           ` Andrey Grodzovsky
@ 2022-05-11 15:46                                             ` Lazar, Lijo
  2022-05-11 15:53                                               ` Andrey Grodzovsky
  0 siblings, 1 reply; 40+ messages in thread
From: Lazar, Lijo @ 2022-05-11 15:46 UTC (permalink / raw)
  To: Andrey Grodzovsky, Christian König, Christian König,
	Felix Kuehling, amd-gfx
  Cc: Bai Zoy



On 5/11/2022 9:13 PM, Andrey Grodzovsky wrote:
> 
> On 2022-05-11 11:37, Lazar, Lijo wrote:
>>
>>
>> On 5/11/2022 9:05 PM, Andrey Grodzovsky wrote:
>>>
>>> On 2022-05-11 11:20, Lazar, Lijo wrote:
>>>>
>>>>
>>>> On 5/11/2022 7:28 PM, Christian König wrote:
>>>>> Am 11.05.22 um 15:43 schrieb Andrey Grodzovsky:
>>>>>> On 2022-05-11 03:38, Christian König wrote:
>>>>>>> Am 10.05.22 um 20:53 schrieb Andrey Grodzovsky:
>>>>>>>> [SNIP]
>>>>>>>>> E.g. in the reset code (either before or after the reset, 
>>>>>>>>> that's debatable) you do something like this:
>>>>>>>>>
>>>>>>>>> for (i = 0; i < num_ring; ++i)
>>>>>>>>>     cancel_delayed_work(ring[i]->scheduler....)
>>>>>>>>> cancel_work(adev->ras_work);
>>>>>>>>> cancel_work(adev->iofault_work);
>>>>>>>>> cancel_work(adev->debugfs_work);
>>>>>>>>> ...
>>>>>>>>>
>>>>>>>>> You don't really need to track which reset source has fired and 
>>>>>>>>> which hasn't, because that would be racy again. Instead just 
>>>>>>>>> bluntly reset all possible sources.
>>>>>>>>>
>>>>>>>>> Christian.
>>>>>>>>
>>>>>>>>
>>>>>>>> I don't say we care if it fired once or twice (I need to add a 
>>>>>>>> fix to only insert reset work to pending reset list if it's not 
>>>>>>>> already there), the point of using list (or array) to which you 
>>>>>>>> add and from which you remove is that the logic of this is 
>>>>>>>> encapsulated within reset domain. In your way we need to be 
>>>>>>>> aware who exactly schedules reset work and explicitly cancel 
>>>>>>>> them, this also means that for any new source added in the 
>>>>>>>> future you will need to remember to add him
>>>>>>>
>>>>>>> I don't think that this is a valid argument. Additionally to the 
>>>>>>> schedulers we probably just need less than a handful of reset 
>>>>>>> sources, most likely even just one or two is enough.
>>>>>>>
>>>>>>> The only justification I can see of having additional separate 
>>>>>>> reset sources would be if somebody wants to know if a specific 
>>>>>>> source has been handled or not (e.g. call flush_work() or 
>>>>>>> work_pending()). Like in the case of a reset triggered through 
>>>>>>> debugfs.
>>>>>>
>>>>>>
>>>>>> This is indeed one reason, another is as we said before that if 
>>>>>> you share 'reset source' (meaning a delayed work) with another 
>>>>>> client (i.e. RAS and KFD) it means you make assumption that the 
>>>>>> other client always proceeds with the
>>>>>> reset exactly the same way as you expect. So today we have this 
>>>>>> only in scheduler vs non scheduler reset happening - non scheduler 
>>>>>> reset clients assume the reset is always fully executed in HW 
>>>>>> while scheduler based reset makes shortcuts and not always does HW 
>>>>>> reset hence they cannot share 'reset source' (delayed work). Yes, 
>>>>>> we can always add this in the future if and when such problem will 
>>>>>> arise but no one will remember this then and a new bug will be 
>>>>>> introduced and will take time to find and resolve.
>>>>>
>>>>> Mhm, so your main concern is that we forget to correctly handle the 
>>>>> new reset sources?
>>>>>
>>>>> How about we do it like this then:
>>>>>
>>>>> struct amdgpu_reset_domain {
>>>>>      ....
>>>>>      union {
>>>>>          struct {
>>>>>              struct work_item debugfs;
>>>>>              struct work_item ras;
>>>>>              ....
>>>>>          };
>>>>>          struct work_item array[]
>>>>>      } reset_sources;
>>>>> }
>>>>>
>>>>
>>>> If it's only about static array,
>>>>
>>>> enum amdgpu_reset_soruce {
>>>>
>>>> AMDGPU_RESET_SRC_RAS,
>>>> AMDGPU_RESET_SRC_ABC,
>>>> .....
>>>> AMDGPU_RESET_SRC_XYZ,
>>>> AMDGPU_RESET_SRC_MAX
>>>>
>>>> };
>>>>
>>>> struct work_struct reset_work[AMDGPU_RESET_SRC_MAX]; => An index for 
>>>> each work item
>>>>
>>>>
>>>> Thanks,
>>>> Lijo
>>>
>>>
>>> It's possible though it makes harder to generalize reset_domain later 
>>> for other drivers.
>>
>> The current reset domain queue design is not good for a hierarchichal 
>> reset within amdgpu itself :)
>>
>> Thanks,
>> Lijo
> 
> 
> Not sure what do you mean ?
> 

It's tied to the TDR queue in scheduler.

Hierarchichal model - start from reset of lowest level nodes and on 
failure try with a higher level reset. This model doesn't suit that.

Thanks,
Lijo

> Andrey
> 
> 
>>
>>> But still one caveat, look at amdgpu_recover_work_struct and it's 
>>> usage in amdgpu_device_gpu_recover and in gpu_recover_get,
>>> At least for debugfs i need to return back the result of GPU reset 
>>> and so I cannot store actual work items in the array mentioned above
>>> but rather pointers to work_item because i need a way to get back the 
>>> return value from gpu_recover like I do now in 
>>> amdgpu_device_gpu_recover.
>>>
>>> Andrey
>>>
>>>
>>>>
>>>>> Not 100% sure if that works, but something like that should do the 
>>>>> trick.
>>>>>
>>>>> My main concern is that I don't want to allocate the work items on 
>>>>> the stack and dynamic allocation (e.g. kmalloc) is usually not 
>>>>> possible either.
>>>>>
>>>>> Additional to that putting/removing work items from a list, array 
>>>>> or other container is a very common source for race conditions.
>>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>>>
>>>>>>>> to the cancellation list which you showed above. In current way 
>>>>>>>> all this done automatically within reset_domain code and it's 
>>>>>>>> agnostic to specific driver and it's specific list of reset 
>>>>>>>> sources. Also in case we would want to generalize reset_domain 
>>>>>>>> to other GPU drivers (which was
>>>>>>>> a plan as far as i remember) this explicit mention of each reset 
>>>>>>>> works for cancellation is again less suitable in my opinion.
>>>>>>>
>>>>>>> Well we could put the work item for the scheduler independent 
>>>>>>> reset source into the reset domain as well. But I'm not sure 
>>>>>>> those additional reset sources should be part of any common 
>>>>>>> handling, that is largely amdgpu specific.
>>>>>>
>>>>>>
>>>>>> So it's for sure more then one source for the reasons described 
>>>>>> above, also note that for scheduler we already cancel delayed work 
>>>>>> in drm_sched_stop so calling them again in amdgpu code kind of 
>>>>>> superfluous.
>>>>>>
>>>>>> Andrey
>>>>>>
>>>>>>
>>>>>>>
>>>>>>> Christian.
>>>>>>>
>>>>>>>>
>>>>>>>> Andrey
>>>>>>>>
>>>>>>>>
>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> Andrey
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> The only difference is I chose to do the canceling right 
>>>>>>>>>>>> BEFORE the HW reset and not AFTER. I did this because I see 
>>>>>>>>>>>> a possible race where a new reset request is being generated 
>>>>>>>>>>>> exactly after we finished the HW reset but before we 
>>>>>>>>>>>> canceled out all pending resets - in such case you wold not 
>>>>>>>>>>>> want to cancel this 'border line new' reset request.
>>>>>>>>>>>
>>>>>>>>>>> Why not? Any new reset request directly after a hardware 
>>>>>>>>>>> reset is most likely just falsely generated by the reset itself.
>>>>>>>>>>>
>>>>>>>>>>> Ideally I would cancel all sources after the reset, but 
>>>>>>>>>>> before starting any new work.
>>>>>>>>>>>
>>>>>>>>>>> Regards,
>>>>>>>>>>> Christian.
>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> Andrey
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>
>>>>>>>>>>>>>>> You can see that if many different reset sources share 
>>>>>>>>>>>>>>> same work struct what can happen is that the first to 
>>>>>>>>>>>>>>> obtain the lock you describe bellow might opt out from 
>>>>>>>>>>>>>>> full HW reset because his bad job did signal for example 
>>>>>>>>>>>>>>> or because his hunged IP block was able to recover 
>>>>>>>>>>>>>>> through SW reset but in the meantime another reset source 
>>>>>>>>>>>>>>> who needed an actual HW reset just silently returned and 
>>>>>>>>>>>>>>> we end up with unhandled reset request. True that today 
>>>>>>>>>>>>>>> this happens only to job timeout reset sources that are 
>>>>>>>>>>>>>>> handled form within the scheduler and won't use this 
>>>>>>>>>>>>>>> single work struct but no one prevents a future case for 
>>>>>>>>>>>>>>> this to happen and also, if we actually want to unify 
>>>>>>>>>>>>>>> scheduler time out handlers within reset domain (which 
>>>>>>>>>>>>>>> seems to me the right design approach) we won't be able 
>>>>>>>>>>>>>>> to use just one work struct for this reason anyway.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Just to add to this point - a reset domain is co-operative 
>>>>>>>>>>>>>> domain. In addition to sharing a set of clients from 
>>>>>>>>>>>>>> various reset sources for one device, it also will have a 
>>>>>>>>>>>>>> set of devices like in XGMI hive. The job timeout on one 
>>>>>>>>>>>>>> device may not eventually result in result, but a RAS 
>>>>>>>>>>>>>> error happening on another device at the same time would 
>>>>>>>>>>>>>> need a reset. The second device's RAS error cannot return 
>>>>>>>>>>>>>> seeing that a reset work already started, or ignore the 
>>>>>>>>>>>>>> reset work given that another device has filled the reset 
>>>>>>>>>>>>>> data.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> When there is a reset domain, it should take care of the 
>>>>>>>>>>>>>> work scheduled and keeping it in device or any other level 
>>>>>>>>>>>>>> doesn't sound good.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Thanks,
>>>>>>>>>>>>>> Lijo
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> I'd put the reset work struct into the reset_domain 
>>>>>>>>>>>>>>>> struct. That way you'd have exactly one worker for the 
>>>>>>>>>>>>>>>> reset domain. You could implement a lock-less scheme to 
>>>>>>>>>>>>>>>> decide whether you need to schedule a reset, e.g. using 
>>>>>>>>>>>>>>>> an atomic counter in the shared work struct that gets 
>>>>>>>>>>>>>>>> incremented when a client wants to trigger a reset 
>>>>>>>>>>>>>>>> (atomic_add_return). If that counter is exactly 1 after 
>>>>>>>>>>>>>>>> incrementing, you need to fill in the rest of the work 
>>>>>>>>>>>>>>>> struct and schedule the work. Otherwise, it's already 
>>>>>>>>>>>>>>>> scheduled (or another client is in the process of 
>>>>>>>>>>>>>>>> scheduling it) and you just return. When the worker 
>>>>>>>>>>>>>>>> finishes (after confirming a successful reset), it 
>>>>>>>>>>>>>>>> resets the counter to 0, so the next client requesting a 
>>>>>>>>>>>>>>>> reset will schedule the worker again.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>   Felix
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> Additional to that keep in mind that you can't 
>>>>>>>>>>>>>>>>>> allocate any memory before or during the GPU reset nor 
>>>>>>>>>>>>>>>>>> wait for the reset to complete (so you can't allocate 
>>>>>>>>>>>>>>>>>> anything on the stack either).
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> There is no dynamic allocation here, regarding stack 
>>>>>>>>>>>>>>>>> allocations - we do it all the time when we call 
>>>>>>>>>>>>>>>>> functions, even during GPU resets, how on stack 
>>>>>>>>>>>>>>>>> allocation of work struct in amdgpu_device_gpu_recover 
>>>>>>>>>>>>>>>>> is different from any other local variable we allocate 
>>>>>>>>>>>>>>>>> in any function we call ?
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> I am also not sure why it's not allowed to wait for 
>>>>>>>>>>>>>>>>> reset to complete ? Also, see in amdgpu_ras_do_recovery 
>>>>>>>>>>>>>>>>> and gpu_recover_get (debugfs) - the caller expects the 
>>>>>>>>>>>>>>>>> reset to complete before he returns. I can probably 
>>>>>>>>>>>>>>>>> work around it in RAS code by calling 
>>>>>>>>>>>>>>>>> atomic_set(&ras->in_recovery, 0) from some callback 
>>>>>>>>>>>>>>>>> within actual reset function but regarding sysfs it 
>>>>>>>>>>>>>>>>> actually expects a result returned indicating whether 
>>>>>>>>>>>>>>>>> the call was successful or not.
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> I don't think that concept you try here will work.
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> Also in general seems to me it's cleaner approach 
>>>>>>>>>>>>>>>>>>> where this logic (the work items) are held and 
>>>>>>>>>>>>>>>>>>> handled in reset_domain and are not split in each 
>>>>>>>>>>>>>>>>>>> adev or any other entity. We might want in the future 
>>>>>>>>>>>>>>>>>>> to even move the scheduler handling into reset domain 
>>>>>>>>>>>>>>>>>>> since reset domain is supposed to be a generic things 
>>>>>>>>>>>>>>>>>>> and not only or AMD.
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky 
>>>>>>>>>>>>>>>>>>>>>>> <andrey.grodzovsky@amd.com>
>>>>>>>>>>>>>>>>>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com>
>>>>>>>>>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +---
>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 
>>>>>>>>>>>>>>>>>>>>>>> +++--
>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 +
>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 
>>>>>>>>>>>>>>>>>>>>>>> +++++++++++++++++++++-
>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +-
>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++-
>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++-
>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++-
>>>>>>>>>>>>>>>>>>>>>>>   8 files changed, 104 insertions(+), 24 
>>>>>>>>>>>>>>>>>>>>>>> deletions(-)
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>>>>>>>>> index 4264abc5604d..99efd8317547 100644
>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>>>>>>>>> @@ -109,6 +109,7 @@
>>>>>>>>>>>>>>>>>>>>>>>   #include "amdgpu_fdinfo.h"
>>>>>>>>>>>>>>>>>>>>>>>   #include "amdgpu_mca.h"
>>>>>>>>>>>>>>>>>>>>>>>   #include "amdgpu_ras.h"
>>>>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>>>>>>>>>>>>>>>>>     #define MAX_GPU_INSTANCE 16
>>>>>>>>>>>>>>>>>>>>>>>   @@ -509,16 +510,6 @@ struct 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_allowed_register_entry {
>>>>>>>>>>>>>>>>>>>>>>>       bool grbm_indexed;
>>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>>   -enum amd_reset_method {
>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_NONE = -1,
>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_LEGACY = 0,
>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE0,
>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE1,
>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE2,
>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_BACO,
>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_PCI,
>>>>>>>>>>>>>>>>>>>>>>> -};
>>>>>>>>>>>>>>>>>>>>>>> -
>>>>>>>>>>>>>>>>>>>>>>>   struct amdgpu_video_codec_info {
>>>>>>>>>>>>>>>>>>>>>>>       u32 codec_type;
>>>>>>>>>>>>>>>>>>>>>>>       u32 max_width;
>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>> index e582f1044c0f..7fa82269c30f 100644
>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>> @@ -5201,6 +5201,12 @@ int 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>>>> tmp_vram_lost_counter = 
>>>>>>>>>>>>>>>>>>>>>>> atomic_read(&((adev)->vram_lost_counter));
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>> +    /* Drop all pending resets since we will 
>>>>>>>>>>>>>>>>>>>>>>> reset now anyway */
>>>>>>>>>>>>>>>>>>>>>>> +    tmp_adev = 
>>>>>>>>>>>>>>>>>>>>>>> list_first_entry(device_list_handle, struct 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device,
>>>>>>>>>>>>>>>>>>>>>>> + reset_list);
>>>>>>>>>>>>>>>>>>>>>>> + amdgpu_reset_pending_list(tmp_adev->reset_domain);
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>       /* Actual ASIC resets if needed.*/
>>>>>>>>>>>>>>>>>>>>>>>       /* Host driver will handle XGMI hive reset 
>>>>>>>>>>>>>>>>>>>>>>> for SRIOV */
>>>>>>>>>>>>>>>>>>>>>>>       if (amdgpu_sriov_vf(adev)) {
>>>>>>>>>>>>>>>>>>>>>>> @@ -5296,7 +5302,7 @@ int 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>     struct amdgpu_recover_work_struct {
>>>>>>>>>>>>>>>>>>>>>>> -    struct work_struct base;
>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct base;
>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev;
>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_job *job;
>>>>>>>>>>>>>>>>>>>>>>>       int ret;
>>>>>>>>>>>>>>>>>>>>>>> @@ -5304,7 +5310,7 @@ struct 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct {
>>>>>>>>>>>>>>>>>>>>>>>     static void 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work(struct 
>>>>>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_recover_work_struct 
>>>>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base);
>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_recover_work_struct 
>>>>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base.base.work);
>>>>>>>>>>>>>>>>>>>>>>> recover_work->ret = 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, 
>>>>>>>>>>>>>>>>>>>>>>> recover_work->job);
>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>> @@ -5316,12 +5322,15 @@ int 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>>> *adev,
>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_recover_work_struct work = 
>>>>>>>>>>>>>>>>>>>>>>> {.adev = adev, .job = job};
>>>>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&work.base, 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&work.base.base, 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&work.base.node);
>>>>>>>>>>>>>>>>>>>>>>>         if 
>>>>>>>>>>>>>>>>>>>>>>> (!amdgpu_reset_domain_schedule(adev->reset_domain, &work.base)) 
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>           return -EAGAIN;
>>>>>>>>>>>>>>>>>>>>>>>   - flush_work(&work.base);
>>>>>>>>>>>>>>>>>>>>>>> + flush_delayed_work(&work.base.base);
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>>> &work.base);
>>>>>>>>>>>>>>>>>>>>>>>         return work.ret;
>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>>>>>>>>> index c80af0889773..ffddd419c351 100644
>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>>>>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain 
>>>>>>>>>>>>>>>>>>>>>>> *amdgpu_reset_create_reset_domain(enum 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_d
>>>>>>>>>>>>>>>>>>>>>>> atomic_set(&reset_domain->in_gpu_reset, 0);
>>>>>>>>>>>>>>>>>>>>>>> init_rwsem(&reset_domain->sem);
>>>>>>>>>>>>>>>>>>>>>>>   + INIT_LIST_HEAD(&reset_domain->pending_works);
>>>>>>>>>>>>>>>>>>>>>>> + mutex_init(&reset_domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>       return reset_domain;
>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>   diff --git 
>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>>>>>>>>> index 1949dbe28a86..863ec5720fc1 100644
>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>>>>>>>>> @@ -24,7 +24,18 @@
>>>>>>>>>>>>>>>>>>>>>>>   #ifndef __AMDGPU_RESET_H__
>>>>>>>>>>>>>>>>>>>>>>>   #define __AMDGPU_RESET_H__
>>>>>>>>>>>>>>>>>>>>>>>   -#include "amdgpu.h"
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/atomic.h>
>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/mutex.h>
>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/list.h>
>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/kref.h>
>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/rwsem.h>
>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/workqueue.h>
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_device;
>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_job;
>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_hive_info;
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>     enum AMDGPU_RESET_FLAGS {
>>>>>>>>>>>>>>>>>>>>>>>   @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS {
>>>>>>>>>>>>>>>>>>>>>>> AMDGPU_SKIP_HW_RESET = 1,
>>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>>   +
>>>>>>>>>>>>>>>>>>>>>>> +enum amd_reset_method {
>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_NONE = -1,
>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_LEGACY = 0,
>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE0,
>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE1,
>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE2,
>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_BACO,
>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_PCI,
>>>>>>>>>>>>>>>>>>>>>>> +};
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>   struct amdgpu_reset_context {
>>>>>>>>>>>>>>>>>>>>>>>       enum amd_reset_method method;
>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *reset_req_dev;
>>>>>>>>>>>>>>>>>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context {
>>>>>>>>>>>>>>>>>>>>>>>       unsigned long flags;
>>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>>   +struct amdgpu_reset_control;
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>   struct amdgpu_reset_handler {
>>>>>>>>>>>>>>>>>>>>>>>       enum amd_reset_method reset_method;
>>>>>>>>>>>>>>>>>>>>>>>       struct list_head handler_list;
>>>>>>>>>>>>>>>>>>>>>>> @@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type {
>>>>>>>>>>>>>>>>>>>>>>>       XGMI_HIVE
>>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>>   +
>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_reset_work_struct {
>>>>>>>>>>>>>>>>>>>>>>> +    struct delayed_work base;
>>>>>>>>>>>>>>>>>>>>>>> +    struct list_head node;
>>>>>>>>>>>>>>>>>>>>>>> +};
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>   struct amdgpu_reset_domain {
>>>>>>>>>>>>>>>>>>>>>>>       struct kref refcount;
>>>>>>>>>>>>>>>>>>>>>>>       struct workqueue_struct *wq;
>>>>>>>>>>>>>>>>>>>>>>>       enum amdgpu_reset_domain_type type;
>>>>>>>>>>>>>>>>>>>>>>>       struct rw_semaphore sem;
>>>>>>>>>>>>>>>>>>>>>>>       atomic_t in_gpu_reset;
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>> +    struct list_head pending_works;
>>>>>>>>>>>>>>>>>>>>>>> +    struct mutex reset_lock;
>>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>>     @@ -113,9 +146,43 @@ static inline void 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_put_reset_domain(struct 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *dom
>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>     static inline bool 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_schedule(struct 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain,
>>>>>>>>>>>>>>>>>>>>>>> - struct work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>> -    return queue_work(domain->wq, work);
>>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>> +    if (!queue_delayed_work(domain->wq, 
>>>>>>>>>>>>>>>>>>>>>>> &work->base, 0)) {
>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>> +        return false;
>>>>>>>>>>>>>>>>>>>>>>> +    }
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>> + list_add_tail(&work->node, 
>>>>>>>>>>>>>>>>>>>>>>> &domain->pending_works);
>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>> +    return true;
>>>>>>>>>>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>> +static inline void 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(struct 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain,
>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>> + list_del_init(&work->node);
>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>> +static inline void 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_pending_list(struct 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain)
>>>>>>>>>>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct *entry, *tmp;
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>> + list_for_each_entry_safe(entry, tmp, 
>>>>>>>>>>>>>>>>>>>>>>> &domain->pending_works, node) {
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>> + list_del_init(&entry->node);
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>> +        /* Stop any other related pending resets */
>>>>>>>>>>>>>>>>>>>>>>> + cancel_delayed_work(&entry->base);
>>>>>>>>>>>>>>>>>>>>>>> +    }
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>     void amdgpu_device_lock_reset_domain(struct 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *reset_domain);
>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h 
>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>>>>>>>>> index 239f232f9c02..574e870d3064 100644
>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>>>>>>>>> @@ -25,6 +25,7 @@
>>>>>>>>>>>>>>>>>>>>>>>   #define AMDGPU_VIRT_H
>>>>>>>>>>>>>>>>>>>>>>>     #include "amdgv_sriovmsg.h"
>>>>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>>>>>>>>>>>>>>>>>     #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 << 
>>>>>>>>>>>>>>>>>>>>>>> 0) /* vBIOS is sr-iov ready */
>>>>>>>>>>>>>>>>>>>>>>>   #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 1) 
>>>>>>>>>>>>>>>>>>>>>>> /* sr-iov is enabled on this GPU */
>>>>>>>>>>>>>>>>>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt {
>>>>>>>>>>>>>>>>>>>>>>>       uint32_t reg_val_offs;
>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_irq_src ack_irq;
>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_irq_src rcv_irq;
>>>>>>>>>>>>>>>>>>>>>>> -    struct work_struct flr_work;
>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct flr_work;
>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_mm_table mm_table;
>>>>>>>>>>>>>>>>>>>>>>>       const struct amdgpu_virt_ops *ops;
>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_vf_error_buffer vf_errors;
>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>>>>>>> index b81acf59870c..f3d1c2be9292 100644
>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>>>>>>> @@ -251,7 +251,7 @@ static int 
>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>>> *adev,
>>>>>>>>>>>>>>>>>>>>>>>     static void xgpu_ai_mailbox_flr_work(struct 
>>>>>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, 
>>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work);
>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = 
>>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>>>>>>>>       int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>>>>>>>>>>>   @@ -380,7 +380,8 @@ int 
>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_get_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>> @@ -389,6 +390,8 @@ void 
>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_put_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>     static int xgpu_ai_request_init_data(struct 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644
>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>>>>>>> @@ -275,7 +275,7 @@ static int 
>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>>> *adev,
>>>>>>>>>>>>>>>>>>>>>>>     static void xgpu_nv_mailbox_flr_work(struct 
>>>>>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, 
>>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work);
>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = 
>>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>>>>>>>>       int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>>>>>>>>>>>   @@ -407,7 +407,8 @@ int 
>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_get_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>> @@ -416,6 +417,8 @@ void 
>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_put_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>     const struct amdgpu_virt_ops xgpu_nv_virt_ops 
>>>>>>>>>>>>>>>>>>>>>>> = {
>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c 
>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>>>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644
>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>>>>>>>>> @@ -512,7 +512,7 @@ static int 
>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>>> *adev,
>>>>>>>>>>>>>>>>>>>>>>>     static void xgpu_vi_mailbox_flr_work(struct 
>>>>>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, 
>>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work);
>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = 
>>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>>>>>>>>         /* wait until RCV_MSG become 3 */
>>>>>>>>>>>>>>>>>>>>>>> @@ -610,7 +610,8 @@ int 
>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_get_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>> @@ -619,6 +620,8 @@ void 
>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_put_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>     const struct amdgpu_virt_ops xgpu_vi_virt_ops 
>>>>>>>>>>>>>>>>>>>>>>> = {
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>
>>>>>>>
>>>>>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive.
  2022-05-11 15:46                                             ` Lazar, Lijo
@ 2022-05-11 15:53                                               ` Andrey Grodzovsky
  0 siblings, 0 replies; 40+ messages in thread
From: Andrey Grodzovsky @ 2022-05-11 15:53 UTC (permalink / raw)
  To: Lazar, Lijo, Christian König, Christian König,
	Felix Kuehling, amd-gfx
  Cc: Bai Zoy


On 2022-05-11 11:46, Lazar, Lijo wrote:
>
>
> On 5/11/2022 9:13 PM, Andrey Grodzovsky wrote:
>>
>> On 2022-05-11 11:37, Lazar, Lijo wrote:
>>>
>>>
>>> On 5/11/2022 9:05 PM, Andrey Grodzovsky wrote:
>>>>
>>>> On 2022-05-11 11:20, Lazar, Lijo wrote:
>>>>>
>>>>>
>>>>> On 5/11/2022 7:28 PM, Christian König wrote:
>>>>>> Am 11.05.22 um 15:43 schrieb Andrey Grodzovsky:
>>>>>>> On 2022-05-11 03:38, Christian König wrote:
>>>>>>>> Am 10.05.22 um 20:53 schrieb Andrey Grodzovsky:
>>>>>>>>> [SNIP]
>>>>>>>>>> E.g. in the reset code (either before or after the reset, 
>>>>>>>>>> that's debatable) you do something like this:
>>>>>>>>>>
>>>>>>>>>> for (i = 0; i < num_ring; ++i)
>>>>>>>>>> cancel_delayed_work(ring[i]->scheduler....)
>>>>>>>>>> cancel_work(adev->ras_work);
>>>>>>>>>> cancel_work(adev->iofault_work);
>>>>>>>>>> cancel_work(adev->debugfs_work);
>>>>>>>>>> ...
>>>>>>>>>>
>>>>>>>>>> You don't really need to track which reset source has fired 
>>>>>>>>>> and which hasn't, because that would be racy again. Instead 
>>>>>>>>>> just bluntly reset all possible sources.
>>>>>>>>>>
>>>>>>>>>> Christian.
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> I don't say we care if it fired once or twice (I need to add a 
>>>>>>>>> fix to only insert reset work to pending reset list if it's 
>>>>>>>>> not already there), the point of using list (or array) to 
>>>>>>>>> which you add and from which you remove is that the logic of 
>>>>>>>>> this is encapsulated within reset domain. In your way we need 
>>>>>>>>> to be aware who exactly schedules reset work and explicitly 
>>>>>>>>> cancel them, this also means that for any new source added in 
>>>>>>>>> the future you will need to remember to add him
>>>>>>>>
>>>>>>>> I don't think that this is a valid argument. Additionally to 
>>>>>>>> the schedulers we probably just need less than a handful of 
>>>>>>>> reset sources, most likely even just one or two is enough.
>>>>>>>>
>>>>>>>> The only justification I can see of having additional separate 
>>>>>>>> reset sources would be if somebody wants to know if a specific 
>>>>>>>> source has been handled or not (e.g. call flush_work() or 
>>>>>>>> work_pending()). Like in the case of a reset triggered through 
>>>>>>>> debugfs.
>>>>>>>
>>>>>>>
>>>>>>> This is indeed one reason, another is as we said before that if 
>>>>>>> you share 'reset source' (meaning a delayed work) with another 
>>>>>>> client (i.e. RAS and KFD) it means you make assumption that the 
>>>>>>> other client always proceeds with the
>>>>>>> reset exactly the same way as you expect. So today we have this 
>>>>>>> only in scheduler vs non scheduler reset happening - non 
>>>>>>> scheduler reset clients assume the reset is always fully 
>>>>>>> executed in HW while scheduler based reset makes shortcuts and 
>>>>>>> not always does HW reset hence they cannot share 'reset source' 
>>>>>>> (delayed work). Yes, we can always add this in the future if and 
>>>>>>> when such problem will arise but no one will remember this then 
>>>>>>> and a new bug will be introduced and will take time to find and 
>>>>>>> resolve.
>>>>>>
>>>>>> Mhm, so your main concern is that we forget to correctly handle 
>>>>>> the new reset sources?
>>>>>>
>>>>>> How about we do it like this then:
>>>>>>
>>>>>> struct amdgpu_reset_domain {
>>>>>>      ....
>>>>>>      union {
>>>>>>          struct {
>>>>>>              struct work_item debugfs;
>>>>>>              struct work_item ras;
>>>>>>              ....
>>>>>>          };
>>>>>>          struct work_item array[]
>>>>>>      } reset_sources;
>>>>>> }
>>>>>>
>>>>>
>>>>> If it's only about static array,
>>>>>
>>>>> enum amdgpu_reset_soruce {
>>>>>
>>>>> AMDGPU_RESET_SRC_RAS,
>>>>> AMDGPU_RESET_SRC_ABC,
>>>>> .....
>>>>> AMDGPU_RESET_SRC_XYZ,
>>>>> AMDGPU_RESET_SRC_MAX
>>>>>
>>>>> };
>>>>>
>>>>> struct work_struct reset_work[AMDGPU_RESET_SRC_MAX]; => An index 
>>>>> for each work item
>>>>>
>>>>>
>>>>> Thanks,
>>>>> Lijo
>>>>
>>>>
>>>> It's possible though it makes harder to generalize reset_domain 
>>>> later for other drivers.
>>>
>>> The current reset domain queue design is not good for a 
>>> hierarchichal reset within amdgpu itself :)
>>>
>>> Thanks,
>>> Lijo
>>
>>
>> Not sure what do you mean ?
>>
>
> It's tied to the TDR queue in scheduler.
>
> Hierarchichal model - start from reset of lowest level nodes and on 
> failure try with a higher level reset. This model doesn't suit that.
>
> Thanks,
> Lijo


The TDR queue just provides a single threaded context to execute all 
resets. It has no restrictions on what exactly you reset within each 
work item on the queue
so I still don't see a problem. I also don't understand what is lower 
level vs higher level nodes in our case. Is it single node vs hive ?

Andrey


>
>> Andrey
>>
>>
>>>
>>>> But still one caveat, look at amdgpu_recover_work_struct and it's 
>>>> usage in amdgpu_device_gpu_recover and in gpu_recover_get,
>>>> At least for debugfs i need to return back the result of GPU reset 
>>>> and so I cannot store actual work items in the array mentioned above
>>>> but rather pointers to work_item because i need a way to get back 
>>>> the return value from gpu_recover like I do now in 
>>>> amdgpu_device_gpu_recover.
>>>>
>>>> Andrey
>>>>
>>>>
>>>>>
>>>>>> Not 100% sure if that works, but something like that should do 
>>>>>> the trick.
>>>>>>
>>>>>> My main concern is that I don't want to allocate the work items 
>>>>>> on the stack and dynamic allocation (e.g. kmalloc) is usually not 
>>>>>> possible either.
>>>>>>
>>>>>> Additional to that putting/removing work items from a list, array 
>>>>>> or other container is a very common source for race conditions.
>>>>>>
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>>>>
>>>>>>>>> to the cancellation list which you showed above. In current 
>>>>>>>>> way all this done automatically within reset_domain code and 
>>>>>>>>> it's agnostic to specific driver and it's specific list of 
>>>>>>>>> reset sources. Also in case we would want to generalize 
>>>>>>>>> reset_domain to other GPU drivers (which was
>>>>>>>>> a plan as far as i remember) this explicit mention of each 
>>>>>>>>> reset works for cancellation is again less suitable in my 
>>>>>>>>> opinion.
>>>>>>>>
>>>>>>>> Well we could put the work item for the scheduler independent 
>>>>>>>> reset source into the reset domain as well. But I'm not sure 
>>>>>>>> those additional reset sources should be part of any common 
>>>>>>>> handling, that is largely amdgpu specific.
>>>>>>>
>>>>>>>
>>>>>>> So it's for sure more then one source for the reasons described 
>>>>>>> above, also note that for scheduler we already cancel delayed 
>>>>>>> work in drm_sched_stop so calling them again in amdgpu code kind 
>>>>>>> of superfluous.
>>>>>>>
>>>>>>> Andrey
>>>>>>>
>>>>>>>
>>>>>>>>
>>>>>>>> Christian.
>>>>>>>>
>>>>>>>>>
>>>>>>>>> Andrey
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> Andrey
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>> The only difference is I chose to do the canceling right 
>>>>>>>>>>>>> BEFORE the HW reset and not AFTER. I did this because I 
>>>>>>>>>>>>> see a possible race where a new reset request is being 
>>>>>>>>>>>>> generated exactly after we finished the HW reset but 
>>>>>>>>>>>>> before we canceled out all pending resets - in such case 
>>>>>>>>>>>>> you wold not want to cancel this 'border line new' reset 
>>>>>>>>>>>>> request.
>>>>>>>>>>>>
>>>>>>>>>>>> Why not? Any new reset request directly after a hardware 
>>>>>>>>>>>> reset is most likely just falsely generated by the reset 
>>>>>>>>>>>> itself.
>>>>>>>>>>>>
>>>>>>>>>>>> Ideally I would cancel all sources after the reset, but 
>>>>>>>>>>>> before starting any new work.
>>>>>>>>>>>>
>>>>>>>>>>>> Regards,
>>>>>>>>>>>> Christian.
>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> You can see that if many different reset sources share 
>>>>>>>>>>>>>>>> same work struct what can happen is that the first to 
>>>>>>>>>>>>>>>> obtain the lock you describe bellow might opt out from 
>>>>>>>>>>>>>>>> full HW reset because his bad job did signal for 
>>>>>>>>>>>>>>>> example or because his hunged IP block was able to 
>>>>>>>>>>>>>>>> recover through SW reset but in the meantime another 
>>>>>>>>>>>>>>>> reset source who needed an actual HW reset just 
>>>>>>>>>>>>>>>> silently returned and we end up with unhandled reset 
>>>>>>>>>>>>>>>> request. True that today this happens only to job 
>>>>>>>>>>>>>>>> timeout reset sources that are handled form within the 
>>>>>>>>>>>>>>>> scheduler and won't use this single work struct but no 
>>>>>>>>>>>>>>>> one prevents a future case for this to happen and also, 
>>>>>>>>>>>>>>>> if we actually want to unify scheduler time out 
>>>>>>>>>>>>>>>> handlers within reset domain (which seems to me the 
>>>>>>>>>>>>>>>> right design approach) we won't be able to use just one 
>>>>>>>>>>>>>>>> work struct for this reason anyway.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Just to add to this point - a reset domain is 
>>>>>>>>>>>>>>> co-operative domain. In addition to sharing a set of 
>>>>>>>>>>>>>>> clients from various reset sources for one device, it 
>>>>>>>>>>>>>>> also will have a set of devices like in XGMI hive. The 
>>>>>>>>>>>>>>> job timeout on one device may not eventually result in 
>>>>>>>>>>>>>>> result, but a RAS error happening on another device at 
>>>>>>>>>>>>>>> the same time would need a reset. The second device's 
>>>>>>>>>>>>>>> RAS error cannot return seeing that a reset work already 
>>>>>>>>>>>>>>> started, or ignore the reset work given that another 
>>>>>>>>>>>>>>> device has filled the reset data.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> When there is a reset domain, it should take care of the 
>>>>>>>>>>>>>>> work scheduled and keeping it in device or any other 
>>>>>>>>>>>>>>> level doesn't sound good.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Thanks,
>>>>>>>>>>>>>>> Lijo
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> I'd put the reset work struct into the reset_domain 
>>>>>>>>>>>>>>>>> struct. That way you'd have exactly one worker for the 
>>>>>>>>>>>>>>>>> reset domain. You could implement a lock-less scheme 
>>>>>>>>>>>>>>>>> to decide whether you need to schedule a reset, e.g. 
>>>>>>>>>>>>>>>>> using an atomic counter in the shared work struct that 
>>>>>>>>>>>>>>>>> gets incremented when a client wants to trigger a 
>>>>>>>>>>>>>>>>> reset (atomic_add_return). If that counter is exactly 
>>>>>>>>>>>>>>>>> 1 after incrementing, you need to fill in the rest of 
>>>>>>>>>>>>>>>>> the work struct and schedule the work. Otherwise, it's 
>>>>>>>>>>>>>>>>> already scheduled (or another client is in the process 
>>>>>>>>>>>>>>>>> of scheduling it) and you just return. When the worker 
>>>>>>>>>>>>>>>>> finishes (after confirming a successful reset), it 
>>>>>>>>>>>>>>>>> resets the counter to 0, so the next client requesting 
>>>>>>>>>>>>>>>>> a reset will schedule the worker again.
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>   Felix
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> Additional to that keep in mind that you can't 
>>>>>>>>>>>>>>>>>>> allocate any memory before or during the GPU reset 
>>>>>>>>>>>>>>>>>>> nor wait for the reset to complete (so you can't 
>>>>>>>>>>>>>>>>>>> allocate anything on the stack either).
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> There is no dynamic allocation here, regarding stack 
>>>>>>>>>>>>>>>>>> allocations - we do it all the time when we call 
>>>>>>>>>>>>>>>>>> functions, even during GPU resets, how on stack 
>>>>>>>>>>>>>>>>>> allocation of work struct in 
>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover is different from any other 
>>>>>>>>>>>>>>>>>> local variable we allocate in any function we call ?
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> I am also not sure why it's not allowed to wait for 
>>>>>>>>>>>>>>>>>> reset to complete ? Also, see in 
>>>>>>>>>>>>>>>>>> amdgpu_ras_do_recovery and gpu_recover_get (debugfs) 
>>>>>>>>>>>>>>>>>> - the caller expects the reset to complete before he 
>>>>>>>>>>>>>>>>>> returns. I can probably work around it in RAS code by 
>>>>>>>>>>>>>>>>>> calling atomic_set(&ras->in_recovery, 0) from some 
>>>>>>>>>>>>>>>>>> callback within actual reset function but regarding 
>>>>>>>>>>>>>>>>>> sysfs it actually expects a result returned 
>>>>>>>>>>>>>>>>>> indicating whether the call was successful or not.
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> I don't think that concept you try here will work.
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> Also in general seems to me it's cleaner approach 
>>>>>>>>>>>>>>>>>>>> where this logic (the work items) are held and 
>>>>>>>>>>>>>>>>>>>> handled in reset_domain and are not split in each 
>>>>>>>>>>>>>>>>>>>> adev or any other entity. We might want in the 
>>>>>>>>>>>>>>>>>>>> future to even move the scheduler handling into 
>>>>>>>>>>>>>>>>>>>> reset domain since reset domain is supposed to be a 
>>>>>>>>>>>>>>>>>>>> generic things and not only or AMD.
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky 
>>>>>>>>>>>>>>>>>>>>>>>> <andrey.grodzovsky@amd.com>
>>>>>>>>>>>>>>>>>>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com>
>>>>>>>>>>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +---
>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 
>>>>>>>>>>>>>>>>>>>>>>>> +++--
>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 +
>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 
>>>>>>>>>>>>>>>>>>>>>>>> +++++++++++++++++++++-
>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +-
>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++-
>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++-
>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++-
>>>>>>>>>>>>>>>>>>>>>>>>   8 files changed, 104 insertions(+), 24 
>>>>>>>>>>>>>>>>>>>>>>>> deletions(-)
>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>>>>>>>>>> index 4264abc5604d..99efd8317547 100644
>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>>>>>>>>>> @@ -109,6 +109,7 @@
>>>>>>>>>>>>>>>>>>>>>>>>   #include "amdgpu_fdinfo.h"
>>>>>>>>>>>>>>>>>>>>>>>>   #include "amdgpu_mca.h"
>>>>>>>>>>>>>>>>>>>>>>>>   #include "amdgpu_ras.h"
>>>>>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>>>>>>>>>>>>>>>>>>     #define MAX_GPU_INSTANCE 16
>>>>>>>>>>>>>>>>>>>>>>>>   @@ -509,16 +510,6 @@ struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_allowed_register_entry {
>>>>>>>>>>>>>>>>>>>>>>>>       bool grbm_indexed;
>>>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>>>   -enum amd_reset_method {
>>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_NONE = -1,
>>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_LEGACY = 0,
>>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE0,
>>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE1,
>>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE2,
>>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_BACO,
>>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_PCI,
>>>>>>>>>>>>>>>>>>>>>>>> -};
>>>>>>>>>>>>>>>>>>>>>>>> -
>>>>>>>>>>>>>>>>>>>>>>>>   struct amdgpu_video_codec_info {
>>>>>>>>>>>>>>>>>>>>>>>>       u32 codec_type;
>>>>>>>>>>>>>>>>>>>>>>>>       u32 max_width;
>>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>>> index e582f1044c0f..7fa82269c30f 100644
>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>>> @@ -5201,6 +5201,12 @@ int 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>>>>> tmp_vram_lost_counter = 
>>>>>>>>>>>>>>>>>>>>>>>> atomic_read(&((adev)->vram_lost_counter));
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> +    /* Drop all pending resets since we will 
>>>>>>>>>>>>>>>>>>>>>>>> reset now anyway */
>>>>>>>>>>>>>>>>>>>>>>>> +    tmp_adev = 
>>>>>>>>>>>>>>>>>>>>>>>> list_first_entry(device_list_handle, struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device,
>>>>>>>>>>>>>>>>>>>>>>>> + reset_list);
>>>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_pending_list(tmp_adev->reset_domain);
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>       /* Actual ASIC resets if needed.*/
>>>>>>>>>>>>>>>>>>>>>>>>       /* Host driver will handle XGMI hive 
>>>>>>>>>>>>>>>>>>>>>>>> reset for SRIOV */
>>>>>>>>>>>>>>>>>>>>>>>>       if (amdgpu_sriov_vf(adev)) {
>>>>>>>>>>>>>>>>>>>>>>>> @@ -5296,7 +5302,7 @@ int 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>>     struct amdgpu_recover_work_struct {
>>>>>>>>>>>>>>>>>>>>>>>> -    struct work_struct base;
>>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct base;
>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev;
>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_job *job;
>>>>>>>>>>>>>>>>>>>>>>>>       int ret;
>>>>>>>>>>>>>>>>>>>>>>>> @@ -5304,7 +5310,7 @@ struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct {
>>>>>>>>>>>>>>>>>>>>>>>>     static void 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work(struct 
>>>>>>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_recover_work_struct 
>>>>>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base);
>>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_recover_work_struct 
>>>>>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base.base.work);
>>>>>>>>>>>>>>>>>>>>>>>> recover_work->ret = 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, 
>>>>>>>>>>>>>>>>>>>>>>>> recover_work->job);
>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>> @@ -5316,12 +5322,15 @@ int 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>>>> *adev,
>>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_recover_work_struct work = 
>>>>>>>>>>>>>>>>>>>>>>>> {.adev = adev, .job = job};
>>>>>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&work.base, 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&work.base.base, 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&work.base.node);
>>>>>>>>>>>>>>>>>>>>>>>>         if 
>>>>>>>>>>>>>>>>>>>>>>>> (!amdgpu_reset_domain_schedule(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>>>> &work.base))
>>>>>>>>>>>>>>>>>>>>>>>>           return -EAGAIN;
>>>>>>>>>>>>>>>>>>>>>>>>   - flush_work(&work.base);
>>>>>>>>>>>>>>>>>>>>>>>> + flush_delayed_work(&work.base.base);
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>>>> &work.base);
>>>>>>>>>>>>>>>>>>>>>>>>         return work.ret;
>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>>>>>>>>>> index c80af0889773..ffddd419c351 100644
>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>>>>>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain 
>>>>>>>>>>>>>>>>>>>>>>>> *amdgpu_reset_create_reset_domain(enum 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_d
>>>>>>>>>>>>>>>>>>>>>>>> atomic_set(&reset_domain->in_gpu_reset, 0);
>>>>>>>>>>>>>>>>>>>>>>>> init_rwsem(&reset_domain->sem);
>>>>>>>>>>>>>>>>>>>>>>>>   + INIT_LIST_HEAD(&reset_domain->pending_works);
>>>>>>>>>>>>>>>>>>>>>>>> + mutex_init(&reset_domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>       return reset_domain;
>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>>   diff --git 
>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>>>>>>>>>> index 1949dbe28a86..863ec5720fc1 100644
>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>>>>>>>>>> @@ -24,7 +24,18 @@
>>>>>>>>>>>>>>>>>>>>>>>>   #ifndef __AMDGPU_RESET_H__
>>>>>>>>>>>>>>>>>>>>>>>>   #define __AMDGPU_RESET_H__
>>>>>>>>>>>>>>>>>>>>>>>>   -#include "amdgpu.h"
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/atomic.h>
>>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/mutex.h>
>>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/list.h>
>>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/kref.h>
>>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/rwsem.h>
>>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/workqueue.h>
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_device;
>>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_job;
>>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_hive_info;
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>     enum AMDGPU_RESET_FLAGS {
>>>>>>>>>>>>>>>>>>>>>>>>   @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS {
>>>>>>>>>>>>>>>>>>>>>>>> AMDGPU_SKIP_HW_RESET = 1,
>>>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>>>   +
>>>>>>>>>>>>>>>>>>>>>>>> +enum amd_reset_method {
>>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_NONE = -1,
>>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_LEGACY = 0,
>>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE0,
>>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE1,
>>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE2,
>>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_BACO,
>>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_PCI,
>>>>>>>>>>>>>>>>>>>>>>>> +};
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>   struct amdgpu_reset_context {
>>>>>>>>>>>>>>>>>>>>>>>>       enum amd_reset_method method;
>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *reset_req_dev;
>>>>>>>>>>>>>>>>>>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context {
>>>>>>>>>>>>>>>>>>>>>>>>       unsigned long flags;
>>>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>>>   +struct amdgpu_reset_control;
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>   struct amdgpu_reset_handler {
>>>>>>>>>>>>>>>>>>>>>>>>       enum amd_reset_method reset_method;
>>>>>>>>>>>>>>>>>>>>>>>>       struct list_head handler_list;
>>>>>>>>>>>>>>>>>>>>>>>> @@ -76,12 +100,21 @@ enum 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_type {
>>>>>>>>>>>>>>>>>>>>>>>>       XGMI_HIVE
>>>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>>>   +
>>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_reset_work_struct {
>>>>>>>>>>>>>>>>>>>>>>>> +    struct delayed_work base;
>>>>>>>>>>>>>>>>>>>>>>>> +    struct list_head node;
>>>>>>>>>>>>>>>>>>>>>>>> +};
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>   struct amdgpu_reset_domain {
>>>>>>>>>>>>>>>>>>>>>>>>       struct kref refcount;
>>>>>>>>>>>>>>>>>>>>>>>>       struct workqueue_struct *wq;
>>>>>>>>>>>>>>>>>>>>>>>>       enum amdgpu_reset_domain_type type;
>>>>>>>>>>>>>>>>>>>>>>>>       struct rw_semaphore sem;
>>>>>>>>>>>>>>>>>>>>>>>>       atomic_t in_gpu_reset;
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> +    struct list_head pending_works;
>>>>>>>>>>>>>>>>>>>>>>>> +    struct mutex reset_lock;
>>>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>>>     @@ -113,9 +146,43 @@ static inline void 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_put_reset_domain(struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *dom
>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>>     static inline bool 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_schedule(struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain,
>>>>>>>>>>>>>>>>>>>>>>>> - struct work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>> -    return queue_work(domain->wq, work);
>>>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> +    if (!queue_delayed_work(domain->wq, 
>>>>>>>>>>>>>>>>>>>>>>>> &work->base, 0)) {
>>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>>> +        return false;
>>>>>>>>>>>>>>>>>>>>>>>> +    }
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> + list_add_tail(&work->node, 
>>>>>>>>>>>>>>>>>>>>>>>> &domain->pending_works);
>>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> +    return true;
>>>>>>>>>>>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> +static inline void 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain,
>>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>>> + list_del_init(&work->node);
>>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> +static inline void 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_pending_list(struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain)
>>>>>>>>>>>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct *entry, *tmp;
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>>> + list_for_each_entry_safe(entry, tmp, 
>>>>>>>>>>>>>>>>>>>>>>>> &domain->pending_works, node) {
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> + list_del_init(&entry->node);
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> +        /* Stop any other related pending 
>>>>>>>>>>>>>>>>>>>>>>>> resets */
>>>>>>>>>>>>>>>>>>>>>>>> + cancel_delayed_work(&entry->base);
>>>>>>>>>>>>>>>>>>>>>>>> +    }
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>>     void amdgpu_device_lock_reset_domain(struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *reset_domain);
>>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h 
>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>>>>>>>>>> index 239f232f9c02..574e870d3064 100644
>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>>>>>>>>>> @@ -25,6 +25,7 @@
>>>>>>>>>>>>>>>>>>>>>>>>   #define AMDGPU_VIRT_H
>>>>>>>>>>>>>>>>>>>>>>>>     #include "amdgv_sriovmsg.h"
>>>>>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>>>>>>>>>>>>>>>>>>     #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 << 
>>>>>>>>>>>>>>>>>>>>>>>> 0) /* vBIOS is sr-iov ready */
>>>>>>>>>>>>>>>>>>>>>>>>   #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 1) 
>>>>>>>>>>>>>>>>>>>>>>>> /* sr-iov is enabled on this GPU */
>>>>>>>>>>>>>>>>>>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt {
>>>>>>>>>>>>>>>>>>>>>>>>       uint32_t reg_val_offs;
>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_irq_src ack_irq;
>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_irq_src rcv_irq;
>>>>>>>>>>>>>>>>>>>>>>>> -    struct work_struct flr_work;
>>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct flr_work;
>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_mm_table mm_table;
>>>>>>>>>>>>>>>>>>>>>>>>       const struct amdgpu_virt_ops *ops;
>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_vf_error_buffer vf_errors;
>>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>>>>>>>> index b81acf59870c..f3d1c2be9292 100644
>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>>>>>>>> @@ -251,7 +251,7 @@ static int 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>>>>>     static void xgpu_ai_mailbox_flr_work(struct 
>>>>>>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, 
>>>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work);
>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = 
>>>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>>>>>>>>>       int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>>>>>>>>>>>>   @@ -380,7 +380,8 @@ int 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_get_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>>>> *adev)
>>>>>>>>>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>> @@ -389,6 +390,8 @@ void 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_put_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>>>> *adev)
>>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>>     static int xgpu_ai_request_init_data(struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644
>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>>>>>>>> @@ -275,7 +275,7 @@ static int 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>>>>>     static void xgpu_nv_mailbox_flr_work(struct 
>>>>>>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, 
>>>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work);
>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = 
>>>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>>>>>>>>>       int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>>>>>>>>>>>>   @@ -407,7 +407,8 @@ int 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_get_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>>>> *adev)
>>>>>>>>>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>> @@ -416,6 +417,8 @@ void 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_put_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>>>> *adev)
>>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>>     const struct amdgpu_virt_ops 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_virt_ops = {
>>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c 
>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>>>>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644
>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>>>>>>>>>> @@ -512,7 +512,7 @@ static int 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>>>>>     static void xgpu_vi_mailbox_flr_work(struct 
>>>>>>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, 
>>>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work);
>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = 
>>>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>>>>>>>>>         /* wait until RCV_MSG become 3 */
>>>>>>>>>>>>>>>>>>>>>>>> @@ -610,7 +610,8 @@ int 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_get_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>>>> *adev)
>>>>>>>>>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>> @@ -619,6 +620,8 @@ void 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_put_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>>>> *adev)
>>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>>     const struct amdgpu_virt_ops 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_virt_ops = {
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>
>>>>>>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive.
  2022-05-11 15:39                                         ` Christian König
@ 2022-05-11 15:57                                           ` Andrey Grodzovsky
  2022-05-12  6:03                                             ` Christian König
  2022-05-11 20:27                                           ` Andrey Grodzovsky
  1 sibling, 1 reply; 40+ messages in thread
From: Andrey Grodzovsky @ 2022-05-11 15:57 UTC (permalink / raw)
  To: Christian König, Lazar, Lijo, Christian König,
	Felix Kuehling, amd-gfx
  Cc: Bai Zoy


On 2022-05-11 11:39, Christian König wrote:
> Am 11.05.22 um 17:35 schrieb Andrey Grodzovsky:
>> On 2022-05-11 11:20, Lazar, Lijo wrote:
>>>
>>>
>>> On 5/11/2022 7:28 PM, Christian König wrote:
>>>> Am 11.05.22 um 15:43 schrieb Andrey Grodzovsky:
>>>>> On 2022-05-11 03:38, Christian König wrote:
>>>>>> Am 10.05.22 um 20:53 schrieb Andrey Grodzovsky:
>>>>>>> [SNIP]
>>>>>>>> E.g. in the reset code (either before or after the reset, 
>>>>>>>> that's debatable) you do something like this:
>>>>>>>>
>>>>>>>> for (i = 0; i < num_ring; ++i)
>>>>>>>>     cancel_delayed_work(ring[i]->scheduler....)
>>>>>>>> cancel_work(adev->ras_work);
>>>>>>>> cancel_work(adev->iofault_work);
>>>>>>>> cancel_work(adev->debugfs_work);
>>>>>>>> ...
>>>>>>>>
>>>>>>>> You don't really need to track which reset source has fired and 
>>>>>>>> which hasn't, because that would be racy again. Instead just 
>>>>>>>> bluntly reset all possible sources.
>>>>>>>>
>>>>>>>> Christian.
>>>>>>>
>>>>>>>
>>>>>>> I don't say we care if it fired once or twice (I need to add a 
>>>>>>> fix to only insert reset work to pending reset list if it's not 
>>>>>>> already there), the point of using list (or array) to which you 
>>>>>>> add and from which you remove is that the logic of this is 
>>>>>>> encapsulated within reset domain. In your way we need to be 
>>>>>>> aware who exactly schedules reset work and explicitly cancel 
>>>>>>> them, this also means that for any new source added in the 
>>>>>>> future you will need to remember to add him
>>>>>>
>>>>>> I don't think that this is a valid argument. Additionally to the 
>>>>>> schedulers we probably just need less than a handful of reset 
>>>>>> sources, most likely even just one or two is enough.
>>>>>>
>>>>>> The only justification I can see of having additional separate 
>>>>>> reset sources would be if somebody wants to know if a specific 
>>>>>> source has been handled or not (e.g. call flush_work() or 
>>>>>> work_pending()). Like in the case of a reset triggered through 
>>>>>> debugfs.
>>>>>
>>>>>
>>>>> This is indeed one reason, another is as we said before that if 
>>>>> you share 'reset source' (meaning a delayed work) with another 
>>>>> client (i.e. RAS and KFD) it means you make assumption that the 
>>>>> other client always proceeds with the
>>>>> reset exactly the same way as you expect. So today we have this 
>>>>> only in scheduler vs non scheduler reset happening - non scheduler 
>>>>> reset clients assume the reset is always fully executed in HW 
>>>>> while scheduler based reset makes shortcuts and not always does HW 
>>>>> reset hence they cannot share 'reset source' (delayed work). Yes, 
>>>>> we can always add this in the future if and when such problem will 
>>>>> arise but no one will remember this then and a new bug will be 
>>>>> introduced and will take time to find and resolve.
>>>>
>>>> Mhm, so your main concern is that we forget to correctly handle the 
>>>> new reset sources?
>>>>
>>>> How about we do it like this then:
>>>>
>>>> struct amdgpu_reset_domain {
>>>>      ....
>>>>      union {
>>>>          struct {
>>>>              struct work_item debugfs;
>>>>              struct work_item ras;
>>>>              ....
>>>>          };
>>>>          struct work_item array[]
>>>>      } reset_sources;
>>>> }
>>>>
>>>
>>> If it's only about static array,
>>>
>>> enum amdgpu_reset_soruce {
>>>
>>> AMDGPU_RESET_SRC_RAS,
>>> AMDGPU_RESET_SRC_ABC,
>>> .....
>>> AMDGPU_RESET_SRC_XYZ,
>>> AMDGPU_RESET_SRC_MAX
>>>
>>> };
>>>
>>> struct work_struct reset_work[AMDGPU_RESET_SRC_MAX]; => An index for 
>>> each work item
>>>
>>>
>>> Thanks,
>>> Lijo
>>
>>
>> It's possible though it makes harder to generalize reset_domain later 
>> for other drivers.
>> But still one caveat, look at amdgpu_recover_work_struct and it's 
>> usage in amdgpu_device_gpu_recover and in gpu_recover_get,
>> At least for debugfs i need to return back the result of GPU reset 
>> and so I cannot store actual work items in the array mentioned above
>> but rather pointers to work_item because i need a way to get back the 
>> return value from gpu_recover like I do now in 
>> amdgpu_device_gpu_recover.
>
> You should try to avoid that as well.


Why ?


>
> See when the debugfs reset is canceled because of a scheduler reset 
> you won't get a useful return code either.


That true.

>
> What we should do instead is to cache the status of the last reset in 
> the reset domain.


Probably an atomic ret in reset_domain then.

Another 2 points i forgot to ask -

1) What race condition you had in mind about insertion and extraction 
from the list if all is done under lock ?

2) This I asked already - why you opposed so much to allocation on the 
stack ? I understand the problem with dynamic memory allocations but why 
stack ? We do multiple allocations on the stack for any function
we call during GPU reset, what so special in this case where we allocate 
work struct on the stack? It's not like the work struct is especially 
big compared to other stuff we allocate on the stack.

Andrey


>
> Regards,
> Christian.
>
>>
>> Andrey
>>
>>
>>>
>>>> Not 100% sure if that works, but something like that should do the 
>>>> trick.
>>>>
>>>> My main concern is that I don't want to allocate the work items on 
>>>> the stack and dynamic allocation (e.g. kmalloc) is usually not 
>>>> possible either.
>>>>
>>>> Additional to that putting/removing work items from a list, array 
>>>> or other container is a very common source for race conditions.
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>>
>>>>>>> to the cancellation list which you showed above. In current way 
>>>>>>> all this done automatically within reset_domain code and it's 
>>>>>>> agnostic to specific driver and it's specific list of reset 
>>>>>>> sources. Also in case we would want to generalize reset_domain 
>>>>>>> to other GPU drivers (which was
>>>>>>> a plan as far as i remember) this explicit mention of each reset 
>>>>>>> works for cancellation is again less suitable in my opinion.
>>>>>>
>>>>>> Well we could put the work item for the scheduler independent 
>>>>>> reset source into the reset domain as well. But I'm not sure 
>>>>>> those additional reset sources should be part of any common 
>>>>>> handling, that is largely amdgpu specific.
>>>>>
>>>>>
>>>>> So it's for sure more then one source for the reasons described 
>>>>> above, also note that for scheduler we already cancel delayed work 
>>>>> in drm_sched_stop so calling them again in amdgpu code kind of 
>>>>> superfluous.
>>>>>
>>>>> Andrey
>>>>>
>>>>>
>>>>>>
>>>>>> Christian.
>>>>>>
>>>>>>>
>>>>>>> Andrey
>>>>>>>
>>>>>>>
>>>>>>>>
>>>>>>>>>
>>>>>>>>> Andrey
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> The only difference is I chose to do the canceling right 
>>>>>>>>>>> BEFORE the HW reset and not AFTER. I did this because I see 
>>>>>>>>>>> a possible race where a new reset request is being generated 
>>>>>>>>>>> exactly after we finished the HW reset but before we 
>>>>>>>>>>> canceled out all pending resets - in such case you wold not 
>>>>>>>>>>> want to cancel this 'border line new' reset request.
>>>>>>>>>>
>>>>>>>>>> Why not? Any new reset request directly after a hardware 
>>>>>>>>>> reset is most likely just falsely generated by the reset itself.
>>>>>>>>>>
>>>>>>>>>> Ideally I would cancel all sources after the reset, but 
>>>>>>>>>> before starting any new work.
>>>>>>>>>>
>>>>>>>>>> Regards,
>>>>>>>>>> Christian.
>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> Andrey
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> Regards,
>>>>>>>>>>>> Christian.
>>>>>>>>>>>>
>>>>>>>>>>>>>> You can see that if many different reset sources share 
>>>>>>>>>>>>>> same work struct what can happen is that the first to 
>>>>>>>>>>>>>> obtain the lock you describe bellow might opt out from 
>>>>>>>>>>>>>> full HW reset because his bad job did signal for example 
>>>>>>>>>>>>>> or because his hunged IP block was able to recover 
>>>>>>>>>>>>>> through SW reset but in the meantime another reset source 
>>>>>>>>>>>>>> who needed an actual HW reset just silently returned and 
>>>>>>>>>>>>>> we end up with unhandled reset request. True that today 
>>>>>>>>>>>>>> this happens only to job timeout reset sources that are 
>>>>>>>>>>>>>> handled form within the scheduler and won't use this 
>>>>>>>>>>>>>> single work struct but no one prevents a future case for 
>>>>>>>>>>>>>> this to happen and also, if we actually want to unify 
>>>>>>>>>>>>>> scheduler time out handlers within reset domain (which 
>>>>>>>>>>>>>> seems to me the right design approach) we won't be able 
>>>>>>>>>>>>>> to use just one work struct for this reason anyway.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>> Just to add to this point - a reset domain is co-operative 
>>>>>>>>>>>>> domain. In addition to sharing a set of clients from 
>>>>>>>>>>>>> various reset sources for one device, it also will have a 
>>>>>>>>>>>>> set of devices like in XGMI hive. The job timeout on one 
>>>>>>>>>>>>> device may not eventually result in result, but a RAS 
>>>>>>>>>>>>> error happening on another device at the same time would 
>>>>>>>>>>>>> need a reset. The second device's RAS error cannot return 
>>>>>>>>>>>>> seeing that a reset work already started, or ignore the 
>>>>>>>>>>>>> reset work given that another device has filled the reset 
>>>>>>>>>>>>> data.
>>>>>>>>>>>>>
>>>>>>>>>>>>> When there is a reset domain, it should take care of the 
>>>>>>>>>>>>> work scheduled and keeping it in device or any other level 
>>>>>>>>>>>>> doesn't sound good.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Thanks,
>>>>>>>>>>>>> Lijo
>>>>>>>>>>>>>
>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> I'd put the reset work struct into the reset_domain 
>>>>>>>>>>>>>>> struct. That way you'd have exactly one worker for the 
>>>>>>>>>>>>>>> reset domain. You could implement a lock-less scheme to 
>>>>>>>>>>>>>>> decide whether you need to schedule a reset, e.g. using 
>>>>>>>>>>>>>>> an atomic counter in the shared work struct that gets 
>>>>>>>>>>>>>>> incremented when a client wants to trigger a reset 
>>>>>>>>>>>>>>> (atomic_add_return). If that counter is exactly 1 after 
>>>>>>>>>>>>>>> incrementing, you need to fill in the rest of the work 
>>>>>>>>>>>>>>> struct and schedule the work. Otherwise, it's already 
>>>>>>>>>>>>>>> scheduled (or another client is in the process of 
>>>>>>>>>>>>>>> scheduling it) and you just return. When the worker 
>>>>>>>>>>>>>>> finishes (after confirming a successful reset), it 
>>>>>>>>>>>>>>> resets the counter to 0, so the next client requesting a 
>>>>>>>>>>>>>>> reset will schedule the worker again.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>   Felix
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Additional to that keep in mind that you can't 
>>>>>>>>>>>>>>>>> allocate any memory before or during the GPU reset nor 
>>>>>>>>>>>>>>>>> wait for the reset to complete (so you can't allocate 
>>>>>>>>>>>>>>>>> anything on the stack either).
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> There is no dynamic allocation here, regarding stack 
>>>>>>>>>>>>>>>> allocations - we do it all the time when we call 
>>>>>>>>>>>>>>>> functions, even during GPU resets, how on stack 
>>>>>>>>>>>>>>>> allocation of work struct in amdgpu_device_gpu_recover 
>>>>>>>>>>>>>>>> is different from any other local variable we allocate 
>>>>>>>>>>>>>>>> in any function we call ?
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> I am also not sure why it's not allowed to wait for 
>>>>>>>>>>>>>>>> reset to complete ? Also, see in amdgpu_ras_do_recovery 
>>>>>>>>>>>>>>>> and gpu_recover_get (debugfs) - the caller expects the 
>>>>>>>>>>>>>>>> reset to complete before he returns. I can probably 
>>>>>>>>>>>>>>>> work around it in RAS code by calling 
>>>>>>>>>>>>>>>> atomic_set(&ras->in_recovery, 0) from some callback 
>>>>>>>>>>>>>>>> within actual reset function but regarding sysfs it 
>>>>>>>>>>>>>>>> actually expects a result returned indicating whether 
>>>>>>>>>>>>>>>> the call was successful or not.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> I don't think that concept you try here will work.
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> Also in general seems to me it's cleaner approach 
>>>>>>>>>>>>>>>>>> where this logic (the work items) are held and 
>>>>>>>>>>>>>>>>>> handled in reset_domain and are not split in each 
>>>>>>>>>>>>>>>>>> adev or any other entity. We might want in the future 
>>>>>>>>>>>>>>>>>> to even move the scheduler handling into reset domain 
>>>>>>>>>>>>>>>>>> since reset domain is supposed to be a generic things 
>>>>>>>>>>>>>>>>>> and not only or AMD.
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky 
>>>>>>>>>>>>>>>>>>>>>> <andrey.grodzovsky@amd.com>
>>>>>>>>>>>>>>>>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com>
>>>>>>>>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +---
>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 
>>>>>>>>>>>>>>>>>>>>>> +++--
>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 +
>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 
>>>>>>>>>>>>>>>>>>>>>> +++++++++++++++++++++-
>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +-
>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++-
>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++-
>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++-
>>>>>>>>>>>>>>>>>>>>>>   8 files changed, 104 insertions(+), 24 
>>>>>>>>>>>>>>>>>>>>>> deletions(-)
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>>>>>>>> index 4264abc5604d..99efd8317547 100644
>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>>>>>>>> @@ -109,6 +109,7 @@
>>>>>>>>>>>>>>>>>>>>>>   #include "amdgpu_fdinfo.h"
>>>>>>>>>>>>>>>>>>>>>>   #include "amdgpu_mca.h"
>>>>>>>>>>>>>>>>>>>>>>   #include "amdgpu_ras.h"
>>>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>>>>>>>>>>>>>>>>     #define MAX_GPU_INSTANCE 16
>>>>>>>>>>>>>>>>>>>>>>   @@ -509,16 +510,6 @@ struct 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_allowed_register_entry {
>>>>>>>>>>>>>>>>>>>>>>       bool grbm_indexed;
>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>   -enum amd_reset_method {
>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_NONE = -1,
>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_LEGACY = 0,
>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE0,
>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE1,
>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE2,
>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_BACO,
>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_PCI,
>>>>>>>>>>>>>>>>>>>>>> -};
>>>>>>>>>>>>>>>>>>>>>> -
>>>>>>>>>>>>>>>>>>>>>>   struct amdgpu_video_codec_info {
>>>>>>>>>>>>>>>>>>>>>>       u32 codec_type;
>>>>>>>>>>>>>>>>>>>>>>       u32 max_width;
>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>> index e582f1044c0f..7fa82269c30f 100644
>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>> @@ -5201,6 +5201,12 @@ int 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>>> tmp_vram_lost_counter = 
>>>>>>>>>>>>>>>>>>>>>> atomic_read(&((adev)->vram_lost_counter));
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> +    /* Drop all pending resets since we will 
>>>>>>>>>>>>>>>>>>>>>> reset now anyway */
>>>>>>>>>>>>>>>>>>>>>> +    tmp_adev = 
>>>>>>>>>>>>>>>>>>>>>> list_first_entry(device_list_handle, struct 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_device,
>>>>>>>>>>>>>>>>>>>>>> + reset_list);
>>>>>>>>>>>>>>>>>>>>>> + amdgpu_reset_pending_list(tmp_adev->reset_domain);
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>       /* Actual ASIC resets if needed.*/
>>>>>>>>>>>>>>>>>>>>>>       /* Host driver will handle XGMI hive reset 
>>>>>>>>>>>>>>>>>>>>>> for SRIOV */
>>>>>>>>>>>>>>>>>>>>>>       if (amdgpu_sriov_vf(adev)) {
>>>>>>>>>>>>>>>>>>>>>> @@ -5296,7 +5302,7 @@ int 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>     struct amdgpu_recover_work_struct {
>>>>>>>>>>>>>>>>>>>>>> -    struct work_struct base;
>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct base;
>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev;
>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_job *job;
>>>>>>>>>>>>>>>>>>>>>>       int ret;
>>>>>>>>>>>>>>>>>>>>>> @@ -5304,7 +5310,7 @@ struct 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct {
>>>>>>>>>>>>>>>>>>>>>>     static void 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work(struct 
>>>>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_recover_work_struct 
>>>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base);
>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_recover_work_struct 
>>>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base.base.work);
>>>>>>>>>>>>>>>>>>>>>> recover_work->ret = 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, 
>>>>>>>>>>>>>>>>>>>>>> recover_work->job);
>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>> @@ -5316,12 +5322,15 @@ int 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>> *adev,
>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_recover_work_struct work = 
>>>>>>>>>>>>>>>>>>>>>> {.adev = adev, .job = job};
>>>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&work.base, 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&work.base.base, 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&work.base.node);
>>>>>>>>>>>>>>>>>>>>>>         if 
>>>>>>>>>>>>>>>>>>>>>> (!amdgpu_reset_domain_schedule(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>> &work.base))
>>>>>>>>>>>>>>>>>>>>>>           return -EAGAIN;
>>>>>>>>>>>>>>>>>>>>>>   - flush_work(&work.base);
>>>>>>>>>>>>>>>>>>>>>> + flush_delayed_work(&work.base.base);
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>> &work.base);
>>>>>>>>>>>>>>>>>>>>>>         return work.ret;
>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>>>>>>>> index c80af0889773..ffddd419c351 100644
>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>>>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain 
>>>>>>>>>>>>>>>>>>>>>> *amdgpu_reset_create_reset_domain(enum 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_d
>>>>>>>>>>>>>>>>>>>>>> atomic_set(&reset_domain->in_gpu_reset, 0);
>>>>>>>>>>>>>>>>>>>>>> init_rwsem(&reset_domain->sem);
>>>>>>>>>>>>>>>>>>>>>>   + INIT_LIST_HEAD(&reset_domain->pending_works);
>>>>>>>>>>>>>>>>>>>>>> + mutex_init(&reset_domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>       return reset_domain;
>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>   diff --git 
>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>>>>>>>> index 1949dbe28a86..863ec5720fc1 100644
>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>>>>>>>> @@ -24,7 +24,18 @@
>>>>>>>>>>>>>>>>>>>>>>   #ifndef __AMDGPU_RESET_H__
>>>>>>>>>>>>>>>>>>>>>>   #define __AMDGPU_RESET_H__
>>>>>>>>>>>>>>>>>>>>>>   -#include "amdgpu.h"
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> +#include <linux/atomic.h>
>>>>>>>>>>>>>>>>>>>>>> +#include <linux/mutex.h>
>>>>>>>>>>>>>>>>>>>>>> +#include <linux/list.h>
>>>>>>>>>>>>>>>>>>>>>> +#include <linux/kref.h>
>>>>>>>>>>>>>>>>>>>>>> +#include <linux/rwsem.h>
>>>>>>>>>>>>>>>>>>>>>> +#include <linux/workqueue.h>
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_device;
>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_job;
>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_hive_info;
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>     enum AMDGPU_RESET_FLAGS {
>>>>>>>>>>>>>>>>>>>>>>   @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS {
>>>>>>>>>>>>>>>>>>>>>> AMDGPU_SKIP_HW_RESET = 1,
>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>   +
>>>>>>>>>>>>>>>>>>>>>> +enum amd_reset_method {
>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_NONE = -1,
>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_LEGACY = 0,
>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE0,
>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE1,
>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE2,
>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_BACO,
>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_PCI,
>>>>>>>>>>>>>>>>>>>>>> +};
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>   struct amdgpu_reset_context {
>>>>>>>>>>>>>>>>>>>>>>       enum amd_reset_method method;
>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *reset_req_dev;
>>>>>>>>>>>>>>>>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context {
>>>>>>>>>>>>>>>>>>>>>>       unsigned long flags;
>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>   +struct amdgpu_reset_control;
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>   struct amdgpu_reset_handler {
>>>>>>>>>>>>>>>>>>>>>>       enum amd_reset_method reset_method;
>>>>>>>>>>>>>>>>>>>>>>       struct list_head handler_list;
>>>>>>>>>>>>>>>>>>>>>> @@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type {
>>>>>>>>>>>>>>>>>>>>>>       XGMI_HIVE
>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>   +
>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_reset_work_struct {
>>>>>>>>>>>>>>>>>>>>>> +    struct delayed_work base;
>>>>>>>>>>>>>>>>>>>>>> +    struct list_head node;
>>>>>>>>>>>>>>>>>>>>>> +};
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>   struct amdgpu_reset_domain {
>>>>>>>>>>>>>>>>>>>>>>       struct kref refcount;
>>>>>>>>>>>>>>>>>>>>>>       struct workqueue_struct *wq;
>>>>>>>>>>>>>>>>>>>>>>       enum amdgpu_reset_domain_type type;
>>>>>>>>>>>>>>>>>>>>>>       struct rw_semaphore sem;
>>>>>>>>>>>>>>>>>>>>>>       atomic_t in_gpu_reset;
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> +    struct list_head pending_works;
>>>>>>>>>>>>>>>>>>>>>> +    struct mutex reset_lock;
>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>     @@ -113,9 +146,43 @@ static inline void 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_put_reset_domain(struct 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *dom
>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>     static inline bool 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_schedule(struct 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain,
>>>>>>>>>>>>>>>>>>>>>> - struct work_struct *work)
>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>> -    return queue_work(domain->wq, work);
>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> +    if (!queue_delayed_work(domain->wq, 
>>>>>>>>>>>>>>>>>>>>>> &work->base, 0)) {
>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>> +        return false;
>>>>>>>>>>>>>>>>>>>>>> +    }
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> + list_add_tail(&work->node, 
>>>>>>>>>>>>>>>>>>>>>> &domain->pending_works);
>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> +    return true;
>>>>>>>>>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> +static inline void 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(struct 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain,
>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work)
>>>>>>>>>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>> + list_del_init(&work->node);
>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> +static inline void 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_pending_list(struct 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain)
>>>>>>>>>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct *entry, *tmp;
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>> + list_for_each_entry_safe(entry, tmp, 
>>>>>>>>>>>>>>>>>>>>>> &domain->pending_works, node) {
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> + list_del_init(&entry->node);
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> +        /* Stop any other related pending resets */
>>>>>>>>>>>>>>>>>>>>>> + cancel_delayed_work(&entry->base);
>>>>>>>>>>>>>>>>>>>>>> +    }
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>     void amdgpu_device_lock_reset_domain(struct 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *reset_domain);
>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h 
>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>>>>>>>> index 239f232f9c02..574e870d3064 100644
>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>>>>>>>> @@ -25,6 +25,7 @@
>>>>>>>>>>>>>>>>>>>>>>   #define AMDGPU_VIRT_H
>>>>>>>>>>>>>>>>>>>>>>     #include "amdgv_sriovmsg.h"
>>>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>>>>>>>>>>>>>>>>     #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 << 
>>>>>>>>>>>>>>>>>>>>>> 0) /* vBIOS is sr-iov ready */
>>>>>>>>>>>>>>>>>>>>>>   #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 1) 
>>>>>>>>>>>>>>>>>>>>>> /* sr-iov is enabled on this GPU */
>>>>>>>>>>>>>>>>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt {
>>>>>>>>>>>>>>>>>>>>>>       uint32_t reg_val_offs;
>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_irq_src ack_irq;
>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_irq_src rcv_irq;
>>>>>>>>>>>>>>>>>>>>>> -    struct work_struct flr_work;
>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct flr_work;
>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_mm_table mm_table;
>>>>>>>>>>>>>>>>>>>>>>       const struct amdgpu_virt_ops *ops;
>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_vf_error_buffer vf_errors;
>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>>>>>> index b81acf59870c..f3d1c2be9292 100644
>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>>>>>> @@ -251,7 +251,7 @@ static int 
>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>> *adev,
>>>>>>>>>>>>>>>>>>>>>>     static void xgpu_ai_mailbox_flr_work(struct 
>>>>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, 
>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work);
>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = 
>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>>>>>>>       int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>>>>>>>>>>   @@ -380,7 +380,8 @@ int 
>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_get_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>> @@ -389,6 +390,8 @@ void 
>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_put_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>     static int xgpu_ai_request_init_data(struct 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644
>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>>>>>> @@ -275,7 +275,7 @@ static int 
>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>> *adev,
>>>>>>>>>>>>>>>>>>>>>>     static void xgpu_nv_mailbox_flr_work(struct 
>>>>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, 
>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work);
>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = 
>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>>>>>>>       int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>>>>>>>>>>   @@ -407,7 +407,8 @@ int 
>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_get_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>> @@ -416,6 +417,8 @@ void 
>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_put_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>     const struct amdgpu_virt_ops xgpu_nv_virt_ops 
>>>>>>>>>>>>>>>>>>>>>> = {
>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c 
>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644
>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>>>>>>>> @@ -512,7 +512,7 @@ static int 
>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>> *adev,
>>>>>>>>>>>>>>>>>>>>>>     static void xgpu_vi_mailbox_flr_work(struct 
>>>>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, 
>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work);
>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = 
>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>>>>>>>         /* wait until RCV_MSG become 3 */
>>>>>>>>>>>>>>>>>>>>>> @@ -610,7 +610,8 @@ int 
>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_get_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>> @@ -619,6 +620,8 @@ void 
>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_put_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>     const struct amdgpu_virt_ops xgpu_vi_virt_ops 
>>>>>>>>>>>>>>>>>>>>>> = {
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>
>>>>>>
>>>>
>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive.
  2022-05-11 15:39                                         ` Christian König
  2022-05-11 15:57                                           ` Andrey Grodzovsky
@ 2022-05-11 20:27                                           ` Andrey Grodzovsky
  2022-05-12  6:06                                             ` Christian König
  1 sibling, 1 reply; 40+ messages in thread
From: Andrey Grodzovsky @ 2022-05-11 20:27 UTC (permalink / raw)
  To: Christian König, Lazar, Lijo, Christian König,
	Felix Kuehling, amd-gfx
  Cc: Bai Zoy


On 2022-05-11 11:39, Christian König wrote:
> Am 11.05.22 um 17:35 schrieb Andrey Grodzovsky:
>> On 2022-05-11 11:20, Lazar, Lijo wrote:
>>>
>>>
>>> On 5/11/2022 7:28 PM, Christian König wrote:
>>>> Am 11.05.22 um 15:43 schrieb Andrey Grodzovsky:
>>>>> On 2022-05-11 03:38, Christian König wrote:
>>>>>> Am 10.05.22 um 20:53 schrieb Andrey Grodzovsky:
>>>>>>> [SNIP]
>>>>>>>> E.g. in the reset code (either before or after the reset, 
>>>>>>>> that's debatable) you do something like this:
>>>>>>>>
>>>>>>>> for (i = 0; i < num_ring; ++i)
>>>>>>>>     cancel_delayed_work(ring[i]->scheduler....)
>>>>>>>> cancel_work(adev->ras_work);
>>>>>>>> cancel_work(adev->iofault_work);
>>>>>>>> cancel_work(adev->debugfs_work);
>>>>>>>> ...
>>>>>>>>
>>>>>>>> You don't really need to track which reset source has fired and 
>>>>>>>> which hasn't, because that would be racy again. Instead just 
>>>>>>>> bluntly reset all possible sources.
>>>>>>>>
>>>>>>>> Christian.
>>>>>>>
>>>>>>>
>>>>>>> I don't say we care if it fired once or twice (I need to add a 
>>>>>>> fix to only insert reset work to pending reset list if it's not 
>>>>>>> already there), the point of using list (or array) to which you 
>>>>>>> add and from which you remove is that the logic of this is 
>>>>>>> encapsulated within reset domain. In your way we need to be 
>>>>>>> aware who exactly schedules reset work and explicitly cancel 
>>>>>>> them, this also means that for any new source added in the 
>>>>>>> future you will need to remember to add him
>>>>>>
>>>>>> I don't think that this is a valid argument. Additionally to the 
>>>>>> schedulers we probably just need less than a handful of reset 
>>>>>> sources, most likely even just one or two is enough.
>>>>>>
>>>>>> The only justification I can see of having additional separate 
>>>>>> reset sources would be if somebody wants to know if a specific 
>>>>>> source has been handled or not (e.g. call flush_work() or 
>>>>>> work_pending()). Like in the case of a reset triggered through 
>>>>>> debugfs.
>>>>>
>>>>>
>>>>> This is indeed one reason, another is as we said before that if 
>>>>> you share 'reset source' (meaning a delayed work) with another 
>>>>> client (i.e. RAS and KFD) it means you make assumption that the 
>>>>> other client always proceeds with the
>>>>> reset exactly the same way as you expect. So today we have this 
>>>>> only in scheduler vs non scheduler reset happening - non scheduler 
>>>>> reset clients assume the reset is always fully executed in HW 
>>>>> while scheduler based reset makes shortcuts and not always does HW 
>>>>> reset hence they cannot share 'reset source' (delayed work). Yes, 
>>>>> we can always add this in the future if and when such problem will 
>>>>> arise but no one will remember this then and a new bug will be 
>>>>> introduced and will take time to find and resolve.
>>>>
>>>> Mhm, so your main concern is that we forget to correctly handle the 
>>>> new reset sources?
>>>>
>>>> How about we do it like this then:
>>>>
>>>> struct amdgpu_reset_domain {
>>>>      ....
>>>>      union {
>>>>          struct {
>>>>              struct work_item debugfs;
>>>>              struct work_item ras;
>>>>              ....
>>>>          };
>>>>          struct work_item array[]
>>>>      } reset_sources;
>>>> }
>>>>
>>>
>>> If it's only about static array,
>>>
>>> enum amdgpu_reset_soruce {
>>>
>>> AMDGPU_RESET_SRC_RAS,
>>> AMDGPU_RESET_SRC_ABC,
>>> .....
>>> AMDGPU_RESET_SRC_XYZ,
>>> AMDGPU_RESET_SRC_MAX
>>>
>>> };
>>>
>>> struct work_struct reset_work[AMDGPU_RESET_SRC_MAX]; => An index for 
>>> each work item
>>>
>>>
>>> Thanks,
>>> Lijo
>>
>>
>> It's possible though it makes harder to generalize reset_domain later 
>> for other drivers.
>> But still one caveat, look at amdgpu_recover_work_struct and it's 
>> usage in amdgpu_device_gpu_recover and in gpu_recover_get,
>> At least for debugfs i need to return back the result of GPU reset 
>> and so I cannot store actual work items in the array mentioned above
>> but rather pointers to work_item because i need a way to get back the 
>> return value from gpu_recover like I do now in 
>> amdgpu_device_gpu_recover.
>
> You should try to avoid that as well.
>
> See when the debugfs reset is canceled because of a scheduler reset 
> you won't get a useful return code either.
>
> What we should do instead is to cache the status of the last reset in 
> the reset domain.
>
> Regards,
> Christian.


Another problem with this approach -  to execute  the actaul GPU reset I 
need accesses  to concrete amdgpu_device pointer from work struct (see 
xgpu_ai_mailbox_flr_work) as example. If i store all work items in
array in amdgpu_reset_domain the most i can only retrieve is the 
reset_domain struct itself which won't help since it's dynamically 
allocated, not embedded in hive or adev and can can be one per device or 
per entire hive in case of XGMI and so there is no way for me to reach 
back to amdgpu_device. Back pointer to adev* from amdgpu_reset_domain 
will only work for single device but not for XGMI hive where there are 
multiple devices in a hive.

Andrey


>
>>
>> Andrey
>>
>>
>>>
>>>> Not 100% sure if that works, but something like that should do the 
>>>> trick.
>>>>
>>>> My main concern is that I don't want to allocate the work items on 
>>>> the stack and dynamic allocation (e.g. kmalloc) is usually not 
>>>> possible either.
>>>>
>>>> Additional to that putting/removing work items from a list, array 
>>>> or other container is a very common source for race conditions.
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>>
>>>>>>> to the cancellation list which you showed above. In current way 
>>>>>>> all this done automatically within reset_domain code and it's 
>>>>>>> agnostic to specific driver and it's specific list of reset 
>>>>>>> sources. Also in case we would want to generalize reset_domain 
>>>>>>> to other GPU drivers (which was
>>>>>>> a plan as far as i remember) this explicit mention of each reset 
>>>>>>> works for cancellation is again less suitable in my opinion.
>>>>>>
>>>>>> Well we could put the work item for the scheduler independent 
>>>>>> reset source into the reset domain as well. But I'm not sure 
>>>>>> those additional reset sources should be part of any common 
>>>>>> handling, that is largely amdgpu specific.
>>>>>
>>>>>
>>>>> So it's for sure more then one source for the reasons described 
>>>>> above, also note that for scheduler we already cancel delayed work 
>>>>> in drm_sched_stop so calling them again in amdgpu code kind of 
>>>>> superfluous.
>>>>>
>>>>> Andrey
>>>>>
>>>>>
>>>>>>
>>>>>> Christian.
>>>>>>
>>>>>>>
>>>>>>> Andrey
>>>>>>>
>>>>>>>
>>>>>>>>
>>>>>>>>>
>>>>>>>>> Andrey
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> The only difference is I chose to do the canceling right 
>>>>>>>>>>> BEFORE the HW reset and not AFTER. I did this because I see 
>>>>>>>>>>> a possible race where a new reset request is being generated 
>>>>>>>>>>> exactly after we finished the HW reset but before we 
>>>>>>>>>>> canceled out all pending resets - in such case you wold not 
>>>>>>>>>>> want to cancel this 'border line new' reset request.
>>>>>>>>>>
>>>>>>>>>> Why not? Any new reset request directly after a hardware 
>>>>>>>>>> reset is most likely just falsely generated by the reset itself.
>>>>>>>>>>
>>>>>>>>>> Ideally I would cancel all sources after the reset, but 
>>>>>>>>>> before starting any new work.
>>>>>>>>>>
>>>>>>>>>> Regards,
>>>>>>>>>> Christian.
>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> Andrey
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> Regards,
>>>>>>>>>>>> Christian.
>>>>>>>>>>>>
>>>>>>>>>>>>>> You can see that if many different reset sources share 
>>>>>>>>>>>>>> same work struct what can happen is that the first to 
>>>>>>>>>>>>>> obtain the lock you describe bellow might opt out from 
>>>>>>>>>>>>>> full HW reset because his bad job did signal for example 
>>>>>>>>>>>>>> or because his hunged IP block was able to recover 
>>>>>>>>>>>>>> through SW reset but in the meantime another reset source 
>>>>>>>>>>>>>> who needed an actual HW reset just silently returned and 
>>>>>>>>>>>>>> we end up with unhandled reset request. True that today 
>>>>>>>>>>>>>> this happens only to job timeout reset sources that are 
>>>>>>>>>>>>>> handled form within the scheduler and won't use this 
>>>>>>>>>>>>>> single work struct but no one prevents a future case for 
>>>>>>>>>>>>>> this to happen and also, if we actually want to unify 
>>>>>>>>>>>>>> scheduler time out handlers within reset domain (which 
>>>>>>>>>>>>>> seems to me the right design approach) we won't be able 
>>>>>>>>>>>>>> to use just one work struct for this reason anyway.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>> Just to add to this point - a reset domain is co-operative 
>>>>>>>>>>>>> domain. In addition to sharing a set of clients from 
>>>>>>>>>>>>> various reset sources for one device, it also will have a 
>>>>>>>>>>>>> set of devices like in XGMI hive. The job timeout on one 
>>>>>>>>>>>>> device may not eventually result in result, but a RAS 
>>>>>>>>>>>>> error happening on another device at the same time would 
>>>>>>>>>>>>> need a reset. The second device's RAS error cannot return 
>>>>>>>>>>>>> seeing that a reset work already started, or ignore the 
>>>>>>>>>>>>> reset work given that another device has filled the reset 
>>>>>>>>>>>>> data.
>>>>>>>>>>>>>
>>>>>>>>>>>>> When there is a reset domain, it should take care of the 
>>>>>>>>>>>>> work scheduled and keeping it in device or any other level 
>>>>>>>>>>>>> doesn't sound good.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Thanks,
>>>>>>>>>>>>> Lijo
>>>>>>>>>>>>>
>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> I'd put the reset work struct into the reset_domain 
>>>>>>>>>>>>>>> struct. That way you'd have exactly one worker for the 
>>>>>>>>>>>>>>> reset domain. You could implement a lock-less scheme to 
>>>>>>>>>>>>>>> decide whether you need to schedule a reset, e.g. using 
>>>>>>>>>>>>>>> an atomic counter in the shared work struct that gets 
>>>>>>>>>>>>>>> incremented when a client wants to trigger a reset 
>>>>>>>>>>>>>>> (atomic_add_return). If that counter is exactly 1 after 
>>>>>>>>>>>>>>> incrementing, you need to fill in the rest of the work 
>>>>>>>>>>>>>>> struct and schedule the work. Otherwise, it's already 
>>>>>>>>>>>>>>> scheduled (or another client is in the process of 
>>>>>>>>>>>>>>> scheduling it) and you just return. When the worker 
>>>>>>>>>>>>>>> finishes (after confirming a successful reset), it 
>>>>>>>>>>>>>>> resets the counter to 0, so the next client requesting a 
>>>>>>>>>>>>>>> reset will schedule the worker again.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>   Felix
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Additional to that keep in mind that you can't 
>>>>>>>>>>>>>>>>> allocate any memory before or during the GPU reset nor 
>>>>>>>>>>>>>>>>> wait for the reset to complete (so you can't allocate 
>>>>>>>>>>>>>>>>> anything on the stack either).
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> There is no dynamic allocation here, regarding stack 
>>>>>>>>>>>>>>>> allocations - we do it all the time when we call 
>>>>>>>>>>>>>>>> functions, even during GPU resets, how on stack 
>>>>>>>>>>>>>>>> allocation of work struct in amdgpu_device_gpu_recover 
>>>>>>>>>>>>>>>> is different from any other local variable we allocate 
>>>>>>>>>>>>>>>> in any function we call ?
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> I am also not sure why it's not allowed to wait for 
>>>>>>>>>>>>>>>> reset to complete ? Also, see in amdgpu_ras_do_recovery 
>>>>>>>>>>>>>>>> and gpu_recover_get (debugfs) - the caller expects the 
>>>>>>>>>>>>>>>> reset to complete before he returns. I can probably 
>>>>>>>>>>>>>>>> work around it in RAS code by calling 
>>>>>>>>>>>>>>>> atomic_set(&ras->in_recovery, 0) from some callback 
>>>>>>>>>>>>>>>> within actual reset function but regarding sysfs it 
>>>>>>>>>>>>>>>> actually expects a result returned indicating whether 
>>>>>>>>>>>>>>>> the call was successful or not.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> I don't think that concept you try here will work.
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> Also in general seems to me it's cleaner approach 
>>>>>>>>>>>>>>>>>> where this logic (the work items) are held and 
>>>>>>>>>>>>>>>>>> handled in reset_domain and are not split in each 
>>>>>>>>>>>>>>>>>> adev or any other entity. We might want in the future 
>>>>>>>>>>>>>>>>>> to even move the scheduler handling into reset domain 
>>>>>>>>>>>>>>>>>> since reset domain is supposed to be a generic things 
>>>>>>>>>>>>>>>>>> and not only or AMD.
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky 
>>>>>>>>>>>>>>>>>>>>>> <andrey.grodzovsky@amd.com>
>>>>>>>>>>>>>>>>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com>
>>>>>>>>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +---
>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 
>>>>>>>>>>>>>>>>>>>>>> +++--
>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 +
>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 
>>>>>>>>>>>>>>>>>>>>>> +++++++++++++++++++++-
>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +-
>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++-
>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++-
>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++-
>>>>>>>>>>>>>>>>>>>>>>   8 files changed, 104 insertions(+), 24 
>>>>>>>>>>>>>>>>>>>>>> deletions(-)
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>>>>>>>> index 4264abc5604d..99efd8317547 100644
>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>>>>>>>> @@ -109,6 +109,7 @@
>>>>>>>>>>>>>>>>>>>>>>   #include "amdgpu_fdinfo.h"
>>>>>>>>>>>>>>>>>>>>>>   #include "amdgpu_mca.h"
>>>>>>>>>>>>>>>>>>>>>>   #include "amdgpu_ras.h"
>>>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>>>>>>>>>>>>>>>>     #define MAX_GPU_INSTANCE 16
>>>>>>>>>>>>>>>>>>>>>>   @@ -509,16 +510,6 @@ struct 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_allowed_register_entry {
>>>>>>>>>>>>>>>>>>>>>>       bool grbm_indexed;
>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>   -enum amd_reset_method {
>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_NONE = -1,
>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_LEGACY = 0,
>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE0,
>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE1,
>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE2,
>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_BACO,
>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_PCI,
>>>>>>>>>>>>>>>>>>>>>> -};
>>>>>>>>>>>>>>>>>>>>>> -
>>>>>>>>>>>>>>>>>>>>>>   struct amdgpu_video_codec_info {
>>>>>>>>>>>>>>>>>>>>>>       u32 codec_type;
>>>>>>>>>>>>>>>>>>>>>>       u32 max_width;
>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>> index e582f1044c0f..7fa82269c30f 100644
>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>> @@ -5201,6 +5201,12 @@ int 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>>> tmp_vram_lost_counter = 
>>>>>>>>>>>>>>>>>>>>>> atomic_read(&((adev)->vram_lost_counter));
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> +    /* Drop all pending resets since we will 
>>>>>>>>>>>>>>>>>>>>>> reset now anyway */
>>>>>>>>>>>>>>>>>>>>>> +    tmp_adev = 
>>>>>>>>>>>>>>>>>>>>>> list_first_entry(device_list_handle, struct 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_device,
>>>>>>>>>>>>>>>>>>>>>> + reset_list);
>>>>>>>>>>>>>>>>>>>>>> + amdgpu_reset_pending_list(tmp_adev->reset_domain);
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>       /* Actual ASIC resets if needed.*/
>>>>>>>>>>>>>>>>>>>>>>       /* Host driver will handle XGMI hive reset 
>>>>>>>>>>>>>>>>>>>>>> for SRIOV */
>>>>>>>>>>>>>>>>>>>>>>       if (amdgpu_sriov_vf(adev)) {
>>>>>>>>>>>>>>>>>>>>>> @@ -5296,7 +5302,7 @@ int 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>     struct amdgpu_recover_work_struct {
>>>>>>>>>>>>>>>>>>>>>> -    struct work_struct base;
>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct base;
>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev;
>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_job *job;
>>>>>>>>>>>>>>>>>>>>>>       int ret;
>>>>>>>>>>>>>>>>>>>>>> @@ -5304,7 +5310,7 @@ struct 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct {
>>>>>>>>>>>>>>>>>>>>>>     static void 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work(struct 
>>>>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_recover_work_struct 
>>>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base);
>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_recover_work_struct 
>>>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base.base.work);
>>>>>>>>>>>>>>>>>>>>>> recover_work->ret = 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, 
>>>>>>>>>>>>>>>>>>>>>> recover_work->job);
>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>> @@ -5316,12 +5322,15 @@ int 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>> *adev,
>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_recover_work_struct work = 
>>>>>>>>>>>>>>>>>>>>>> {.adev = adev, .job = job};
>>>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&work.base, 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&work.base.base, 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&work.base.node);
>>>>>>>>>>>>>>>>>>>>>>         if 
>>>>>>>>>>>>>>>>>>>>>> (!amdgpu_reset_domain_schedule(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>> &work.base))
>>>>>>>>>>>>>>>>>>>>>>           return -EAGAIN;
>>>>>>>>>>>>>>>>>>>>>>   - flush_work(&work.base);
>>>>>>>>>>>>>>>>>>>>>> + flush_delayed_work(&work.base.base);
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>> &work.base);
>>>>>>>>>>>>>>>>>>>>>>         return work.ret;
>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>>>>>>>> index c80af0889773..ffddd419c351 100644
>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>>>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain 
>>>>>>>>>>>>>>>>>>>>>> *amdgpu_reset_create_reset_domain(enum 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_d
>>>>>>>>>>>>>>>>>>>>>> atomic_set(&reset_domain->in_gpu_reset, 0);
>>>>>>>>>>>>>>>>>>>>>> init_rwsem(&reset_domain->sem);
>>>>>>>>>>>>>>>>>>>>>>   + INIT_LIST_HEAD(&reset_domain->pending_works);
>>>>>>>>>>>>>>>>>>>>>> + mutex_init(&reset_domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>       return reset_domain;
>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>   diff --git 
>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>>>>>>>> index 1949dbe28a86..863ec5720fc1 100644
>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>>>>>>>> @@ -24,7 +24,18 @@
>>>>>>>>>>>>>>>>>>>>>>   #ifndef __AMDGPU_RESET_H__
>>>>>>>>>>>>>>>>>>>>>>   #define __AMDGPU_RESET_H__
>>>>>>>>>>>>>>>>>>>>>>   -#include "amdgpu.h"
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> +#include <linux/atomic.h>
>>>>>>>>>>>>>>>>>>>>>> +#include <linux/mutex.h>
>>>>>>>>>>>>>>>>>>>>>> +#include <linux/list.h>
>>>>>>>>>>>>>>>>>>>>>> +#include <linux/kref.h>
>>>>>>>>>>>>>>>>>>>>>> +#include <linux/rwsem.h>
>>>>>>>>>>>>>>>>>>>>>> +#include <linux/workqueue.h>
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_device;
>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_job;
>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_hive_info;
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>     enum AMDGPU_RESET_FLAGS {
>>>>>>>>>>>>>>>>>>>>>>   @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS {
>>>>>>>>>>>>>>>>>>>>>> AMDGPU_SKIP_HW_RESET = 1,
>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>   +
>>>>>>>>>>>>>>>>>>>>>> +enum amd_reset_method {
>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_NONE = -1,
>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_LEGACY = 0,
>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE0,
>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE1,
>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE2,
>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_BACO,
>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_PCI,
>>>>>>>>>>>>>>>>>>>>>> +};
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>   struct amdgpu_reset_context {
>>>>>>>>>>>>>>>>>>>>>>       enum amd_reset_method method;
>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *reset_req_dev;
>>>>>>>>>>>>>>>>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context {
>>>>>>>>>>>>>>>>>>>>>>       unsigned long flags;
>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>   +struct amdgpu_reset_control;
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>   struct amdgpu_reset_handler {
>>>>>>>>>>>>>>>>>>>>>>       enum amd_reset_method reset_method;
>>>>>>>>>>>>>>>>>>>>>>       struct list_head handler_list;
>>>>>>>>>>>>>>>>>>>>>> @@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type {
>>>>>>>>>>>>>>>>>>>>>>       XGMI_HIVE
>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>   +
>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_reset_work_struct {
>>>>>>>>>>>>>>>>>>>>>> +    struct delayed_work base;
>>>>>>>>>>>>>>>>>>>>>> +    struct list_head node;
>>>>>>>>>>>>>>>>>>>>>> +};
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>   struct amdgpu_reset_domain {
>>>>>>>>>>>>>>>>>>>>>>       struct kref refcount;
>>>>>>>>>>>>>>>>>>>>>>       struct workqueue_struct *wq;
>>>>>>>>>>>>>>>>>>>>>>       enum amdgpu_reset_domain_type type;
>>>>>>>>>>>>>>>>>>>>>>       struct rw_semaphore sem;
>>>>>>>>>>>>>>>>>>>>>>       atomic_t in_gpu_reset;
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> +    struct list_head pending_works;
>>>>>>>>>>>>>>>>>>>>>> +    struct mutex reset_lock;
>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>     @@ -113,9 +146,43 @@ static inline void 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_put_reset_domain(struct 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *dom
>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>     static inline bool 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_schedule(struct 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain,
>>>>>>>>>>>>>>>>>>>>>> - struct work_struct *work)
>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>> -    return queue_work(domain->wq, work);
>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> +    if (!queue_delayed_work(domain->wq, 
>>>>>>>>>>>>>>>>>>>>>> &work->base, 0)) {
>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>> +        return false;
>>>>>>>>>>>>>>>>>>>>>> +    }
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> + list_add_tail(&work->node, 
>>>>>>>>>>>>>>>>>>>>>> &domain->pending_works);
>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> +    return true;
>>>>>>>>>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> +static inline void 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(struct 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain,
>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work)
>>>>>>>>>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>> + list_del_init(&work->node);
>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> +static inline void 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_pending_list(struct 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain)
>>>>>>>>>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct *entry, *tmp;
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>> + list_for_each_entry_safe(entry, tmp, 
>>>>>>>>>>>>>>>>>>>>>> &domain->pending_works, node) {
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> + list_del_init(&entry->node);
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> +        /* Stop any other related pending resets */
>>>>>>>>>>>>>>>>>>>>>> + cancel_delayed_work(&entry->base);
>>>>>>>>>>>>>>>>>>>>>> +    }
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>     void amdgpu_device_lock_reset_domain(struct 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *reset_domain);
>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h 
>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>>>>>>>> index 239f232f9c02..574e870d3064 100644
>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>>>>>>>> @@ -25,6 +25,7 @@
>>>>>>>>>>>>>>>>>>>>>>   #define AMDGPU_VIRT_H
>>>>>>>>>>>>>>>>>>>>>>     #include "amdgv_sriovmsg.h"
>>>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>>>>>>>>>>>>>>>>     #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 << 
>>>>>>>>>>>>>>>>>>>>>> 0) /* vBIOS is sr-iov ready */
>>>>>>>>>>>>>>>>>>>>>>   #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 1) 
>>>>>>>>>>>>>>>>>>>>>> /* sr-iov is enabled on this GPU */
>>>>>>>>>>>>>>>>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt {
>>>>>>>>>>>>>>>>>>>>>>       uint32_t reg_val_offs;
>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_irq_src ack_irq;
>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_irq_src rcv_irq;
>>>>>>>>>>>>>>>>>>>>>> -    struct work_struct flr_work;
>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct flr_work;
>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_mm_table mm_table;
>>>>>>>>>>>>>>>>>>>>>>       const struct amdgpu_virt_ops *ops;
>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_vf_error_buffer vf_errors;
>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>>>>>> index b81acf59870c..f3d1c2be9292 100644
>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>>>>>> @@ -251,7 +251,7 @@ static int 
>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>> *adev,
>>>>>>>>>>>>>>>>>>>>>>     static void xgpu_ai_mailbox_flr_work(struct 
>>>>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, 
>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work);
>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = 
>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>>>>>>>       int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>>>>>>>>>>   @@ -380,7 +380,8 @@ int 
>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_get_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>> @@ -389,6 +390,8 @@ void 
>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_put_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>     static int xgpu_ai_request_init_data(struct 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644
>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>>>>>> @@ -275,7 +275,7 @@ static int 
>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>> *adev,
>>>>>>>>>>>>>>>>>>>>>>     static void xgpu_nv_mailbox_flr_work(struct 
>>>>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, 
>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work);
>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = 
>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>>>>>>>       int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>>>>>>>>>>   @@ -407,7 +407,8 @@ int 
>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_get_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>> @@ -416,6 +417,8 @@ void 
>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_put_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>     const struct amdgpu_virt_ops xgpu_nv_virt_ops 
>>>>>>>>>>>>>>>>>>>>>> = {
>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c 
>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644
>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>>>>>>>> @@ -512,7 +512,7 @@ static int 
>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>> *adev,
>>>>>>>>>>>>>>>>>>>>>>     static void xgpu_vi_mailbox_flr_work(struct 
>>>>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, 
>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work);
>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = 
>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>>>>>>>         /* wait until RCV_MSG become 3 */
>>>>>>>>>>>>>>>>>>>>>> @@ -610,7 +610,8 @@ int 
>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_get_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>> @@ -619,6 +620,8 @@ void 
>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_put_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>     const struct amdgpu_virt_ops xgpu_vi_virt_ops 
>>>>>>>>>>>>>>>>>>>>>> = {
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>
>>>>>>
>>>>
>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive.
  2022-05-11 15:57                                           ` Andrey Grodzovsky
@ 2022-05-12  6:03                                             ` Christian König
  2022-05-12 12:57                                               ` Andrey Grodzovsky
  0 siblings, 1 reply; 40+ messages in thread
From: Christian König @ 2022-05-12  6:03 UTC (permalink / raw)
  To: Andrey Grodzovsky, Christian König, Lazar, Lijo,
	Felix Kuehling, amd-gfx
  Cc: Bai Zoy

Am 11.05.22 um 17:57 schrieb Andrey Grodzovsky:
> [SNIP]
>>>>> How about we do it like this then:
>>>>>
>>>>> struct amdgpu_reset_domain {
>>>>>      ....
>>>>>      union {
>>>>>          struct {
>>>>>              struct work_item debugfs;
>>>>>              struct work_item ras;
>>>>>              ....
>>>>>          };
>>>>>          struct work_item array[]
>>>>>      } reset_sources;
>>>>> }
>>>>>
>>>>
>>>> If it's only about static array,
>>>>
>>>> enum amdgpu_reset_soruce {
>>>>
>>>> AMDGPU_RESET_SRC_RAS,
>>>> AMDGPU_RESET_SRC_ABC,
>>>> .....
>>>> AMDGPU_RESET_SRC_XYZ,
>>>> AMDGPU_RESET_SRC_MAX
>>>>
>>>> };
>>>>
>>>> struct work_struct reset_work[AMDGPU_RESET_SRC_MAX]; => An index 
>>>> for each work item
>>>>
>>>>
>>>> Thanks,
>>>> Lijo
>>>
>>>
>>> It's possible though it makes harder to generalize reset_domain 
>>> later for other drivers.
>>> But still one caveat, look at amdgpu_recover_work_struct and it's 
>>> usage in amdgpu_device_gpu_recover and in gpu_recover_get,
>>> At least for debugfs i need to return back the result of GPU reset 
>>> and so I cannot store actual work items in the array mentioned above
>>> but rather pointers to work_item because i need a way to get back 
>>> the return value from gpu_recover like I do now in 
>>> amdgpu_device_gpu_recover.
>>
>> You should try to avoid that as well.
>
>
> Why ?

Because pointers need a lifetime. When the work items are allocated as 
part of the structure you can be sure that they are not freed not reused 
for something else.

>
>>
>> What we should do instead is to cache the status of the last reset in 
>> the reset domain.
>
>
> Probably an atomic ret in reset_domain then.
>
> Another 2 points i forgot to ask -
>
> 1) What race condition you had in mind about insertion and extraction 
> from the list if all is done under lock ?

A work item is essentially a linked list and a function to call when the 
worker thread has time to process the item. This means you now have two 
linked lists representing essentially the same.

To make it even worse those two lists are protected by two different 
locks. The work item list is protected by the worker lock and the reset 
item by our own.

Keeping all that synced up is rather racy.

>
> 2) This I asked already - why you opposed so much to allocation on the 
> stack ? I understand the problem with dynamic memory allocations but 
> why stack ? We do multiple allocations on the stack for any function
> we call during GPU reset, what so special in this case where we 
> allocate work struct on the stack? It's not like the work struct is 
> especially big compared to other stuff we allocate on the stack.

The problem is once more the lifetime. When the reset work items are 
allocated on the stack we absolutely need to make sure that their 
pointer is not flying around anywhere when the function returns. Keep in 
mind that a stack is just another form of memory management.

It's certainly possible to get that right, but it's just the more error 
prone approach.

Regards,
Christian.

>
> Andrey
>
>
>>
>> Regards,
>> Christian.
>>
>>>
>>> Andrey
>>>
>>>
>>>>
>>>>> Not 100% sure if that works, but something like that should do the 
>>>>> trick.
>>>>>
>>>>> My main concern is that I don't want to allocate the work items on 
>>>>> the stack and dynamic allocation (e.g. kmalloc) is usually not 
>>>>> possible either.
>>>>>
>>>>> Additional to that putting/removing work items from a list, array 
>>>>> or other container is a very common source for race conditions.
>>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>>>
>>>>>>>> to the cancellation list which you showed above. In current way 
>>>>>>>> all this done automatically within reset_domain code and it's 
>>>>>>>> agnostic to specific driver and it's specific list of reset 
>>>>>>>> sources. Also in case we would want to generalize reset_domain 
>>>>>>>> to other GPU drivers (which was
>>>>>>>> a plan as far as i remember) this explicit mention of each 
>>>>>>>> reset works for cancellation is again less suitable in my opinion.
>>>>>>>
>>>>>>> Well we could put the work item for the scheduler independent 
>>>>>>> reset source into the reset domain as well. But I'm not sure 
>>>>>>> those additional reset sources should be part of any common 
>>>>>>> handling, that is largely amdgpu specific.
>>>>>>
>>>>>>
>>>>>> So it's for sure more then one source for the reasons described 
>>>>>> above, also note that for scheduler we already cancel delayed 
>>>>>> work in drm_sched_stop so calling them again in amdgpu code kind 
>>>>>> of superfluous.
>>>>>>
>>>>>> Andrey
>>>>>>
>>>>>>
>>>>>>>
>>>>>>> Christian.
>>>>>>>
>>>>>>>>
>>>>>>>> Andrey
>>>>>>>>
>>>>>>>>
>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> Andrey
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> The only difference is I chose to do the canceling right 
>>>>>>>>>>>> BEFORE the HW reset and not AFTER. I did this because I see 
>>>>>>>>>>>> a possible race where a new reset request is being 
>>>>>>>>>>>> generated exactly after we finished the HW reset but before 
>>>>>>>>>>>> we canceled out all pending resets - in such case you wold 
>>>>>>>>>>>> not want to cancel this 'border line new' reset request.
>>>>>>>>>>>
>>>>>>>>>>> Why not? Any new reset request directly after a hardware 
>>>>>>>>>>> reset is most likely just falsely generated by the reset 
>>>>>>>>>>> itself.
>>>>>>>>>>>
>>>>>>>>>>> Ideally I would cancel all sources after the reset, but 
>>>>>>>>>>> before starting any new work.
>>>>>>>>>>>
>>>>>>>>>>> Regards,
>>>>>>>>>>> Christian.
>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> Andrey
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>
>>>>>>>>>>>>>>> You can see that if many different reset sources share 
>>>>>>>>>>>>>>> same work struct what can happen is that the first to 
>>>>>>>>>>>>>>> obtain the lock you describe bellow might opt out from 
>>>>>>>>>>>>>>> full HW reset because his bad job did signal for example 
>>>>>>>>>>>>>>> or because his hunged IP block was able to recover 
>>>>>>>>>>>>>>> through SW reset but in the meantime another reset 
>>>>>>>>>>>>>>> source who needed an actual HW reset just silently 
>>>>>>>>>>>>>>> returned and we end up with unhandled reset request. 
>>>>>>>>>>>>>>> True that today this happens only to job timeout reset 
>>>>>>>>>>>>>>> sources that are handled form within the scheduler and 
>>>>>>>>>>>>>>> won't use this single work struct but no one prevents a 
>>>>>>>>>>>>>>> future case for this to happen and also, if we actually 
>>>>>>>>>>>>>>> want to unify scheduler time out handlers within reset 
>>>>>>>>>>>>>>> domain (which seems to me the right design approach) we 
>>>>>>>>>>>>>>> won't be able to use just one work struct for this 
>>>>>>>>>>>>>>> reason anyway.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Just to add to this point - a reset domain is 
>>>>>>>>>>>>>> co-operative domain. In addition to sharing a set of 
>>>>>>>>>>>>>> clients from various reset sources for one device, it 
>>>>>>>>>>>>>> also will have a set of devices like in XGMI hive. The 
>>>>>>>>>>>>>> job timeout on one device may not eventually result in 
>>>>>>>>>>>>>> result, but a RAS error happening on another device at 
>>>>>>>>>>>>>> the same time would need a reset. The second device's RAS 
>>>>>>>>>>>>>> error cannot return seeing that a reset work already 
>>>>>>>>>>>>>> started, or ignore the reset work given that another 
>>>>>>>>>>>>>> device has filled the reset data.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> When there is a reset domain, it should take care of the 
>>>>>>>>>>>>>> work scheduled and keeping it in device or any other 
>>>>>>>>>>>>>> level doesn't sound good.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Thanks,
>>>>>>>>>>>>>> Lijo
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> I'd put the reset work struct into the reset_domain 
>>>>>>>>>>>>>>>> struct. That way you'd have exactly one worker for the 
>>>>>>>>>>>>>>>> reset domain. You could implement a lock-less scheme to 
>>>>>>>>>>>>>>>> decide whether you need to schedule a reset, e.g. using 
>>>>>>>>>>>>>>>> an atomic counter in the shared work struct that gets 
>>>>>>>>>>>>>>>> incremented when a client wants to trigger a reset 
>>>>>>>>>>>>>>>> (atomic_add_return). If that counter is exactly 1 after 
>>>>>>>>>>>>>>>> incrementing, you need to fill in the rest of the work 
>>>>>>>>>>>>>>>> struct and schedule the work. Otherwise, it's already 
>>>>>>>>>>>>>>>> scheduled (or another client is in the process of 
>>>>>>>>>>>>>>>> scheduling it) and you just return. When the worker 
>>>>>>>>>>>>>>>> finishes (after confirming a successful reset), it 
>>>>>>>>>>>>>>>> resets the counter to 0, so the next client requesting 
>>>>>>>>>>>>>>>> a reset will schedule the worker again.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>   Felix
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> Additional to that keep in mind that you can't 
>>>>>>>>>>>>>>>>>> allocate any memory before or during the GPU reset 
>>>>>>>>>>>>>>>>>> nor wait for the reset to complete (so you can't 
>>>>>>>>>>>>>>>>>> allocate anything on the stack either).
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> There is no dynamic allocation here, regarding stack 
>>>>>>>>>>>>>>>>> allocations - we do it all the time when we call 
>>>>>>>>>>>>>>>>> functions, even during GPU resets, how on stack 
>>>>>>>>>>>>>>>>> allocation of work struct in amdgpu_device_gpu_recover 
>>>>>>>>>>>>>>>>> is different from any other local variable we allocate 
>>>>>>>>>>>>>>>>> in any function we call ?
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> I am also not sure why it's not allowed to wait for 
>>>>>>>>>>>>>>>>> reset to complete ? Also, see in 
>>>>>>>>>>>>>>>>> amdgpu_ras_do_recovery and gpu_recover_get (debugfs) - 
>>>>>>>>>>>>>>>>> the caller expects the reset to complete before he 
>>>>>>>>>>>>>>>>> returns. I can probably work around it in RAS code by 
>>>>>>>>>>>>>>>>> calling atomic_set(&ras->in_recovery, 0) from some 
>>>>>>>>>>>>>>>>> callback within actual reset function but regarding 
>>>>>>>>>>>>>>>>> sysfs it actually expects a result returned indicating 
>>>>>>>>>>>>>>>>> whether the call was successful or not.
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> I don't think that concept you try here will work.
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> Also in general seems to me it's cleaner approach 
>>>>>>>>>>>>>>>>>>> where this logic (the work items) are held and 
>>>>>>>>>>>>>>>>>>> handled in reset_domain and are not split in each 
>>>>>>>>>>>>>>>>>>> adev or any other entity. We might want in the 
>>>>>>>>>>>>>>>>>>> future to even move the scheduler handling into 
>>>>>>>>>>>>>>>>>>> reset domain since reset domain is supposed to be a 
>>>>>>>>>>>>>>>>>>> generic things and not only or AMD.
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky 
>>>>>>>>>>>>>>>>>>>>>>> <andrey.grodzovsky@amd.com>
>>>>>>>>>>>>>>>>>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com>
>>>>>>>>>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +---
>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 
>>>>>>>>>>>>>>>>>>>>>>> +++--
>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 +
>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 
>>>>>>>>>>>>>>>>>>>>>>> +++++++++++++++++++++-
>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +-
>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++-
>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++-
>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++-
>>>>>>>>>>>>>>>>>>>>>>>   8 files changed, 104 insertions(+), 24 
>>>>>>>>>>>>>>>>>>>>>>> deletions(-)
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>>>>>>>>> index 4264abc5604d..99efd8317547 100644
>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>>>>>>>>> @@ -109,6 +109,7 @@
>>>>>>>>>>>>>>>>>>>>>>>   #include "amdgpu_fdinfo.h"
>>>>>>>>>>>>>>>>>>>>>>>   #include "amdgpu_mca.h"
>>>>>>>>>>>>>>>>>>>>>>>   #include "amdgpu_ras.h"
>>>>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>>>>>>>>>>>>>>>>>     #define MAX_GPU_INSTANCE 16
>>>>>>>>>>>>>>>>>>>>>>>   @@ -509,16 +510,6 @@ struct 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_allowed_register_entry {
>>>>>>>>>>>>>>>>>>>>>>>       bool grbm_indexed;
>>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>>   -enum amd_reset_method {
>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_NONE = -1,
>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_LEGACY = 0,
>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE0,
>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE1,
>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE2,
>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_BACO,
>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_PCI,
>>>>>>>>>>>>>>>>>>>>>>> -};
>>>>>>>>>>>>>>>>>>>>>>> -
>>>>>>>>>>>>>>>>>>>>>>>   struct amdgpu_video_codec_info {
>>>>>>>>>>>>>>>>>>>>>>>       u32 codec_type;
>>>>>>>>>>>>>>>>>>>>>>>       u32 max_width;
>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>> index e582f1044c0f..7fa82269c30f 100644
>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>> @@ -5201,6 +5201,12 @@ int 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>>>> tmp_vram_lost_counter = 
>>>>>>>>>>>>>>>>>>>>>>> atomic_read(&((adev)->vram_lost_counter));
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>> +    /* Drop all pending resets since we will 
>>>>>>>>>>>>>>>>>>>>>>> reset now anyway */
>>>>>>>>>>>>>>>>>>>>>>> +    tmp_adev = 
>>>>>>>>>>>>>>>>>>>>>>> list_first_entry(device_list_handle, struct 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device,
>>>>>>>>>>>>>>>>>>>>>>> + reset_list);
>>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_pending_list(tmp_adev->reset_domain);
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>       /* Actual ASIC resets if needed.*/
>>>>>>>>>>>>>>>>>>>>>>>       /* Host driver will handle XGMI hive reset 
>>>>>>>>>>>>>>>>>>>>>>> for SRIOV */
>>>>>>>>>>>>>>>>>>>>>>>       if (amdgpu_sriov_vf(adev)) {
>>>>>>>>>>>>>>>>>>>>>>> @@ -5296,7 +5302,7 @@ int 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>     struct amdgpu_recover_work_struct {
>>>>>>>>>>>>>>>>>>>>>>> -    struct work_struct base;
>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct base;
>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev;
>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_job *job;
>>>>>>>>>>>>>>>>>>>>>>>       int ret;
>>>>>>>>>>>>>>>>>>>>>>> @@ -5304,7 +5310,7 @@ struct 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct {
>>>>>>>>>>>>>>>>>>>>>>>     static void 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work(struct 
>>>>>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_recover_work_struct 
>>>>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base);
>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_recover_work_struct 
>>>>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base.base.work);
>>>>>>>>>>>>>>>>>>>>>>> recover_work->ret = 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, 
>>>>>>>>>>>>>>>>>>>>>>> recover_work->job);
>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>> @@ -5316,12 +5322,15 @@ int 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>>> *adev,
>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_recover_work_struct work = 
>>>>>>>>>>>>>>>>>>>>>>> {.adev = adev, .job = job};
>>>>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&work.base, 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&work.base.base, 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&work.base.node);
>>>>>>>>>>>>>>>>>>>>>>>         if 
>>>>>>>>>>>>>>>>>>>>>>> (!amdgpu_reset_domain_schedule(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>>> &work.base))
>>>>>>>>>>>>>>>>>>>>>>>           return -EAGAIN;
>>>>>>>>>>>>>>>>>>>>>>>   - flush_work(&work.base);
>>>>>>>>>>>>>>>>>>>>>>> + flush_delayed_work(&work.base.base);
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>>> &work.base);
>>>>>>>>>>>>>>>>>>>>>>>         return work.ret;
>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>>>>>>>>> index c80af0889773..ffddd419c351 100644
>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>>>>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain 
>>>>>>>>>>>>>>>>>>>>>>> *amdgpu_reset_create_reset_domain(enum 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_d
>>>>>>>>>>>>>>>>>>>>>>> atomic_set(&reset_domain->in_gpu_reset, 0);
>>>>>>>>>>>>>>>>>>>>>>> init_rwsem(&reset_domain->sem);
>>>>>>>>>>>>>>>>>>>>>>>   + INIT_LIST_HEAD(&reset_domain->pending_works);
>>>>>>>>>>>>>>>>>>>>>>> + mutex_init(&reset_domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>       return reset_domain;
>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>   diff --git 
>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>>>>>>>>> index 1949dbe28a86..863ec5720fc1 100644
>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>>>>>>>>> @@ -24,7 +24,18 @@
>>>>>>>>>>>>>>>>>>>>>>>   #ifndef __AMDGPU_RESET_H__
>>>>>>>>>>>>>>>>>>>>>>>   #define __AMDGPU_RESET_H__
>>>>>>>>>>>>>>>>>>>>>>>   -#include "amdgpu.h"
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/atomic.h>
>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/mutex.h>
>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/list.h>
>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/kref.h>
>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/rwsem.h>
>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/workqueue.h>
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_device;
>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_job;
>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_hive_info;
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>     enum AMDGPU_RESET_FLAGS {
>>>>>>>>>>>>>>>>>>>>>>>   @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS {
>>>>>>>>>>>>>>>>>>>>>>> AMDGPU_SKIP_HW_RESET = 1,
>>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>>   +
>>>>>>>>>>>>>>>>>>>>>>> +enum amd_reset_method {
>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_NONE = -1,
>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_LEGACY = 0,
>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE0,
>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE1,
>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE2,
>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_BACO,
>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_PCI,
>>>>>>>>>>>>>>>>>>>>>>> +};
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>   struct amdgpu_reset_context {
>>>>>>>>>>>>>>>>>>>>>>>       enum amd_reset_method method;
>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *reset_req_dev;
>>>>>>>>>>>>>>>>>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context {
>>>>>>>>>>>>>>>>>>>>>>>       unsigned long flags;
>>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>>   +struct amdgpu_reset_control;
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>   struct amdgpu_reset_handler {
>>>>>>>>>>>>>>>>>>>>>>>       enum amd_reset_method reset_method;
>>>>>>>>>>>>>>>>>>>>>>>       struct list_head handler_list;
>>>>>>>>>>>>>>>>>>>>>>> @@ -76,12 +100,21 @@ enum 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_type {
>>>>>>>>>>>>>>>>>>>>>>>       XGMI_HIVE
>>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>>   +
>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_reset_work_struct {
>>>>>>>>>>>>>>>>>>>>>>> +    struct delayed_work base;
>>>>>>>>>>>>>>>>>>>>>>> +    struct list_head node;
>>>>>>>>>>>>>>>>>>>>>>> +};
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>   struct amdgpu_reset_domain {
>>>>>>>>>>>>>>>>>>>>>>>       struct kref refcount;
>>>>>>>>>>>>>>>>>>>>>>>       struct workqueue_struct *wq;
>>>>>>>>>>>>>>>>>>>>>>>       enum amdgpu_reset_domain_type type;
>>>>>>>>>>>>>>>>>>>>>>>       struct rw_semaphore sem;
>>>>>>>>>>>>>>>>>>>>>>>       atomic_t in_gpu_reset;
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>> +    struct list_head pending_works;
>>>>>>>>>>>>>>>>>>>>>>> +    struct mutex reset_lock;
>>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>>     @@ -113,9 +146,43 @@ static inline void 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_put_reset_domain(struct 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *dom
>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>     static inline bool 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_schedule(struct 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain,
>>>>>>>>>>>>>>>>>>>>>>> - struct work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>> -    return queue_work(domain->wq, work);
>>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>> +    if (!queue_delayed_work(domain->wq, 
>>>>>>>>>>>>>>>>>>>>>>> &work->base, 0)) {
>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>> +        return false;
>>>>>>>>>>>>>>>>>>>>>>> +    }
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>> + list_add_tail(&work->node, 
>>>>>>>>>>>>>>>>>>>>>>> &domain->pending_works);
>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>> +    return true;
>>>>>>>>>>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>> +static inline void 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(struct 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain,
>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>> + list_del_init(&work->node);
>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>> +static inline void 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_pending_list(struct 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain)
>>>>>>>>>>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct *entry, *tmp;
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>> + list_for_each_entry_safe(entry, tmp, 
>>>>>>>>>>>>>>>>>>>>>>> &domain->pending_works, node) {
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>> + list_del_init(&entry->node);
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>> +        /* Stop any other related pending 
>>>>>>>>>>>>>>>>>>>>>>> resets */
>>>>>>>>>>>>>>>>>>>>>>> + cancel_delayed_work(&entry->base);
>>>>>>>>>>>>>>>>>>>>>>> +    }
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>     void amdgpu_device_lock_reset_domain(struct 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *reset_domain);
>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h 
>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>>>>>>>>> index 239f232f9c02..574e870d3064 100644
>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>>>>>>>>> @@ -25,6 +25,7 @@
>>>>>>>>>>>>>>>>>>>>>>>   #define AMDGPU_VIRT_H
>>>>>>>>>>>>>>>>>>>>>>>     #include "amdgv_sriovmsg.h"
>>>>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>>>>>>>>>>>>>>>>>     #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 << 
>>>>>>>>>>>>>>>>>>>>>>> 0) /* vBIOS is sr-iov ready */
>>>>>>>>>>>>>>>>>>>>>>>   #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 1) 
>>>>>>>>>>>>>>>>>>>>>>> /* sr-iov is enabled on this GPU */
>>>>>>>>>>>>>>>>>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt {
>>>>>>>>>>>>>>>>>>>>>>>       uint32_t reg_val_offs;
>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_irq_src ack_irq;
>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_irq_src rcv_irq;
>>>>>>>>>>>>>>>>>>>>>>> -    struct work_struct flr_work;
>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct flr_work;
>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_mm_table mm_table;
>>>>>>>>>>>>>>>>>>>>>>>       const struct amdgpu_virt_ops *ops;
>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_vf_error_buffer vf_errors;
>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>>>>>>> index b81acf59870c..f3d1c2be9292 100644
>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>>>>>>> @@ -251,7 +251,7 @@ static int 
>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>>> *adev,
>>>>>>>>>>>>>>>>>>>>>>>     static void xgpu_ai_mailbox_flr_work(struct 
>>>>>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, 
>>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work);
>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = 
>>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>>>>>>>>       int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>>>>>>>>>>>   @@ -380,7 +380,8 @@ int 
>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_get_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>> @@ -389,6 +390,8 @@ void 
>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_put_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>     static int xgpu_ai_request_init_data(struct 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644
>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>>>>>>> @@ -275,7 +275,7 @@ static int 
>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>>> *adev,
>>>>>>>>>>>>>>>>>>>>>>>     static void xgpu_nv_mailbox_flr_work(struct 
>>>>>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, 
>>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work);
>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = 
>>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>>>>>>>>       int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>>>>>>>>>>>   @@ -407,7 +407,8 @@ int 
>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_get_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>> @@ -416,6 +417,8 @@ void 
>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_put_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>     const struct amdgpu_virt_ops 
>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_virt_ops = {
>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c 
>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>>>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644
>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>>>>>>>>> @@ -512,7 +512,7 @@ static int 
>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>>> *adev,
>>>>>>>>>>>>>>>>>>>>>>>     static void xgpu_vi_mailbox_flr_work(struct 
>>>>>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, 
>>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work);
>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = 
>>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>>>>>>>>         /* wait until RCV_MSG become 3 */
>>>>>>>>>>>>>>>>>>>>>>> @@ -610,7 +610,8 @@ int 
>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_get_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>> @@ -619,6 +620,8 @@ void 
>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_put_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>     const struct amdgpu_virt_ops 
>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_virt_ops = {
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>
>>>>>>>
>>>>>
>>


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive.
  2022-05-11 20:27                                           ` Andrey Grodzovsky
@ 2022-05-12  6:06                                             ` Christian König
  2022-05-12  9:21                                               ` Lazar, Lijo
  2022-05-12 13:07                                               ` Andrey Grodzovsky
  0 siblings, 2 replies; 40+ messages in thread
From: Christian König @ 2022-05-12  6:06 UTC (permalink / raw)
  To: Andrey Grodzovsky, Lazar, Lijo, Christian König,
	Felix Kuehling, amd-gfx
  Cc: Bai Zoy

Am 11.05.22 um 22:27 schrieb Andrey Grodzovsky:
>
> On 2022-05-11 11:39, Christian König wrote:
>> Am 11.05.22 um 17:35 schrieb Andrey Grodzovsky:
>>> On 2022-05-11 11:20, Lazar, Lijo wrote:
>>>>
>>>>
>>>> On 5/11/2022 7:28 PM, Christian König wrote:
>>>>> Am 11.05.22 um 15:43 schrieb Andrey Grodzovsky:
>>>>>> On 2022-05-11 03:38, Christian König wrote:
>>>>>>> Am 10.05.22 um 20:53 schrieb Andrey Grodzovsky:
>>>>>>>> [SNIP]
>>>>>>>>> E.g. in the reset code (either before or after the reset, 
>>>>>>>>> that's debatable) you do something like this:
>>>>>>>>>
>>>>>>>>> for (i = 0; i < num_ring; ++i)
>>>>>>>>>     cancel_delayed_work(ring[i]->scheduler....)
>>>>>>>>> cancel_work(adev->ras_work);
>>>>>>>>> cancel_work(adev->iofault_work);
>>>>>>>>> cancel_work(adev->debugfs_work);
>>>>>>>>> ...
>>>>>>>>>
>>>>>>>>> You don't really need to track which reset source has fired 
>>>>>>>>> and which hasn't, because that would be racy again. Instead 
>>>>>>>>> just bluntly reset all possible sources.
>>>>>>>>>
>>>>>>>>> Christian.
>>>>>>>>
>>>>>>>>
>>>>>>>> I don't say we care if it fired once or twice (I need to add a 
>>>>>>>> fix to only insert reset work to pending reset list if it's not 
>>>>>>>> already there), the point of using list (or array) to which you 
>>>>>>>> add and from which you remove is that the logic of this is 
>>>>>>>> encapsulated within reset domain. In your way we need to be 
>>>>>>>> aware who exactly schedules reset work and explicitly cancel 
>>>>>>>> them, this also means that for any new source added in the 
>>>>>>>> future you will need to remember to add him
>>>>>>>
>>>>>>> I don't think that this is a valid argument. Additionally to the 
>>>>>>> schedulers we probably just need less than a handful of reset 
>>>>>>> sources, most likely even just one or two is enough.
>>>>>>>
>>>>>>> The only justification I can see of having additional separate 
>>>>>>> reset sources would be if somebody wants to know if a specific 
>>>>>>> source has been handled or not (e.g. call flush_work() or 
>>>>>>> work_pending()). Like in the case of a reset triggered through 
>>>>>>> debugfs.
>>>>>>
>>>>>>
>>>>>> This is indeed one reason, another is as we said before that if 
>>>>>> you share 'reset source' (meaning a delayed work) with another 
>>>>>> client (i.e. RAS and KFD) it means you make assumption that the 
>>>>>> other client always proceeds with the
>>>>>> reset exactly the same way as you expect. So today we have this 
>>>>>> only in scheduler vs non scheduler reset happening - non 
>>>>>> scheduler reset clients assume the reset is always fully executed 
>>>>>> in HW while scheduler based reset makes shortcuts and not always 
>>>>>> does HW reset hence they cannot share 'reset source' (delayed 
>>>>>> work). Yes, we can always add this in the future if and when such 
>>>>>> problem will arise but no one will remember this then and a new 
>>>>>> bug will be introduced and will take time to find and resolve.
>>>>>
>>>>> Mhm, so your main concern is that we forget to correctly handle 
>>>>> the new reset sources?
>>>>>
>>>>> How about we do it like this then:
>>>>>
>>>>> struct amdgpu_reset_domain {
>>>>>      ....
>>>>>      union {
>>>>>          struct {
>>>>>              struct work_item debugfs;
>>>>>              struct work_item ras;
>>>>>              ....
>>>>>          };
>>>>>          struct work_item array[]
>>>>>      } reset_sources;
>>>>> }
>>>>>
>>>>
>>>> If it's only about static array,
>>>>
>>>> enum amdgpu_reset_soruce {
>>>>
>>>> AMDGPU_RESET_SRC_RAS,
>>>> AMDGPU_RESET_SRC_ABC,
>>>> .....
>>>> AMDGPU_RESET_SRC_XYZ,
>>>> AMDGPU_RESET_SRC_MAX
>>>>
>>>> };
>>>>
>>>> struct work_struct reset_work[AMDGPU_RESET_SRC_MAX]; => An index 
>>>> for each work item
>>>>
>>>>
>>>> Thanks,
>>>> Lijo
>>>
>>>
>>> It's possible though it makes harder to generalize reset_domain 
>>> later for other drivers.
>>> But still one caveat, look at amdgpu_recover_work_struct and it's 
>>> usage in amdgpu_device_gpu_recover and in gpu_recover_get,
>>> At least for debugfs i need to return back the result of GPU reset 
>>> and so I cannot store actual work items in the array mentioned above
>>> but rather pointers to work_item because i need a way to get back 
>>> the return value from gpu_recover like I do now in 
>>> amdgpu_device_gpu_recover.
>>
>> You should try to avoid that as well.
>>
>> See when the debugfs reset is canceled because of a scheduler reset 
>> you won't get a useful return code either.
>>
>> What we should do instead is to cache the status of the last reset in 
>> the reset domain.
>>
>> Regards,
>> Christian.
>
>
> Another problem with this approach -  to execute  the actaul GPU reset 
> I need accesses  to concrete amdgpu_device pointer from work struct 
> (see xgpu_ai_mailbox_flr_work) as example. If i store all work items in
> array in amdgpu_reset_domain the most i can only retrieve is the 
> reset_domain struct itself which won't help since it's dynamically 
> allocated, not embedded in hive or adev and can can be one per device 
> or per entire hive in case of XGMI and so there is no way for me to 
> reach back to amdgpu_device. Back pointer to adev* from 
> amdgpu_reset_domain will only work for single device but not for XGMI 
> hive where there are multiple devices in a hive.

Which is exactly the reason why I think we should always allocate the 
hive structure, even if we only have one device. And a GPU reset should 
then always work with the hive data structure and not adev.

Adding a pointer from your reset work item back to the hive is then trivial.

Regards,
Christian.

>
> Andrey
>
>
>>
>>>
>>> Andrey
>>>
>>>
>>>>
>>>>> Not 100% sure if that works, but something like that should do the 
>>>>> trick.
>>>>>
>>>>> My main concern is that I don't want to allocate the work items on 
>>>>> the stack and dynamic allocation (e.g. kmalloc) is usually not 
>>>>> possible either.
>>>>>
>>>>> Additional to that putting/removing work items from a list, array 
>>>>> or other container is a very common source for race conditions.
>>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>>>
>>>>>>>> to the cancellation list which you showed above. In current way 
>>>>>>>> all this done automatically within reset_domain code and it's 
>>>>>>>> agnostic to specific driver and it's specific list of reset 
>>>>>>>> sources. Also in case we would want to generalize reset_domain 
>>>>>>>> to other GPU drivers (which was
>>>>>>>> a plan as far as i remember) this explicit mention of each 
>>>>>>>> reset works for cancellation is again less suitable in my opinion.
>>>>>>>
>>>>>>> Well we could put the work item for the scheduler independent 
>>>>>>> reset source into the reset domain as well. But I'm not sure 
>>>>>>> those additional reset sources should be part of any common 
>>>>>>> handling, that is largely amdgpu specific.
>>>>>>
>>>>>>
>>>>>> So it's for sure more then one source for the reasons described 
>>>>>> above, also note that for scheduler we already cancel delayed 
>>>>>> work in drm_sched_stop so calling them again in amdgpu code kind 
>>>>>> of superfluous.
>>>>>>
>>>>>> Andrey
>>>>>>
>>>>>>
>>>>>>>
>>>>>>> Christian.
>>>>>>>
>>>>>>>>
>>>>>>>> Andrey
>>>>>>>>
>>>>>>>>
>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> Andrey
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> The only difference is I chose to do the canceling right 
>>>>>>>>>>>> BEFORE the HW reset and not AFTER. I did this because I see 
>>>>>>>>>>>> a possible race where a new reset request is being 
>>>>>>>>>>>> generated exactly after we finished the HW reset but before 
>>>>>>>>>>>> we canceled out all pending resets - in such case you wold 
>>>>>>>>>>>> not want to cancel this 'border line new' reset request.
>>>>>>>>>>>
>>>>>>>>>>> Why not? Any new reset request directly after a hardware 
>>>>>>>>>>> reset is most likely just falsely generated by the reset 
>>>>>>>>>>> itself.
>>>>>>>>>>>
>>>>>>>>>>> Ideally I would cancel all sources after the reset, but 
>>>>>>>>>>> before starting any new work.
>>>>>>>>>>>
>>>>>>>>>>> Regards,
>>>>>>>>>>> Christian.
>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> Andrey
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>
>>>>>>>>>>>>>>> You can see that if many different reset sources share 
>>>>>>>>>>>>>>> same work struct what can happen is that the first to 
>>>>>>>>>>>>>>> obtain the lock you describe bellow might opt out from 
>>>>>>>>>>>>>>> full HW reset because his bad job did signal for example 
>>>>>>>>>>>>>>> or because his hunged IP block was able to recover 
>>>>>>>>>>>>>>> through SW reset but in the meantime another reset 
>>>>>>>>>>>>>>> source who needed an actual HW reset just silently 
>>>>>>>>>>>>>>> returned and we end up with unhandled reset request. 
>>>>>>>>>>>>>>> True that today this happens only to job timeout reset 
>>>>>>>>>>>>>>> sources that are handled form within the scheduler and 
>>>>>>>>>>>>>>> won't use this single work struct but no one prevents a 
>>>>>>>>>>>>>>> future case for this to happen and also, if we actually 
>>>>>>>>>>>>>>> want to unify scheduler time out handlers within reset 
>>>>>>>>>>>>>>> domain (which seems to me the right design approach) we 
>>>>>>>>>>>>>>> won't be able to use just one work struct for this 
>>>>>>>>>>>>>>> reason anyway.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Just to add to this point - a reset domain is 
>>>>>>>>>>>>>> co-operative domain. In addition to sharing a set of 
>>>>>>>>>>>>>> clients from various reset sources for one device, it 
>>>>>>>>>>>>>> also will have a set of devices like in XGMI hive. The 
>>>>>>>>>>>>>> job timeout on one device may not eventually result in 
>>>>>>>>>>>>>> result, but a RAS error happening on another device at 
>>>>>>>>>>>>>> the same time would need a reset. The second device's RAS 
>>>>>>>>>>>>>> error cannot return seeing that a reset work already 
>>>>>>>>>>>>>> started, or ignore the reset work given that another 
>>>>>>>>>>>>>> device has filled the reset data.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> When there is a reset domain, it should take care of the 
>>>>>>>>>>>>>> work scheduled and keeping it in device or any other 
>>>>>>>>>>>>>> level doesn't sound good.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Thanks,
>>>>>>>>>>>>>> Lijo
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> I'd put the reset work struct into the reset_domain 
>>>>>>>>>>>>>>>> struct. That way you'd have exactly one worker for the 
>>>>>>>>>>>>>>>> reset domain. You could implement a lock-less scheme to 
>>>>>>>>>>>>>>>> decide whether you need to schedule a reset, e.g. using 
>>>>>>>>>>>>>>>> an atomic counter in the shared work struct that gets 
>>>>>>>>>>>>>>>> incremented when a client wants to trigger a reset 
>>>>>>>>>>>>>>>> (atomic_add_return). If that counter is exactly 1 after 
>>>>>>>>>>>>>>>> incrementing, you need to fill in the rest of the work 
>>>>>>>>>>>>>>>> struct and schedule the work. Otherwise, it's already 
>>>>>>>>>>>>>>>> scheduled (or another client is in the process of 
>>>>>>>>>>>>>>>> scheduling it) and you just return. When the worker 
>>>>>>>>>>>>>>>> finishes (after confirming a successful reset), it 
>>>>>>>>>>>>>>>> resets the counter to 0, so the next client requesting 
>>>>>>>>>>>>>>>> a reset will schedule the worker again.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>   Felix
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> Additional to that keep in mind that you can't 
>>>>>>>>>>>>>>>>>> allocate any memory before or during the GPU reset 
>>>>>>>>>>>>>>>>>> nor wait for the reset to complete (so you can't 
>>>>>>>>>>>>>>>>>> allocate anything on the stack either).
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> There is no dynamic allocation here, regarding stack 
>>>>>>>>>>>>>>>>> allocations - we do it all the time when we call 
>>>>>>>>>>>>>>>>> functions, even during GPU resets, how on stack 
>>>>>>>>>>>>>>>>> allocation of work struct in amdgpu_device_gpu_recover 
>>>>>>>>>>>>>>>>> is different from any other local variable we allocate 
>>>>>>>>>>>>>>>>> in any function we call ?
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> I am also not sure why it's not allowed to wait for 
>>>>>>>>>>>>>>>>> reset to complete ? Also, see in 
>>>>>>>>>>>>>>>>> amdgpu_ras_do_recovery and gpu_recover_get (debugfs) - 
>>>>>>>>>>>>>>>>> the caller expects the reset to complete before he 
>>>>>>>>>>>>>>>>> returns. I can probably work around it in RAS code by 
>>>>>>>>>>>>>>>>> calling atomic_set(&ras->in_recovery, 0) from some 
>>>>>>>>>>>>>>>>> callback within actual reset function but regarding 
>>>>>>>>>>>>>>>>> sysfs it actually expects a result returned indicating 
>>>>>>>>>>>>>>>>> whether the call was successful or not.
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> I don't think that concept you try here will work.
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> Also in general seems to me it's cleaner approach 
>>>>>>>>>>>>>>>>>>> where this logic (the work items) are held and 
>>>>>>>>>>>>>>>>>>> handled in reset_domain and are not split in each 
>>>>>>>>>>>>>>>>>>> adev or any other entity. We might want in the 
>>>>>>>>>>>>>>>>>>> future to even move the scheduler handling into 
>>>>>>>>>>>>>>>>>>> reset domain since reset domain is supposed to be a 
>>>>>>>>>>>>>>>>>>> generic things and not only or AMD.
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky 
>>>>>>>>>>>>>>>>>>>>>>> <andrey.grodzovsky@amd.com>
>>>>>>>>>>>>>>>>>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com>
>>>>>>>>>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +---
>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 
>>>>>>>>>>>>>>>>>>>>>>> +++--
>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 +
>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 
>>>>>>>>>>>>>>>>>>>>>>> +++++++++++++++++++++-
>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +-
>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++-
>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++-
>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++-
>>>>>>>>>>>>>>>>>>>>>>>   8 files changed, 104 insertions(+), 24 
>>>>>>>>>>>>>>>>>>>>>>> deletions(-)
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>>>>>>>>> index 4264abc5604d..99efd8317547 100644
>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>>>>>>>>> @@ -109,6 +109,7 @@
>>>>>>>>>>>>>>>>>>>>>>>   #include "amdgpu_fdinfo.h"
>>>>>>>>>>>>>>>>>>>>>>>   #include "amdgpu_mca.h"
>>>>>>>>>>>>>>>>>>>>>>>   #include "amdgpu_ras.h"
>>>>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>>>>>>>>>>>>>>>>>     #define MAX_GPU_INSTANCE 16
>>>>>>>>>>>>>>>>>>>>>>>   @@ -509,16 +510,6 @@ struct 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_allowed_register_entry {
>>>>>>>>>>>>>>>>>>>>>>>       bool grbm_indexed;
>>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>>   -enum amd_reset_method {
>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_NONE = -1,
>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_LEGACY = 0,
>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE0,
>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE1,
>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE2,
>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_BACO,
>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_PCI,
>>>>>>>>>>>>>>>>>>>>>>> -};
>>>>>>>>>>>>>>>>>>>>>>> -
>>>>>>>>>>>>>>>>>>>>>>>   struct amdgpu_video_codec_info {
>>>>>>>>>>>>>>>>>>>>>>>       u32 codec_type;
>>>>>>>>>>>>>>>>>>>>>>>       u32 max_width;
>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>> index e582f1044c0f..7fa82269c30f 100644
>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>> @@ -5201,6 +5201,12 @@ int 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>>>> tmp_vram_lost_counter = 
>>>>>>>>>>>>>>>>>>>>>>> atomic_read(&((adev)->vram_lost_counter));
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>> +    /* Drop all pending resets since we will 
>>>>>>>>>>>>>>>>>>>>>>> reset now anyway */
>>>>>>>>>>>>>>>>>>>>>>> +    tmp_adev = 
>>>>>>>>>>>>>>>>>>>>>>> list_first_entry(device_list_handle, struct 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device,
>>>>>>>>>>>>>>>>>>>>>>> + reset_list);
>>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_pending_list(tmp_adev->reset_domain);
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>       /* Actual ASIC resets if needed.*/
>>>>>>>>>>>>>>>>>>>>>>>       /* Host driver will handle XGMI hive reset 
>>>>>>>>>>>>>>>>>>>>>>> for SRIOV */
>>>>>>>>>>>>>>>>>>>>>>>       if (amdgpu_sriov_vf(adev)) {
>>>>>>>>>>>>>>>>>>>>>>> @@ -5296,7 +5302,7 @@ int 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>     struct amdgpu_recover_work_struct {
>>>>>>>>>>>>>>>>>>>>>>> -    struct work_struct base;
>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct base;
>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev;
>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_job *job;
>>>>>>>>>>>>>>>>>>>>>>>       int ret;
>>>>>>>>>>>>>>>>>>>>>>> @@ -5304,7 +5310,7 @@ struct 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct {
>>>>>>>>>>>>>>>>>>>>>>>     static void 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work(struct 
>>>>>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_recover_work_struct 
>>>>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base);
>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_recover_work_struct 
>>>>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base.base.work);
>>>>>>>>>>>>>>>>>>>>>>> recover_work->ret = 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, 
>>>>>>>>>>>>>>>>>>>>>>> recover_work->job);
>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>> @@ -5316,12 +5322,15 @@ int 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>>> *adev,
>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_recover_work_struct work = 
>>>>>>>>>>>>>>>>>>>>>>> {.adev = adev, .job = job};
>>>>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&work.base, 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&work.base.base, 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&work.base.node);
>>>>>>>>>>>>>>>>>>>>>>>         if 
>>>>>>>>>>>>>>>>>>>>>>> (!amdgpu_reset_domain_schedule(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>>> &work.base))
>>>>>>>>>>>>>>>>>>>>>>>           return -EAGAIN;
>>>>>>>>>>>>>>>>>>>>>>>   - flush_work(&work.base);
>>>>>>>>>>>>>>>>>>>>>>> + flush_delayed_work(&work.base.base);
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>>> &work.base);
>>>>>>>>>>>>>>>>>>>>>>>         return work.ret;
>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>>>>>>>>> index c80af0889773..ffddd419c351 100644
>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>>>>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain 
>>>>>>>>>>>>>>>>>>>>>>> *amdgpu_reset_create_reset_domain(enum 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_d
>>>>>>>>>>>>>>>>>>>>>>> atomic_set(&reset_domain->in_gpu_reset, 0);
>>>>>>>>>>>>>>>>>>>>>>> init_rwsem(&reset_domain->sem);
>>>>>>>>>>>>>>>>>>>>>>>   + INIT_LIST_HEAD(&reset_domain->pending_works);
>>>>>>>>>>>>>>>>>>>>>>> + mutex_init(&reset_domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>       return reset_domain;
>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>   diff --git 
>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>>>>>>>>> index 1949dbe28a86..863ec5720fc1 100644
>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>>>>>>>>> @@ -24,7 +24,18 @@
>>>>>>>>>>>>>>>>>>>>>>>   #ifndef __AMDGPU_RESET_H__
>>>>>>>>>>>>>>>>>>>>>>>   #define __AMDGPU_RESET_H__
>>>>>>>>>>>>>>>>>>>>>>>   -#include "amdgpu.h"
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/atomic.h>
>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/mutex.h>
>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/list.h>
>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/kref.h>
>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/rwsem.h>
>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/workqueue.h>
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_device;
>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_job;
>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_hive_info;
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>     enum AMDGPU_RESET_FLAGS {
>>>>>>>>>>>>>>>>>>>>>>>   @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS {
>>>>>>>>>>>>>>>>>>>>>>> AMDGPU_SKIP_HW_RESET = 1,
>>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>>   +
>>>>>>>>>>>>>>>>>>>>>>> +enum amd_reset_method {
>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_NONE = -1,
>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_LEGACY = 0,
>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE0,
>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE1,
>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE2,
>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_BACO,
>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_PCI,
>>>>>>>>>>>>>>>>>>>>>>> +};
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>   struct amdgpu_reset_context {
>>>>>>>>>>>>>>>>>>>>>>>       enum amd_reset_method method;
>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *reset_req_dev;
>>>>>>>>>>>>>>>>>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context {
>>>>>>>>>>>>>>>>>>>>>>>       unsigned long flags;
>>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>>   +struct amdgpu_reset_control;
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>   struct amdgpu_reset_handler {
>>>>>>>>>>>>>>>>>>>>>>>       enum amd_reset_method reset_method;
>>>>>>>>>>>>>>>>>>>>>>>       struct list_head handler_list;
>>>>>>>>>>>>>>>>>>>>>>> @@ -76,12 +100,21 @@ enum 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_type {
>>>>>>>>>>>>>>>>>>>>>>>       XGMI_HIVE
>>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>>   +
>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_reset_work_struct {
>>>>>>>>>>>>>>>>>>>>>>> +    struct delayed_work base;
>>>>>>>>>>>>>>>>>>>>>>> +    struct list_head node;
>>>>>>>>>>>>>>>>>>>>>>> +};
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>   struct amdgpu_reset_domain {
>>>>>>>>>>>>>>>>>>>>>>>       struct kref refcount;
>>>>>>>>>>>>>>>>>>>>>>>       struct workqueue_struct *wq;
>>>>>>>>>>>>>>>>>>>>>>>       enum amdgpu_reset_domain_type type;
>>>>>>>>>>>>>>>>>>>>>>>       struct rw_semaphore sem;
>>>>>>>>>>>>>>>>>>>>>>>       atomic_t in_gpu_reset;
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>> +    struct list_head pending_works;
>>>>>>>>>>>>>>>>>>>>>>> +    struct mutex reset_lock;
>>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>>     @@ -113,9 +146,43 @@ static inline void 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_put_reset_domain(struct 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *dom
>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>     static inline bool 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_schedule(struct 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain,
>>>>>>>>>>>>>>>>>>>>>>> - struct work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>> -    return queue_work(domain->wq, work);
>>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>> +    if (!queue_delayed_work(domain->wq, 
>>>>>>>>>>>>>>>>>>>>>>> &work->base, 0)) {
>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>> +        return false;
>>>>>>>>>>>>>>>>>>>>>>> +    }
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>> + list_add_tail(&work->node, 
>>>>>>>>>>>>>>>>>>>>>>> &domain->pending_works);
>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>> +    return true;
>>>>>>>>>>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>> +static inline void 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(struct 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain,
>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>> + list_del_init(&work->node);
>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>> +static inline void 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_pending_list(struct 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain)
>>>>>>>>>>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct *entry, *tmp;
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>> + list_for_each_entry_safe(entry, tmp, 
>>>>>>>>>>>>>>>>>>>>>>> &domain->pending_works, node) {
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>> + list_del_init(&entry->node);
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>> +        /* Stop any other related pending 
>>>>>>>>>>>>>>>>>>>>>>> resets */
>>>>>>>>>>>>>>>>>>>>>>> + cancel_delayed_work(&entry->base);
>>>>>>>>>>>>>>>>>>>>>>> +    }
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>     void amdgpu_device_lock_reset_domain(struct 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *reset_domain);
>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h 
>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>>>>>>>>> index 239f232f9c02..574e870d3064 100644
>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>>>>>>>>> @@ -25,6 +25,7 @@
>>>>>>>>>>>>>>>>>>>>>>>   #define AMDGPU_VIRT_H
>>>>>>>>>>>>>>>>>>>>>>>     #include "amdgv_sriovmsg.h"
>>>>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>>>>>>>>>>>>>>>>>     #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 << 
>>>>>>>>>>>>>>>>>>>>>>> 0) /* vBIOS is sr-iov ready */
>>>>>>>>>>>>>>>>>>>>>>>   #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 1) 
>>>>>>>>>>>>>>>>>>>>>>> /* sr-iov is enabled on this GPU */
>>>>>>>>>>>>>>>>>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt {
>>>>>>>>>>>>>>>>>>>>>>>       uint32_t reg_val_offs;
>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_irq_src ack_irq;
>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_irq_src rcv_irq;
>>>>>>>>>>>>>>>>>>>>>>> -    struct work_struct flr_work;
>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct flr_work;
>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_mm_table mm_table;
>>>>>>>>>>>>>>>>>>>>>>>       const struct amdgpu_virt_ops *ops;
>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_vf_error_buffer vf_errors;
>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>>>>>>> index b81acf59870c..f3d1c2be9292 100644
>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>>>>>>> @@ -251,7 +251,7 @@ static int 
>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>>> *adev,
>>>>>>>>>>>>>>>>>>>>>>>     static void xgpu_ai_mailbox_flr_work(struct 
>>>>>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, 
>>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work);
>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = 
>>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>>>>>>>>       int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>>>>>>>>>>>   @@ -380,7 +380,8 @@ int 
>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_get_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>> @@ -389,6 +390,8 @@ void 
>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_put_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>     static int xgpu_ai_request_init_data(struct 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644
>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>>>>>>> @@ -275,7 +275,7 @@ static int 
>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>>> *adev,
>>>>>>>>>>>>>>>>>>>>>>>     static void xgpu_nv_mailbox_flr_work(struct 
>>>>>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, 
>>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work);
>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = 
>>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>>>>>>>>       int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>>>>>>>>>>>   @@ -407,7 +407,8 @@ int 
>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_get_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>> @@ -416,6 +417,8 @@ void 
>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_put_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>     const struct amdgpu_virt_ops 
>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_virt_ops = {
>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c 
>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>>>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644
>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>>>>>>>>> @@ -512,7 +512,7 @@ static int 
>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>>> *adev,
>>>>>>>>>>>>>>>>>>>>>>>     static void xgpu_vi_mailbox_flr_work(struct 
>>>>>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, 
>>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work);
>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = 
>>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>>>>>>>>         /* wait until RCV_MSG become 3 */
>>>>>>>>>>>>>>>>>>>>>>> @@ -610,7 +610,8 @@ int 
>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_get_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>> @@ -619,6 +620,8 @@ void 
>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_put_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>     const struct amdgpu_virt_ops 
>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_virt_ops = {
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>
>>>>>>>
>>>>>
>>


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive.
  2022-05-12  6:06                                             ` Christian König
@ 2022-05-12  9:21                                               ` Lazar, Lijo
  2022-05-12 13:07                                               ` Andrey Grodzovsky
  1 sibling, 0 replies; 40+ messages in thread
From: Lazar, Lijo @ 2022-05-12  9:21 UTC (permalink / raw)
  To: Christian König, Andrey Grodzovsky, Christian König,
	Felix Kuehling, amd-gfx
  Cc: Bai Zoy



On 5/12/2022 11:36 AM, Christian König wrote:
> Am 11.05.22 um 22:27 schrieb Andrey Grodzovsky:
>>
>> On 2022-05-11 11:39, Christian König wrote:
>>> Am 11.05.22 um 17:35 schrieb Andrey Grodzovsky:
>>>> On 2022-05-11 11:20, Lazar, Lijo wrote:
>>>>>
>>>>>
>>>>> On 5/11/2022 7:28 PM, Christian König wrote:
>>>>>> Am 11.05.22 um 15:43 schrieb Andrey Grodzovsky:
>>>>>>> On 2022-05-11 03:38, Christian König wrote:
>>>>>>>> Am 10.05.22 um 20:53 schrieb Andrey Grodzovsky:
>>>>>>>>> [SNIP]
>>>>>>>>>> E.g. in the reset code (either before or after the reset, 
>>>>>>>>>> that's debatable) you do something like this:
>>>>>>>>>>
>>>>>>>>>> for (i = 0; i < num_ring; ++i)
>>>>>>>>>>     cancel_delayed_work(ring[i]->scheduler....)
>>>>>>>>>> cancel_work(adev->ras_work);
>>>>>>>>>> cancel_work(adev->iofault_work);
>>>>>>>>>> cancel_work(adev->debugfs_work);
>>>>>>>>>> ...
>>>>>>>>>>
>>>>>>>>>> You don't really need to track which reset source has fired 
>>>>>>>>>> and which hasn't, because that would be racy again. Instead 
>>>>>>>>>> just bluntly reset all possible sources.
>>>>>>>>>>
>>>>>>>>>> Christian.
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> I don't say we care if it fired once or twice (I need to add a 
>>>>>>>>> fix to only insert reset work to pending reset list if it's not 
>>>>>>>>> already there), the point of using list (or array) to which you 
>>>>>>>>> add and from which you remove is that the logic of this is 
>>>>>>>>> encapsulated within reset domain. In your way we need to be 
>>>>>>>>> aware who exactly schedules reset work and explicitly cancel 
>>>>>>>>> them, this also means that for any new source added in the 
>>>>>>>>> future you will need to remember to add him
>>>>>>>>
>>>>>>>> I don't think that this is a valid argument. Additionally to the 
>>>>>>>> schedulers we probably just need less than a handful of reset 
>>>>>>>> sources, most likely even just one or two is enough.
>>>>>>>>
>>>>>>>> The only justification I can see of having additional separate 
>>>>>>>> reset sources would be if somebody wants to know if a specific 
>>>>>>>> source has been handled or not (e.g. call flush_work() or 
>>>>>>>> work_pending()). Like in the case of a reset triggered through 
>>>>>>>> debugfs.
>>>>>>>
>>>>>>>
>>>>>>> This is indeed one reason, another is as we said before that if 
>>>>>>> you share 'reset source' (meaning a delayed work) with another 
>>>>>>> client (i.e. RAS and KFD) it means you make assumption that the 
>>>>>>> other client always proceeds with the
>>>>>>> reset exactly the same way as you expect. So today we have this 
>>>>>>> only in scheduler vs non scheduler reset happening - non 
>>>>>>> scheduler reset clients assume the reset is always fully executed 
>>>>>>> in HW while scheduler based reset makes shortcuts and not always 
>>>>>>> does HW reset hence they cannot share 'reset source' (delayed 
>>>>>>> work). Yes, we can always add this in the future if and when such 
>>>>>>> problem will arise but no one will remember this then and a new 
>>>>>>> bug will be introduced and will take time to find and resolve.
>>>>>>
>>>>>> Mhm, so your main concern is that we forget to correctly handle 
>>>>>> the new reset sources?
>>>>>>
>>>>>> How about we do it like this then:
>>>>>>
>>>>>> struct amdgpu_reset_domain {
>>>>>>      ....
>>>>>>      union {
>>>>>>          struct {
>>>>>>              struct work_item debugfs;
>>>>>>              struct work_item ras;
>>>>>>              ....
>>>>>>          };
>>>>>>          struct work_item array[]
>>>>>>      } reset_sources;
>>>>>> }
>>>>>>
>>>>>
>>>>> If it's only about static array,
>>>>>
>>>>> enum amdgpu_reset_soruce {
>>>>>
>>>>> AMDGPU_RESET_SRC_RAS,
>>>>> AMDGPU_RESET_SRC_ABC,
>>>>> .....
>>>>> AMDGPU_RESET_SRC_XYZ,
>>>>> AMDGPU_RESET_SRC_MAX
>>>>>
>>>>> };
>>>>>
>>>>> struct work_struct reset_work[AMDGPU_RESET_SRC_MAX]; => An index 
>>>>> for each work item
>>>>>
>>>>>
>>>>> Thanks,
>>>>> Lijo
>>>>
>>>>
>>>> It's possible though it makes harder to generalize reset_domain 
>>>> later for other drivers.
>>>> But still one caveat, look at amdgpu_recover_work_struct and it's 
>>>> usage in amdgpu_device_gpu_recover and in gpu_recover_get,
>>>> At least for debugfs i need to return back the result of GPU reset 
>>>> and so I cannot store actual work items in the array mentioned above
>>>> but rather pointers to work_item because i need a way to get back 
>>>> the return value from gpu_recover like I do now in 
>>>> amdgpu_device_gpu_recover.
>>>
>>> You should try to avoid that as well.
>>>
>>> See when the debugfs reset is canceled because of a scheduler reset 
>>> you won't get a useful return code either.
>>>
>>> What we should do instead is to cache the status of the last reset in 
>>> the reset domain.
>>>
>>> Regards,
>>> Christian.
>>
>>
>> Another problem with this approach -  to execute  the actaul GPU reset 
>> I need accesses  to concrete amdgpu_device pointer from work struct 
>> (see xgpu_ai_mailbox_flr_work) as example. If i store all work items in
>> array in amdgpu_reset_domain the most i can only retrieve is the 
>> reset_domain struct itself which won't help since it's dynamically 
>> allocated, not embedded in hive or adev and can can be one per device 
>> or per entire hive in case of XGMI and so there is no way for me to 
>> reach back to amdgpu_device. Back pointer to adev* from 
>> amdgpu_reset_domain will only work for single device but not for XGMI 
>> hive where there are multiple devices in a hive.

You could embed the work within another struct which provides the 
context of the work and have that as the array.

struct amdgpu_reset_job {
         void *reset_context; // A struct containing the reset context, 
you may check amdgpu_reset_context.
         struct work_struct reset_work;
         unsigned long flags;
};

And keep an array of amdgpu_reset_job[AMDGPU_RESET_SRC_MAX] inside reset 
domain. When the request is coming you may fill the requested device + 
reset_context and schedule the corresponding work.

I guess the idea now is if a work item for a particular source is 
already active (indicated by flags), another device belonging to the 
reset domain won't be able to request the same workitem again.

Thanks,
Lijo

> 
> Which is exactly the reason why I think we should always allocate the 
> hive structure, even if we only have one device. And a GPU reset should 
> then always work with the hive data structure and not adev.
> 
> Adding a pointer from your reset work item back to the hive is then 
> trivial.
> 
> Regards,
> Christian.
> 
>>
>> Andrey
>>
>>
>>>
>>>>
>>>> Andrey
>>>>
>>>>
>>>>>
>>>>>> Not 100% sure if that works, but something like that should do the 
>>>>>> trick.
>>>>>>
>>>>>> My main concern is that I don't want to allocate the work items on 
>>>>>> the stack and dynamic allocation (e.g. kmalloc) is usually not 
>>>>>> possible either.
>>>>>>
>>>>>> Additional to that putting/removing work items from a list, array 
>>>>>> or other container is a very common source for race conditions.
>>>>>>
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>>>>
>>>>>>>>> to the cancellation list which you showed above. In current way 
>>>>>>>>> all this done automatically within reset_domain code and it's 
>>>>>>>>> agnostic to specific driver and it's specific list of reset 
>>>>>>>>> sources. Also in case we would want to generalize reset_domain 
>>>>>>>>> to other GPU drivers (which was
>>>>>>>>> a plan as far as i remember) this explicit mention of each 
>>>>>>>>> reset works for cancellation is again less suitable in my opinion.
>>>>>>>>
>>>>>>>> Well we could put the work item for the scheduler independent 
>>>>>>>> reset source into the reset domain as well. But I'm not sure 
>>>>>>>> those additional reset sources should be part of any common 
>>>>>>>> handling, that is largely amdgpu specific.
>>>>>>>
>>>>>>>
>>>>>>> So it's for sure more then one source for the reasons described 
>>>>>>> above, also note that for scheduler we already cancel delayed 
>>>>>>> work in drm_sched_stop so calling them again in amdgpu code kind 
>>>>>>> of superfluous.
>>>>>>>
>>>>>>> Andrey
>>>>>>>
>>>>>>>
>>>>>>>>
>>>>>>>> Christian.
>>>>>>>>
>>>>>>>>>
>>>>>>>>> Andrey
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> Andrey
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>> The only difference is I chose to do the canceling right 
>>>>>>>>>>>>> BEFORE the HW reset and not AFTER. I did this because I see 
>>>>>>>>>>>>> a possible race where a new reset request is being 
>>>>>>>>>>>>> generated exactly after we finished the HW reset but before 
>>>>>>>>>>>>> we canceled out all pending resets - in such case you wold 
>>>>>>>>>>>>> not want to cancel this 'border line new' reset request.
>>>>>>>>>>>>
>>>>>>>>>>>> Why not? Any new reset request directly after a hardware 
>>>>>>>>>>>> reset is most likely just falsely generated by the reset 
>>>>>>>>>>>> itself.
>>>>>>>>>>>>
>>>>>>>>>>>> Ideally I would cancel all sources after the reset, but 
>>>>>>>>>>>> before starting any new work.
>>>>>>>>>>>>
>>>>>>>>>>>> Regards,
>>>>>>>>>>>> Christian.
>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> You can see that if many different reset sources share 
>>>>>>>>>>>>>>>> same work struct what can happen is that the first to 
>>>>>>>>>>>>>>>> obtain the lock you describe bellow might opt out from 
>>>>>>>>>>>>>>>> full HW reset because his bad job did signal for example 
>>>>>>>>>>>>>>>> or because his hunged IP block was able to recover 
>>>>>>>>>>>>>>>> through SW reset but in the meantime another reset 
>>>>>>>>>>>>>>>> source who needed an actual HW reset just silently 
>>>>>>>>>>>>>>>> returned and we end up with unhandled reset request. 
>>>>>>>>>>>>>>>> True that today this happens only to job timeout reset 
>>>>>>>>>>>>>>>> sources that are handled form within the scheduler and 
>>>>>>>>>>>>>>>> won't use this single work struct but no one prevents a 
>>>>>>>>>>>>>>>> future case for this to happen and also, if we actually 
>>>>>>>>>>>>>>>> want to unify scheduler time out handlers within reset 
>>>>>>>>>>>>>>>> domain (which seems to me the right design approach) we 
>>>>>>>>>>>>>>>> won't be able to use just one work struct for this 
>>>>>>>>>>>>>>>> reason anyway.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Just to add to this point - a reset domain is 
>>>>>>>>>>>>>>> co-operative domain. In addition to sharing a set of 
>>>>>>>>>>>>>>> clients from various reset sources for one device, it 
>>>>>>>>>>>>>>> also will have a set of devices like in XGMI hive. The 
>>>>>>>>>>>>>>> job timeout on one device may not eventually result in 
>>>>>>>>>>>>>>> result, but a RAS error happening on another device at 
>>>>>>>>>>>>>>> the same time would need a reset. The second device's RAS 
>>>>>>>>>>>>>>> error cannot return seeing that a reset work already 
>>>>>>>>>>>>>>> started, or ignore the reset work given that another 
>>>>>>>>>>>>>>> device has filled the reset data.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> When there is a reset domain, it should take care of the 
>>>>>>>>>>>>>>> work scheduled and keeping it in device or any other 
>>>>>>>>>>>>>>> level doesn't sound good.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Thanks,
>>>>>>>>>>>>>>> Lijo
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> I'd put the reset work struct into the reset_domain 
>>>>>>>>>>>>>>>>> struct. That way you'd have exactly one worker for the 
>>>>>>>>>>>>>>>>> reset domain. You could implement a lock-less scheme to 
>>>>>>>>>>>>>>>>> decide whether you need to schedule a reset, e.g. using 
>>>>>>>>>>>>>>>>> an atomic counter in the shared work struct that gets 
>>>>>>>>>>>>>>>>> incremented when a client wants to trigger a reset 
>>>>>>>>>>>>>>>>> (atomic_add_return). If that counter is exactly 1 after 
>>>>>>>>>>>>>>>>> incrementing, you need to fill in the rest of the work 
>>>>>>>>>>>>>>>>> struct and schedule the work. Otherwise, it's already 
>>>>>>>>>>>>>>>>> scheduled (or another client is in the process of 
>>>>>>>>>>>>>>>>> scheduling it) and you just return. When the worker 
>>>>>>>>>>>>>>>>> finishes (after confirming a successful reset), it 
>>>>>>>>>>>>>>>>> resets the counter to 0, so the next client requesting 
>>>>>>>>>>>>>>>>> a reset will schedule the worker again.
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>   Felix
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> Additional to that keep in mind that you can't 
>>>>>>>>>>>>>>>>>>> allocate any memory before or during the GPU reset 
>>>>>>>>>>>>>>>>>>> nor wait for the reset to complete (so you can't 
>>>>>>>>>>>>>>>>>>> allocate anything on the stack either).
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> There is no dynamic allocation here, regarding stack 
>>>>>>>>>>>>>>>>>> allocations - we do it all the time when we call 
>>>>>>>>>>>>>>>>>> functions, even during GPU resets, how on stack 
>>>>>>>>>>>>>>>>>> allocation of work struct in amdgpu_device_gpu_recover 
>>>>>>>>>>>>>>>>>> is different from any other local variable we allocate 
>>>>>>>>>>>>>>>>>> in any function we call ?
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> I am also not sure why it's not allowed to wait for 
>>>>>>>>>>>>>>>>>> reset to complete ? Also, see in 
>>>>>>>>>>>>>>>>>> amdgpu_ras_do_recovery and gpu_recover_get (debugfs) - 
>>>>>>>>>>>>>>>>>> the caller expects the reset to complete before he 
>>>>>>>>>>>>>>>>>> returns. I can probably work around it in RAS code by 
>>>>>>>>>>>>>>>>>> calling atomic_set(&ras->in_recovery, 0) from some 
>>>>>>>>>>>>>>>>>> callback within actual reset function but regarding 
>>>>>>>>>>>>>>>>>> sysfs it actually expects a result returned indicating 
>>>>>>>>>>>>>>>>>> whether the call was successful or not.
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> I don't think that concept you try here will work.
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> Also in general seems to me it's cleaner approach 
>>>>>>>>>>>>>>>>>>>> where this logic (the work items) are held and 
>>>>>>>>>>>>>>>>>>>> handled in reset_domain and are not split in each 
>>>>>>>>>>>>>>>>>>>> adev or any other entity. We might want in the 
>>>>>>>>>>>>>>>>>>>> future to even move the scheduler handling into 
>>>>>>>>>>>>>>>>>>>> reset domain since reset domain is supposed to be a 
>>>>>>>>>>>>>>>>>>>> generic things and not only or AMD.
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky 
>>>>>>>>>>>>>>>>>>>>>>>> <andrey.grodzovsky@amd.com>
>>>>>>>>>>>>>>>>>>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com>
>>>>>>>>>>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +---
>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 
>>>>>>>>>>>>>>>>>>>>>>>> +++--
>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 +
>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 
>>>>>>>>>>>>>>>>>>>>>>>> +++++++++++++++++++++-
>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +-
>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++-
>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++-
>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++-
>>>>>>>>>>>>>>>>>>>>>>>>   8 files changed, 104 insertions(+), 24 
>>>>>>>>>>>>>>>>>>>>>>>> deletions(-)
>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>>>>>>>>>> index 4264abc5604d..99efd8317547 100644
>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>>>>>>>>>> @@ -109,6 +109,7 @@
>>>>>>>>>>>>>>>>>>>>>>>>   #include "amdgpu_fdinfo.h"
>>>>>>>>>>>>>>>>>>>>>>>>   #include "amdgpu_mca.h"
>>>>>>>>>>>>>>>>>>>>>>>>   #include "amdgpu_ras.h"
>>>>>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>>>>>>>>>>>>>>>>>>     #define MAX_GPU_INSTANCE 16
>>>>>>>>>>>>>>>>>>>>>>>>   @@ -509,16 +510,6 @@ struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_allowed_register_entry {
>>>>>>>>>>>>>>>>>>>>>>>>       bool grbm_indexed;
>>>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>>>   -enum amd_reset_method {
>>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_NONE = -1,
>>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_LEGACY = 0,
>>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE0,
>>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE1,
>>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE2,
>>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_BACO,
>>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_PCI,
>>>>>>>>>>>>>>>>>>>>>>>> -};
>>>>>>>>>>>>>>>>>>>>>>>> -
>>>>>>>>>>>>>>>>>>>>>>>>   struct amdgpu_video_codec_info {
>>>>>>>>>>>>>>>>>>>>>>>>       u32 codec_type;
>>>>>>>>>>>>>>>>>>>>>>>>       u32 max_width;
>>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>>> index e582f1044c0f..7fa82269c30f 100644
>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>>> @@ -5201,6 +5201,12 @@ int 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>>>>> tmp_vram_lost_counter = 
>>>>>>>>>>>>>>>>>>>>>>>> atomic_read(&((adev)->vram_lost_counter));
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> +    /* Drop all pending resets since we will 
>>>>>>>>>>>>>>>>>>>>>>>> reset now anyway */
>>>>>>>>>>>>>>>>>>>>>>>> +    tmp_adev = 
>>>>>>>>>>>>>>>>>>>>>>>> list_first_entry(device_list_handle, struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device,
>>>>>>>>>>>>>>>>>>>>>>>> + reset_list);
>>>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_pending_list(tmp_adev->reset_domain);
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>       /* Actual ASIC resets if needed.*/
>>>>>>>>>>>>>>>>>>>>>>>>       /* Host driver will handle XGMI hive reset 
>>>>>>>>>>>>>>>>>>>>>>>> for SRIOV */
>>>>>>>>>>>>>>>>>>>>>>>>       if (amdgpu_sriov_vf(adev)) {
>>>>>>>>>>>>>>>>>>>>>>>> @@ -5296,7 +5302,7 @@ int 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>>     struct amdgpu_recover_work_struct {
>>>>>>>>>>>>>>>>>>>>>>>> -    struct work_struct base;
>>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct base;
>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev;
>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_job *job;
>>>>>>>>>>>>>>>>>>>>>>>>       int ret;
>>>>>>>>>>>>>>>>>>>>>>>> @@ -5304,7 +5310,7 @@ struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct {
>>>>>>>>>>>>>>>>>>>>>>>>     static void 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work(struct 
>>>>>>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_recover_work_struct 
>>>>>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base);
>>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_recover_work_struct 
>>>>>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base.base.work);
>>>>>>>>>>>>>>>>>>>>>>>> recover_work->ret = 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, recover_work->job); 
>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>> @@ -5316,12 +5322,15 @@ int 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>>>> *adev,
>>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_recover_work_struct work = 
>>>>>>>>>>>>>>>>>>>>>>>> {.adev = adev, .job = job};
>>>>>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&work.base, 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&work.base.base, 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&work.base.node);
>>>>>>>>>>>>>>>>>>>>>>>>         if 
>>>>>>>>>>>>>>>>>>>>>>>> (!amdgpu_reset_domain_schedule(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>>>> &work.base))
>>>>>>>>>>>>>>>>>>>>>>>>           return -EAGAIN;
>>>>>>>>>>>>>>>>>>>>>>>>   - flush_work(&work.base);
>>>>>>>>>>>>>>>>>>>>>>>> + flush_delayed_work(&work.base.base);
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>>>> &work.base);
>>>>>>>>>>>>>>>>>>>>>>>>         return work.ret;
>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>>>>>>>>>> index c80af0889773..ffddd419c351 100644
>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>>>>>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain 
>>>>>>>>>>>>>>>>>>>>>>>> *amdgpu_reset_create_reset_domain(enum 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_d
>>>>>>>>>>>>>>>>>>>>>>>> atomic_set(&reset_domain->in_gpu_reset, 0);
>>>>>>>>>>>>>>>>>>>>>>>> init_rwsem(&reset_domain->sem);
>>>>>>>>>>>>>>>>>>>>>>>>   + INIT_LIST_HEAD(&reset_domain->pending_works);
>>>>>>>>>>>>>>>>>>>>>>>> + mutex_init(&reset_domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>       return reset_domain;
>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>>   diff --git 
>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>>>>>>>>>> index 1949dbe28a86..863ec5720fc1 100644
>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>>>>>>>>>> @@ -24,7 +24,18 @@
>>>>>>>>>>>>>>>>>>>>>>>>   #ifndef __AMDGPU_RESET_H__
>>>>>>>>>>>>>>>>>>>>>>>>   #define __AMDGPU_RESET_H__
>>>>>>>>>>>>>>>>>>>>>>>>   -#include "amdgpu.h"
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/atomic.h>
>>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/mutex.h>
>>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/list.h>
>>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/kref.h>
>>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/rwsem.h>
>>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/workqueue.h>
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_device;
>>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_job;
>>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_hive_info;
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>     enum AMDGPU_RESET_FLAGS {
>>>>>>>>>>>>>>>>>>>>>>>>   @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS {
>>>>>>>>>>>>>>>>>>>>>>>> AMDGPU_SKIP_HW_RESET = 1,
>>>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>>>   +
>>>>>>>>>>>>>>>>>>>>>>>> +enum amd_reset_method {
>>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_NONE = -1,
>>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_LEGACY = 0,
>>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE0,
>>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE1,
>>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE2,
>>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_BACO,
>>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_PCI,
>>>>>>>>>>>>>>>>>>>>>>>> +};
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>   struct amdgpu_reset_context {
>>>>>>>>>>>>>>>>>>>>>>>>       enum amd_reset_method method;
>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *reset_req_dev;
>>>>>>>>>>>>>>>>>>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context {
>>>>>>>>>>>>>>>>>>>>>>>>       unsigned long flags;
>>>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>>>   +struct amdgpu_reset_control;
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>   struct amdgpu_reset_handler {
>>>>>>>>>>>>>>>>>>>>>>>>       enum amd_reset_method reset_method;
>>>>>>>>>>>>>>>>>>>>>>>>       struct list_head handler_list;
>>>>>>>>>>>>>>>>>>>>>>>> @@ -76,12 +100,21 @@ enum 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_type {
>>>>>>>>>>>>>>>>>>>>>>>>       XGMI_HIVE
>>>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>>>   +
>>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_reset_work_struct {
>>>>>>>>>>>>>>>>>>>>>>>> +    struct delayed_work base;
>>>>>>>>>>>>>>>>>>>>>>>> +    struct list_head node;
>>>>>>>>>>>>>>>>>>>>>>>> +};
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>   struct amdgpu_reset_domain {
>>>>>>>>>>>>>>>>>>>>>>>>       struct kref refcount;
>>>>>>>>>>>>>>>>>>>>>>>>       struct workqueue_struct *wq;
>>>>>>>>>>>>>>>>>>>>>>>>       enum amdgpu_reset_domain_type type;
>>>>>>>>>>>>>>>>>>>>>>>>       struct rw_semaphore sem;
>>>>>>>>>>>>>>>>>>>>>>>>       atomic_t in_gpu_reset;
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> +    struct list_head pending_works;
>>>>>>>>>>>>>>>>>>>>>>>> +    struct mutex reset_lock;
>>>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>>>     @@ -113,9 +146,43 @@ static inline void 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_put_reset_domain(struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *dom
>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>>     static inline bool 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_schedule(struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain,
>>>>>>>>>>>>>>>>>>>>>>>> - struct work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>> -    return queue_work(domain->wq, work);
>>>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> +    if (!queue_delayed_work(domain->wq, 
>>>>>>>>>>>>>>>>>>>>>>>> &work->base, 0)) {
>>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>>> +        return false;
>>>>>>>>>>>>>>>>>>>>>>>> +    }
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> + list_add_tail(&work->node, 
>>>>>>>>>>>>>>>>>>>>>>>> &domain->pending_works);
>>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> +    return true;
>>>>>>>>>>>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> +static inline void 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain,
>>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>>> + list_del_init(&work->node);
>>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> +static inline void 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_pending_list(struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain)
>>>>>>>>>>>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct *entry, *tmp;
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>>> + list_for_each_entry_safe(entry, tmp, 
>>>>>>>>>>>>>>>>>>>>>>>> &domain->pending_works, node) {
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> + list_del_init(&entry->node);
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> +        /* Stop any other related pending 
>>>>>>>>>>>>>>>>>>>>>>>> resets */
>>>>>>>>>>>>>>>>>>>>>>>> + cancel_delayed_work(&entry->base);
>>>>>>>>>>>>>>>>>>>>>>>> +    }
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>>     void amdgpu_device_lock_reset_domain(struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *reset_domain);
>>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h 
>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>>>>>>>>>> index 239f232f9c02..574e870d3064 100644
>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>>>>>>>>>> @@ -25,6 +25,7 @@
>>>>>>>>>>>>>>>>>>>>>>>>   #define AMDGPU_VIRT_H
>>>>>>>>>>>>>>>>>>>>>>>>     #include "amdgv_sriovmsg.h"
>>>>>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>>>>>>>>>>>>>>>>>>     #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 << 
>>>>>>>>>>>>>>>>>>>>>>>> 0) /* vBIOS is sr-iov ready */
>>>>>>>>>>>>>>>>>>>>>>>>   #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 1) 
>>>>>>>>>>>>>>>>>>>>>>>> /* sr-iov is enabled on this GPU */
>>>>>>>>>>>>>>>>>>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt {
>>>>>>>>>>>>>>>>>>>>>>>>       uint32_t reg_val_offs;
>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_irq_src ack_irq;
>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_irq_src rcv_irq;
>>>>>>>>>>>>>>>>>>>>>>>> -    struct work_struct flr_work;
>>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct flr_work;
>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_mm_table mm_table;
>>>>>>>>>>>>>>>>>>>>>>>>       const struct amdgpu_virt_ops *ops;
>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_vf_error_buffer vf_errors;
>>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>>>>>>>> index b81acf59870c..f3d1c2be9292 100644
>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>>>>>>>> @@ -251,7 +251,7 @@ static int 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>>>> *adev,
>>>>>>>>>>>>>>>>>>>>>>>>     static void xgpu_ai_mailbox_flr_work(struct 
>>>>>>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, 
>>>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work);
>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = 
>>>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>>>>>>>>>       int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>>>>>>>>>>>>   @@ -380,7 +380,8 @@ int 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_get_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>> @@ -389,6 +390,8 @@ void 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_put_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>>     static int xgpu_ai_request_init_data(struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644
>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>>>>>>>> @@ -275,7 +275,7 @@ static int 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>>>> *adev,
>>>>>>>>>>>>>>>>>>>>>>>>     static void xgpu_nv_mailbox_flr_work(struct 
>>>>>>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, 
>>>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work);
>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = 
>>>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>>>>>>>>>       int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>>>>>>>>>>>>   @@ -407,7 +407,8 @@ int 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_get_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>> @@ -416,6 +417,8 @@ void 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_put_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>>     const struct amdgpu_virt_ops 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_virt_ops = {
>>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c 
>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>>>>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644
>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>>>>>>>>>> @@ -512,7 +512,7 @@ static int 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>>>> *adev,
>>>>>>>>>>>>>>>>>>>>>>>>     static void xgpu_vi_mailbox_flr_work(struct 
>>>>>>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, 
>>>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work);
>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = 
>>>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>>>>>>>>>         /* wait until RCV_MSG become 3 */
>>>>>>>>>>>>>>>>>>>>>>>> @@ -610,7 +610,8 @@ int 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_get_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>> @@ -619,6 +620,8 @@ void 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_put_irq(struct amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>>     const struct amdgpu_virt_ops 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_virt_ops = {
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>
>>>>>>
>>>
> 

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive.
  2022-05-12  6:03                                             ` Christian König
@ 2022-05-12 12:57                                               ` Andrey Grodzovsky
  0 siblings, 0 replies; 40+ messages in thread
From: Andrey Grodzovsky @ 2022-05-12 12:57 UTC (permalink / raw)
  To: Christian König, Christian König, Lazar, Lijo,
	Felix Kuehling, amd-gfx
  Cc: Bai Zoy


On 2022-05-12 02:03, Christian König wrote:
> Am 11.05.22 um 17:57 schrieb Andrey Grodzovsky:
>> [SNIP]
>>>>>> How about we do it like this then:
>>>>>>
>>>>>> struct amdgpu_reset_domain {
>>>>>>      ....
>>>>>>      union {
>>>>>>          struct {
>>>>>>              struct work_item debugfs;
>>>>>>              struct work_item ras;
>>>>>>              ....
>>>>>>          };
>>>>>>          struct work_item array[]
>>>>>>      } reset_sources;
>>>>>> }
>>>>>>
>>>>>
>>>>> If it's only about static array,
>>>>>
>>>>> enum amdgpu_reset_soruce {
>>>>>
>>>>> AMDGPU_RESET_SRC_RAS,
>>>>> AMDGPU_RESET_SRC_ABC,
>>>>> .....
>>>>> AMDGPU_RESET_SRC_XYZ,
>>>>> AMDGPU_RESET_SRC_MAX
>>>>>
>>>>> };
>>>>>
>>>>> struct work_struct reset_work[AMDGPU_RESET_SRC_MAX]; => An index 
>>>>> for each work item
>>>>>
>>>>>
>>>>> Thanks,
>>>>> Lijo
>>>>
>>>>
>>>> It's possible though it makes harder to generalize reset_domain 
>>>> later for other drivers.
>>>> But still one caveat, look at amdgpu_recover_work_struct and it's 
>>>> usage in amdgpu_device_gpu_recover and in gpu_recover_get,
>>>> At least for debugfs i need to return back the result of GPU reset 
>>>> and so I cannot store actual work items in the array mentioned above
>>>> but rather pointers to work_item because i need a way to get back 
>>>> the return value from gpu_recover like I do now in 
>>>> amdgpu_device_gpu_recover.
>>>
>>> You should try to avoid that as well.
>>
>>
>> Why ?
>
> Because pointers need a lifetime. When the work items are allocated as 
> part of the structure you can be sure that they are not freed not 
> reused for something else.
>
>>
>>>
>>> What we should do instead is to cache the status of the last reset 
>>> in the reset domain.
>>
>>
>> Probably an atomic ret in reset_domain then.
>>
>> Another 2 points i forgot to ask -
>>
>> 1) What race condition you had in mind about insertion and extraction 
>> from the list if all is done under lock ?
>
> A work item is essentially a linked list and a function to call when 
> the worker thread has time to process the item. This means you now 
> have two linked lists representing essentially the same.
>
> To make it even worse those two lists are protected by two different 
> locks. The work item list is protected by the worker lock and the 
> reset item by our own.
>
> Keeping all that synced up is rather racy.


I really don't see a problem as the locking order is always preserved so 
that the reset list lock scope is around the internal worker thread list 
lock.


>
>>
>> 2) This I asked already - why you opposed so much to allocation on 
>> the stack ? I understand the problem with dynamic memory allocations 
>> but why stack ? We do multiple allocations on the stack for any function
>> we call during GPU reset, what so special in this case where we 
>> allocate work struct on the stack? It's not like the work struct is 
>> especially big compared to other stuff we allocate on the stack.
>
> The problem is once more the lifetime. When the reset work items are 
> allocated on the stack we absolutely need to make sure that their 
> pointer is not flying around anywhere when the function returns. Keep 
> in mind that a stack is just another form of memory management.
>
> It's certainly possible to get that right, but it's just the more 
> error prone approach.


I believe i got that right as at any place where the reset_work item 
life cycle ends i was always calling pending works list delete to not 
leave dangling pointers.

I guess it's all pros and cons, Yes, it' maybe more risky this way since 
it creates a risk of dangling pointers if forgetting to remove but on 
the other hand this approach in my opinion gives cleaner and more 
generic design then
the other alternative as you can see from the other problem I 
encountered with not being able to retrieve amdgpu_device pointer which 
we discuss in another thread and having to explicitly name each reset 
source in reset domain.

Andrey


>
> Regards,
> Christian.
>
>>
>> Andrey
>>
>>
>>>
>>> Regards,
>>> Christian.
>>>
>>>>
>>>> Andrey
>>>>
>>>>
>>>>>
>>>>>> Not 100% sure if that works, but something like that should do 
>>>>>> the trick.
>>>>>>
>>>>>> My main concern is that I don't want to allocate the work items 
>>>>>> on the stack and dynamic allocation (e.g. kmalloc) is usually not 
>>>>>> possible either.
>>>>>>
>>>>>> Additional to that putting/removing work items from a list, array 
>>>>>> or other container is a very common source for race conditions.
>>>>>>
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>>>>
>>>>>>>>> to the cancellation list which you showed above. In current 
>>>>>>>>> way all this done automatically within reset_domain code and 
>>>>>>>>> it's agnostic to specific driver and it's specific list of 
>>>>>>>>> reset sources. Also in case we would want to generalize 
>>>>>>>>> reset_domain to other GPU drivers (which was
>>>>>>>>> a plan as far as i remember) this explicit mention of each 
>>>>>>>>> reset works for cancellation is again less suitable in my 
>>>>>>>>> opinion.
>>>>>>>>
>>>>>>>> Well we could put the work item for the scheduler independent 
>>>>>>>> reset source into the reset domain as well. But I'm not sure 
>>>>>>>> those additional reset sources should be part of any common 
>>>>>>>> handling, that is largely amdgpu specific.
>>>>>>>
>>>>>>>
>>>>>>> So it's for sure more then one source for the reasons described 
>>>>>>> above, also note that for scheduler we already cancel delayed 
>>>>>>> work in drm_sched_stop so calling them again in amdgpu code kind 
>>>>>>> of superfluous.
>>>>>>>
>>>>>>> Andrey
>>>>>>>
>>>>>>>
>>>>>>>>
>>>>>>>> Christian.
>>>>>>>>
>>>>>>>>>
>>>>>>>>> Andrey
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> Andrey
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>> The only difference is I chose to do the canceling right 
>>>>>>>>>>>>> BEFORE the HW reset and not AFTER. I did this because I 
>>>>>>>>>>>>> see a possible race where a new reset request is being 
>>>>>>>>>>>>> generated exactly after we finished the HW reset but 
>>>>>>>>>>>>> before we canceled out all pending resets - in such case 
>>>>>>>>>>>>> you wold not want to cancel this 'border line new' reset 
>>>>>>>>>>>>> request.
>>>>>>>>>>>>
>>>>>>>>>>>> Why not? Any new reset request directly after a hardware 
>>>>>>>>>>>> reset is most likely just falsely generated by the reset 
>>>>>>>>>>>> itself.
>>>>>>>>>>>>
>>>>>>>>>>>> Ideally I would cancel all sources after the reset, but 
>>>>>>>>>>>> before starting any new work.
>>>>>>>>>>>>
>>>>>>>>>>>> Regards,
>>>>>>>>>>>> Christian.
>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> You can see that if many different reset sources share 
>>>>>>>>>>>>>>>> same work struct what can happen is that the first to 
>>>>>>>>>>>>>>>> obtain the lock you describe bellow might opt out from 
>>>>>>>>>>>>>>>> full HW reset because his bad job did signal for 
>>>>>>>>>>>>>>>> example or because his hunged IP block was able to 
>>>>>>>>>>>>>>>> recover through SW reset but in the meantime another 
>>>>>>>>>>>>>>>> reset source who needed an actual HW reset just 
>>>>>>>>>>>>>>>> silently returned and we end up with unhandled reset 
>>>>>>>>>>>>>>>> request. True that today this happens only to job 
>>>>>>>>>>>>>>>> timeout reset sources that are handled form within the 
>>>>>>>>>>>>>>>> scheduler and won't use this single work struct but no 
>>>>>>>>>>>>>>>> one prevents a future case for this to happen and also, 
>>>>>>>>>>>>>>>> if we actually want to unify scheduler time out 
>>>>>>>>>>>>>>>> handlers within reset domain (which seems to me the 
>>>>>>>>>>>>>>>> right design approach) we won't be able to use just one 
>>>>>>>>>>>>>>>> work struct for this reason anyway.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Just to add to this point - a reset domain is 
>>>>>>>>>>>>>>> co-operative domain. In addition to sharing a set of 
>>>>>>>>>>>>>>> clients from various reset sources for one device, it 
>>>>>>>>>>>>>>> also will have a set of devices like in XGMI hive. The 
>>>>>>>>>>>>>>> job timeout on one device may not eventually result in 
>>>>>>>>>>>>>>> result, but a RAS error happening on another device at 
>>>>>>>>>>>>>>> the same time would need a reset. The second device's 
>>>>>>>>>>>>>>> RAS error cannot return seeing that a reset work already 
>>>>>>>>>>>>>>> started, or ignore the reset work given that another 
>>>>>>>>>>>>>>> device has filled the reset data.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> When there is a reset domain, it should take care of the 
>>>>>>>>>>>>>>> work scheduled and keeping it in device or any other 
>>>>>>>>>>>>>>> level doesn't sound good.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Thanks,
>>>>>>>>>>>>>>> Lijo
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> I'd put the reset work struct into the reset_domain 
>>>>>>>>>>>>>>>>> struct. That way you'd have exactly one worker for the 
>>>>>>>>>>>>>>>>> reset domain. You could implement a lock-less scheme 
>>>>>>>>>>>>>>>>> to decide whether you need to schedule a reset, e.g. 
>>>>>>>>>>>>>>>>> using an atomic counter in the shared work struct that 
>>>>>>>>>>>>>>>>> gets incremented when a client wants to trigger a 
>>>>>>>>>>>>>>>>> reset (atomic_add_return). If that counter is exactly 
>>>>>>>>>>>>>>>>> 1 after incrementing, you need to fill in the rest of 
>>>>>>>>>>>>>>>>> the work struct and schedule the work. Otherwise, it's 
>>>>>>>>>>>>>>>>> already scheduled (or another client is in the process 
>>>>>>>>>>>>>>>>> of scheduling it) and you just return. When the worker 
>>>>>>>>>>>>>>>>> finishes (after confirming a successful reset), it 
>>>>>>>>>>>>>>>>> resets the counter to 0, so the next client requesting 
>>>>>>>>>>>>>>>>> a reset will schedule the worker again.
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>   Felix
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> Additional to that keep in mind that you can't 
>>>>>>>>>>>>>>>>>>> allocate any memory before or during the GPU reset 
>>>>>>>>>>>>>>>>>>> nor wait for the reset to complete (so you can't 
>>>>>>>>>>>>>>>>>>> allocate anything on the stack either).
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> There is no dynamic allocation here, regarding stack 
>>>>>>>>>>>>>>>>>> allocations - we do it all the time when we call 
>>>>>>>>>>>>>>>>>> functions, even during GPU resets, how on stack 
>>>>>>>>>>>>>>>>>> allocation of work struct in 
>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover is different from any other 
>>>>>>>>>>>>>>>>>> local variable we allocate in any function we call ?
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> I am also not sure why it's not allowed to wait for 
>>>>>>>>>>>>>>>>>> reset to complete ? Also, see in 
>>>>>>>>>>>>>>>>>> amdgpu_ras_do_recovery and gpu_recover_get (debugfs) 
>>>>>>>>>>>>>>>>>> - the caller expects the reset to complete before he 
>>>>>>>>>>>>>>>>>> returns. I can probably work around it in RAS code by 
>>>>>>>>>>>>>>>>>> calling atomic_set(&ras->in_recovery, 0) from some 
>>>>>>>>>>>>>>>>>> callback within actual reset function but regarding 
>>>>>>>>>>>>>>>>>> sysfs it actually expects a result returned 
>>>>>>>>>>>>>>>>>> indicating whether the call was successful or not.
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> I don't think that concept you try here will work.
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> Also in general seems to me it's cleaner approach 
>>>>>>>>>>>>>>>>>>>> where this logic (the work items) are held and 
>>>>>>>>>>>>>>>>>>>> handled in reset_domain and are not split in each 
>>>>>>>>>>>>>>>>>>>> adev or any other entity. We might want in the 
>>>>>>>>>>>>>>>>>>>> future to even move the scheduler handling into 
>>>>>>>>>>>>>>>>>>>> reset domain since reset domain is supposed to be a 
>>>>>>>>>>>>>>>>>>>> generic things and not only or AMD.
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky 
>>>>>>>>>>>>>>>>>>>>>>>> <andrey.grodzovsky@amd.com>
>>>>>>>>>>>>>>>>>>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com>
>>>>>>>>>>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +---
>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 
>>>>>>>>>>>>>>>>>>>>>>>> +++--
>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 +
>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 
>>>>>>>>>>>>>>>>>>>>>>>> +++++++++++++++++++++-
>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +-
>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++-
>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++-
>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++-
>>>>>>>>>>>>>>>>>>>>>>>>   8 files changed, 104 insertions(+), 24 
>>>>>>>>>>>>>>>>>>>>>>>> deletions(-)
>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>>>>>>>>>> index 4264abc5604d..99efd8317547 100644
>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>>>>>>>>>> @@ -109,6 +109,7 @@
>>>>>>>>>>>>>>>>>>>>>>>>   #include "amdgpu_fdinfo.h"
>>>>>>>>>>>>>>>>>>>>>>>>   #include "amdgpu_mca.h"
>>>>>>>>>>>>>>>>>>>>>>>>   #include "amdgpu_ras.h"
>>>>>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>>>>>>>>>>>>>>>>>>     #define MAX_GPU_INSTANCE 16
>>>>>>>>>>>>>>>>>>>>>>>>   @@ -509,16 +510,6 @@ struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_allowed_register_entry {
>>>>>>>>>>>>>>>>>>>>>>>>       bool grbm_indexed;
>>>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>>>   -enum amd_reset_method {
>>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_NONE = -1,
>>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_LEGACY = 0,
>>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE0,
>>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE1,
>>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE2,
>>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_BACO,
>>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_PCI,
>>>>>>>>>>>>>>>>>>>>>>>> -};
>>>>>>>>>>>>>>>>>>>>>>>> -
>>>>>>>>>>>>>>>>>>>>>>>>   struct amdgpu_video_codec_info {
>>>>>>>>>>>>>>>>>>>>>>>>       u32 codec_type;
>>>>>>>>>>>>>>>>>>>>>>>>       u32 max_width;
>>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>>> index e582f1044c0f..7fa82269c30f 100644
>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>>> @@ -5201,6 +5201,12 @@ int 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>>>>> tmp_vram_lost_counter = 
>>>>>>>>>>>>>>>>>>>>>>>> atomic_read(&((adev)->vram_lost_counter));
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> +    /* Drop all pending resets since we will 
>>>>>>>>>>>>>>>>>>>>>>>> reset now anyway */
>>>>>>>>>>>>>>>>>>>>>>>> +    tmp_adev = 
>>>>>>>>>>>>>>>>>>>>>>>> list_first_entry(device_list_handle, struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device,
>>>>>>>>>>>>>>>>>>>>>>>> + reset_list);
>>>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_pending_list(tmp_adev->reset_domain);
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>       /* Actual ASIC resets if needed.*/
>>>>>>>>>>>>>>>>>>>>>>>>       /* Host driver will handle XGMI hive 
>>>>>>>>>>>>>>>>>>>>>>>> reset for SRIOV */
>>>>>>>>>>>>>>>>>>>>>>>>       if (amdgpu_sriov_vf(adev)) {
>>>>>>>>>>>>>>>>>>>>>>>> @@ -5296,7 +5302,7 @@ int 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>>     struct amdgpu_recover_work_struct {
>>>>>>>>>>>>>>>>>>>>>>>> -    struct work_struct base;
>>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct base;
>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev;
>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_job *job;
>>>>>>>>>>>>>>>>>>>>>>>>       int ret;
>>>>>>>>>>>>>>>>>>>>>>>> @@ -5304,7 +5310,7 @@ struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct {
>>>>>>>>>>>>>>>>>>>>>>>>     static void 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work(struct 
>>>>>>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_recover_work_struct 
>>>>>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base);
>>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_recover_work_struct 
>>>>>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base.base.work);
>>>>>>>>>>>>>>>>>>>>>>>> recover_work->ret = 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, 
>>>>>>>>>>>>>>>>>>>>>>>> recover_work->job);
>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>> @@ -5316,12 +5322,15 @@ int 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>>>> *adev,
>>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_recover_work_struct work = 
>>>>>>>>>>>>>>>>>>>>>>>> {.adev = adev, .job = job};
>>>>>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&work.base, 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&work.base.base, 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&work.base.node);
>>>>>>>>>>>>>>>>>>>>>>>>         if 
>>>>>>>>>>>>>>>>>>>>>>>> (!amdgpu_reset_domain_schedule(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>>>> &work.base))
>>>>>>>>>>>>>>>>>>>>>>>>           return -EAGAIN;
>>>>>>>>>>>>>>>>>>>>>>>>   - flush_work(&work.base);
>>>>>>>>>>>>>>>>>>>>>>>> + flush_delayed_work(&work.base.base);
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>>>> &work.base);
>>>>>>>>>>>>>>>>>>>>>>>>         return work.ret;
>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>>>>>>>>>> index c80af0889773..ffddd419c351 100644
>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>>>>>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain 
>>>>>>>>>>>>>>>>>>>>>>>> *amdgpu_reset_create_reset_domain(enum 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_d
>>>>>>>>>>>>>>>>>>>>>>>> atomic_set(&reset_domain->in_gpu_reset, 0);
>>>>>>>>>>>>>>>>>>>>>>>> init_rwsem(&reset_domain->sem);
>>>>>>>>>>>>>>>>>>>>>>>>   + INIT_LIST_HEAD(&reset_domain->pending_works);
>>>>>>>>>>>>>>>>>>>>>>>> + mutex_init(&reset_domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>       return reset_domain;
>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>>   diff --git 
>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>>>>>>>>>> index 1949dbe28a86..863ec5720fc1 100644
>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>>>>>>>>>> @@ -24,7 +24,18 @@
>>>>>>>>>>>>>>>>>>>>>>>>   #ifndef __AMDGPU_RESET_H__
>>>>>>>>>>>>>>>>>>>>>>>>   #define __AMDGPU_RESET_H__
>>>>>>>>>>>>>>>>>>>>>>>>   -#include "amdgpu.h"
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/atomic.h>
>>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/mutex.h>
>>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/list.h>
>>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/kref.h>
>>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/rwsem.h>
>>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/workqueue.h>
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_device;
>>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_job;
>>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_hive_info;
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>     enum AMDGPU_RESET_FLAGS {
>>>>>>>>>>>>>>>>>>>>>>>>   @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS {
>>>>>>>>>>>>>>>>>>>>>>>> AMDGPU_SKIP_HW_RESET = 1,
>>>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>>>   +
>>>>>>>>>>>>>>>>>>>>>>>> +enum amd_reset_method {
>>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_NONE = -1,
>>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_LEGACY = 0,
>>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE0,
>>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE1,
>>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE2,
>>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_BACO,
>>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_PCI,
>>>>>>>>>>>>>>>>>>>>>>>> +};
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>   struct amdgpu_reset_context {
>>>>>>>>>>>>>>>>>>>>>>>>       enum amd_reset_method method;
>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *reset_req_dev;
>>>>>>>>>>>>>>>>>>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context {
>>>>>>>>>>>>>>>>>>>>>>>>       unsigned long flags;
>>>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>>>   +struct amdgpu_reset_control;
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>   struct amdgpu_reset_handler {
>>>>>>>>>>>>>>>>>>>>>>>>       enum amd_reset_method reset_method;
>>>>>>>>>>>>>>>>>>>>>>>>       struct list_head handler_list;
>>>>>>>>>>>>>>>>>>>>>>>> @@ -76,12 +100,21 @@ enum 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_type {
>>>>>>>>>>>>>>>>>>>>>>>>       XGMI_HIVE
>>>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>>>   +
>>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_reset_work_struct {
>>>>>>>>>>>>>>>>>>>>>>>> +    struct delayed_work base;
>>>>>>>>>>>>>>>>>>>>>>>> +    struct list_head node;
>>>>>>>>>>>>>>>>>>>>>>>> +};
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>   struct amdgpu_reset_domain {
>>>>>>>>>>>>>>>>>>>>>>>>       struct kref refcount;
>>>>>>>>>>>>>>>>>>>>>>>>       struct workqueue_struct *wq;
>>>>>>>>>>>>>>>>>>>>>>>>       enum amdgpu_reset_domain_type type;
>>>>>>>>>>>>>>>>>>>>>>>>       struct rw_semaphore sem;
>>>>>>>>>>>>>>>>>>>>>>>>       atomic_t in_gpu_reset;
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> +    struct list_head pending_works;
>>>>>>>>>>>>>>>>>>>>>>>> +    struct mutex reset_lock;
>>>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>>>     @@ -113,9 +146,43 @@ static inline void 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_put_reset_domain(struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *dom
>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>>     static inline bool 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_schedule(struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain,
>>>>>>>>>>>>>>>>>>>>>>>> - struct work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>> -    return queue_work(domain->wq, work);
>>>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> +    if (!queue_delayed_work(domain->wq, 
>>>>>>>>>>>>>>>>>>>>>>>> &work->base, 0)) {
>>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>>> +        return false;
>>>>>>>>>>>>>>>>>>>>>>>> +    }
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> + list_add_tail(&work->node, 
>>>>>>>>>>>>>>>>>>>>>>>> &domain->pending_works);
>>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> +    return true;
>>>>>>>>>>>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> +static inline void 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain,
>>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>>> + list_del_init(&work->node);
>>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> +static inline void 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_pending_list(struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain)
>>>>>>>>>>>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct *entry, *tmp;
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>>> + list_for_each_entry_safe(entry, tmp, 
>>>>>>>>>>>>>>>>>>>>>>>> &domain->pending_works, node) {
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> + list_del_init(&entry->node);
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> +        /* Stop any other related pending 
>>>>>>>>>>>>>>>>>>>>>>>> resets */
>>>>>>>>>>>>>>>>>>>>>>>> + cancel_delayed_work(&entry->base);
>>>>>>>>>>>>>>>>>>>>>>>> +    }
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>>     void amdgpu_device_lock_reset_domain(struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *reset_domain);
>>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h 
>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>>>>>>>>>> index 239f232f9c02..574e870d3064 100644
>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>>>>>>>>>> @@ -25,6 +25,7 @@
>>>>>>>>>>>>>>>>>>>>>>>>   #define AMDGPU_VIRT_H
>>>>>>>>>>>>>>>>>>>>>>>>     #include "amdgv_sriovmsg.h"
>>>>>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>>>>>>>>>>>>>>>>>>     #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 << 
>>>>>>>>>>>>>>>>>>>>>>>> 0) /* vBIOS is sr-iov ready */
>>>>>>>>>>>>>>>>>>>>>>>>   #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 1) 
>>>>>>>>>>>>>>>>>>>>>>>> /* sr-iov is enabled on this GPU */
>>>>>>>>>>>>>>>>>>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt {
>>>>>>>>>>>>>>>>>>>>>>>>       uint32_t reg_val_offs;
>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_irq_src ack_irq;
>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_irq_src rcv_irq;
>>>>>>>>>>>>>>>>>>>>>>>> -    struct work_struct flr_work;
>>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct flr_work;
>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_mm_table mm_table;
>>>>>>>>>>>>>>>>>>>>>>>>       const struct amdgpu_virt_ops *ops;
>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_vf_error_buffer vf_errors;
>>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>>>>>>>> index b81acf59870c..f3d1c2be9292 100644
>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>>>>>>>> @@ -251,7 +251,7 @@ static int 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>>>>>     static void xgpu_ai_mailbox_flr_work(struct 
>>>>>>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, 
>>>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work);
>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = 
>>>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>>>>>>>>>       int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>>>>>>>>>>>>   @@ -380,7 +380,8 @@ int 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_get_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>>>> *adev)
>>>>>>>>>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>> @@ -389,6 +390,8 @@ void 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_put_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>>>> *adev)
>>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>>     static int xgpu_ai_request_init_data(struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644
>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>>>>>>>> @@ -275,7 +275,7 @@ static int 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>>>>>     static void xgpu_nv_mailbox_flr_work(struct 
>>>>>>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, 
>>>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work);
>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = 
>>>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>>>>>>>>>       int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>>>>>>>>>>>>   @@ -407,7 +407,8 @@ int 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_get_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>>>> *adev)
>>>>>>>>>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>> @@ -416,6 +417,8 @@ void 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_put_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>>>> *adev)
>>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>>     const struct amdgpu_virt_ops 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_virt_ops = {
>>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c 
>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>>>>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644
>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>>>>>>>>>> @@ -512,7 +512,7 @@ static int 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>>>>>     static void xgpu_vi_mailbox_flr_work(struct 
>>>>>>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, 
>>>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work);
>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = 
>>>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>>>>>>>>>         /* wait until RCV_MSG become 3 */
>>>>>>>>>>>>>>>>>>>>>>>> @@ -610,7 +610,8 @@ int 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_get_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>>>> *adev)
>>>>>>>>>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>> @@ -619,6 +620,8 @@ void 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_put_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>>>> *adev)
>>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>>     const struct amdgpu_virt_ops 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_virt_ops = {
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>
>>>>>>
>>>
>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive.
  2022-05-12  6:06                                             ` Christian König
  2022-05-12  9:21                                               ` Lazar, Lijo
@ 2022-05-12 13:07                                               ` Andrey Grodzovsky
  2022-05-12 13:15                                                 ` Christian König
  1 sibling, 1 reply; 40+ messages in thread
From: Andrey Grodzovsky @ 2022-05-12 13:07 UTC (permalink / raw)
  To: Christian König, Lazar, Lijo, Christian König,
	Felix Kuehling, amd-gfx
  Cc: Bai Zoy


On 2022-05-12 02:06, Christian König wrote:
> Am 11.05.22 um 22:27 schrieb Andrey Grodzovsky:
>>
>> On 2022-05-11 11:39, Christian König wrote:
>>> Am 11.05.22 um 17:35 schrieb Andrey Grodzovsky:
>>>> On 2022-05-11 11:20, Lazar, Lijo wrote:
>>>>>
>>>>>
>>>>> On 5/11/2022 7:28 PM, Christian König wrote:
>>>>>> Am 11.05.22 um 15:43 schrieb Andrey Grodzovsky:
>>>>>>> On 2022-05-11 03:38, Christian König wrote:
>>>>>>>> Am 10.05.22 um 20:53 schrieb Andrey Grodzovsky:
>>>>>>>>> [SNIP]
>>>>>>>>>> E.g. in the reset code (either before or after the reset, 
>>>>>>>>>> that's debatable) you do something like this:
>>>>>>>>>>
>>>>>>>>>> for (i = 0; i < num_ring; ++i)
>>>>>>>>>> cancel_delayed_work(ring[i]->scheduler....)
>>>>>>>>>> cancel_work(adev->ras_work);
>>>>>>>>>> cancel_work(adev->iofault_work);
>>>>>>>>>> cancel_work(adev->debugfs_work);
>>>>>>>>>> ...
>>>>>>>>>>
>>>>>>>>>> You don't really need to track which reset source has fired 
>>>>>>>>>> and which hasn't, because that would be racy again. Instead 
>>>>>>>>>> just bluntly reset all possible sources.
>>>>>>>>>>
>>>>>>>>>> Christian.
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> I don't say we care if it fired once or twice (I need to add a 
>>>>>>>>> fix to only insert reset work to pending reset list if it's 
>>>>>>>>> not already there), the point of using list (or array) to 
>>>>>>>>> which you add and from which you remove is that the logic of 
>>>>>>>>> this is encapsulated within reset domain. In your way we need 
>>>>>>>>> to be aware who exactly schedules reset work and explicitly 
>>>>>>>>> cancel them, this also means that for any new source added in 
>>>>>>>>> the future you will need to remember to add him
>>>>>>>>
>>>>>>>> I don't think that this is a valid argument. Additionally to 
>>>>>>>> the schedulers we probably just need less than a handful of 
>>>>>>>> reset sources, most likely even just one or two is enough.
>>>>>>>>
>>>>>>>> The only justification I can see of having additional separate 
>>>>>>>> reset sources would be if somebody wants to know if a specific 
>>>>>>>> source has been handled or not (e.g. call flush_work() or 
>>>>>>>> work_pending()). Like in the case of a reset triggered through 
>>>>>>>> debugfs.
>>>>>>>
>>>>>>>
>>>>>>> This is indeed one reason, another is as we said before that if 
>>>>>>> you share 'reset source' (meaning a delayed work) with another 
>>>>>>> client (i.e. RAS and KFD) it means you make assumption that the 
>>>>>>> other client always proceeds with the
>>>>>>> reset exactly the same way as you expect. So today we have this 
>>>>>>> only in scheduler vs non scheduler reset happening - non 
>>>>>>> scheduler reset clients assume the reset is always fully 
>>>>>>> executed in HW while scheduler based reset makes shortcuts and 
>>>>>>> not always does HW reset hence they cannot share 'reset source' 
>>>>>>> (delayed work). Yes, we can always add this in the future if and 
>>>>>>> when such problem will arise but no one will remember this then 
>>>>>>> and a new bug will be introduced and will take time to find and 
>>>>>>> resolve.
>>>>>>
>>>>>> Mhm, so your main concern is that we forget to correctly handle 
>>>>>> the new reset sources?
>>>>>>
>>>>>> How about we do it like this then:
>>>>>>
>>>>>> struct amdgpu_reset_domain {
>>>>>>      ....
>>>>>>      union {
>>>>>>          struct {
>>>>>>              struct work_item debugfs;
>>>>>>              struct work_item ras;
>>>>>>              ....
>>>>>>          };
>>>>>>          struct work_item array[]
>>>>>>      } reset_sources;
>>>>>> }
>>>>>>
>>>>>
>>>>> If it's only about static array,
>>>>>
>>>>> enum amdgpu_reset_soruce {
>>>>>
>>>>> AMDGPU_RESET_SRC_RAS,
>>>>> AMDGPU_RESET_SRC_ABC,
>>>>> .....
>>>>> AMDGPU_RESET_SRC_XYZ,
>>>>> AMDGPU_RESET_SRC_MAX
>>>>>
>>>>> };
>>>>>
>>>>> struct work_struct reset_work[AMDGPU_RESET_SRC_MAX]; => An index 
>>>>> for each work item
>>>>>
>>>>>
>>>>> Thanks,
>>>>> Lijo
>>>>
>>>>
>>>> It's possible though it makes harder to generalize reset_domain 
>>>> later for other drivers.
>>>> But still one caveat, look at amdgpu_recover_work_struct and it's 
>>>> usage in amdgpu_device_gpu_recover and in gpu_recover_get,
>>>> At least for debugfs i need to return back the result of GPU reset 
>>>> and so I cannot store actual work items in the array mentioned above
>>>> but rather pointers to work_item because i need a way to get back 
>>>> the return value from gpu_recover like I do now in 
>>>> amdgpu_device_gpu_recover.
>>>
>>> You should try to avoid that as well.
>>>
>>> See when the debugfs reset is canceled because of a scheduler reset 
>>> you won't get a useful return code either.
>>>
>>> What we should do instead is to cache the status of the last reset 
>>> in the reset domain.
>>>
>>> Regards,
>>> Christian.
>>
>>
>> Another problem with this approach -  to execute  the actaul GPU 
>> reset I need accesses  to concrete amdgpu_device pointer from work 
>> struct (see xgpu_ai_mailbox_flr_work) as example. If i store all work 
>> items in
>> array in amdgpu_reset_domain the most i can only retrieve is the 
>> reset_domain struct itself which won't help since it's dynamically 
>> allocated, not embedded in hive or adev and can can be one per device 
>> or per entire hive in case of XGMI and so there is no way for me to 
>> reach back to amdgpu_device. Back pointer to adev* from 
>> amdgpu_reset_domain will only work for single device but not for XGMI 
>> hive where there are multiple devices in a hive.
>
> Which is exactly the reason why I think we should always allocate the 
> hive structure, even if we only have one device. And a GPU reset 
> should then always work with the hive data structure and not adev.


I am not sure why HIVE is the object we should work with, hive is one 
use case, single device is another, then Lijo described something called 
partition which is what ? Particular pipe within GPU ?. What they all 
share in common
IMHO is that all of them use reset domain when they want a recovery 
operation, so maybe GPU reset should be oriented to work with reset domain ?

Andrey


>
> Adding a pointer from your reset work item back to the hive is then 
> trivial.
>
> Regards,
> Christian.
>
>>
>> Andrey
>>
>>
>>>
>>>>
>>>> Andrey
>>>>
>>>>
>>>>>
>>>>>> Not 100% sure if that works, but something like that should do 
>>>>>> the trick.
>>>>>>
>>>>>> My main concern is that I don't want to allocate the work items 
>>>>>> on the stack and dynamic allocation (e.g. kmalloc) is usually not 
>>>>>> possible either.
>>>>>>
>>>>>> Additional to that putting/removing work items from a list, array 
>>>>>> or other container is a very common source for race conditions.
>>>>>>
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>>>>
>>>>>>>>> to the cancellation list which you showed above. In current 
>>>>>>>>> way all this done automatically within reset_domain code and 
>>>>>>>>> it's agnostic to specific driver and it's specific list of 
>>>>>>>>> reset sources. Also in case we would want to generalize 
>>>>>>>>> reset_domain to other GPU drivers (which was
>>>>>>>>> a plan as far as i remember) this explicit mention of each 
>>>>>>>>> reset works for cancellation is again less suitable in my 
>>>>>>>>> opinion.
>>>>>>>>
>>>>>>>> Well we could put the work item for the scheduler independent 
>>>>>>>> reset source into the reset domain as well. But I'm not sure 
>>>>>>>> those additional reset sources should be part of any common 
>>>>>>>> handling, that is largely amdgpu specific.
>>>>>>>
>>>>>>>
>>>>>>> So it's for sure more then one source for the reasons described 
>>>>>>> above, also note that for scheduler we already cancel delayed 
>>>>>>> work in drm_sched_stop so calling them again in amdgpu code kind 
>>>>>>> of superfluous.
>>>>>>>
>>>>>>> Andrey
>>>>>>>
>>>>>>>
>>>>>>>>
>>>>>>>> Christian.
>>>>>>>>
>>>>>>>>>
>>>>>>>>> Andrey
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> Andrey
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>> The only difference is I chose to do the canceling right 
>>>>>>>>>>>>> BEFORE the HW reset and not AFTER. I did this because I 
>>>>>>>>>>>>> see a possible race where a new reset request is being 
>>>>>>>>>>>>> generated exactly after we finished the HW reset but 
>>>>>>>>>>>>> before we canceled out all pending resets - in such case 
>>>>>>>>>>>>> you wold not want to cancel this 'border line new' reset 
>>>>>>>>>>>>> request.
>>>>>>>>>>>>
>>>>>>>>>>>> Why not? Any new reset request directly after a hardware 
>>>>>>>>>>>> reset is most likely just falsely generated by the reset 
>>>>>>>>>>>> itself.
>>>>>>>>>>>>
>>>>>>>>>>>> Ideally I would cancel all sources after the reset, but 
>>>>>>>>>>>> before starting any new work.
>>>>>>>>>>>>
>>>>>>>>>>>> Regards,
>>>>>>>>>>>> Christian.
>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> You can see that if many different reset sources share 
>>>>>>>>>>>>>>>> same work struct what can happen is that the first to 
>>>>>>>>>>>>>>>> obtain the lock you describe bellow might opt out from 
>>>>>>>>>>>>>>>> full HW reset because his bad job did signal for 
>>>>>>>>>>>>>>>> example or because his hunged IP block was able to 
>>>>>>>>>>>>>>>> recover through SW reset but in the meantime another 
>>>>>>>>>>>>>>>> reset source who needed an actual HW reset just 
>>>>>>>>>>>>>>>> silently returned and we end up with unhandled reset 
>>>>>>>>>>>>>>>> request. True that today this happens only to job 
>>>>>>>>>>>>>>>> timeout reset sources that are handled form within the 
>>>>>>>>>>>>>>>> scheduler and won't use this single work struct but no 
>>>>>>>>>>>>>>>> one prevents a future case for this to happen and also, 
>>>>>>>>>>>>>>>> if we actually want to unify scheduler time out 
>>>>>>>>>>>>>>>> handlers within reset domain (which seems to me the 
>>>>>>>>>>>>>>>> right design approach) we won't be able to use just one 
>>>>>>>>>>>>>>>> work struct for this reason anyway.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Just to add to this point - a reset domain is 
>>>>>>>>>>>>>>> co-operative domain. In addition to sharing a set of 
>>>>>>>>>>>>>>> clients from various reset sources for one device, it 
>>>>>>>>>>>>>>> also will have a set of devices like in XGMI hive. The 
>>>>>>>>>>>>>>> job timeout on one device may not eventually result in 
>>>>>>>>>>>>>>> result, but a RAS error happening on another device at 
>>>>>>>>>>>>>>> the same time would need a reset. The second device's 
>>>>>>>>>>>>>>> RAS error cannot return seeing that a reset work already 
>>>>>>>>>>>>>>> started, or ignore the reset work given that another 
>>>>>>>>>>>>>>> device has filled the reset data.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> When there is a reset domain, it should take care of the 
>>>>>>>>>>>>>>> work scheduled and keeping it in device or any other 
>>>>>>>>>>>>>>> level doesn't sound good.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Thanks,
>>>>>>>>>>>>>>> Lijo
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> I'd put the reset work struct into the reset_domain 
>>>>>>>>>>>>>>>>> struct. That way you'd have exactly one worker for the 
>>>>>>>>>>>>>>>>> reset domain. You could implement a lock-less scheme 
>>>>>>>>>>>>>>>>> to decide whether you need to schedule a reset, e.g. 
>>>>>>>>>>>>>>>>> using an atomic counter in the shared work struct that 
>>>>>>>>>>>>>>>>> gets incremented when a client wants to trigger a 
>>>>>>>>>>>>>>>>> reset (atomic_add_return). If that counter is exactly 
>>>>>>>>>>>>>>>>> 1 after incrementing, you need to fill in the rest of 
>>>>>>>>>>>>>>>>> the work struct and schedule the work. Otherwise, it's 
>>>>>>>>>>>>>>>>> already scheduled (or another client is in the process 
>>>>>>>>>>>>>>>>> of scheduling it) and you just return. When the worker 
>>>>>>>>>>>>>>>>> finishes (after confirming a successful reset), it 
>>>>>>>>>>>>>>>>> resets the counter to 0, so the next client requesting 
>>>>>>>>>>>>>>>>> a reset will schedule the worker again.
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>   Felix
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> Additional to that keep in mind that you can't 
>>>>>>>>>>>>>>>>>>> allocate any memory before or during the GPU reset 
>>>>>>>>>>>>>>>>>>> nor wait for the reset to complete (so you can't 
>>>>>>>>>>>>>>>>>>> allocate anything on the stack either).
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> There is no dynamic allocation here, regarding stack 
>>>>>>>>>>>>>>>>>> allocations - we do it all the time when we call 
>>>>>>>>>>>>>>>>>> functions, even during GPU resets, how on stack 
>>>>>>>>>>>>>>>>>> allocation of work struct in 
>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover is different from any other 
>>>>>>>>>>>>>>>>>> local variable we allocate in any function we call ?
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> I am also not sure why it's not allowed to wait for 
>>>>>>>>>>>>>>>>>> reset to complete ? Also, see in 
>>>>>>>>>>>>>>>>>> amdgpu_ras_do_recovery and gpu_recover_get (debugfs) 
>>>>>>>>>>>>>>>>>> - the caller expects the reset to complete before he 
>>>>>>>>>>>>>>>>>> returns. I can probably work around it in RAS code by 
>>>>>>>>>>>>>>>>>> calling atomic_set(&ras->in_recovery, 0) from some 
>>>>>>>>>>>>>>>>>> callback within actual reset function but regarding 
>>>>>>>>>>>>>>>>>> sysfs it actually expects a result returned 
>>>>>>>>>>>>>>>>>> indicating whether the call was successful or not.
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> I don't think that concept you try here will work.
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> Also in general seems to me it's cleaner approach 
>>>>>>>>>>>>>>>>>>>> where this logic (the work items) are held and 
>>>>>>>>>>>>>>>>>>>> handled in reset_domain and are not split in each 
>>>>>>>>>>>>>>>>>>>> adev or any other entity. We might want in the 
>>>>>>>>>>>>>>>>>>>> future to even move the scheduler handling into 
>>>>>>>>>>>>>>>>>>>> reset domain since reset domain is supposed to be a 
>>>>>>>>>>>>>>>>>>>> generic things and not only or AMD.
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky 
>>>>>>>>>>>>>>>>>>>>>>>> <andrey.grodzovsky@amd.com>
>>>>>>>>>>>>>>>>>>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com>
>>>>>>>>>>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +---
>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 
>>>>>>>>>>>>>>>>>>>>>>>> +++--
>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 +
>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 
>>>>>>>>>>>>>>>>>>>>>>>> +++++++++++++++++++++-
>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +-
>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++-
>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++-
>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++-
>>>>>>>>>>>>>>>>>>>>>>>>   8 files changed, 104 insertions(+), 24 
>>>>>>>>>>>>>>>>>>>>>>>> deletions(-)
>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>>>>>>>>>> index 4264abc5604d..99efd8317547 100644
>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>>>>>>>>>> @@ -109,6 +109,7 @@
>>>>>>>>>>>>>>>>>>>>>>>>   #include "amdgpu_fdinfo.h"
>>>>>>>>>>>>>>>>>>>>>>>>   #include "amdgpu_mca.h"
>>>>>>>>>>>>>>>>>>>>>>>>   #include "amdgpu_ras.h"
>>>>>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>>>>>>>>>>>>>>>>>>     #define MAX_GPU_INSTANCE 16
>>>>>>>>>>>>>>>>>>>>>>>>   @@ -509,16 +510,6 @@ struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_allowed_register_entry {
>>>>>>>>>>>>>>>>>>>>>>>>       bool grbm_indexed;
>>>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>>>   -enum amd_reset_method {
>>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_NONE = -1,
>>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_LEGACY = 0,
>>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE0,
>>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE1,
>>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE2,
>>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_BACO,
>>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_PCI,
>>>>>>>>>>>>>>>>>>>>>>>> -};
>>>>>>>>>>>>>>>>>>>>>>>> -
>>>>>>>>>>>>>>>>>>>>>>>>   struct amdgpu_video_codec_info {
>>>>>>>>>>>>>>>>>>>>>>>>       u32 codec_type;
>>>>>>>>>>>>>>>>>>>>>>>>       u32 max_width;
>>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>>> index e582f1044c0f..7fa82269c30f 100644
>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>>> @@ -5201,6 +5201,12 @@ int 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>>>>> tmp_vram_lost_counter = 
>>>>>>>>>>>>>>>>>>>>>>>> atomic_read(&((adev)->vram_lost_counter));
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> +    /* Drop all pending resets since we will 
>>>>>>>>>>>>>>>>>>>>>>>> reset now anyway */
>>>>>>>>>>>>>>>>>>>>>>>> +    tmp_adev = 
>>>>>>>>>>>>>>>>>>>>>>>> list_first_entry(device_list_handle, struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device,
>>>>>>>>>>>>>>>>>>>>>>>> + reset_list);
>>>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_pending_list(tmp_adev->reset_domain);
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>       /* Actual ASIC resets if needed.*/
>>>>>>>>>>>>>>>>>>>>>>>>       /* Host driver will handle XGMI hive 
>>>>>>>>>>>>>>>>>>>>>>>> reset for SRIOV */
>>>>>>>>>>>>>>>>>>>>>>>>       if (amdgpu_sriov_vf(adev)) {
>>>>>>>>>>>>>>>>>>>>>>>> @@ -5296,7 +5302,7 @@ int 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>>     struct amdgpu_recover_work_struct {
>>>>>>>>>>>>>>>>>>>>>>>> -    struct work_struct base;
>>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct base;
>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev;
>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_job *job;
>>>>>>>>>>>>>>>>>>>>>>>>       int ret;
>>>>>>>>>>>>>>>>>>>>>>>> @@ -5304,7 +5310,7 @@ struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct {
>>>>>>>>>>>>>>>>>>>>>>>>     static void 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work(struct 
>>>>>>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_recover_work_struct 
>>>>>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base);
>>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_recover_work_struct 
>>>>>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base.base.work);
>>>>>>>>>>>>>>>>>>>>>>>> recover_work->ret = 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, 
>>>>>>>>>>>>>>>>>>>>>>>> recover_work->job);
>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>> @@ -5316,12 +5322,15 @@ int 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>>>> *adev,
>>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_recover_work_struct work = 
>>>>>>>>>>>>>>>>>>>>>>>> {.adev = adev, .job = job};
>>>>>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&work.base, 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&work.base.base, 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&work.base.node);
>>>>>>>>>>>>>>>>>>>>>>>>         if 
>>>>>>>>>>>>>>>>>>>>>>>> (!amdgpu_reset_domain_schedule(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>>>> &work.base))
>>>>>>>>>>>>>>>>>>>>>>>>           return -EAGAIN;
>>>>>>>>>>>>>>>>>>>>>>>>   - flush_work(&work.base);
>>>>>>>>>>>>>>>>>>>>>>>> + flush_delayed_work(&work.base.base);
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>>>> &work.base);
>>>>>>>>>>>>>>>>>>>>>>>>         return work.ret;
>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>>>>>>>>>> index c80af0889773..ffddd419c351 100644
>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>>>>>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain 
>>>>>>>>>>>>>>>>>>>>>>>> *amdgpu_reset_create_reset_domain(enum 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_d
>>>>>>>>>>>>>>>>>>>>>>>> atomic_set(&reset_domain->in_gpu_reset, 0);
>>>>>>>>>>>>>>>>>>>>>>>> init_rwsem(&reset_domain->sem);
>>>>>>>>>>>>>>>>>>>>>>>>   + INIT_LIST_HEAD(&reset_domain->pending_works);
>>>>>>>>>>>>>>>>>>>>>>>> + mutex_init(&reset_domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>       return reset_domain;
>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>>   diff --git 
>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>>>>>>>>>> index 1949dbe28a86..863ec5720fc1 100644
>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>>>>>>>>>> @@ -24,7 +24,18 @@
>>>>>>>>>>>>>>>>>>>>>>>>   #ifndef __AMDGPU_RESET_H__
>>>>>>>>>>>>>>>>>>>>>>>>   #define __AMDGPU_RESET_H__
>>>>>>>>>>>>>>>>>>>>>>>>   -#include "amdgpu.h"
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/atomic.h>
>>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/mutex.h>
>>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/list.h>
>>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/kref.h>
>>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/rwsem.h>
>>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/workqueue.h>
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_device;
>>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_job;
>>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_hive_info;
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>     enum AMDGPU_RESET_FLAGS {
>>>>>>>>>>>>>>>>>>>>>>>>   @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS {
>>>>>>>>>>>>>>>>>>>>>>>> AMDGPU_SKIP_HW_RESET = 1,
>>>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>>>   +
>>>>>>>>>>>>>>>>>>>>>>>> +enum amd_reset_method {
>>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_NONE = -1,
>>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_LEGACY = 0,
>>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE0,
>>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE1,
>>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE2,
>>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_BACO,
>>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_PCI,
>>>>>>>>>>>>>>>>>>>>>>>> +};
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>   struct amdgpu_reset_context {
>>>>>>>>>>>>>>>>>>>>>>>>       enum amd_reset_method method;
>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *reset_req_dev;
>>>>>>>>>>>>>>>>>>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context {
>>>>>>>>>>>>>>>>>>>>>>>>       unsigned long flags;
>>>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>>>   +struct amdgpu_reset_control;
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>   struct amdgpu_reset_handler {
>>>>>>>>>>>>>>>>>>>>>>>>       enum amd_reset_method reset_method;
>>>>>>>>>>>>>>>>>>>>>>>>       struct list_head handler_list;
>>>>>>>>>>>>>>>>>>>>>>>> @@ -76,12 +100,21 @@ enum 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_type {
>>>>>>>>>>>>>>>>>>>>>>>>       XGMI_HIVE
>>>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>>>   +
>>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_reset_work_struct {
>>>>>>>>>>>>>>>>>>>>>>>> +    struct delayed_work base;
>>>>>>>>>>>>>>>>>>>>>>>> +    struct list_head node;
>>>>>>>>>>>>>>>>>>>>>>>> +};
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>   struct amdgpu_reset_domain {
>>>>>>>>>>>>>>>>>>>>>>>>       struct kref refcount;
>>>>>>>>>>>>>>>>>>>>>>>>       struct workqueue_struct *wq;
>>>>>>>>>>>>>>>>>>>>>>>>       enum amdgpu_reset_domain_type type;
>>>>>>>>>>>>>>>>>>>>>>>>       struct rw_semaphore sem;
>>>>>>>>>>>>>>>>>>>>>>>>       atomic_t in_gpu_reset;
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> +    struct list_head pending_works;
>>>>>>>>>>>>>>>>>>>>>>>> +    struct mutex reset_lock;
>>>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>>>     @@ -113,9 +146,43 @@ static inline void 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_put_reset_domain(struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *dom
>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>>     static inline bool 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_schedule(struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain,
>>>>>>>>>>>>>>>>>>>>>>>> - struct work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>> -    return queue_work(domain->wq, work);
>>>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> +    if (!queue_delayed_work(domain->wq, 
>>>>>>>>>>>>>>>>>>>>>>>> &work->base, 0)) {
>>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>>> +        return false;
>>>>>>>>>>>>>>>>>>>>>>>> +    }
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> + list_add_tail(&work->node, 
>>>>>>>>>>>>>>>>>>>>>>>> &domain->pending_works);
>>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> +    return true;
>>>>>>>>>>>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> +static inline void 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain,
>>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>>> + list_del_init(&work->node);
>>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> +static inline void 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_pending_list(struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain)
>>>>>>>>>>>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct *entry, *tmp;
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>>> + list_for_each_entry_safe(entry, tmp, 
>>>>>>>>>>>>>>>>>>>>>>>> &domain->pending_works, node) {
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> + list_del_init(&entry->node);
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> +        /* Stop any other related pending 
>>>>>>>>>>>>>>>>>>>>>>>> resets */
>>>>>>>>>>>>>>>>>>>>>>>> + cancel_delayed_work(&entry->base);
>>>>>>>>>>>>>>>>>>>>>>>> +    }
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>>     void amdgpu_device_lock_reset_domain(struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *reset_domain);
>>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h 
>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>>>>>>>>>> index 239f232f9c02..574e870d3064 100644
>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>>>>>>>>>> @@ -25,6 +25,7 @@
>>>>>>>>>>>>>>>>>>>>>>>>   #define AMDGPU_VIRT_H
>>>>>>>>>>>>>>>>>>>>>>>>     #include "amdgv_sriovmsg.h"
>>>>>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>>>>>>>>>>>>>>>>>>     #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 << 
>>>>>>>>>>>>>>>>>>>>>>>> 0) /* vBIOS is sr-iov ready */
>>>>>>>>>>>>>>>>>>>>>>>>   #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 1) 
>>>>>>>>>>>>>>>>>>>>>>>> /* sr-iov is enabled on this GPU */
>>>>>>>>>>>>>>>>>>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt {
>>>>>>>>>>>>>>>>>>>>>>>>       uint32_t reg_val_offs;
>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_irq_src ack_irq;
>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_irq_src rcv_irq;
>>>>>>>>>>>>>>>>>>>>>>>> -    struct work_struct flr_work;
>>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct flr_work;
>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_mm_table mm_table;
>>>>>>>>>>>>>>>>>>>>>>>>       const struct amdgpu_virt_ops *ops;
>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_vf_error_buffer vf_errors;
>>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>>>>>>>> index b81acf59870c..f3d1c2be9292 100644
>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>>>>>>>> @@ -251,7 +251,7 @@ static int 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>>>>>     static void xgpu_ai_mailbox_flr_work(struct 
>>>>>>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, 
>>>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work);
>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = 
>>>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>>>>>>>>>       int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>>>>>>>>>>>>   @@ -380,7 +380,8 @@ int 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_get_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>>>> *adev)
>>>>>>>>>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>> @@ -389,6 +390,8 @@ void 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_put_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>>>> *adev)
>>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>>     static int xgpu_ai_request_init_data(struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644
>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>>>>>>>> @@ -275,7 +275,7 @@ static int 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>>>>>     static void xgpu_nv_mailbox_flr_work(struct 
>>>>>>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, 
>>>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work);
>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = 
>>>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>>>>>>>>>       int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>>>>>>>>>>>>   @@ -407,7 +407,8 @@ int 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_get_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>>>> *adev)
>>>>>>>>>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>> @@ -416,6 +417,8 @@ void 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_put_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>>>> *adev)
>>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>>     const struct amdgpu_virt_ops 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_virt_ops = {
>>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c 
>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>>>>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644
>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>>>>>>>>>> @@ -512,7 +512,7 @@ static int 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>>>>>     static void xgpu_vi_mailbox_flr_work(struct 
>>>>>>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, 
>>>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work);
>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = 
>>>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>>>>>>>>>         /* wait until RCV_MSG become 3 */
>>>>>>>>>>>>>>>>>>>>>>>> @@ -610,7 +610,8 @@ int 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_get_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>>>> *adev)
>>>>>>>>>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>> @@ -619,6 +620,8 @@ void 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_put_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>>>> *adev)
>>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>>     const struct amdgpu_virt_ops 
>>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_virt_ops = {
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>
>>>>>>
>>>
>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive.
  2022-05-12 13:07                                               ` Andrey Grodzovsky
@ 2022-05-12 13:15                                                 ` Christian König
  2022-05-12 13:44                                                   ` Andrey Grodzovsky
  2022-05-13 15:41                                                   ` Andrey Grodzovsky
  0 siblings, 2 replies; 40+ messages in thread
From: Christian König @ 2022-05-12 13:15 UTC (permalink / raw)
  To: Andrey Grodzovsky, Lazar, Lijo, Christian König,
	Felix Kuehling, amd-gfx
  Cc: Bai Zoy

Am 12.05.22 um 15:07 schrieb Andrey Grodzovsky:
>
> On 2022-05-12 02:06, Christian König wrote:
>> Am 11.05.22 um 22:27 schrieb Andrey Grodzovsky:
>>>
>>> On 2022-05-11 11:39, Christian König wrote:
>>>> Am 11.05.22 um 17:35 schrieb Andrey Grodzovsky:
>>>>> On 2022-05-11 11:20, Lazar, Lijo wrote:
>>>>>>
>>>>>>
>>>>>> On 5/11/2022 7:28 PM, Christian König wrote:
>>>>>>> Am 11.05.22 um 15:43 schrieb Andrey Grodzovsky:
>>>>>>>> On 2022-05-11 03:38, Christian König wrote:
>>>>>>>>> Am 10.05.22 um 20:53 schrieb Andrey Grodzovsky:
>>>>>>>>>> [SNIP]
>>>>>>>>>>> E.g. in the reset code (either before or after the reset, 
>>>>>>>>>>> that's debatable) you do something like this:
>>>>>>>>>>>
>>>>>>>>>>> for (i = 0; i < num_ring; ++i)
>>>>>>>>>>> cancel_delayed_work(ring[i]->scheduler....)
>>>>>>>>>>> cancel_work(adev->ras_work);
>>>>>>>>>>> cancel_work(adev->iofault_work);
>>>>>>>>>>> cancel_work(adev->debugfs_work);
>>>>>>>>>>> ...
>>>>>>>>>>>
>>>>>>>>>>> You don't really need to track which reset source has fired 
>>>>>>>>>>> and which hasn't, because that would be racy again. Instead 
>>>>>>>>>>> just bluntly reset all possible sources.
>>>>>>>>>>>
>>>>>>>>>>> Christian.
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> I don't say we care if it fired once or twice (I need to add 
>>>>>>>>>> a fix to only insert reset work to pending reset list if it's 
>>>>>>>>>> not already there), the point of using list (or array) to 
>>>>>>>>>> which you add and from which you remove is that the logic of 
>>>>>>>>>> this is encapsulated within reset domain. In your way we need 
>>>>>>>>>> to be aware who exactly schedules reset work and explicitly 
>>>>>>>>>> cancel them, this also means that for any new source added in 
>>>>>>>>>> the future you will need to remember to add him
>>>>>>>>>
>>>>>>>>> I don't think that this is a valid argument. Additionally to 
>>>>>>>>> the schedulers we probably just need less than a handful of 
>>>>>>>>> reset sources, most likely even just one or two is enough.
>>>>>>>>>
>>>>>>>>> The only justification I can see of having additional separate 
>>>>>>>>> reset sources would be if somebody wants to know if a specific 
>>>>>>>>> source has been handled or not (e.g. call flush_work() or 
>>>>>>>>> work_pending()). Like in the case of a reset triggered through 
>>>>>>>>> debugfs.
>>>>>>>>
>>>>>>>>
>>>>>>>> This is indeed one reason, another is as we said before that if 
>>>>>>>> you share 'reset source' (meaning a delayed work) with another 
>>>>>>>> client (i.e. RAS and KFD) it means you make assumption that the 
>>>>>>>> other client always proceeds with the
>>>>>>>> reset exactly the same way as you expect. So today we have this 
>>>>>>>> only in scheduler vs non scheduler reset happening - non 
>>>>>>>> scheduler reset clients assume the reset is always fully 
>>>>>>>> executed in HW while scheduler based reset makes shortcuts and 
>>>>>>>> not always does HW reset hence they cannot share 'reset source' 
>>>>>>>> (delayed work). Yes, we can always add this in the future if 
>>>>>>>> and when such problem will arise but no one will remember this 
>>>>>>>> then and a new bug will be introduced and will take time to 
>>>>>>>> find and resolve.
>>>>>>>
>>>>>>> Mhm, so your main concern is that we forget to correctly handle 
>>>>>>> the new reset sources?
>>>>>>>
>>>>>>> How about we do it like this then:
>>>>>>>
>>>>>>> struct amdgpu_reset_domain {
>>>>>>>      ....
>>>>>>>      union {
>>>>>>>          struct {
>>>>>>>              struct work_item debugfs;
>>>>>>>              struct work_item ras;
>>>>>>>              ....
>>>>>>>          };
>>>>>>>          struct work_item array[]
>>>>>>>      } reset_sources;
>>>>>>> }
>>>>>>>
>>>>>>
>>>>>> If it's only about static array,
>>>>>>
>>>>>> enum amdgpu_reset_soruce {
>>>>>>
>>>>>> AMDGPU_RESET_SRC_RAS,
>>>>>> AMDGPU_RESET_SRC_ABC,
>>>>>> .....
>>>>>> AMDGPU_RESET_SRC_XYZ,
>>>>>> AMDGPU_RESET_SRC_MAX
>>>>>>
>>>>>> };
>>>>>>
>>>>>> struct work_struct reset_work[AMDGPU_RESET_SRC_MAX]; => An index 
>>>>>> for each work item
>>>>>>
>>>>>>
>>>>>> Thanks,
>>>>>> Lijo
>>>>>
>>>>>
>>>>> It's possible though it makes harder to generalize reset_domain 
>>>>> later for other drivers.
>>>>> But still one caveat, look at amdgpu_recover_work_struct and it's 
>>>>> usage in amdgpu_device_gpu_recover and in gpu_recover_get,
>>>>> At least for debugfs i need to return back the result of GPU reset 
>>>>> and so I cannot store actual work items in the array mentioned above
>>>>> but rather pointers to work_item because i need a way to get back 
>>>>> the return value from gpu_recover like I do now in 
>>>>> amdgpu_device_gpu_recover.
>>>>
>>>> You should try to avoid that as well.
>>>>
>>>> See when the debugfs reset is canceled because of a scheduler reset 
>>>> you won't get a useful return code either.
>>>>
>>>> What we should do instead is to cache the status of the last reset 
>>>> in the reset domain.
>>>>
>>>> Regards,
>>>> Christian.
>>>
>>>
>>> Another problem with this approach -  to execute  the actaul GPU 
>>> reset I need accesses  to concrete amdgpu_device pointer from work 
>>> struct (see xgpu_ai_mailbox_flr_work) as example. If i store all 
>>> work items in
>>> array in amdgpu_reset_domain the most i can only retrieve is the 
>>> reset_domain struct itself which won't help since it's dynamically 
>>> allocated, not embedded in hive or adev and can can be one per 
>>> device or per entire hive in case of XGMI and so there is no way for 
>>> me to reach back to amdgpu_device. Back pointer to adev* from 
>>> amdgpu_reset_domain will only work for single device but not for 
>>> XGMI hive where there are multiple devices in a hive.
>>
>> Which is exactly the reason why I think we should always allocate the 
>> hive structure, even if we only have one device. And a GPU reset 
>> should then always work with the hive data structure and not adev.
>
>
> I am not sure why HIVE is the object we should work with, hive is one 
> use case, single device is another, then Lijo described something 
> called partition which is what ? Particular pipe within GPU ?. What 
> they all share in common
> IMHO is that all of them use reset domain when they want a recovery 
> operation, so maybe GPU reset should be oriented to work with reset 
> domain ?

Yes, exactly that's the idea.

Basically the reset domain knowns which amdgpu devices it needs to reset 
together.

If you then represent that so that you always have a hive even when you 
only have one device in it, or if you put an array of devices which 
needs to be reset together into the reset domain doesn't matter.

Maybe go for the later approach, that is probably a bit cleaner and less 
code to change.

Christian.

>
> Andrey
>
>
>>
>> Adding a pointer from your reset work item back to the hive is then 
>> trivial.
>>
>> Regards,
>> Christian.
>>
>>>
>>> Andrey
>>>
>>>
>>>>
>>>>>
>>>>> Andrey
>>>>>
>>>>>
>>>>>>
>>>>>>> Not 100% sure if that works, but something like that should do 
>>>>>>> the trick.
>>>>>>>
>>>>>>> My main concern is that I don't want to allocate the work items 
>>>>>>> on the stack and dynamic allocation (e.g. kmalloc) is usually 
>>>>>>> not possible either.
>>>>>>>
>>>>>>> Additional to that putting/removing work items from a list, 
>>>>>>> array or other container is a very common source for race 
>>>>>>> conditions.
>>>>>>>
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>>>>
>>>>>>>>>> to the cancellation list which you showed above. In current 
>>>>>>>>>> way all this done automatically within reset_domain code and 
>>>>>>>>>> it's agnostic to specific driver and it's specific list of 
>>>>>>>>>> reset sources. Also in case we would want to generalize 
>>>>>>>>>> reset_domain to other GPU drivers (which was
>>>>>>>>>> a plan as far as i remember) this explicit mention of each 
>>>>>>>>>> reset works for cancellation is again less suitable in my 
>>>>>>>>>> opinion.
>>>>>>>>>
>>>>>>>>> Well we could put the work item for the scheduler independent 
>>>>>>>>> reset source into the reset domain as well. But I'm not sure 
>>>>>>>>> those additional reset sources should be part of any common 
>>>>>>>>> handling, that is largely amdgpu specific.
>>>>>>>>
>>>>>>>>
>>>>>>>> So it's for sure more then one source for the reasons described 
>>>>>>>> above, also note that for scheduler we already cancel delayed 
>>>>>>>> work in drm_sched_stop so calling them again in amdgpu code 
>>>>>>>> kind of superfluous.
>>>>>>>>
>>>>>>>> Andrey
>>>>>>>>
>>>>>>>>
>>>>>>>>>
>>>>>>>>> Christian.
>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> Andrey
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> Andrey
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> The only difference is I chose to do the canceling right 
>>>>>>>>>>>>>> BEFORE the HW reset and not AFTER. I did this because I 
>>>>>>>>>>>>>> see a possible race where a new reset request is being 
>>>>>>>>>>>>>> generated exactly after we finished the HW reset but 
>>>>>>>>>>>>>> before we canceled out all pending resets - in such case 
>>>>>>>>>>>>>> you wold not want to cancel this 'border line new' reset 
>>>>>>>>>>>>>> request.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Why not? Any new reset request directly after a hardware 
>>>>>>>>>>>>> reset is most likely just falsely generated by the reset 
>>>>>>>>>>>>> itself.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Ideally I would cancel all sources after the reset, but 
>>>>>>>>>>>>> before starting any new work.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> You can see that if many different reset sources share 
>>>>>>>>>>>>>>>>> same work struct what can happen is that the first to 
>>>>>>>>>>>>>>>>> obtain the lock you describe bellow might opt out from 
>>>>>>>>>>>>>>>>> full HW reset because his bad job did signal for 
>>>>>>>>>>>>>>>>> example or because his hunged IP block was able to 
>>>>>>>>>>>>>>>>> recover through SW reset but in the meantime another 
>>>>>>>>>>>>>>>>> reset source who needed an actual HW reset just 
>>>>>>>>>>>>>>>>> silently returned and we end up with unhandled reset 
>>>>>>>>>>>>>>>>> request. True that today this happens only to job 
>>>>>>>>>>>>>>>>> timeout reset sources that are handled form within the 
>>>>>>>>>>>>>>>>> scheduler and won't use this single work struct but no 
>>>>>>>>>>>>>>>>> one prevents a future case for this to happen and 
>>>>>>>>>>>>>>>>> also, if we actually want to unify scheduler time out 
>>>>>>>>>>>>>>>>> handlers within reset domain (which seems to me the 
>>>>>>>>>>>>>>>>> right design approach) we won't be able to use just 
>>>>>>>>>>>>>>>>> one work struct for this reason anyway.
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Just to add to this point - a reset domain is 
>>>>>>>>>>>>>>>> co-operative domain. In addition to sharing a set of 
>>>>>>>>>>>>>>>> clients from various reset sources for one device, it 
>>>>>>>>>>>>>>>> also will have a set of devices like in XGMI hive. The 
>>>>>>>>>>>>>>>> job timeout on one device may not eventually result in 
>>>>>>>>>>>>>>>> result, but a RAS error happening on another device at 
>>>>>>>>>>>>>>>> the same time would need a reset. The second device's 
>>>>>>>>>>>>>>>> RAS error cannot return seeing that a reset work 
>>>>>>>>>>>>>>>> already started, or ignore the reset work given that 
>>>>>>>>>>>>>>>> another device has filled the reset data.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> When there is a reset domain, it should take care of 
>>>>>>>>>>>>>>>> the work scheduled and keeping it in device or any 
>>>>>>>>>>>>>>>> other level doesn't sound good.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Thanks,
>>>>>>>>>>>>>>>> Lijo
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> I'd put the reset work struct into the reset_domain 
>>>>>>>>>>>>>>>>>> struct. That way you'd have exactly one worker for 
>>>>>>>>>>>>>>>>>> the reset domain. You could implement a lock-less 
>>>>>>>>>>>>>>>>>> scheme to decide whether you need to schedule a 
>>>>>>>>>>>>>>>>>> reset, e.g. using an atomic counter in the shared 
>>>>>>>>>>>>>>>>>> work struct that gets incremented when a client wants 
>>>>>>>>>>>>>>>>>> to trigger a reset (atomic_add_return). If that 
>>>>>>>>>>>>>>>>>> counter is exactly 1 after incrementing, you need to 
>>>>>>>>>>>>>>>>>> fill in the rest of the work struct and schedule the 
>>>>>>>>>>>>>>>>>> work. Otherwise, it's already scheduled (or another 
>>>>>>>>>>>>>>>>>> client is in the process of scheduling it) and you 
>>>>>>>>>>>>>>>>>> just return. When the worker finishes (after 
>>>>>>>>>>>>>>>>>> confirming a successful reset), it resets the counter 
>>>>>>>>>>>>>>>>>> to 0, so the next client requesting a reset will 
>>>>>>>>>>>>>>>>>> schedule the worker again.
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>   Felix
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> Additional to that keep in mind that you can't 
>>>>>>>>>>>>>>>>>>>> allocate any memory before or during the GPU reset 
>>>>>>>>>>>>>>>>>>>> nor wait for the reset to complete (so you can't 
>>>>>>>>>>>>>>>>>>>> allocate anything on the stack either).
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> There is no dynamic allocation here, regarding stack 
>>>>>>>>>>>>>>>>>>> allocations - we do it all the time when we call 
>>>>>>>>>>>>>>>>>>> functions, even during GPU resets, how on stack 
>>>>>>>>>>>>>>>>>>> allocation of work struct in 
>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover is different from any 
>>>>>>>>>>>>>>>>>>> other local variable we allocate in any function we 
>>>>>>>>>>>>>>>>>>> call ?
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> I am also not sure why it's not allowed to wait for 
>>>>>>>>>>>>>>>>>>> reset to complete ? Also, see in 
>>>>>>>>>>>>>>>>>>> amdgpu_ras_do_recovery and gpu_recover_get (debugfs) 
>>>>>>>>>>>>>>>>>>> - the caller expects the reset to complete before he 
>>>>>>>>>>>>>>>>>>> returns. I can probably work around it in RAS code 
>>>>>>>>>>>>>>>>>>> by calling atomic_set(&ras->in_recovery, 0) from 
>>>>>>>>>>>>>>>>>>> some callback within actual reset function but 
>>>>>>>>>>>>>>>>>>> regarding sysfs it actually expects a result 
>>>>>>>>>>>>>>>>>>> returned indicating whether the call was successful 
>>>>>>>>>>>>>>>>>>> or not.
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> I don't think that concept you try here will work.
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> Also in general seems to me it's cleaner approach 
>>>>>>>>>>>>>>>>>>>>> where this logic (the work items) are held and 
>>>>>>>>>>>>>>>>>>>>> handled in reset_domain and are not split in each 
>>>>>>>>>>>>>>>>>>>>> adev or any other entity. We might want in the 
>>>>>>>>>>>>>>>>>>>>> future to even move the scheduler handling into 
>>>>>>>>>>>>>>>>>>>>> reset domain since reset domain is supposed to be 
>>>>>>>>>>>>>>>>>>>>> a generic things and not only or AMD.
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky 
>>>>>>>>>>>>>>>>>>>>>>>>> <andrey.grodzovsky@amd.com>
>>>>>>>>>>>>>>>>>>>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com>
>>>>>>>>>>>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +---
>>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 
>>>>>>>>>>>>>>>>>>>>>>>>> 17 +++--
>>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 +
>>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 
>>>>>>>>>>>>>>>>>>>>>>>>> +++++++++++++++++++++-
>>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +-
>>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++-
>>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++-
>>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++-
>>>>>>>>>>>>>>>>>>>>>>>>>   8 files changed, 104 insertions(+), 24 
>>>>>>>>>>>>>>>>>>>>>>>>> deletions(-)
>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>>>>>>>>>>> index 4264abc5604d..99efd8317547 100644
>>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>>>>>>>>>>> @@ -109,6 +109,7 @@
>>>>>>>>>>>>>>>>>>>>>>>>>   #include "amdgpu_fdinfo.h"
>>>>>>>>>>>>>>>>>>>>>>>>>   #include "amdgpu_mca.h"
>>>>>>>>>>>>>>>>>>>>>>>>>   #include "amdgpu_ras.h"
>>>>>>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>>>>>>>>>>>>>>>>>>>     #define MAX_GPU_INSTANCE 16
>>>>>>>>>>>>>>>>>>>>>>>>>   @@ -509,16 +510,6 @@ struct 
>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_allowed_register_entry {
>>>>>>>>>>>>>>>>>>>>>>>>>       bool grbm_indexed;
>>>>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>>>>   -enum amd_reset_method {
>>>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_NONE = -1,
>>>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_LEGACY = 0,
>>>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE0,
>>>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE1,
>>>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE2,
>>>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_BACO,
>>>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_PCI,
>>>>>>>>>>>>>>>>>>>>>>>>> -};
>>>>>>>>>>>>>>>>>>>>>>>>> -
>>>>>>>>>>>>>>>>>>>>>>>>>   struct amdgpu_video_codec_info {
>>>>>>>>>>>>>>>>>>>>>>>>>       u32 codec_type;
>>>>>>>>>>>>>>>>>>>>>>>>>       u32 max_width;
>>>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>>>> index e582f1044c0f..7fa82269c30f 100644
>>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>>>> @@ -5201,6 +5201,12 @@ int 
>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct 
>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>>>>>> tmp_vram_lost_counter = 
>>>>>>>>>>>>>>>>>>>>>>>>> atomic_read(&((adev)->vram_lost_counter));
>>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>> +    /* Drop all pending resets since we will 
>>>>>>>>>>>>>>>>>>>>>>>>> reset now anyway */
>>>>>>>>>>>>>>>>>>>>>>>>> +    tmp_adev = 
>>>>>>>>>>>>>>>>>>>>>>>>> list_first_entry(device_list_handle, struct 
>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device,
>>>>>>>>>>>>>>>>>>>>>>>>> + reset_list);
>>>>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_pending_list(tmp_adev->reset_domain); 
>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>>       /* Actual ASIC resets if needed.*/
>>>>>>>>>>>>>>>>>>>>>>>>>       /* Host driver will handle XGMI hive 
>>>>>>>>>>>>>>>>>>>>>>>>> reset for SRIOV */
>>>>>>>>>>>>>>>>>>>>>>>>>       if (amdgpu_sriov_vf(adev)) {
>>>>>>>>>>>>>>>>>>>>>>>>> @@ -5296,7 +5302,7 @@ int 
>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct 
>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>>>     struct amdgpu_recover_work_struct {
>>>>>>>>>>>>>>>>>>>>>>>>> -    struct work_struct base;
>>>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct base;
>>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev;
>>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_job *job;
>>>>>>>>>>>>>>>>>>>>>>>>>       int ret;
>>>>>>>>>>>>>>>>>>>>>>>>> @@ -5304,7 +5310,7 @@ struct 
>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct {
>>>>>>>>>>>>>>>>>>>>>>>>>     static void 
>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work(struct 
>>>>>>>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_recover_work_struct 
>>>>>>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct 
>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base);
>>>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_recover_work_struct 
>>>>>>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct 
>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base.base.work);
>>>>>>>>>>>>>>>>>>>>>>>>> recover_work->ret = 
>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, 
>>>>>>>>>>>>>>>>>>>>>>>>> recover_work->job);
>>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>>> @@ -5316,12 +5322,15 @@ int 
>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>>>>> *adev,
>>>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_recover_work_struct work = 
>>>>>>>>>>>>>>>>>>>>>>>>> {.adev = adev, .job = job};
>>>>>>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&work.base, 
>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&work.base.base, 
>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&work.base.node);
>>>>>>>>>>>>>>>>>>>>>>>>>         if 
>>>>>>>>>>>>>>>>>>>>>>>>> (!amdgpu_reset_domain_schedule(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>>>>> &work.base))
>>>>>>>>>>>>>>>>>>>>>>>>>           return -EAGAIN;
>>>>>>>>>>>>>>>>>>>>>>>>>   - flush_work(&work.base);
>>>>>>>>>>>>>>>>>>>>>>>>> + flush_delayed_work(&work.base.base);
>>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>>>>> &work.base);
>>>>>>>>>>>>>>>>>>>>>>>>>         return work.ret;
>>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
>>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>>>>>>>>>>> index c80af0889773..ffddd419c351 100644
>>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>>>>>>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain 
>>>>>>>>>>>>>>>>>>>>>>>>> *amdgpu_reset_create_reset_domain(enum 
>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_d
>>>>>>>>>>>>>>>>>>>>>>>>> atomic_set(&reset_domain->in_gpu_reset, 0);
>>>>>>>>>>>>>>>>>>>>>>>>> init_rwsem(&reset_domain->sem);
>>>>>>>>>>>>>>>>>>>>>>>>>   + INIT_LIST_HEAD(&reset_domain->pending_works);
>>>>>>>>>>>>>>>>>>>>>>>>> + mutex_init(&reset_domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>>       return reset_domain;
>>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>>>   diff --git 
>>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
>>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>>>>>>>>>>> index 1949dbe28a86..863ec5720fc1 100644
>>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>>>>>>>>>>> @@ -24,7 +24,18 @@
>>>>>>>>>>>>>>>>>>>>>>>>>   #ifndef __AMDGPU_RESET_H__
>>>>>>>>>>>>>>>>>>>>>>>>>   #define __AMDGPU_RESET_H__
>>>>>>>>>>>>>>>>>>>>>>>>>   -#include "amdgpu.h"
>>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/atomic.h>
>>>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/mutex.h>
>>>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/list.h>
>>>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/kref.h>
>>>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/rwsem.h>
>>>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/workqueue.h>
>>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_device;
>>>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_job;
>>>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_hive_info;
>>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>>     enum AMDGPU_RESET_FLAGS {
>>>>>>>>>>>>>>>>>>>>>>>>>   @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS {
>>>>>>>>>>>>>>>>>>>>>>>>> AMDGPU_SKIP_HW_RESET = 1,
>>>>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>>>>   +
>>>>>>>>>>>>>>>>>>>>>>>>> +enum amd_reset_method {
>>>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_NONE = -1,
>>>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_LEGACY = 0,
>>>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE0,
>>>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE1,
>>>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE2,
>>>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_BACO,
>>>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_PCI,
>>>>>>>>>>>>>>>>>>>>>>>>> +};
>>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>>   struct amdgpu_reset_context {
>>>>>>>>>>>>>>>>>>>>>>>>>       enum amd_reset_method method;
>>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *reset_req_dev;
>>>>>>>>>>>>>>>>>>>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context {
>>>>>>>>>>>>>>>>>>>>>>>>>       unsigned long flags;
>>>>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>>>>   +struct amdgpu_reset_control;
>>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>>   struct amdgpu_reset_handler {
>>>>>>>>>>>>>>>>>>>>>>>>>       enum amd_reset_method reset_method;
>>>>>>>>>>>>>>>>>>>>>>>>>       struct list_head handler_list;
>>>>>>>>>>>>>>>>>>>>>>>>> @@ -76,12 +100,21 @@ enum 
>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_type {
>>>>>>>>>>>>>>>>>>>>>>>>>       XGMI_HIVE
>>>>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>>>>   +
>>>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_reset_work_struct {
>>>>>>>>>>>>>>>>>>>>>>>>> +    struct delayed_work base;
>>>>>>>>>>>>>>>>>>>>>>>>> +    struct list_head node;
>>>>>>>>>>>>>>>>>>>>>>>>> +};
>>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>>   struct amdgpu_reset_domain {
>>>>>>>>>>>>>>>>>>>>>>>>>       struct kref refcount;
>>>>>>>>>>>>>>>>>>>>>>>>>       struct workqueue_struct *wq;
>>>>>>>>>>>>>>>>>>>>>>>>>       enum amdgpu_reset_domain_type type;
>>>>>>>>>>>>>>>>>>>>>>>>>       struct rw_semaphore sem;
>>>>>>>>>>>>>>>>>>>>>>>>>       atomic_t in_gpu_reset;
>>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>> +    struct list_head pending_works;
>>>>>>>>>>>>>>>>>>>>>>>>> +    struct mutex reset_lock;
>>>>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>>>>     @@ -113,9 +146,43 @@ static inline void 
>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_put_reset_domain(struct 
>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *dom
>>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>>>     static inline bool 
>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_schedule(struct 
>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain,
>>>>>>>>>>>>>>>>>>>>>>>>> - struct work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>>> -    return queue_work(domain->wq, work);
>>>>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>> +    if (!queue_delayed_work(domain->wq, 
>>>>>>>>>>>>>>>>>>>>>>>>> &work->base, 0)) {
>>>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>>>> +        return false;
>>>>>>>>>>>>>>>>>>>>>>>>> +    }
>>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>> + list_add_tail(&work->node, 
>>>>>>>>>>>>>>>>>>>>>>>>> &domain->pending_works);
>>>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>> +    return true;
>>>>>>>>>>>>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>> +static inline void 
>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(struct 
>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain,
>>>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>>>> + list_del_init(&work->node);
>>>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>> +static inline void 
>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_pending_list(struct 
>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain)
>>>>>>>>>>>>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct *entry, 
>>>>>>>>>>>>>>>>>>>>>>>>> *tmp;
>>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>>>> + list_for_each_entry_safe(entry, tmp, 
>>>>>>>>>>>>>>>>>>>>>>>>> &domain->pending_works, node) {
>>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>> + list_del_init(&entry->node);
>>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>> +        /* Stop any other related pending 
>>>>>>>>>>>>>>>>>>>>>>>>> resets */
>>>>>>>>>>>>>>>>>>>>>>>>> + cancel_delayed_work(&entry->base);
>>>>>>>>>>>>>>>>>>>>>>>>> +    }
>>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>>>     void 
>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_lock_reset_domain(struct 
>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *reset_domain);
>>>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h 
>>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>>>>>>>>>>> index 239f232f9c02..574e870d3064 100644
>>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>>>>>>>>>>> @@ -25,6 +25,7 @@
>>>>>>>>>>>>>>>>>>>>>>>>>   #define AMDGPU_VIRT_H
>>>>>>>>>>>>>>>>>>>>>>>>>     #include "amdgv_sriovmsg.h"
>>>>>>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>>>>>>>>>>>>>>>>>>>     #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 
>>>>>>>>>>>>>>>>>>>>>>>>> << 0) /* vBIOS is sr-iov ready */
>>>>>>>>>>>>>>>>>>>>>>>>>   #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 
>>>>>>>>>>>>>>>>>>>>>>>>> 1) /* sr-iov is enabled on this GPU */
>>>>>>>>>>>>>>>>>>>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt {
>>>>>>>>>>>>>>>>>>>>>>>>>       uint32_t reg_val_offs;
>>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_irq_src ack_irq;
>>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_irq_src rcv_irq;
>>>>>>>>>>>>>>>>>>>>>>>>> -    struct work_struct flr_work;
>>>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct flr_work;
>>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_mm_table mm_table;
>>>>>>>>>>>>>>>>>>>>>>>>>       const struct amdgpu_virt_ops *ops;
>>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_vf_error_buffer vf_errors;
>>>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
>>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>>>>>>>>> index b81acf59870c..f3d1c2be9292 100644
>>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>>>>>>>>> @@ -251,7 +251,7 @@ static int 
>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct 
>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>>>>>>     static void 
>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work(struct work_struct 
>>>>>>>>>>>>>>>>>>>>>>>>> *work)
>>>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, 
>>>>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work);
>>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = 
>>>>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>>>>>>>>>>       int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>>>>>>>>>>>>>   @@ -380,7 +380,8 @@ int 
>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_get_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>>>>> *adev)
>>>>>>>>>>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>>> @@ -389,6 +390,8 @@ void 
>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_put_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>>>>> *adev)
>>>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>>>     static int 
>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_request_init_data(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>>>>> *adev)
>>>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
>>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>>>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644
>>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>>>>>>>>> @@ -275,7 +275,7 @@ static int 
>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct 
>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>>>>>>     static void 
>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work(struct work_struct 
>>>>>>>>>>>>>>>>>>>>>>>>> *work)
>>>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, 
>>>>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work);
>>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = 
>>>>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>>>>>>>>>>       int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>>>>>>>>>>>>>   @@ -407,7 +407,8 @@ int 
>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_get_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>>>>> *adev)
>>>>>>>>>>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>>> @@ -416,6 +417,8 @@ void 
>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_put_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>>>>> *adev)
>>>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>>>     const struct amdgpu_virt_ops 
>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_virt_ops = {
>>>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c 
>>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>>>>>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644
>>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>>>>>>>>>>> @@ -512,7 +512,7 @@ static int 
>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct 
>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>>>>>>     static void 
>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work(struct work_struct 
>>>>>>>>>>>>>>>>>>>>>>>>> *work)
>>>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work);
>>>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, 
>>>>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work);
>>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = 
>>>>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>>>>>>>>>>         /* wait until RCV_MSG become 3 */
>>>>>>>>>>>>>>>>>>>>>>>>> @@ -610,7 +610,8 @@ int 
>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_get_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>>>>> *adev)
>>>>>>>>>>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>>> @@ -619,6 +620,8 @@ void 
>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_put_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>>>>> *adev)
>>>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>>>     const struct amdgpu_virt_ops 
>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_virt_ops = {
>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>
>>>>>>>
>>>>
>>


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive.
  2022-05-12 13:15                                                 ` Christian König
@ 2022-05-12 13:44                                                   ` Andrey Grodzovsky
  2022-05-13 15:41                                                   ` Andrey Grodzovsky
  1 sibling, 0 replies; 40+ messages in thread
From: Andrey Grodzovsky @ 2022-05-12 13:44 UTC (permalink / raw)
  To: Christian König, Lazar, Lijo, Christian König,
	Felix Kuehling, amd-gfx
  Cc: Bai Zoy

[-- Attachment #1: Type: text/plain, Size: 1218 bytes --]

Sure, I will investigate that. What about the ticket which LIjo raised 
which was basically doing 8 resets instead of one  ? Lijo - can this 
ticket wait until I come up with this new design for amdgpu reset 
function or u need a quick solution now in which case we can use the 
already existing patch temporary.

Andrey

On 2022-05-12 09:15, Christian König wrote:
>> I am not sure why HIVE is the object we should work with, hive is one 
>> use case, single device is another, then Lijo described something 
>> called partition which is what ? Particular pipe within GPU ?. What 
>> they all share in common
>> IMHO is that all of them use reset domain when they want a recovery 
>> operation, so maybe GPU reset should be oriented to work with reset 
>> domain ?
>
> Yes, exactly that's the idea.
>
> Basically the reset domain knowns which amdgpu devices it needs to 
> reset together.
>
> If you then represent that so that you always have a hive even when 
> you only have one device in it, or if you put an array of devices 
> which needs to be reset together into the reset domain doesn't matter.
>
> Maybe go for the later approach, that is probably a bit cleaner and 
> less code to change.
>
> Christian. 

[-- Attachment #2: Type: text/html, Size: 1819 bytes --]

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive.
  2022-05-12 13:15                                                 ` Christian König
  2022-05-12 13:44                                                   ` Andrey Grodzovsky
@ 2022-05-13 15:41                                                   ` Andrey Grodzovsky
  2022-05-16 14:12                                                     ` Andrey Grodzovsky
  1 sibling, 1 reply; 40+ messages in thread
From: Andrey Grodzovsky @ 2022-05-13 15:41 UTC (permalink / raw)
  To: Christian König, Lazar, Lijo, Christian König,
	Felix Kuehling, amd-gfx
  Cc: Bai Zoy


On 2022-05-12 09:15, Christian König wrote:
> Am 12.05.22 um 15:07 schrieb Andrey Grodzovsky:
>>
>> On 2022-05-12 02:06, Christian König wrote:
>>> Am 11.05.22 um 22:27 schrieb Andrey Grodzovsky:
>>>>
>>>> On 2022-05-11 11:39, Christian König wrote:
>>>>> Am 11.05.22 um 17:35 schrieb Andrey Grodzovsky:
>>>>>> On 2022-05-11 11:20, Lazar, Lijo wrote:
>>>>>>>
>>>>>>>
>>>>>>> On 5/11/2022 7:28 PM, Christian König wrote:
>>>>>>>> Am 11.05.22 um 15:43 schrieb Andrey Grodzovsky:
>>>>>>>>> On 2022-05-11 03:38, Christian König wrote:
>>>>>>>>>> Am 10.05.22 um 20:53 schrieb Andrey Grodzovsky:
>>>>>>>>>>> [SNIP]
>>>>>>>>>>>> E.g. in the reset code (either before or after the reset, 
>>>>>>>>>>>> that's debatable) you do something like this:
>>>>>>>>>>>>
>>>>>>>>>>>> for (i = 0; i < num_ring; ++i)
>>>>>>>>>>>> cancel_delayed_work(ring[i]->scheduler....)
>>>>>>>>>>>> cancel_work(adev->ras_work);
>>>>>>>>>>>> cancel_work(adev->iofault_work);
>>>>>>>>>>>> cancel_work(adev->debugfs_work);
>>>>>>>>>>>> ...
>>>>>>>>>>>>
>>>>>>>>>>>> You don't really need to track which reset source has fired 
>>>>>>>>>>>> and which hasn't, because that would be racy again. Instead 
>>>>>>>>>>>> just bluntly reset all possible sources.
>>>>>>>>>>>>
>>>>>>>>>>>> Christian.
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> I don't say we care if it fired once or twice (I need to add 
>>>>>>>>>>> a fix to only insert reset work to pending reset list if 
>>>>>>>>>>> it's not already there), the point of using list (or array) 
>>>>>>>>>>> to which you add and from which you remove is that the logic 
>>>>>>>>>>> of this is encapsulated within reset domain. In your way we 
>>>>>>>>>>> need to be aware who exactly schedules reset work and 
>>>>>>>>>>> explicitly cancel them, this also means that for any new 
>>>>>>>>>>> source added in the future you will need to remember to add him
>>>>>>>>>>
>>>>>>>>>> I don't think that this is a valid argument. Additionally to 
>>>>>>>>>> the schedulers we probably just need less than a handful of 
>>>>>>>>>> reset sources, most likely even just one or two is enough.
>>>>>>>>>>
>>>>>>>>>> The only justification I can see of having additional 
>>>>>>>>>> separate reset sources would be if somebody wants to know if 
>>>>>>>>>> a specific source has been handled or not (e.g. call 
>>>>>>>>>> flush_work() or work_pending()). Like in the case of a reset 
>>>>>>>>>> triggered through debugfs.
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> This is indeed one reason, another is as we said before that 
>>>>>>>>> if you share 'reset source' (meaning a delayed work) with 
>>>>>>>>> another client (i.e. RAS and KFD) it means you make assumption 
>>>>>>>>> that the other client always proceeds with the
>>>>>>>>> reset exactly the same way as you expect. So today we have 
>>>>>>>>> this only in scheduler vs non scheduler reset happening - non 
>>>>>>>>> scheduler reset clients assume the reset is always fully 
>>>>>>>>> executed in HW while scheduler based reset makes shortcuts and 
>>>>>>>>> not always does HW reset hence they cannot share 'reset 
>>>>>>>>> source' (delayed work). Yes, we can always add this in the 
>>>>>>>>> future if and when such problem will arise but no one will 
>>>>>>>>> remember this then and a new bug will be introduced and will 
>>>>>>>>> take time to find and resolve.
>>>>>>>>
>>>>>>>> Mhm, so your main concern is that we forget to correctly handle 
>>>>>>>> the new reset sources?
>>>>>>>>
>>>>>>>> How about we do it like this then:
>>>>>>>>
>>>>>>>> struct amdgpu_reset_domain {
>>>>>>>>      ....
>>>>>>>>      union {
>>>>>>>>          struct {
>>>>>>>>              struct work_item debugfs;
>>>>>>>>              struct work_item ras;
>>>>>>>>              ....
>>>>>>>>          };
>>>>>>>>          struct work_item array[]
>>>>>>>>      } reset_sources;
>>>>>>>> }
>>>>>>>>
>>>>>>>
>>>>>>> If it's only about static array,
>>>>>>>
>>>>>>> enum amdgpu_reset_soruce {
>>>>>>>
>>>>>>> AMDGPU_RESET_SRC_RAS,
>>>>>>> AMDGPU_RESET_SRC_ABC,
>>>>>>> .....
>>>>>>> AMDGPU_RESET_SRC_XYZ,
>>>>>>> AMDGPU_RESET_SRC_MAX
>>>>>>>
>>>>>>> };
>>>>>>>
>>>>>>> struct work_struct reset_work[AMDGPU_RESET_SRC_MAX]; => An index 
>>>>>>> for each work item
>>>>>>>
>>>>>>>
>>>>>>> Thanks,
>>>>>>> Lijo
>>>>>>
>>>>>>
>>>>>> It's possible though it makes harder to generalize reset_domain 
>>>>>> later for other drivers.
>>>>>> But still one caveat, look at amdgpu_recover_work_struct and it's 
>>>>>> usage in amdgpu_device_gpu_recover and in gpu_recover_get,
>>>>>> At least for debugfs i need to return back the result of GPU 
>>>>>> reset and so I cannot store actual work items in the array 
>>>>>> mentioned above
>>>>>> but rather pointers to work_item because i need a way to get back 
>>>>>> the return value from gpu_recover like I do now in 
>>>>>> amdgpu_device_gpu_recover.
>>>>>
>>>>> You should try to avoid that as well.
>>>>>
>>>>> See when the debugfs reset is canceled because of a scheduler 
>>>>> reset you won't get a useful return code either.
>>>>>
>>>>> What we should do instead is to cache the status of the last reset 
>>>>> in the reset domain.
>>>>>
>>>>> Regards,
>>>>> Christian.
>>>>
>>>>
>>>> Another problem with this approach -  to execute  the actaul GPU 
>>>> reset I need accesses  to concrete amdgpu_device pointer from work 
>>>> struct (see xgpu_ai_mailbox_flr_work) as example. If i store all 
>>>> work items in
>>>> array in amdgpu_reset_domain the most i can only retrieve is the 
>>>> reset_domain struct itself which won't help since it's dynamically 
>>>> allocated, not embedded in hive or adev and can can be one per 
>>>> device or per entire hive in case of XGMI and so there is no way 
>>>> for me to reach back to amdgpu_device. Back pointer to adev* from 
>>>> amdgpu_reset_domain will only work for single device but not for 
>>>> XGMI hive where there are multiple devices in a hive.
>>>
>>> Which is exactly the reason why I think we should always allocate 
>>> the hive structure, even if we only have one device. And a GPU reset 
>>> should then always work with the hive data structure and not adev.
>>
>>
>> I am not sure why HIVE is the object we should work with, hive is one 
>> use case, single device is another, then Lijo described something 
>> called partition which is what ? Particular pipe within GPU ?. What 
>> they all share in common
>> IMHO is that all of them use reset domain when they want a recovery 
>> operation, so maybe GPU reset should be oriented to work with reset 
>> domain ?
>
> Yes, exactly that's the idea.
>
> Basically the reset domain knowns which amdgpu devices it needs to 
> reset together.
>
> If you then represent that so that you always have a hive even when 
> you only have one device in it, or if you put an array of devices 
> which needs to be reset together into the reset domain doesn't matter.
>
> Maybe go for the later approach, that is probably a bit cleaner and 
> less code to change.
>
> Christian.


Unfortunately this approach raises also a few  difficulties -
First - if holding array of devices in reset_domain then when you come 
to GPU reset function you don't really know which adev is the one 
triggered the reset and this is actually essential to some procedures 
like emergency restart.

Second - in XGMI case we must take into account that one of the hive 
members might go away in runtime (i could do echo 1 > 
/sysfs/pci_id/remove on it for example at any moment) - so now we need 
to maintain this array and mark such entry with NULL probably on XGMI 
node removal , and then there might be hot insertion and all this adds 
more complications.

I now tend to prefer your initial solution for it's simplicity and the 
result will be what we need -

"E.g. in the reset code (either before or after the reset, that's 
debatable) you do something like this:

for (i = 0; i < num_ring; ++i)
cancel_delayed_work(ring[i]->scheduler....)
cancel_work(adev->ras_work);
cancel_work(adev->iofault_work);
cancel_work(adev->debugfs_work);
"

And while here for each new reset source you need to remember to add 
another line of code here same can said about adev* array in 
reset_context as you will need to remember to add a new source there anyway.

Let me know what you think.

Andrey


>
>>
>> Andrey
>>
>>
>>>
>>> Adding a pointer from your reset work item back to the hive is then 
>>> trivial.
>>>
>>> Regards,
>>> Christian.
>>>
>>>>
>>>> Andrey
>>>>
>>>>
>>>>>
>>>>>>
>>>>>> Andrey
>>>>>>
>>>>>>
>>>>>>>
>>>>>>>> Not 100% sure if that works, but something like that should do 
>>>>>>>> the trick.
>>>>>>>>
>>>>>>>> My main concern is that I don't want to allocate the work items 
>>>>>>>> on the stack and dynamic allocation (e.g. kmalloc) is usually 
>>>>>>>> not possible either.
>>>>>>>>
>>>>>>>> Additional to that putting/removing work items from a list, 
>>>>>>>> array or other container is a very common source for race 
>>>>>>>> conditions.
>>>>>>>>
>>>>>>>> Regards,
>>>>>>>> Christian.
>>>>>>>>
>>>>>>>>>
>>>>>>>>>>> to the cancellation list which you showed above. In current 
>>>>>>>>>>> way all this done automatically within reset_domain code and 
>>>>>>>>>>> it's agnostic to specific driver and it's specific list of 
>>>>>>>>>>> reset sources. Also in case we would want to generalize 
>>>>>>>>>>> reset_domain to other GPU drivers (which was
>>>>>>>>>>> a plan as far as i remember) this explicit mention of each 
>>>>>>>>>>> reset works for cancellation is again less suitable in my 
>>>>>>>>>>> opinion.
>>>>>>>>>>
>>>>>>>>>> Well we could put the work item for the scheduler independent 
>>>>>>>>>> reset source into the reset domain as well. But I'm not sure 
>>>>>>>>>> those additional reset sources should be part of any common 
>>>>>>>>>> handling, that is largely amdgpu specific.
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> So it's for sure more then one source for the reasons 
>>>>>>>>> described above, also note that for scheduler we already 
>>>>>>>>> cancel delayed work in drm_sched_stop so calling them again in 
>>>>>>>>> amdgpu code kind of superfluous.
>>>>>>>>>
>>>>>>>>> Andrey
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> Christian.
>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> Andrey
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> The only difference is I chose to do the canceling right 
>>>>>>>>>>>>>>> BEFORE the HW reset and not AFTER. I did this because I 
>>>>>>>>>>>>>>> see a possible race where a new reset request is being 
>>>>>>>>>>>>>>> generated exactly after we finished the HW reset but 
>>>>>>>>>>>>>>> before we canceled out all pending resets - in such case 
>>>>>>>>>>>>>>> you wold not want to cancel this 'border line new' reset 
>>>>>>>>>>>>>>> request.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Why not? Any new reset request directly after a hardware 
>>>>>>>>>>>>>> reset is most likely just falsely generated by the reset 
>>>>>>>>>>>>>> itself.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Ideally I would cancel all sources after the reset, but 
>>>>>>>>>>>>>> before starting any new work.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> You can see that if many different reset sources 
>>>>>>>>>>>>>>>>>> share same work struct what can happen is that the 
>>>>>>>>>>>>>>>>>> first to obtain the lock you describe bellow might 
>>>>>>>>>>>>>>>>>> opt out from full HW reset because his bad job did 
>>>>>>>>>>>>>>>>>> signal for example or because his hunged IP block was 
>>>>>>>>>>>>>>>>>> able to recover through SW reset but in the meantime 
>>>>>>>>>>>>>>>>>> another reset source who needed an actual HW reset 
>>>>>>>>>>>>>>>>>> just silently returned and we end up with unhandled 
>>>>>>>>>>>>>>>>>> reset request. True that today this happens only to 
>>>>>>>>>>>>>>>>>> job timeout reset sources that are handled form 
>>>>>>>>>>>>>>>>>> within the scheduler and won't use this single work 
>>>>>>>>>>>>>>>>>> struct but no one prevents a future case for this to 
>>>>>>>>>>>>>>>>>> happen and also, if we actually want to unify 
>>>>>>>>>>>>>>>>>> scheduler time out handlers within reset domain 
>>>>>>>>>>>>>>>>>> (which seems to me the right design approach) we 
>>>>>>>>>>>>>>>>>> won't be able to use just one work struct for this 
>>>>>>>>>>>>>>>>>> reason anyway.
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Just to add to this point - a reset domain is 
>>>>>>>>>>>>>>>>> co-operative domain. In addition to sharing a set of 
>>>>>>>>>>>>>>>>> clients from various reset sources for one device, it 
>>>>>>>>>>>>>>>>> also will have a set of devices like in XGMI hive. The 
>>>>>>>>>>>>>>>>> job timeout on one device may not eventually result in 
>>>>>>>>>>>>>>>>> result, but a RAS error happening on another device at 
>>>>>>>>>>>>>>>>> the same time would need a reset. The second device's 
>>>>>>>>>>>>>>>>> RAS error cannot return seeing that a reset work 
>>>>>>>>>>>>>>>>> already started, or ignore the reset work given that 
>>>>>>>>>>>>>>>>> another device has filled the reset data.
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> When there is a reset domain, it should take care of 
>>>>>>>>>>>>>>>>> the work scheduled and keeping it in device or any 
>>>>>>>>>>>>>>>>> other level doesn't sound good.
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Thanks,
>>>>>>>>>>>>>>>>> Lijo
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> I'd put the reset work struct into the reset_domain 
>>>>>>>>>>>>>>>>>>> struct. That way you'd have exactly one worker for 
>>>>>>>>>>>>>>>>>>> the reset domain. You could implement a lock-less 
>>>>>>>>>>>>>>>>>>> scheme to decide whether you need to schedule a 
>>>>>>>>>>>>>>>>>>> reset, e.g. using an atomic counter in the shared 
>>>>>>>>>>>>>>>>>>> work struct that gets incremented when a client 
>>>>>>>>>>>>>>>>>>> wants to trigger a reset (atomic_add_return). If 
>>>>>>>>>>>>>>>>>>> that counter is exactly 1 after incrementing, you 
>>>>>>>>>>>>>>>>>>> need to fill in the rest of the work struct and 
>>>>>>>>>>>>>>>>>>> schedule the work. Otherwise, it's already scheduled 
>>>>>>>>>>>>>>>>>>> (or another client is in the process of scheduling 
>>>>>>>>>>>>>>>>>>> it) and you just return. When the worker finishes 
>>>>>>>>>>>>>>>>>>> (after confirming a successful reset), it resets the 
>>>>>>>>>>>>>>>>>>> counter to 0, so the next client requesting a reset 
>>>>>>>>>>>>>>>>>>> will schedule the worker again.
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>>   Felix
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> Additional to that keep in mind that you can't 
>>>>>>>>>>>>>>>>>>>>> allocate any memory before or during the GPU reset 
>>>>>>>>>>>>>>>>>>>>> nor wait for the reset to complete (so you can't 
>>>>>>>>>>>>>>>>>>>>> allocate anything on the stack either).
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> There is no dynamic allocation here, regarding 
>>>>>>>>>>>>>>>>>>>> stack allocations - we do it all the time when we 
>>>>>>>>>>>>>>>>>>>> call functions, even during GPU resets, how on 
>>>>>>>>>>>>>>>>>>>> stack allocation of work struct in 
>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover is different from any 
>>>>>>>>>>>>>>>>>>>> other local variable we allocate in any function we 
>>>>>>>>>>>>>>>>>>>> call ?
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> I am also not sure why it's not allowed to wait for 
>>>>>>>>>>>>>>>>>>>> reset to complete ? Also, see in 
>>>>>>>>>>>>>>>>>>>> amdgpu_ras_do_recovery and gpu_recover_get 
>>>>>>>>>>>>>>>>>>>> (debugfs) - the caller expects the reset to 
>>>>>>>>>>>>>>>>>>>> complete before he returns. I can probably work 
>>>>>>>>>>>>>>>>>>>> around it in RAS code by calling 
>>>>>>>>>>>>>>>>>>>> atomic_set(&ras->in_recovery, 0) from some callback 
>>>>>>>>>>>>>>>>>>>> within actual reset function but regarding sysfs it 
>>>>>>>>>>>>>>>>>>>> actually expects a result returned indicating 
>>>>>>>>>>>>>>>>>>>> whether the call was successful or not.
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> I don't think that concept you try here will work.
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> Also in general seems to me it's cleaner approach 
>>>>>>>>>>>>>>>>>>>>>> where this logic (the work items) are held and 
>>>>>>>>>>>>>>>>>>>>>> handled in reset_domain and are not split in each 
>>>>>>>>>>>>>>>>>>>>>> adev or any other entity. We might want in the 
>>>>>>>>>>>>>>>>>>>>>> future to even move the scheduler handling into 
>>>>>>>>>>>>>>>>>>>>>> reset domain since reset domain is supposed to be 
>>>>>>>>>>>>>>>>>>>>>> a generic things and not only or AMD.
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky 
>>>>>>>>>>>>>>>>>>>>>>>>>> <andrey.grodzovsky@amd.com>
>>>>>>>>>>>>>>>>>>>>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com>
>>>>>>>>>>>>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +---
>>>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 
>>>>>>>>>>>>>>>>>>>>>>>>>> 17 +++--
>>>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 +
>>>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 
>>>>>>>>>>>>>>>>>>>>>>>>>> 73 +++++++++++++++++++++-
>>>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +-
>>>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++-
>>>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++-
>>>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++-
>>>>>>>>>>>>>>>>>>>>>>>>>>   8 files changed, 104 insertions(+), 24 
>>>>>>>>>>>>>>>>>>>>>>>>>> deletions(-)
>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>>>>>>>>>>>> index 4264abc5604d..99efd8317547 100644
>>>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>>>>>>>>>>>>>>>>>> @@ -109,6 +109,7 @@
>>>>>>>>>>>>>>>>>>>>>>>>>>   #include "amdgpu_fdinfo.h"
>>>>>>>>>>>>>>>>>>>>>>>>>>   #include "amdgpu_mca.h"
>>>>>>>>>>>>>>>>>>>>>>>>>>   #include "amdgpu_ras.h"
>>>>>>>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>>>>>>>>>>>>>>>>>>>>     #define MAX_GPU_INSTANCE 16
>>>>>>>>>>>>>>>>>>>>>>>>>>   @@ -509,16 +510,6 @@ struct 
>>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_allowed_register_entry {
>>>>>>>>>>>>>>>>>>>>>>>>>>       bool grbm_indexed;
>>>>>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>>>>>   -enum amd_reset_method {
>>>>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_NONE = -1,
>>>>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_LEGACY = 0,
>>>>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE0,
>>>>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE1,
>>>>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE2,
>>>>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_BACO,
>>>>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_PCI,
>>>>>>>>>>>>>>>>>>>>>>>>>> -};
>>>>>>>>>>>>>>>>>>>>>>>>>> -
>>>>>>>>>>>>>>>>>>>>>>>>>>   struct amdgpu_video_codec_info {
>>>>>>>>>>>>>>>>>>>>>>>>>>       u32 codec_type;
>>>>>>>>>>>>>>>>>>>>>>>>>>       u32 max_width;
>>>>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>>>>> index e582f1044c0f..7fa82269c30f 100644
>>>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>>>>> @@ -5201,6 +5201,12 @@ int 
>>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct 
>>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>>>>>>> tmp_vram_lost_counter = 
>>>>>>>>>>>>>>>>>>>>>>>>>> atomic_read(&((adev)->vram_lost_counter));
>>>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>>> +    /* Drop all pending resets since we will 
>>>>>>>>>>>>>>>>>>>>>>>>>> reset now anyway */
>>>>>>>>>>>>>>>>>>>>>>>>>> +    tmp_adev = 
>>>>>>>>>>>>>>>>>>>>>>>>>> list_first_entry(device_list_handle, struct 
>>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device,
>>>>>>>>>>>>>>>>>>>>>>>>>> + reset_list);
>>>>>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_pending_list(tmp_adev->reset_domain); 
>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>>>       /* Actual ASIC resets if needed.*/
>>>>>>>>>>>>>>>>>>>>>>>>>>       /* Host driver will handle XGMI hive 
>>>>>>>>>>>>>>>>>>>>>>>>>> reset for SRIOV */
>>>>>>>>>>>>>>>>>>>>>>>>>>       if (amdgpu_sriov_vf(adev)) {
>>>>>>>>>>>>>>>>>>>>>>>>>> @@ -5296,7 +5302,7 @@ int 
>>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct 
>>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>>>>     struct amdgpu_recover_work_struct {
>>>>>>>>>>>>>>>>>>>>>>>>>> -    struct work_struct base;
>>>>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct base;
>>>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev;
>>>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_job *job;
>>>>>>>>>>>>>>>>>>>>>>>>>>       int ret;
>>>>>>>>>>>>>>>>>>>>>>>>>> @@ -5304,7 +5310,7 @@ struct 
>>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct {
>>>>>>>>>>>>>>>>>>>>>>>>>>     static void 
>>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work(struct 
>>>>>>>>>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_recover_work_struct 
>>>>>>>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct 
>>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base);
>>>>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_recover_work_struct 
>>>>>>>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct 
>>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base.base.work);
>>>>>>>>>>>>>>>>>>>>>>>>>> recover_work->ret = 
>>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, 
>>>>>>>>>>>>>>>>>>>>>>>>>> recover_work->job);
>>>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>>>> @@ -5316,12 +5322,15 @@ int 
>>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct 
>>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_recover_work_struct work 
>>>>>>>>>>>>>>>>>>>>>>>>>> = {.adev = adev, .job = job};
>>>>>>>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&work.base, 
>>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&work.base.base, 
>>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work);
>>>>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&work.base.node);
>>>>>>>>>>>>>>>>>>>>>>>>>>         if 
>>>>>>>>>>>>>>>>>>>>>>>>>> (!amdgpu_reset_domain_schedule(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>>>>>> &work.base))
>>>>>>>>>>>>>>>>>>>>>>>>>>           return -EAGAIN;
>>>>>>>>>>>>>>>>>>>>>>>>>>   - flush_work(&work.base);
>>>>>>>>>>>>>>>>>>>>>>>>>> + flush_delayed_work(&work.base.base);
>>>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>>>>>> &work.base);
>>>>>>>>>>>>>>>>>>>>>>>>>>         return work.ret;
>>>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
>>>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>>>>>>>>>>>> index c80af0889773..ffddd419c351 100644
>>>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>>>>>>>>>>>>>>>>>>>>>>>> @@ -134,6 +134,9 @@ struct 
>>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain 
>>>>>>>>>>>>>>>>>>>>>>>>>> *amdgpu_reset_create_reset_domain(enum 
>>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_d
>>>>>>>>>>>>>>>>>>>>>>>>>> atomic_set(&reset_domain->in_gpu_reset, 0);
>>>>>>>>>>>>>>>>>>>>>>>>>> init_rwsem(&reset_domain->sem);
>>>>>>>>>>>>>>>>>>>>>>>>>>   + 
>>>>>>>>>>>>>>>>>>>>>>>>>> INIT_LIST_HEAD(&reset_domain->pending_works);
>>>>>>>>>>>>>>>>>>>>>>>>>> + mutex_init(&reset_domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>>>       return reset_domain;
>>>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>>>>   diff --git 
>>>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
>>>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>>>>>>>>>>>> index 1949dbe28a86..863ec5720fc1 100644
>>>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>>>>>>>>>>>>>>>>>>>>>>>> @@ -24,7 +24,18 @@
>>>>>>>>>>>>>>>>>>>>>>>>>>   #ifndef __AMDGPU_RESET_H__
>>>>>>>>>>>>>>>>>>>>>>>>>>   #define __AMDGPU_RESET_H__
>>>>>>>>>>>>>>>>>>>>>>>>>>   -#include "amdgpu.h"
>>>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/atomic.h>
>>>>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/mutex.h>
>>>>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/list.h>
>>>>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/kref.h>
>>>>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/rwsem.h>
>>>>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/workqueue.h>
>>>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_device;
>>>>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_job;
>>>>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_hive_info;
>>>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>>>     enum AMDGPU_RESET_FLAGS {
>>>>>>>>>>>>>>>>>>>>>>>>>>   @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS {
>>>>>>>>>>>>>>>>>>>>>>>>>> AMDGPU_SKIP_HW_RESET = 1,
>>>>>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>>>>>   +
>>>>>>>>>>>>>>>>>>>>>>>>>> +enum amd_reset_method {
>>>>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_NONE = -1,
>>>>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_LEGACY = 0,
>>>>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE0,
>>>>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE1,
>>>>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE2,
>>>>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_BACO,
>>>>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_PCI,
>>>>>>>>>>>>>>>>>>>>>>>>>> +};
>>>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>>>   struct amdgpu_reset_context {
>>>>>>>>>>>>>>>>>>>>>>>>>>       enum amd_reset_method method;
>>>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *reset_req_dev;
>>>>>>>>>>>>>>>>>>>>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context {
>>>>>>>>>>>>>>>>>>>>>>>>>>       unsigned long flags;
>>>>>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>>>>>   +struct amdgpu_reset_control;
>>>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>>>   struct amdgpu_reset_handler {
>>>>>>>>>>>>>>>>>>>>>>>>>>       enum amd_reset_method reset_method;
>>>>>>>>>>>>>>>>>>>>>>>>>>       struct list_head handler_list;
>>>>>>>>>>>>>>>>>>>>>>>>>> @@ -76,12 +100,21 @@ enum 
>>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_type {
>>>>>>>>>>>>>>>>>>>>>>>>>>       XGMI_HIVE
>>>>>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>>>>>   +
>>>>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_reset_work_struct {
>>>>>>>>>>>>>>>>>>>>>>>>>> +    struct delayed_work base;
>>>>>>>>>>>>>>>>>>>>>>>>>> +    struct list_head node;
>>>>>>>>>>>>>>>>>>>>>>>>>> +};
>>>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>>>   struct amdgpu_reset_domain {
>>>>>>>>>>>>>>>>>>>>>>>>>>       struct kref refcount;
>>>>>>>>>>>>>>>>>>>>>>>>>>       struct workqueue_struct *wq;
>>>>>>>>>>>>>>>>>>>>>>>>>>       enum amdgpu_reset_domain_type type;
>>>>>>>>>>>>>>>>>>>>>>>>>>       struct rw_semaphore sem;
>>>>>>>>>>>>>>>>>>>>>>>>>>       atomic_t in_gpu_reset;
>>>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>>> +    struct list_head pending_works;
>>>>>>>>>>>>>>>>>>>>>>>>>> +    struct mutex reset_lock;
>>>>>>>>>>>>>>>>>>>>>>>>>>   };
>>>>>>>>>>>>>>>>>>>>>>>>>>     @@ -113,9 +146,43 @@ static inline void 
>>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_put_reset_domain(struct 
>>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *dom
>>>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>>>>     static inline bool 
>>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_schedule(struct 
>>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain,
>>>>>>>>>>>>>>>>>>>>>>>>>> - struct work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>>>> -    return queue_work(domain->wq, work);
>>>>>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>>> +    if (!queue_delayed_work(domain->wq, 
>>>>>>>>>>>>>>>>>>>>>>>>>> &work->base, 0)) {
>>>>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>>>>> +        return false;
>>>>>>>>>>>>>>>>>>>>>>>>>> +    }
>>>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>>> + list_add_tail(&work->node, 
>>>>>>>>>>>>>>>>>>>>>>>>>> &domain->pending_works);
>>>>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>>> +    return true;
>>>>>>>>>>>>>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>>> +static inline void 
>>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(struct 
>>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain,
>>>>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work)
>>>>>>>>>>>>>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>>>>> + list_del_init(&work->node);
>>>>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>>> +static inline void 
>>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_pending_list(struct 
>>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain)
>>>>>>>>>>>>>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct *entry, 
>>>>>>>>>>>>>>>>>>>>>>>>>> *tmp;
>>>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>>>>> + list_for_each_entry_safe(entry, tmp, 
>>>>>>>>>>>>>>>>>>>>>>>>>> &domain->pending_works, node) {
>>>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>>> + list_del_init(&entry->node);
>>>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>>> +        /* Stop any other related pending 
>>>>>>>>>>>>>>>>>>>>>>>>>> resets */
>>>>>>>>>>>>>>>>>>>>>>>>>> + cancel_delayed_work(&entry->base);
>>>>>>>>>>>>>>>>>>>>>>>>>> +    }
>>>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock);
>>>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>>>>     void 
>>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_lock_reset_domain(struct 
>>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *reset_domain);
>>>>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h 
>>>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>>>>>>>>>>>> index 239f232f9c02..574e870d3064 100644
>>>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>>>>>>>>>>>>>>>>>>>>> @@ -25,6 +25,7 @@
>>>>>>>>>>>>>>>>>>>>>>>>>>   #define AMDGPU_VIRT_H
>>>>>>>>>>>>>>>>>>>>>>>>>>     #include "amdgv_sriovmsg.h"
>>>>>>>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h"
>>>>>>>>>>>>>>>>>>>>>>>>>>     #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 
>>>>>>>>>>>>>>>>>>>>>>>>>> << 0) /* vBIOS is sr-iov ready */
>>>>>>>>>>>>>>>>>>>>>>>>>>   #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 
>>>>>>>>>>>>>>>>>>>>>>>>>> 1) /* sr-iov is enabled on this GPU */
>>>>>>>>>>>>>>>>>>>>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt {
>>>>>>>>>>>>>>>>>>>>>>>>>>       uint32_t reg_val_offs;
>>>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_irq_src ack_irq;
>>>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_irq_src rcv_irq;
>>>>>>>>>>>>>>>>>>>>>>>>>> -    struct work_struct flr_work;
>>>>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_reset_work_struct flr_work;
>>>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_mm_table mm_table;
>>>>>>>>>>>>>>>>>>>>>>>>>>       const struct amdgpu_virt_ops *ops;
>>>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_vf_error_buffer vf_errors;
>>>>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
>>>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>>>>>>>>>> index b81acf59870c..f3d1c2be9292 100644
>>>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>>>>>>>>>> @@ -251,7 +251,7 @@ static int 
>>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct 
>>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>>>>>>>     static void 
>>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work(struct work_struct 
>>>>>>>>>>>>>>>>>>>>>>>>>> *work)
>>>>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, 
>>>>>>>>>>>>>>>>>>>>>>>>>> flr_work);
>>>>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, 
>>>>>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work);
>>>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = 
>>>>>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>>>>>>>>>>>       int timeout = 
>>>>>>>>>>>>>>>>>>>>>>>>>> AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>>>>>>>>>>>>>>   @@ -380,7 +380,8 @@ int 
>>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_get_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>>>>>> *adev)
>>>>>>>>>>>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>>>>>> INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>>>> @@ -389,6 +390,8 @@ void 
>>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_put_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>>>>>> *adev)
>>>>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>>>>     static int 
>>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_request_init_data(struct 
>>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
>>>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>>>>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644
>>>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>>>>>>>>>> @@ -275,7 +275,7 @@ static int 
>>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct 
>>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>>>>>>>     static void 
>>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work(struct work_struct 
>>>>>>>>>>>>>>>>>>>>>>>>>> *work)
>>>>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, 
>>>>>>>>>>>>>>>>>>>>>>>>>> flr_work);
>>>>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, 
>>>>>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work);
>>>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = 
>>>>>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>>>>>>>>>>>       int timeout = 
>>>>>>>>>>>>>>>>>>>>>>>>>> NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>>>>>>>>>>>>>>   @@ -407,7 +407,8 @@ int 
>>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_get_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>>>>>> *adev)
>>>>>>>>>>>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>>>>>> INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>>>> @@ -416,6 +417,8 @@ void 
>>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_put_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>>>>>> *adev)
>>>>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>>>>     const struct amdgpu_virt_ops 
>>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_virt_ops = {
>>>>>>>>>>>>>>>>>>>>>>>>>> diff --git 
>>>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c 
>>>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>>>>>>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644
>>>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>>>>>>>>>>>>>>>>>>>>>>>>>> @@ -512,7 +512,7 @@ static int 
>>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct 
>>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>>>>>>>     static void 
>>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work(struct work_struct 
>>>>>>>>>>>>>>>>>>>>>>>>>> *work)
>>>>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>>>> -    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, 
>>>>>>>>>>>>>>>>>>>>>>>>>> flr_work);
>>>>>>>>>>>>>>>>>>>>>>>>>> +    struct amdgpu_virt *virt = 
>>>>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, 
>>>>>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work);
>>>>>>>>>>>>>>>>>>>>>>>>>>       struct amdgpu_device *adev = 
>>>>>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt);
>>>>>>>>>>>>>>>>>>>>>>>>>>         /* wait until RCV_MSG become 3 */
>>>>>>>>>>>>>>>>>>>>>>>>>> @@ -610,7 +610,8 @@ int 
>>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_get_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>>>>>> *adev)
>>>>>>>>>>>>>>>>>>>>>>>>>>           return r;
>>>>>>>>>>>>>>>>>>>>>>>>>>       }
>>>>>>>>>>>>>>>>>>>>>>>>>>   - INIT_WORK(&adev->virt.flr_work, 
>>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>>>>>> INIT_DELAYED_WORK(&adev->virt.flr_work.base, 
>>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work);
>>>>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node);
>>>>>>>>>>>>>>>>>>>>>>>>>>         return 0;
>>>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>>>> @@ -619,6 +620,8 @@ void 
>>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_put_irq(struct amdgpu_device 
>>>>>>>>>>>>>>>>>>>>>>>>>> *adev)
>>>>>>>>>>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>>>>>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>>>>>> + 
>>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, 
>>>>>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work);
>>>>>>>>>>>>>>>>>>>>>>>>>>   }
>>>>>>>>>>>>>>>>>>>>>>>>>>     const struct amdgpu_virt_ops 
>>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_virt_ops = {
>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>
>>>>>
>>>
>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive.
  2022-05-13 15:41                                                   ` Andrey Grodzovsky
@ 2022-05-16 14:12                                                     ` Andrey Grodzovsky
  2022-05-16 15:08                                                       ` Christian König
  0 siblings, 1 reply; 40+ messages in thread
From: Andrey Grodzovsky @ 2022-05-16 14:12 UTC (permalink / raw)
  To: Christian König, Lazar, Lijo, Christian König,
	Felix Kuehling, amd-gfx
  Cc: Bai Zoy

[-- Attachment #1: Type: text/plain, Size: 1655 bytes --]

Ping

Andrey

On 2022-05-13 11:41, Andrey Grodzovsky wrote:
>> Yes, exactly that's the idea.
>>
>> Basically the reset domain knowns which amdgpu devices it needs to 
>> reset together.
>>
>> If you then represent that so that you always have a hive even when 
>> you only have one device in it, or if you put an array of devices 
>> which needs to be reset together into the reset domain doesn't matter.
>>
>> Maybe go for the later approach, that is probably a bit cleaner and 
>> less code to change.
>>
>> Christian.
>
>
> Unfortunately this approach raises also a few  difficulties -
> First - if holding array of devices in reset_domain then when you come 
> to GPU reset function you don't really know which adev is the one 
> triggered the reset and this is actually essential to some procedures 
> like emergency restart.
>
> Second - in XGMI case we must take into account that one of the hive 
> members might go away in runtime (i could do echo 1 > 
> /sysfs/pci_id/remove on it for example at any moment) - so now we need 
> to maintain this array and mark such entry with NULL probably on XGMI 
> node removal , and then there might be hot insertion and all this adds 
> more complications.
>
> I now tend to prefer your initial solution for it's simplicity and the 
> result will be what we need -
>
> "E.g. in the reset code (either before or after the reset, that's 
> debatable) you do something like this:
>
> for (i = 0; i < num_ring; ++i)
> cancel_delayed_work(ring[i]->scheduler....)
> cancel_work(adev->ras_work);
> cancel_work(adev->iofault_work);
> cancel_work(adev->debugfs_work);
> "
>
> Let me know what you think.
>
> Andrey 

[-- Attachment #2: Type: text/html, Size: 2515 bytes --]

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive.
  2022-05-16 14:12                                                     ` Andrey Grodzovsky
@ 2022-05-16 15:08                                                       ` Christian König
  2022-05-16 15:13                                                         ` Andrey Grodzovsky
  0 siblings, 1 reply; 40+ messages in thread
From: Christian König @ 2022-05-16 15:08 UTC (permalink / raw)
  To: Andrey Grodzovsky, Lazar, Lijo, Christian König,
	Felix Kuehling, amd-gfx
  Cc: Bai Zoy

[-- Attachment #1: Type: text/plain, Size: 1991 bytes --]

Am 16.05.22 um 16:12 schrieb Andrey Grodzovsky:
>
> Ping
>

Ah, yes sorry.

> Andrey
>
> On 2022-05-13 11:41, Andrey Grodzovsky wrote:
>>> Yes, exactly that's the idea.
>>>
>>> Basically the reset domain knowns which amdgpu devices it needs to 
>>> reset together.
>>>
>>> If you then represent that so that you always have a hive even when 
>>> you only have one device in it, or if you put an array of devices 
>>> which needs to be reset together into the reset domain doesn't matter.
>>>
>>> Maybe go for the later approach, that is probably a bit cleaner and 
>>> less code to change.
>>>
>>> Christian.
>>
>>
>> Unfortunately this approach raises also a few  difficulties -
>> First - if holding array of devices in reset_domain then when you 
>> come to GPU reset function you don't really know which adev is the 
>> one triggered the reset and this is actually essential to some 
>> procedures like emergency restart.

What is "emergency restart"? That's not some requirement I know about.

>>
>> Second - in XGMI case we must take into account that one of the hive 
>> members might go away in runtime (i could do echo 1 > 
>> /sysfs/pci_id/remove on it for example at any moment) - so now we 
>> need to maintain this array and mark such entry with NULL probably on 
>> XGMI node removal , and then there might be hot insertion and all 
>> this adds more complications.
>>
>> I now tend to prefer your initial solution for it's simplicity and 
>> the result will be what we need -
>>
>> "E.g. in the reset code (either before or after the reset, that's 
>> debatable) you do something like this:
>>
>> for (i = 0; i < num_ring; ++i)
>> cancel_delayed_work(ring[i]->scheduler....)
>> cancel_work(adev->ras_work);
>> cancel_work(adev->iofault_work);
>> cancel_work(adev->debugfs_work);
>> "

Works for me. I already expected that switching over the reset to be 
based on the reset context wouldn't be that easy.

Regards,
Christian.

>>
>> Let me know what you think.
>>
>> Andrey 

[-- Attachment #2: Type: text/html, Size: 3552 bytes --]

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive.
  2022-05-16 15:08                                                       ` Christian König
@ 2022-05-16 15:13                                                         ` Andrey Grodzovsky
  0 siblings, 0 replies; 40+ messages in thread
From: Andrey Grodzovsky @ 2022-05-16 15:13 UTC (permalink / raw)
  To: Christian König, Lazar, Lijo, Christian König,
	Felix Kuehling, amd-gfx
  Cc: Bai Zoy

[-- Attachment #1: Type: text/plain, Size: 2413 bytes --]


On 2022-05-16 11:08, Christian König wrote:
> Am 16.05.22 um 16:12 schrieb Andrey Grodzovsky:
>>
>> Ping
>>
>
> Ah, yes sorry.
>
>> Andrey
>>
>> On 2022-05-13 11:41, Andrey Grodzovsky wrote:
>>>> Yes, exactly that's the idea.
>>>>
>>>> Basically the reset domain knowns which amdgpu devices it needs to 
>>>> reset together.
>>>>
>>>> If you then represent that so that you always have a hive even when 
>>>> you only have one device in it, or if you put an array of devices 
>>>> which needs to be reset together into the reset domain doesn't matter.
>>>>
>>>> Maybe go for the later approach, that is probably a bit cleaner and 
>>>> less code to change.
>>>>
>>>> Christian.
>>>
>>>
>>> Unfortunately this approach raises also a few  difficulties -
>>> First - if holding array of devices in reset_domain then when you 
>>> come to GPU reset function you don't really know which adev is the 
>>> one triggered the reset and this is actually essential to some 
>>> procedures like emergency restart.
>
> What is "emergency restart"? That's not some requirement I know about.


Emergency restart is something u can see at the beginning of 
amdgpu_gpu_recover function - it's a historical work around for some 
type of ASICs who weren't able to do full reset I think.  We should 
eventually remove it bu for now I thin it's still in use.


>
>>>
>>> Second - in XGMI case we must take into account that one of the hive 
>>> members might go away in runtime (i could do echo 1 > 
>>> /sysfs/pci_id/remove on it for example at any moment) - so now we 
>>> need to maintain this array and mark such entry with NULL probably 
>>> on XGMI node removal , and then there might be hot insertion and all 
>>> this adds more complications.
>>>
>>> I now tend to prefer your initial solution for it's simplicity and 
>>> the result will be what we need -
>>>
>>> "E.g. in the reset code (either before or after the reset, that's 
>>> debatable) you do something like this:
>>>
>>> for (i = 0; i < num_ring; ++i)
>>> cancel_delayed_work(ring[i]->scheduler....)
>>> cancel_work(adev->ras_work);
>>> cancel_work(adev->iofault_work);
>>> cancel_work(adev->debugfs_work);
>>> "
>
> Works for me. I already expected that switching over the reset to be 
> based on the reset context wouldn't be that easy.
>
> Regards,
> Christian.


Ok - i will resend a patch.

Andrey


>
>>>
>>> Let me know what you think.
>>>
>>> Andrey 
>

[-- Attachment #2: Type: text/html, Size: 4584 bytes --]

^ permalink raw reply	[flat|nested] 40+ messages in thread

end of thread, other threads:[~2022-05-16 15:13 UTC | newest]

Thread overview: 40+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-05-04 16:18 [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive Andrey Grodzovsky
2022-05-05 10:09 ` Christian König
2022-05-05 13:15   ` Andrey Grodzovsky
2022-05-05 13:23     ` Christian König
2022-05-05 13:54       ` Andrey Grodzovsky
2022-05-05 15:06         ` Christian König
2022-05-05 18:57           ` Andrey Grodzovsky
2022-05-05 19:49             ` Felix Kuehling
2022-05-05 21:47               ` Andrey Grodzovsky
2022-05-06  5:41                 ` Luben Tuikov
2022-05-06  6:02                 ` Lazar, Lijo
2022-05-06  8:56                   ` Christian König
2022-05-10 16:00                     ` Andrey Grodzovsky
2022-05-10 16:17                       ` Christian König
2022-05-10 17:01                         ` Andrey Grodzovsky
2022-05-10 17:19                           ` Christian König
2022-05-10 18:53                             ` Andrey Grodzovsky
2022-05-11  7:38                               ` Christian König
2022-05-11 13:43                                 ` Andrey Grodzovsky
2022-05-11 13:58                                   ` Christian König
2022-05-11 15:20                                     ` Lazar, Lijo
2022-05-11 15:35                                       ` Andrey Grodzovsky
2022-05-11 15:37                                         ` Lazar, Lijo
2022-05-11 15:43                                           ` Andrey Grodzovsky
2022-05-11 15:46                                             ` Lazar, Lijo
2022-05-11 15:53                                               ` Andrey Grodzovsky
2022-05-11 15:39                                         ` Christian König
2022-05-11 15:57                                           ` Andrey Grodzovsky
2022-05-12  6:03                                             ` Christian König
2022-05-12 12:57                                               ` Andrey Grodzovsky
2022-05-11 20:27                                           ` Andrey Grodzovsky
2022-05-12  6:06                                             ` Christian König
2022-05-12  9:21                                               ` Lazar, Lijo
2022-05-12 13:07                                               ` Andrey Grodzovsky
2022-05-12 13:15                                                 ` Christian König
2022-05-12 13:44                                                   ` Andrey Grodzovsky
2022-05-13 15:41                                                   ` Andrey Grodzovsky
2022-05-16 14:12                                                     ` Andrey Grodzovsky
2022-05-16 15:08                                                       ` Christian König
2022-05-16 15:13                                                         ` Andrey Grodzovsky

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).