* [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive. @ 2022-05-04 16:18 Andrey Grodzovsky 2022-05-05 10:09 ` Christian König 0 siblings, 1 reply; 40+ messages in thread From: Andrey Grodzovsky @ 2022-05-04 16:18 UTC (permalink / raw) To: amd-gfx; +Cc: Bai Zoy, Andrey Grodzovsky, lijo.lazar, Christian.Koenig Problem: During hive reset caused by command timing out on a ring extra resets are generated by triggered by KFD which is unable to accesses registers on the resetting ASIC. Fix: Rework GPU reset to use a list of pending reset jobs such that the first reset jobs that actaully resets the entire reset domain will cancel all those pending redundant resets. This is in line with what we already do for redundant TDRs in scheduler code. Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> Tested-by: Bai Zoy <Zoy.Bai@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +--- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++-- drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 + drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 +++++++++++++++++++++- drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +- drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++- drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++- drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++- 8 files changed, 104 insertions(+), 24 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 4264abc5604d..99efd8317547 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -109,6 +109,7 @@ #include "amdgpu_fdinfo.h" #include "amdgpu_mca.h" #include "amdgpu_ras.h" +#include "amdgpu_reset.h" #define MAX_GPU_INSTANCE 16 @@ -509,16 +510,6 @@ struct amdgpu_allowed_register_entry { bool grbm_indexed; }; -enum amd_reset_method { - AMD_RESET_METHOD_NONE = -1, - AMD_RESET_METHOD_LEGACY = 0, - AMD_RESET_METHOD_MODE0, - AMD_RESET_METHOD_MODE1, - AMD_RESET_METHOD_MODE2, - AMD_RESET_METHOD_BACO, - AMD_RESET_METHOD_PCI, -}; - struct amdgpu_video_codec_info { u32 codec_type; u32 max_width; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index e582f1044c0f..7fa82269c30f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -5201,6 +5201,12 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev, } tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter)); + + /* Drop all pending resets since we will reset now anyway */ + tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, + reset_list); + amdgpu_reset_pending_list(tmp_adev->reset_domain); + /* Actual ASIC resets if needed.*/ /* Host driver will handle XGMI hive reset for SRIOV */ if (amdgpu_sriov_vf(adev)) { @@ -5296,7 +5302,7 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev, } struct amdgpu_recover_work_struct { - struct work_struct base; + struct amdgpu_reset_work_struct base; struct amdgpu_device *adev; struct amdgpu_job *job; int ret; @@ -5304,7 +5310,7 @@ struct amdgpu_recover_work_struct { static void amdgpu_device_queue_gpu_recover_work(struct work_struct *work) { - struct amdgpu_recover_work_struct *recover_work = container_of(work, struct amdgpu_recover_work_struct, base); + struct amdgpu_recover_work_struct *recover_work = container_of(work, struct amdgpu_recover_work_struct, base.base.work); recover_work->ret = amdgpu_device_gpu_recover_imp(recover_work->adev, recover_work->job); } @@ -5316,12 +5322,15 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, { struct amdgpu_recover_work_struct work = {.adev = adev, .job = job}; - INIT_WORK(&work.base, amdgpu_device_queue_gpu_recover_work); + INIT_DELAYED_WORK(&work.base.base, amdgpu_device_queue_gpu_recover_work); + INIT_LIST_HEAD(&work.base.node); if (!amdgpu_reset_domain_schedule(adev->reset_domain, &work.base)) return -EAGAIN; - flush_work(&work.base); + flush_delayed_work(&work.base.base); + + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, &work.base); return work.ret; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c index c80af0889773..ffddd419c351 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c @@ -134,6 +134,9 @@ struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d atomic_set(&reset_domain->in_gpu_reset, 0); init_rwsem(&reset_domain->sem); + INIT_LIST_HEAD(&reset_domain->pending_works); + mutex_init(&reset_domain->reset_lock); + return reset_domain; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h index 1949dbe28a86..863ec5720fc1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h @@ -24,7 +24,18 @@ #ifndef __AMDGPU_RESET_H__ #define __AMDGPU_RESET_H__ -#include "amdgpu.h" + +#include <linux/atomic.h> +#include <linux/mutex.h> +#include <linux/list.h> +#include <linux/kref.h> +#include <linux/rwsem.h> +#include <linux/workqueue.h> + +struct amdgpu_device; +struct amdgpu_job; +struct amdgpu_hive_info; + enum AMDGPU_RESET_FLAGS { @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS { AMDGPU_SKIP_HW_RESET = 1, }; + +enum amd_reset_method { + AMD_RESET_METHOD_NONE = -1, + AMD_RESET_METHOD_LEGACY = 0, + AMD_RESET_METHOD_MODE0, + AMD_RESET_METHOD_MODE1, + AMD_RESET_METHOD_MODE2, + AMD_RESET_METHOD_BACO, + AMD_RESET_METHOD_PCI, +}; + struct amdgpu_reset_context { enum amd_reset_method method; struct amdgpu_device *reset_req_dev; @@ -40,6 +62,8 @@ struct amdgpu_reset_context { unsigned long flags; }; +struct amdgpu_reset_control; + struct amdgpu_reset_handler { enum amd_reset_method reset_method; struct list_head handler_list; @@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type { XGMI_HIVE }; + +struct amdgpu_reset_work_struct { + struct delayed_work base; + struct list_head node; +}; + struct amdgpu_reset_domain { struct kref refcount; struct workqueue_struct *wq; enum amdgpu_reset_domain_type type; struct rw_semaphore sem; atomic_t in_gpu_reset; + + struct list_head pending_works; + struct mutex reset_lock; }; @@ -113,9 +146,43 @@ static inline void amdgpu_reset_put_reset_domain(struct amdgpu_reset_domain *dom } static inline bool amdgpu_reset_domain_schedule(struct amdgpu_reset_domain *domain, - struct work_struct *work) + struct amdgpu_reset_work_struct *work) { - return queue_work(domain->wq, work); + mutex_lock(&domain->reset_lock); + + if (!queue_delayed_work(domain->wq, &work->base, 0)) { + mutex_unlock(&domain->reset_lock); + return false; + } + + list_add_tail(&work->node, &domain->pending_works); + mutex_unlock(&domain->reset_lock); + + return true; +} + +static inline void amdgpu_reset_domain_del_pendning_work(struct amdgpu_reset_domain *domain, + struct amdgpu_reset_work_struct *work) +{ + mutex_lock(&domain->reset_lock); + list_del_init(&work->node); + mutex_unlock(&domain->reset_lock); +} + +static inline void amdgpu_reset_pending_list(struct amdgpu_reset_domain *domain) +{ + struct amdgpu_reset_work_struct *entry, *tmp; + + mutex_lock(&domain->reset_lock); + list_for_each_entry_safe(entry, tmp, &domain->pending_works, node) { + + list_del_init(&entry->node); + + /* Stop any other related pending resets */ + cancel_delayed_work(&entry->base); + } + + mutex_unlock(&domain->reset_lock); } void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain *reset_domain); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h index 239f232f9c02..574e870d3064 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h @@ -25,6 +25,7 @@ #define AMDGPU_VIRT_H #include "amdgv_sriovmsg.h" +#include "amdgpu_reset.h" #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 << 0) /* vBIOS is sr-iov ready */ #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 1) /* sr-iov is enabled on this GPU */ @@ -230,7 +231,7 @@ struct amdgpu_virt { uint32_t reg_val_offs; struct amdgpu_irq_src ack_irq; struct amdgpu_irq_src rcv_irq; - struct work_struct flr_work; + struct amdgpu_reset_work_struct flr_work; struct amdgpu_mm_table mm_table; const struct amdgpu_virt_ops *ops; struct amdgpu_vf_error_buffer vf_errors; diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c index b81acf59870c..f3d1c2be9292 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c @@ -251,7 +251,7 @@ static int xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device *adev, static void xgpu_ai_mailbox_flr_work(struct work_struct *work) { - struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work); + struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work.base.work); struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt); int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT; @@ -380,7 +380,8 @@ int xgpu_ai_mailbox_get_irq(struct amdgpu_device *adev) return r; } - INIT_WORK(&adev->virt.flr_work, xgpu_ai_mailbox_flr_work); + INIT_DELAYED_WORK(&adev->virt.flr_work.base, xgpu_ai_mailbox_flr_work); + INIT_LIST_HEAD(&adev->virt.flr_work.node); return 0; } @@ -389,6 +390,8 @@ void xgpu_ai_mailbox_put_irq(struct amdgpu_device *adev) { amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); + + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, &adev->virt.flr_work); } static int xgpu_ai_request_init_data(struct amdgpu_device *adev) diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c index 22c10b97ea81..927b3d5bb1d0 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c @@ -275,7 +275,7 @@ static int xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device *adev, static void xgpu_nv_mailbox_flr_work(struct work_struct *work) { - struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work); + struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work.base.work); struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt); int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT; @@ -407,7 +407,8 @@ int xgpu_nv_mailbox_get_irq(struct amdgpu_device *adev) return r; } - INIT_WORK(&adev->virt.flr_work, xgpu_nv_mailbox_flr_work); + INIT_DELAYED_WORK(&adev->virt.flr_work.base, xgpu_nv_mailbox_flr_work); + INIT_LIST_HEAD(&adev->virt.flr_work.node); return 0; } @@ -416,6 +417,8 @@ void xgpu_nv_mailbox_put_irq(struct amdgpu_device *adev) { amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); + + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, &adev->virt.flr_work); } const struct amdgpu_virt_ops xgpu_nv_virt_ops = { diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c index 7b63d30b9b79..1d4ef5c70730 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c @@ -512,7 +512,7 @@ static int xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device *adev, static void xgpu_vi_mailbox_flr_work(struct work_struct *work) { - struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work); + struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work.base.work); struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt); /* wait until RCV_MSG become 3 */ @@ -610,7 +610,8 @@ int xgpu_vi_mailbox_get_irq(struct amdgpu_device *adev) return r; } - INIT_WORK(&adev->virt.flr_work, xgpu_vi_mailbox_flr_work); + INIT_DELAYED_WORK(&adev->virt.flr_work.base, xgpu_vi_mailbox_flr_work); + INIT_LIST_HEAD(&adev->virt.flr_work.node); return 0; } @@ -619,6 +620,8 @@ void xgpu_vi_mailbox_put_irq(struct amdgpu_device *adev) { amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); + + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, &adev->virt.flr_work); } const struct amdgpu_virt_ops xgpu_vi_virt_ops = { -- 2.25.1 ^ permalink raw reply related [flat|nested] 40+ messages in thread
* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive. 2022-05-04 16:18 [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive Andrey Grodzovsky @ 2022-05-05 10:09 ` Christian König 2022-05-05 13:15 ` Andrey Grodzovsky 0 siblings, 1 reply; 40+ messages in thread From: Christian König @ 2022-05-05 10:09 UTC (permalink / raw) To: Andrey Grodzovsky, amd-gfx; +Cc: Bai Zoy, lijo.lazar Am 04.05.22 um 18:18 schrieb Andrey Grodzovsky: > Problem: > During hive reset caused by command timing out on a ring > extra resets are generated by triggered by KFD which is > unable to accesses registers on the resetting ASIC. > > Fix: Rework GPU reset to use a list of pending reset jobs > such that the first reset jobs that actaully resets the entire > reset domain will cancel all those pending redundant resets. > > This is in line with what we already do for redundant TDRs > in scheduler code. Mhm, why exactly do you need the extra linked list then? Let's talk about that on our call today. Regards, Christian. > > Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> > Tested-by: Bai Zoy <Zoy.Bai@amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +--- > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++-- > drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 + > drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 +++++++++++++++++++++- > drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +- > drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++- > drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++- > drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++- > 8 files changed, 104 insertions(+), 24 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index 4264abc5604d..99efd8317547 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -109,6 +109,7 @@ > #include "amdgpu_fdinfo.h" > #include "amdgpu_mca.h" > #include "amdgpu_ras.h" > +#include "amdgpu_reset.h" > > #define MAX_GPU_INSTANCE 16 > > @@ -509,16 +510,6 @@ struct amdgpu_allowed_register_entry { > bool grbm_indexed; > }; > > -enum amd_reset_method { > - AMD_RESET_METHOD_NONE = -1, > - AMD_RESET_METHOD_LEGACY = 0, > - AMD_RESET_METHOD_MODE0, > - AMD_RESET_METHOD_MODE1, > - AMD_RESET_METHOD_MODE2, > - AMD_RESET_METHOD_BACO, > - AMD_RESET_METHOD_PCI, > -}; > - > struct amdgpu_video_codec_info { > u32 codec_type; > u32 max_width; > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index e582f1044c0f..7fa82269c30f 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -5201,6 +5201,12 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev, > } > > tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter)); > + > + /* Drop all pending resets since we will reset now anyway */ > + tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, > + reset_list); > + amdgpu_reset_pending_list(tmp_adev->reset_domain); > + > /* Actual ASIC resets if needed.*/ > /* Host driver will handle XGMI hive reset for SRIOV */ > if (amdgpu_sriov_vf(adev)) { > @@ -5296,7 +5302,7 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev, > } > > struct amdgpu_recover_work_struct { > - struct work_struct base; > + struct amdgpu_reset_work_struct base; > struct amdgpu_device *adev; > struct amdgpu_job *job; > int ret; > @@ -5304,7 +5310,7 @@ struct amdgpu_recover_work_struct { > > static void amdgpu_device_queue_gpu_recover_work(struct work_struct *work) > { > - struct amdgpu_recover_work_struct *recover_work = container_of(work, struct amdgpu_recover_work_struct, base); > + struct amdgpu_recover_work_struct *recover_work = container_of(work, struct amdgpu_recover_work_struct, base.base.work); > > recover_work->ret = amdgpu_device_gpu_recover_imp(recover_work->adev, recover_work->job); > } > @@ -5316,12 +5322,15 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, > { > struct amdgpu_recover_work_struct work = {.adev = adev, .job = job}; > > - INIT_WORK(&work.base, amdgpu_device_queue_gpu_recover_work); > + INIT_DELAYED_WORK(&work.base.base, amdgpu_device_queue_gpu_recover_work); > + INIT_LIST_HEAD(&work.base.node); > > if (!amdgpu_reset_domain_schedule(adev->reset_domain, &work.base)) > return -EAGAIN; > > - flush_work(&work.base); > + flush_delayed_work(&work.base.base); > + > + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, &work.base); > > return work.ret; > } > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c > index c80af0889773..ffddd419c351 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c > @@ -134,6 +134,9 @@ struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d > atomic_set(&reset_domain->in_gpu_reset, 0); > init_rwsem(&reset_domain->sem); > > + INIT_LIST_HEAD(&reset_domain->pending_works); > + mutex_init(&reset_domain->reset_lock); > + > return reset_domain; > } > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h > index 1949dbe28a86..863ec5720fc1 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h > @@ -24,7 +24,18 @@ > #ifndef __AMDGPU_RESET_H__ > #define __AMDGPU_RESET_H__ > > -#include "amdgpu.h" > + > +#include <linux/atomic.h> > +#include <linux/mutex.h> > +#include <linux/list.h> > +#include <linux/kref.h> > +#include <linux/rwsem.h> > +#include <linux/workqueue.h> > + > +struct amdgpu_device; > +struct amdgpu_job; > +struct amdgpu_hive_info; > + > > enum AMDGPU_RESET_FLAGS { > > @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS { > AMDGPU_SKIP_HW_RESET = 1, > }; > > + > +enum amd_reset_method { > + AMD_RESET_METHOD_NONE = -1, > + AMD_RESET_METHOD_LEGACY = 0, > + AMD_RESET_METHOD_MODE0, > + AMD_RESET_METHOD_MODE1, > + AMD_RESET_METHOD_MODE2, > + AMD_RESET_METHOD_BACO, > + AMD_RESET_METHOD_PCI, > +}; > + > struct amdgpu_reset_context { > enum amd_reset_method method; > struct amdgpu_device *reset_req_dev; > @@ -40,6 +62,8 @@ struct amdgpu_reset_context { > unsigned long flags; > }; > > +struct amdgpu_reset_control; > + > struct amdgpu_reset_handler { > enum amd_reset_method reset_method; > struct list_head handler_list; > @@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type { > XGMI_HIVE > }; > > + > +struct amdgpu_reset_work_struct { > + struct delayed_work base; > + struct list_head node; > +}; > + > struct amdgpu_reset_domain { > struct kref refcount; > struct workqueue_struct *wq; > enum amdgpu_reset_domain_type type; > struct rw_semaphore sem; > atomic_t in_gpu_reset; > + > + struct list_head pending_works; > + struct mutex reset_lock; > }; > > > @@ -113,9 +146,43 @@ static inline void amdgpu_reset_put_reset_domain(struct amdgpu_reset_domain *dom > } > > static inline bool amdgpu_reset_domain_schedule(struct amdgpu_reset_domain *domain, > - struct work_struct *work) > + struct amdgpu_reset_work_struct *work) > { > - return queue_work(domain->wq, work); > + mutex_lock(&domain->reset_lock); > + > + if (!queue_delayed_work(domain->wq, &work->base, 0)) { > + mutex_unlock(&domain->reset_lock); > + return false; > + } > + > + list_add_tail(&work->node, &domain->pending_works); > + mutex_unlock(&domain->reset_lock); > + > + return true; > +} > + > +static inline void amdgpu_reset_domain_del_pendning_work(struct amdgpu_reset_domain *domain, > + struct amdgpu_reset_work_struct *work) > +{ > + mutex_lock(&domain->reset_lock); > + list_del_init(&work->node); > + mutex_unlock(&domain->reset_lock); > +} > + > +static inline void amdgpu_reset_pending_list(struct amdgpu_reset_domain *domain) > +{ > + struct amdgpu_reset_work_struct *entry, *tmp; > + > + mutex_lock(&domain->reset_lock); > + list_for_each_entry_safe(entry, tmp, &domain->pending_works, node) { > + > + list_del_init(&entry->node); > + > + /* Stop any other related pending resets */ > + cancel_delayed_work(&entry->base); > + } > + > + mutex_unlock(&domain->reset_lock); > } > > void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain *reset_domain); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h > index 239f232f9c02..574e870d3064 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h > @@ -25,6 +25,7 @@ > #define AMDGPU_VIRT_H > > #include "amdgv_sriovmsg.h" > +#include "amdgpu_reset.h" > > #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 << 0) /* vBIOS is sr-iov ready */ > #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 1) /* sr-iov is enabled on this GPU */ > @@ -230,7 +231,7 @@ struct amdgpu_virt { > uint32_t reg_val_offs; > struct amdgpu_irq_src ack_irq; > struct amdgpu_irq_src rcv_irq; > - struct work_struct flr_work; > + struct amdgpu_reset_work_struct flr_work; > struct amdgpu_mm_table mm_table; > const struct amdgpu_virt_ops *ops; > struct amdgpu_vf_error_buffer vf_errors; > diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c > index b81acf59870c..f3d1c2be9292 100644 > --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c > +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c > @@ -251,7 +251,7 @@ static int xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device *adev, > > static void xgpu_ai_mailbox_flr_work(struct work_struct *work) > { > - struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work); > + struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work.base.work); > struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt); > int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT; > > @@ -380,7 +380,8 @@ int xgpu_ai_mailbox_get_irq(struct amdgpu_device *adev) > return r; > } > > - INIT_WORK(&adev->virt.flr_work, xgpu_ai_mailbox_flr_work); > + INIT_DELAYED_WORK(&adev->virt.flr_work.base, xgpu_ai_mailbox_flr_work); > + INIT_LIST_HEAD(&adev->virt.flr_work.node); > > return 0; > } > @@ -389,6 +390,8 @@ void xgpu_ai_mailbox_put_irq(struct amdgpu_device *adev) > { > amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); > amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); > + > + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, &adev->virt.flr_work); > } > > static int xgpu_ai_request_init_data(struct amdgpu_device *adev) > diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c > index 22c10b97ea81..927b3d5bb1d0 100644 > --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c > +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c > @@ -275,7 +275,7 @@ static int xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device *adev, > > static void xgpu_nv_mailbox_flr_work(struct work_struct *work) > { > - struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work); > + struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work.base.work); > struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt); > int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT; > > @@ -407,7 +407,8 @@ int xgpu_nv_mailbox_get_irq(struct amdgpu_device *adev) > return r; > } > > - INIT_WORK(&adev->virt.flr_work, xgpu_nv_mailbox_flr_work); > + INIT_DELAYED_WORK(&adev->virt.flr_work.base, xgpu_nv_mailbox_flr_work); > + INIT_LIST_HEAD(&adev->virt.flr_work.node); > > return 0; > } > @@ -416,6 +417,8 @@ void xgpu_nv_mailbox_put_irq(struct amdgpu_device *adev) > { > amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); > amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); > + > + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, &adev->virt.flr_work); > } > > const struct amdgpu_virt_ops xgpu_nv_virt_ops = { > diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c > index 7b63d30b9b79..1d4ef5c70730 100644 > --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c > +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c > @@ -512,7 +512,7 @@ static int xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device *adev, > > static void xgpu_vi_mailbox_flr_work(struct work_struct *work) > { > - struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work); > + struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work.base.work); > struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt); > > /* wait until RCV_MSG become 3 */ > @@ -610,7 +610,8 @@ int xgpu_vi_mailbox_get_irq(struct amdgpu_device *adev) > return r; > } > > - INIT_WORK(&adev->virt.flr_work, xgpu_vi_mailbox_flr_work); > + INIT_DELAYED_WORK(&adev->virt.flr_work.base, xgpu_vi_mailbox_flr_work); > + INIT_LIST_HEAD(&adev->virt.flr_work.node); > > return 0; > } > @@ -619,6 +620,8 @@ void xgpu_vi_mailbox_put_irq(struct amdgpu_device *adev) > { > amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); > amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); > + > + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, &adev->virt.flr_work); > } > > const struct amdgpu_virt_ops xgpu_vi_virt_ops = { ^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive. 2022-05-05 10:09 ` Christian König @ 2022-05-05 13:15 ` Andrey Grodzovsky 2022-05-05 13:23 ` Christian König 0 siblings, 1 reply; 40+ messages in thread From: Andrey Grodzovsky @ 2022-05-05 13:15 UTC (permalink / raw) To: Christian König, amd-gfx; +Cc: Bai Zoy, lijo.lazar On 2022-05-05 06:09, Christian König wrote: > Am 04.05.22 um 18:18 schrieb Andrey Grodzovsky: >> Problem: >> During hive reset caused by command timing out on a ring >> extra resets are generated by triggered by KFD which is >> unable to accesses registers on the resetting ASIC. >> >> Fix: Rework GPU reset to use a list of pending reset jobs >> such that the first reset jobs that actaully resets the entire >> reset domain will cancel all those pending redundant resets. >> >> This is in line with what we already do for redundant TDRs >> in scheduler code. > > Mhm, why exactly do you need the extra linked list then? > > Let's talk about that on our call today. Going to miss it as you know, and also this is the place to discuss technical questions anyway so - It's needed because those other resets are not time out handlers that are governed by the scheduler but rather external resets that are triggered by such clients as KFD, RAS and sysfs. Scheduler has no knowledge of them (and should not have) but they are serialized into same wq as the TO handlers from the scheduler. It just happens that TO triggered reset causes in turn another reset (from KFD in this case) and we want to prevent this second reset from taking place just as we want to avoid multiple TO resets to take place in scheduler code. Andrey > > Regards, > Christian. > >> >> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> >> Tested-by: Bai Zoy <Zoy.Bai@amd.com> >> --- >> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +--- >> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++-- >> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 + >> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 +++++++++++++++++++++- >> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +- >> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++- >> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++- >> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++- >> 8 files changed, 104 insertions(+), 24 deletions(-) >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> index 4264abc5604d..99efd8317547 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> @@ -109,6 +109,7 @@ >> #include "amdgpu_fdinfo.h" >> #include "amdgpu_mca.h" >> #include "amdgpu_ras.h" >> +#include "amdgpu_reset.h" >> #define MAX_GPU_INSTANCE 16 >> @@ -509,16 +510,6 @@ struct amdgpu_allowed_register_entry { >> bool grbm_indexed; >> }; >> -enum amd_reset_method { >> - AMD_RESET_METHOD_NONE = -1, >> - AMD_RESET_METHOD_LEGACY = 0, >> - AMD_RESET_METHOD_MODE0, >> - AMD_RESET_METHOD_MODE1, >> - AMD_RESET_METHOD_MODE2, >> - AMD_RESET_METHOD_BACO, >> - AMD_RESET_METHOD_PCI, >> -}; >> - >> struct amdgpu_video_codec_info { >> u32 codec_type; >> u32 max_width; >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> index e582f1044c0f..7fa82269c30f 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> @@ -5201,6 +5201,12 @@ int amdgpu_device_gpu_recover_imp(struct >> amdgpu_device *adev, >> } >> tmp_vram_lost_counter = >> atomic_read(&((adev)->vram_lost_counter)); >> + >> + /* Drop all pending resets since we will reset now anyway */ >> + tmp_adev = list_first_entry(device_list_handle, struct >> amdgpu_device, >> + reset_list); >> + amdgpu_reset_pending_list(tmp_adev->reset_domain); >> + >> /* Actual ASIC resets if needed.*/ >> /* Host driver will handle XGMI hive reset for SRIOV */ >> if (amdgpu_sriov_vf(adev)) { >> @@ -5296,7 +5302,7 @@ int amdgpu_device_gpu_recover_imp(struct >> amdgpu_device *adev, >> } >> struct amdgpu_recover_work_struct { >> - struct work_struct base; >> + struct amdgpu_reset_work_struct base; >> struct amdgpu_device *adev; >> struct amdgpu_job *job; >> int ret; >> @@ -5304,7 +5310,7 @@ struct amdgpu_recover_work_struct { >> static void amdgpu_device_queue_gpu_recover_work(struct >> work_struct *work) >> { >> - struct amdgpu_recover_work_struct *recover_work = >> container_of(work, struct amdgpu_recover_work_struct, base); >> + struct amdgpu_recover_work_struct *recover_work = >> container_of(work, struct amdgpu_recover_work_struct, base.base.work); >> recover_work->ret = >> amdgpu_device_gpu_recover_imp(recover_work->adev, recover_work->job); >> } >> @@ -5316,12 +5322,15 @@ int amdgpu_device_gpu_recover(struct >> amdgpu_device *adev, >> { >> struct amdgpu_recover_work_struct work = {.adev = adev, .job = >> job}; >> - INIT_WORK(&work.base, amdgpu_device_queue_gpu_recover_work); >> + INIT_DELAYED_WORK(&work.base.base, >> amdgpu_device_queue_gpu_recover_work); >> + INIT_LIST_HEAD(&work.base.node); >> if (!amdgpu_reset_domain_schedule(adev->reset_domain, >> &work.base)) >> return -EAGAIN; >> - flush_work(&work.base); >> + flush_delayed_work(&work.base.base); >> + >> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, &work.base); >> return work.ret; >> } >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >> index c80af0889773..ffddd419c351 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain >> *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d >> atomic_set(&reset_domain->in_gpu_reset, 0); >> init_rwsem(&reset_domain->sem); >> + INIT_LIST_HEAD(&reset_domain->pending_works); >> + mutex_init(&reset_domain->reset_lock); >> + >> return reset_domain; >> } >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >> index 1949dbe28a86..863ec5720fc1 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >> @@ -24,7 +24,18 @@ >> #ifndef __AMDGPU_RESET_H__ >> #define __AMDGPU_RESET_H__ >> -#include "amdgpu.h" >> + >> +#include <linux/atomic.h> >> +#include <linux/mutex.h> >> +#include <linux/list.h> >> +#include <linux/kref.h> >> +#include <linux/rwsem.h> >> +#include <linux/workqueue.h> >> + >> +struct amdgpu_device; >> +struct amdgpu_job; >> +struct amdgpu_hive_info; >> + >> enum AMDGPU_RESET_FLAGS { >> @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS { >> AMDGPU_SKIP_HW_RESET = 1, >> }; >> + >> +enum amd_reset_method { >> + AMD_RESET_METHOD_NONE = -1, >> + AMD_RESET_METHOD_LEGACY = 0, >> + AMD_RESET_METHOD_MODE0, >> + AMD_RESET_METHOD_MODE1, >> + AMD_RESET_METHOD_MODE2, >> + AMD_RESET_METHOD_BACO, >> + AMD_RESET_METHOD_PCI, >> +}; >> + >> struct amdgpu_reset_context { >> enum amd_reset_method method; >> struct amdgpu_device *reset_req_dev; >> @@ -40,6 +62,8 @@ struct amdgpu_reset_context { >> unsigned long flags; >> }; >> +struct amdgpu_reset_control; >> + >> struct amdgpu_reset_handler { >> enum amd_reset_method reset_method; >> struct list_head handler_list; >> @@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type { >> XGMI_HIVE >> }; >> + >> +struct amdgpu_reset_work_struct { >> + struct delayed_work base; >> + struct list_head node; >> +}; >> + >> struct amdgpu_reset_domain { >> struct kref refcount; >> struct workqueue_struct *wq; >> enum amdgpu_reset_domain_type type; >> struct rw_semaphore sem; >> atomic_t in_gpu_reset; >> + >> + struct list_head pending_works; >> + struct mutex reset_lock; >> }; >> @@ -113,9 +146,43 @@ static inline void >> amdgpu_reset_put_reset_domain(struct amdgpu_reset_domain *dom >> } >> static inline bool amdgpu_reset_domain_schedule(struct >> amdgpu_reset_domain *domain, >> - struct work_struct *work) >> + struct amdgpu_reset_work_struct *work) >> { >> - return queue_work(domain->wq, work); >> + mutex_lock(&domain->reset_lock); >> + >> + if (!queue_delayed_work(domain->wq, &work->base, 0)) { >> + mutex_unlock(&domain->reset_lock); >> + return false; >> + } >> + >> + list_add_tail(&work->node, &domain->pending_works); >> + mutex_unlock(&domain->reset_lock); >> + >> + return true; >> +} >> + >> +static inline void amdgpu_reset_domain_del_pendning_work(struct >> amdgpu_reset_domain *domain, >> + struct amdgpu_reset_work_struct *work) >> +{ >> + mutex_lock(&domain->reset_lock); >> + list_del_init(&work->node); >> + mutex_unlock(&domain->reset_lock); >> +} >> + >> +static inline void amdgpu_reset_pending_list(struct >> amdgpu_reset_domain *domain) >> +{ >> + struct amdgpu_reset_work_struct *entry, *tmp; >> + >> + mutex_lock(&domain->reset_lock); >> + list_for_each_entry_safe(entry, tmp, &domain->pending_works, >> node) { >> + >> + list_del_init(&entry->node); >> + >> + /* Stop any other related pending resets */ >> + cancel_delayed_work(&entry->base); >> + } >> + >> + mutex_unlock(&domain->reset_lock); >> } >> void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain >> *reset_domain); >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >> index 239f232f9c02..574e870d3064 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >> @@ -25,6 +25,7 @@ >> #define AMDGPU_VIRT_H >> #include "amdgv_sriovmsg.h" >> +#include "amdgpu_reset.h" >> #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 << 0) /* vBIOS is >> sr-iov ready */ >> #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 1) /* sr-iov is >> enabled on this GPU */ >> @@ -230,7 +231,7 @@ struct amdgpu_virt { >> uint32_t reg_val_offs; >> struct amdgpu_irq_src ack_irq; >> struct amdgpu_irq_src rcv_irq; >> - struct work_struct flr_work; >> + struct amdgpu_reset_work_struct flr_work; >> struct amdgpu_mm_table mm_table; >> const struct amdgpu_virt_ops *ops; >> struct amdgpu_vf_error_buffer vf_errors; >> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >> index b81acf59870c..f3d1c2be9292 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >> @@ -251,7 +251,7 @@ static int xgpu_ai_set_mailbox_ack_irq(struct >> amdgpu_device *adev, >> static void xgpu_ai_mailbox_flr_work(struct work_struct *work) >> { >> - struct amdgpu_virt *virt = container_of(work, struct >> amdgpu_virt, flr_work); >> + struct amdgpu_virt *virt = container_of(work, struct >> amdgpu_virt, flr_work.base.work); >> struct amdgpu_device *adev = container_of(virt, struct >> amdgpu_device, virt); >> int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT; >> @@ -380,7 +380,8 @@ int xgpu_ai_mailbox_get_irq(struct >> amdgpu_device *adev) >> return r; >> } >> - INIT_WORK(&adev->virt.flr_work, xgpu_ai_mailbox_flr_work); >> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >> xgpu_ai_mailbox_flr_work); >> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >> return 0; >> } >> @@ -389,6 +390,8 @@ void xgpu_ai_mailbox_put_irq(struct amdgpu_device >> *adev) >> { >> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >> + >> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >> &adev->virt.flr_work); >> } >> static int xgpu_ai_request_init_data(struct amdgpu_device *adev) >> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >> index 22c10b97ea81..927b3d5bb1d0 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >> @@ -275,7 +275,7 @@ static int xgpu_nv_set_mailbox_ack_irq(struct >> amdgpu_device *adev, >> static void xgpu_nv_mailbox_flr_work(struct work_struct *work) >> { >> - struct amdgpu_virt *virt = container_of(work, struct >> amdgpu_virt, flr_work); >> + struct amdgpu_virt *virt = container_of(work, struct >> amdgpu_virt, flr_work.base.work); >> struct amdgpu_device *adev = container_of(virt, struct >> amdgpu_device, virt); >> int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT; >> @@ -407,7 +407,8 @@ int xgpu_nv_mailbox_get_irq(struct >> amdgpu_device *adev) >> return r; >> } >> - INIT_WORK(&adev->virt.flr_work, xgpu_nv_mailbox_flr_work); >> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >> xgpu_nv_mailbox_flr_work); >> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >> return 0; >> } >> @@ -416,6 +417,8 @@ void xgpu_nv_mailbox_put_irq(struct amdgpu_device >> *adev) >> { >> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >> + >> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >> &adev->virt.flr_work); >> } >> const struct amdgpu_virt_ops xgpu_nv_virt_ops = { >> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >> index 7b63d30b9b79..1d4ef5c70730 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >> @@ -512,7 +512,7 @@ static int xgpu_vi_set_mailbox_ack_irq(struct >> amdgpu_device *adev, >> static void xgpu_vi_mailbox_flr_work(struct work_struct *work) >> { >> - struct amdgpu_virt *virt = container_of(work, struct >> amdgpu_virt, flr_work); >> + struct amdgpu_virt *virt = container_of(work, struct >> amdgpu_virt, flr_work.base.work); >> struct amdgpu_device *adev = container_of(virt, struct >> amdgpu_device, virt); >> /* wait until RCV_MSG become 3 */ >> @@ -610,7 +610,8 @@ int xgpu_vi_mailbox_get_irq(struct amdgpu_device >> *adev) >> return r; >> } >> - INIT_WORK(&adev->virt.flr_work, xgpu_vi_mailbox_flr_work); >> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >> xgpu_vi_mailbox_flr_work); >> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >> return 0; >> } >> @@ -619,6 +620,8 @@ void xgpu_vi_mailbox_put_irq(struct amdgpu_device >> *adev) >> { >> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >> + >> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >> &adev->virt.flr_work); >> } >> const struct amdgpu_virt_ops xgpu_vi_virt_ops = { > ^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive. 2022-05-05 13:15 ` Andrey Grodzovsky @ 2022-05-05 13:23 ` Christian König 2022-05-05 13:54 ` Andrey Grodzovsky 0 siblings, 1 reply; 40+ messages in thread From: Christian König @ 2022-05-05 13:23 UTC (permalink / raw) To: Andrey Grodzovsky, amd-gfx; +Cc: Bai Zoy, lijo.lazar Am 05.05.22 um 15:15 schrieb Andrey Grodzovsky: > On 2022-05-05 06:09, Christian König wrote: > >> Am 04.05.22 um 18:18 schrieb Andrey Grodzovsky: >>> Problem: >>> During hive reset caused by command timing out on a ring >>> extra resets are generated by triggered by KFD which is >>> unable to accesses registers on the resetting ASIC. >>> >>> Fix: Rework GPU reset to use a list of pending reset jobs >>> such that the first reset jobs that actaully resets the entire >>> reset domain will cancel all those pending redundant resets. >>> >>> This is in line with what we already do for redundant TDRs >>> in scheduler code. >> >> Mhm, why exactly do you need the extra linked list then? >> >> Let's talk about that on our call today. > > > Going to miss it as you know, and also this is the place to discuss > technical questions anyway so - Good point. > It's needed because those other resets are not time out handlers that > are governed by the scheduler > but rather external resets that are triggered by such clients as KFD, > RAS and sysfs. Scheduler has no > knowledge of them (and should not have) but they are serialized into > same wq as the TO handlers > from the scheduler. It just happens that TO triggered reset causes in > turn another reset (from KFD in > this case) and we want to prevent this second reset from taking place > just as we want to avoid multiple > TO resets to take place in scheduler code. Yeah, but why do you need multiple workers? You have a single worker for the GPU reset not triggered by the scheduler in you adev and cancel that at the end of the reset procedure. If anybody things it needs to trigger another reset while in reset (which is actually a small design bug separately) the reset will just be canceled in the same way we cancel the scheduler resets. Christian. > > Andrey > > >> >> Regards, >> Christian. >> >>> >>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> >>> Tested-by: Bai Zoy <Zoy.Bai@amd.com> >>> --- >>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +--- >>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++-- >>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 + >>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 >>> +++++++++++++++++++++- >>> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +- >>> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++- >>> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++- >>> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++- >>> 8 files changed, 104 insertions(+), 24 deletions(-) >>> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>> index 4264abc5604d..99efd8317547 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>> @@ -109,6 +109,7 @@ >>> #include "amdgpu_fdinfo.h" >>> #include "amdgpu_mca.h" >>> #include "amdgpu_ras.h" >>> +#include "amdgpu_reset.h" >>> #define MAX_GPU_INSTANCE 16 >>> @@ -509,16 +510,6 @@ struct amdgpu_allowed_register_entry { >>> bool grbm_indexed; >>> }; >>> -enum amd_reset_method { >>> - AMD_RESET_METHOD_NONE = -1, >>> - AMD_RESET_METHOD_LEGACY = 0, >>> - AMD_RESET_METHOD_MODE0, >>> - AMD_RESET_METHOD_MODE1, >>> - AMD_RESET_METHOD_MODE2, >>> - AMD_RESET_METHOD_BACO, >>> - AMD_RESET_METHOD_PCI, >>> -}; >>> - >>> struct amdgpu_video_codec_info { >>> u32 codec_type; >>> u32 max_width; >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>> index e582f1044c0f..7fa82269c30f 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>> @@ -5201,6 +5201,12 @@ int amdgpu_device_gpu_recover_imp(struct >>> amdgpu_device *adev, >>> } >>> tmp_vram_lost_counter = >>> atomic_read(&((adev)->vram_lost_counter)); >>> + >>> + /* Drop all pending resets since we will reset now anyway */ >>> + tmp_adev = list_first_entry(device_list_handle, struct >>> amdgpu_device, >>> + reset_list); >>> + amdgpu_reset_pending_list(tmp_adev->reset_domain); >>> + >>> /* Actual ASIC resets if needed.*/ >>> /* Host driver will handle XGMI hive reset for SRIOV */ >>> if (amdgpu_sriov_vf(adev)) { >>> @@ -5296,7 +5302,7 @@ int amdgpu_device_gpu_recover_imp(struct >>> amdgpu_device *adev, >>> } >>> struct amdgpu_recover_work_struct { >>> - struct work_struct base; >>> + struct amdgpu_reset_work_struct base; >>> struct amdgpu_device *adev; >>> struct amdgpu_job *job; >>> int ret; >>> @@ -5304,7 +5310,7 @@ struct amdgpu_recover_work_struct { >>> static void amdgpu_device_queue_gpu_recover_work(struct >>> work_struct *work) >>> { >>> - struct amdgpu_recover_work_struct *recover_work = >>> container_of(work, struct amdgpu_recover_work_struct, base); >>> + struct amdgpu_recover_work_struct *recover_work = >>> container_of(work, struct amdgpu_recover_work_struct, base.base.work); >>> recover_work->ret = >>> amdgpu_device_gpu_recover_imp(recover_work->adev, recover_work->job); >>> } >>> @@ -5316,12 +5322,15 @@ int amdgpu_device_gpu_recover(struct >>> amdgpu_device *adev, >>> { >>> struct amdgpu_recover_work_struct work = {.adev = adev, .job = >>> job}; >>> - INIT_WORK(&work.base, amdgpu_device_queue_gpu_recover_work); >>> + INIT_DELAYED_WORK(&work.base.base, >>> amdgpu_device_queue_gpu_recover_work); >>> + INIT_LIST_HEAD(&work.base.node); >>> if (!amdgpu_reset_domain_schedule(adev->reset_domain, >>> &work.base)) >>> return -EAGAIN; >>> - flush_work(&work.base); >>> + flush_delayed_work(&work.base.base); >>> + >>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>> &work.base); >>> return work.ret; >>> } >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>> index c80af0889773..ffddd419c351 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain >>> *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d >>> atomic_set(&reset_domain->in_gpu_reset, 0); >>> init_rwsem(&reset_domain->sem); >>> + INIT_LIST_HEAD(&reset_domain->pending_works); >>> + mutex_init(&reset_domain->reset_lock); >>> + >>> return reset_domain; >>> } >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>> index 1949dbe28a86..863ec5720fc1 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>> @@ -24,7 +24,18 @@ >>> #ifndef __AMDGPU_RESET_H__ >>> #define __AMDGPU_RESET_H__ >>> -#include "amdgpu.h" >>> + >>> +#include <linux/atomic.h> >>> +#include <linux/mutex.h> >>> +#include <linux/list.h> >>> +#include <linux/kref.h> >>> +#include <linux/rwsem.h> >>> +#include <linux/workqueue.h> >>> + >>> +struct amdgpu_device; >>> +struct amdgpu_job; >>> +struct amdgpu_hive_info; >>> + >>> enum AMDGPU_RESET_FLAGS { >>> @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS { >>> AMDGPU_SKIP_HW_RESET = 1, >>> }; >>> + >>> +enum amd_reset_method { >>> + AMD_RESET_METHOD_NONE = -1, >>> + AMD_RESET_METHOD_LEGACY = 0, >>> + AMD_RESET_METHOD_MODE0, >>> + AMD_RESET_METHOD_MODE1, >>> + AMD_RESET_METHOD_MODE2, >>> + AMD_RESET_METHOD_BACO, >>> + AMD_RESET_METHOD_PCI, >>> +}; >>> + >>> struct amdgpu_reset_context { >>> enum amd_reset_method method; >>> struct amdgpu_device *reset_req_dev; >>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context { >>> unsigned long flags; >>> }; >>> +struct amdgpu_reset_control; >>> + >>> struct amdgpu_reset_handler { >>> enum amd_reset_method reset_method; >>> struct list_head handler_list; >>> @@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type { >>> XGMI_HIVE >>> }; >>> + >>> +struct amdgpu_reset_work_struct { >>> + struct delayed_work base; >>> + struct list_head node; >>> +}; >>> + >>> struct amdgpu_reset_domain { >>> struct kref refcount; >>> struct workqueue_struct *wq; >>> enum amdgpu_reset_domain_type type; >>> struct rw_semaphore sem; >>> atomic_t in_gpu_reset; >>> + >>> + struct list_head pending_works; >>> + struct mutex reset_lock; >>> }; >>> @@ -113,9 +146,43 @@ static inline void >>> amdgpu_reset_put_reset_domain(struct amdgpu_reset_domain *dom >>> } >>> static inline bool amdgpu_reset_domain_schedule(struct >>> amdgpu_reset_domain *domain, >>> - struct work_struct *work) >>> + struct amdgpu_reset_work_struct *work) >>> { >>> - return queue_work(domain->wq, work); >>> + mutex_lock(&domain->reset_lock); >>> + >>> + if (!queue_delayed_work(domain->wq, &work->base, 0)) { >>> + mutex_unlock(&domain->reset_lock); >>> + return false; >>> + } >>> + >>> + list_add_tail(&work->node, &domain->pending_works); >>> + mutex_unlock(&domain->reset_lock); >>> + >>> + return true; >>> +} >>> + >>> +static inline void amdgpu_reset_domain_del_pendning_work(struct >>> amdgpu_reset_domain *domain, >>> + struct amdgpu_reset_work_struct *work) >>> +{ >>> + mutex_lock(&domain->reset_lock); >>> + list_del_init(&work->node); >>> + mutex_unlock(&domain->reset_lock); >>> +} >>> + >>> +static inline void amdgpu_reset_pending_list(struct >>> amdgpu_reset_domain *domain) >>> +{ >>> + struct amdgpu_reset_work_struct *entry, *tmp; >>> + >>> + mutex_lock(&domain->reset_lock); >>> + list_for_each_entry_safe(entry, tmp, &domain->pending_works, >>> node) { >>> + >>> + list_del_init(&entry->node); >>> + >>> + /* Stop any other related pending resets */ >>> + cancel_delayed_work(&entry->base); >>> + } >>> + >>> + mutex_unlock(&domain->reset_lock); >>> } >>> void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain >>> *reset_domain); >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>> index 239f232f9c02..574e870d3064 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>> @@ -25,6 +25,7 @@ >>> #define AMDGPU_VIRT_H >>> #include "amdgv_sriovmsg.h" >>> +#include "amdgpu_reset.h" >>> #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 << 0) /* vBIOS is >>> sr-iov ready */ >>> #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 1) /* sr-iov is >>> enabled on this GPU */ >>> @@ -230,7 +231,7 @@ struct amdgpu_virt { >>> uint32_t reg_val_offs; >>> struct amdgpu_irq_src ack_irq; >>> struct amdgpu_irq_src rcv_irq; >>> - struct work_struct flr_work; >>> + struct amdgpu_reset_work_struct flr_work; >>> struct amdgpu_mm_table mm_table; >>> const struct amdgpu_virt_ops *ops; >>> struct amdgpu_vf_error_buffer vf_errors; >>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>> index b81acf59870c..f3d1c2be9292 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>> @@ -251,7 +251,7 @@ static int xgpu_ai_set_mailbox_ack_irq(struct >>> amdgpu_device *adev, >>> static void xgpu_ai_mailbox_flr_work(struct work_struct *work) >>> { >>> - struct amdgpu_virt *virt = container_of(work, struct >>> amdgpu_virt, flr_work); >>> + struct amdgpu_virt *virt = container_of(work, struct >>> amdgpu_virt, flr_work.base.work); >>> struct amdgpu_device *adev = container_of(virt, struct >>> amdgpu_device, virt); >>> int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT; >>> @@ -380,7 +380,8 @@ int xgpu_ai_mailbox_get_irq(struct >>> amdgpu_device *adev) >>> return r; >>> } >>> - INIT_WORK(&adev->virt.flr_work, xgpu_ai_mailbox_flr_work); >>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>> xgpu_ai_mailbox_flr_work); >>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>> return 0; >>> } >>> @@ -389,6 +390,8 @@ void xgpu_ai_mailbox_put_irq(struct >>> amdgpu_device *adev) >>> { >>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>> + >>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>> &adev->virt.flr_work); >>> } >>> static int xgpu_ai_request_init_data(struct amdgpu_device *adev) >>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>> index 22c10b97ea81..927b3d5bb1d0 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>> @@ -275,7 +275,7 @@ static int xgpu_nv_set_mailbox_ack_irq(struct >>> amdgpu_device *adev, >>> static void xgpu_nv_mailbox_flr_work(struct work_struct *work) >>> { >>> - struct amdgpu_virt *virt = container_of(work, struct >>> amdgpu_virt, flr_work); >>> + struct amdgpu_virt *virt = container_of(work, struct >>> amdgpu_virt, flr_work.base.work); >>> struct amdgpu_device *adev = container_of(virt, struct >>> amdgpu_device, virt); >>> int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT; >>> @@ -407,7 +407,8 @@ int xgpu_nv_mailbox_get_irq(struct >>> amdgpu_device *adev) >>> return r; >>> } >>> - INIT_WORK(&adev->virt.flr_work, xgpu_nv_mailbox_flr_work); >>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>> xgpu_nv_mailbox_flr_work); >>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>> return 0; >>> } >>> @@ -416,6 +417,8 @@ void xgpu_nv_mailbox_put_irq(struct >>> amdgpu_device *adev) >>> { >>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>> + >>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>> &adev->virt.flr_work); >>> } >>> const struct amdgpu_virt_ops xgpu_nv_virt_ops = { >>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>> index 7b63d30b9b79..1d4ef5c70730 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>> @@ -512,7 +512,7 @@ static int xgpu_vi_set_mailbox_ack_irq(struct >>> amdgpu_device *adev, >>> static void xgpu_vi_mailbox_flr_work(struct work_struct *work) >>> { >>> - struct amdgpu_virt *virt = container_of(work, struct >>> amdgpu_virt, flr_work); >>> + struct amdgpu_virt *virt = container_of(work, struct >>> amdgpu_virt, flr_work.base.work); >>> struct amdgpu_device *adev = container_of(virt, struct >>> amdgpu_device, virt); >>> /* wait until RCV_MSG become 3 */ >>> @@ -610,7 +610,8 @@ int xgpu_vi_mailbox_get_irq(struct amdgpu_device >>> *adev) >>> return r; >>> } >>> - INIT_WORK(&adev->virt.flr_work, xgpu_vi_mailbox_flr_work); >>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>> xgpu_vi_mailbox_flr_work); >>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>> return 0; >>> } >>> @@ -619,6 +620,8 @@ void xgpu_vi_mailbox_put_irq(struct >>> amdgpu_device *adev) >>> { >>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>> + >>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>> &adev->virt.flr_work); >>> } >>> const struct amdgpu_virt_ops xgpu_vi_virt_ops = { >> ^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive. 2022-05-05 13:23 ` Christian König @ 2022-05-05 13:54 ` Andrey Grodzovsky 2022-05-05 15:06 ` Christian König 0 siblings, 1 reply; 40+ messages in thread From: Andrey Grodzovsky @ 2022-05-05 13:54 UTC (permalink / raw) To: Christian König, amd-gfx; +Cc: Bai Zoy, lijo.lazar On 2022-05-05 09:23, Christian König wrote: > Am 05.05.22 um 15:15 schrieb Andrey Grodzovsky: >> On 2022-05-05 06:09, Christian König wrote: >> >>> Am 04.05.22 um 18:18 schrieb Andrey Grodzovsky: >>>> Problem: >>>> During hive reset caused by command timing out on a ring >>>> extra resets are generated by triggered by KFD which is >>>> unable to accesses registers on the resetting ASIC. >>>> >>>> Fix: Rework GPU reset to use a list of pending reset jobs >>>> such that the first reset jobs that actaully resets the entire >>>> reset domain will cancel all those pending redundant resets. >>>> >>>> This is in line with what we already do for redundant TDRs >>>> in scheduler code. >>> >>> Mhm, why exactly do you need the extra linked list then? >>> >>> Let's talk about that on our call today. >> >> >> Going to miss it as you know, and also this is the place to discuss >> technical questions anyway so - > > Good point. > >> It's needed because those other resets are not time out handlers that >> are governed by the scheduler >> but rather external resets that are triggered by such clients as KFD, >> RAS and sysfs. Scheduler has no >> knowledge of them (and should not have) but they are serialized into >> same wq as the TO handlers >> from the scheduler. It just happens that TO triggered reset causes in >> turn another reset (from KFD in >> this case) and we want to prevent this second reset from taking place >> just as we want to avoid multiple >> TO resets to take place in scheduler code. > > Yeah, but why do you need multiple workers? > > You have a single worker for the GPU reset not triggered by the > scheduler in you adev and cancel that at the end of the reset procedure. > > If anybody things it needs to trigger another reset while in reset > (which is actually a small design bug separately) the reset will just > be canceled in the same way we cancel the scheduler resets. > > Christian. Had this in mind at first but then I realized that each client (RAS, KFD and sysfs) will want to fill his own data for the work (see amdgpu_device_gpu_recover) - for XGMI hive each will want to set his own adev (which is fine if you set a work per adev as you suggest) but also each client might want (they all put NULL there but in theory in the future) also set his own bad job value and here you might have a collision. Also in general seems to me it's cleaner approach where this logic (the work items) are held and handled in reset_domain and are not split in each adev or any other entity. We might want in the future to even move the scheduler handling into reset domain since reset domain is supposed to be a generic things and not only or AMD. Andrey > >> >> Andrey >> >> >>> >>> Regards, >>> Christian. >>> >>>> >>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> >>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com> >>>> --- >>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +--- >>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++-- >>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 + >>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 >>>> +++++++++++++++++++++- >>>> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +- >>>> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++- >>>> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++- >>>> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++- >>>> 8 files changed, 104 insertions(+), 24 deletions(-) >>>> >>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>> index 4264abc5604d..99efd8317547 100644 >>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>> @@ -109,6 +109,7 @@ >>>> #include "amdgpu_fdinfo.h" >>>> #include "amdgpu_mca.h" >>>> #include "amdgpu_ras.h" >>>> +#include "amdgpu_reset.h" >>>> #define MAX_GPU_INSTANCE 16 >>>> @@ -509,16 +510,6 @@ struct amdgpu_allowed_register_entry { >>>> bool grbm_indexed; >>>> }; >>>> -enum amd_reset_method { >>>> - AMD_RESET_METHOD_NONE = -1, >>>> - AMD_RESET_METHOD_LEGACY = 0, >>>> - AMD_RESET_METHOD_MODE0, >>>> - AMD_RESET_METHOD_MODE1, >>>> - AMD_RESET_METHOD_MODE2, >>>> - AMD_RESET_METHOD_BACO, >>>> - AMD_RESET_METHOD_PCI, >>>> -}; >>>> - >>>> struct amdgpu_video_codec_info { >>>> u32 codec_type; >>>> u32 max_width; >>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>> index e582f1044c0f..7fa82269c30f 100644 >>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>> @@ -5201,6 +5201,12 @@ int amdgpu_device_gpu_recover_imp(struct >>>> amdgpu_device *adev, >>>> } >>>> tmp_vram_lost_counter = >>>> atomic_read(&((adev)->vram_lost_counter)); >>>> + >>>> + /* Drop all pending resets since we will reset now anyway */ >>>> + tmp_adev = list_first_entry(device_list_handle, struct >>>> amdgpu_device, >>>> + reset_list); >>>> + amdgpu_reset_pending_list(tmp_adev->reset_domain); >>>> + >>>> /* Actual ASIC resets if needed.*/ >>>> /* Host driver will handle XGMI hive reset for SRIOV */ >>>> if (amdgpu_sriov_vf(adev)) { >>>> @@ -5296,7 +5302,7 @@ int amdgpu_device_gpu_recover_imp(struct >>>> amdgpu_device *adev, >>>> } >>>> struct amdgpu_recover_work_struct { >>>> - struct work_struct base; >>>> + struct amdgpu_reset_work_struct base; >>>> struct amdgpu_device *adev; >>>> struct amdgpu_job *job; >>>> int ret; >>>> @@ -5304,7 +5310,7 @@ struct amdgpu_recover_work_struct { >>>> static void amdgpu_device_queue_gpu_recover_work(struct >>>> work_struct *work) >>>> { >>>> - struct amdgpu_recover_work_struct *recover_work = >>>> container_of(work, struct amdgpu_recover_work_struct, base); >>>> + struct amdgpu_recover_work_struct *recover_work = >>>> container_of(work, struct amdgpu_recover_work_struct, base.base.work); >>>> recover_work->ret = >>>> amdgpu_device_gpu_recover_imp(recover_work->adev, recover_work->job); >>>> } >>>> @@ -5316,12 +5322,15 @@ int amdgpu_device_gpu_recover(struct >>>> amdgpu_device *adev, >>>> { >>>> struct amdgpu_recover_work_struct work = {.adev = adev, .job >>>> = job}; >>>> - INIT_WORK(&work.base, amdgpu_device_queue_gpu_recover_work); >>>> + INIT_DELAYED_WORK(&work.base.base, >>>> amdgpu_device_queue_gpu_recover_work); >>>> + INIT_LIST_HEAD(&work.base.node); >>>> if (!amdgpu_reset_domain_schedule(adev->reset_domain, >>>> &work.base)) >>>> return -EAGAIN; >>>> - flush_work(&work.base); >>>> + flush_delayed_work(&work.base.base); >>>> + >>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>> &work.base); >>>> return work.ret; >>>> } >>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>> index c80af0889773..ffddd419c351 100644 >>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain >>>> *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d >>>> atomic_set(&reset_domain->in_gpu_reset, 0); >>>> init_rwsem(&reset_domain->sem); >>>> + INIT_LIST_HEAD(&reset_domain->pending_works); >>>> + mutex_init(&reset_domain->reset_lock); >>>> + >>>> return reset_domain; >>>> } >>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>> index 1949dbe28a86..863ec5720fc1 100644 >>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>> @@ -24,7 +24,18 @@ >>>> #ifndef __AMDGPU_RESET_H__ >>>> #define __AMDGPU_RESET_H__ >>>> -#include "amdgpu.h" >>>> + >>>> +#include <linux/atomic.h> >>>> +#include <linux/mutex.h> >>>> +#include <linux/list.h> >>>> +#include <linux/kref.h> >>>> +#include <linux/rwsem.h> >>>> +#include <linux/workqueue.h> >>>> + >>>> +struct amdgpu_device; >>>> +struct amdgpu_job; >>>> +struct amdgpu_hive_info; >>>> + >>>> enum AMDGPU_RESET_FLAGS { >>>> @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS { >>>> AMDGPU_SKIP_HW_RESET = 1, >>>> }; >>>> + >>>> +enum amd_reset_method { >>>> + AMD_RESET_METHOD_NONE = -1, >>>> + AMD_RESET_METHOD_LEGACY = 0, >>>> + AMD_RESET_METHOD_MODE0, >>>> + AMD_RESET_METHOD_MODE1, >>>> + AMD_RESET_METHOD_MODE2, >>>> + AMD_RESET_METHOD_BACO, >>>> + AMD_RESET_METHOD_PCI, >>>> +}; >>>> + >>>> struct amdgpu_reset_context { >>>> enum amd_reset_method method; >>>> struct amdgpu_device *reset_req_dev; >>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context { >>>> unsigned long flags; >>>> }; >>>> +struct amdgpu_reset_control; >>>> + >>>> struct amdgpu_reset_handler { >>>> enum amd_reset_method reset_method; >>>> struct list_head handler_list; >>>> @@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type { >>>> XGMI_HIVE >>>> }; >>>> + >>>> +struct amdgpu_reset_work_struct { >>>> + struct delayed_work base; >>>> + struct list_head node; >>>> +}; >>>> + >>>> struct amdgpu_reset_domain { >>>> struct kref refcount; >>>> struct workqueue_struct *wq; >>>> enum amdgpu_reset_domain_type type; >>>> struct rw_semaphore sem; >>>> atomic_t in_gpu_reset; >>>> + >>>> + struct list_head pending_works; >>>> + struct mutex reset_lock; >>>> }; >>>> @@ -113,9 +146,43 @@ static inline void >>>> amdgpu_reset_put_reset_domain(struct amdgpu_reset_domain *dom >>>> } >>>> static inline bool amdgpu_reset_domain_schedule(struct >>>> amdgpu_reset_domain *domain, >>>> - struct work_struct *work) >>>> + struct amdgpu_reset_work_struct *work) >>>> { >>>> - return queue_work(domain->wq, work); >>>> + mutex_lock(&domain->reset_lock); >>>> + >>>> + if (!queue_delayed_work(domain->wq, &work->base, 0)) { >>>> + mutex_unlock(&domain->reset_lock); >>>> + return false; >>>> + } >>>> + >>>> + list_add_tail(&work->node, &domain->pending_works); >>>> + mutex_unlock(&domain->reset_lock); >>>> + >>>> + return true; >>>> +} >>>> + >>>> +static inline void amdgpu_reset_domain_del_pendning_work(struct >>>> amdgpu_reset_domain *domain, >>>> + struct amdgpu_reset_work_struct *work) >>>> +{ >>>> + mutex_lock(&domain->reset_lock); >>>> + list_del_init(&work->node); >>>> + mutex_unlock(&domain->reset_lock); >>>> +} >>>> + >>>> +static inline void amdgpu_reset_pending_list(struct >>>> amdgpu_reset_domain *domain) >>>> +{ >>>> + struct amdgpu_reset_work_struct *entry, *tmp; >>>> + >>>> + mutex_lock(&domain->reset_lock); >>>> + list_for_each_entry_safe(entry, tmp, &domain->pending_works, >>>> node) { >>>> + >>>> + list_del_init(&entry->node); >>>> + >>>> + /* Stop any other related pending resets */ >>>> + cancel_delayed_work(&entry->base); >>>> + } >>>> + >>>> + mutex_unlock(&domain->reset_lock); >>>> } >>>> void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain >>>> *reset_domain); >>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>> index 239f232f9c02..574e870d3064 100644 >>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>> @@ -25,6 +25,7 @@ >>>> #define AMDGPU_VIRT_H >>>> #include "amdgv_sriovmsg.h" >>>> +#include "amdgpu_reset.h" >>>> #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 << 0) /* vBIOS is >>>> sr-iov ready */ >>>> #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 1) /* sr-iov is >>>> enabled on this GPU */ >>>> @@ -230,7 +231,7 @@ struct amdgpu_virt { >>>> uint32_t reg_val_offs; >>>> struct amdgpu_irq_src ack_irq; >>>> struct amdgpu_irq_src rcv_irq; >>>> - struct work_struct flr_work; >>>> + struct amdgpu_reset_work_struct flr_work; >>>> struct amdgpu_mm_table mm_table; >>>> const struct amdgpu_virt_ops *ops; >>>> struct amdgpu_vf_error_buffer vf_errors; >>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>> index b81acf59870c..f3d1c2be9292 100644 >>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>> @@ -251,7 +251,7 @@ static int xgpu_ai_set_mailbox_ack_irq(struct >>>> amdgpu_device *adev, >>>> static void xgpu_ai_mailbox_flr_work(struct work_struct *work) >>>> { >>>> - struct amdgpu_virt *virt = container_of(work, struct >>>> amdgpu_virt, flr_work); >>>> + struct amdgpu_virt *virt = container_of(work, struct >>>> amdgpu_virt, flr_work.base.work); >>>> struct amdgpu_device *adev = container_of(virt, struct >>>> amdgpu_device, virt); >>>> int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT; >>>> @@ -380,7 +380,8 @@ int xgpu_ai_mailbox_get_irq(struct >>>> amdgpu_device *adev) >>>> return r; >>>> } >>>> - INIT_WORK(&adev->virt.flr_work, xgpu_ai_mailbox_flr_work); >>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>> xgpu_ai_mailbox_flr_work); >>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>> return 0; >>>> } >>>> @@ -389,6 +390,8 @@ void xgpu_ai_mailbox_put_irq(struct >>>> amdgpu_device *adev) >>>> { >>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>> + >>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>> &adev->virt.flr_work); >>>> } >>>> static int xgpu_ai_request_init_data(struct amdgpu_device *adev) >>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>> index 22c10b97ea81..927b3d5bb1d0 100644 >>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>> @@ -275,7 +275,7 @@ static int xgpu_nv_set_mailbox_ack_irq(struct >>>> amdgpu_device *adev, >>>> static void xgpu_nv_mailbox_flr_work(struct work_struct *work) >>>> { >>>> - struct amdgpu_virt *virt = container_of(work, struct >>>> amdgpu_virt, flr_work); >>>> + struct amdgpu_virt *virt = container_of(work, struct >>>> amdgpu_virt, flr_work.base.work); >>>> struct amdgpu_device *adev = container_of(virt, struct >>>> amdgpu_device, virt); >>>> int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT; >>>> @@ -407,7 +407,8 @@ int xgpu_nv_mailbox_get_irq(struct >>>> amdgpu_device *adev) >>>> return r; >>>> } >>>> - INIT_WORK(&adev->virt.flr_work, xgpu_nv_mailbox_flr_work); >>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>> xgpu_nv_mailbox_flr_work); >>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>> return 0; >>>> } >>>> @@ -416,6 +417,8 @@ void xgpu_nv_mailbox_put_irq(struct >>>> amdgpu_device *adev) >>>> { >>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>> + >>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>> &adev->virt.flr_work); >>>> } >>>> const struct amdgpu_virt_ops xgpu_nv_virt_ops = { >>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>> index 7b63d30b9b79..1d4ef5c70730 100644 >>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>> @@ -512,7 +512,7 @@ static int xgpu_vi_set_mailbox_ack_irq(struct >>>> amdgpu_device *adev, >>>> static void xgpu_vi_mailbox_flr_work(struct work_struct *work) >>>> { >>>> - struct amdgpu_virt *virt = container_of(work, struct >>>> amdgpu_virt, flr_work); >>>> + struct amdgpu_virt *virt = container_of(work, struct >>>> amdgpu_virt, flr_work.base.work); >>>> struct amdgpu_device *adev = container_of(virt, struct >>>> amdgpu_device, virt); >>>> /* wait until RCV_MSG become 3 */ >>>> @@ -610,7 +610,8 @@ int xgpu_vi_mailbox_get_irq(struct >>>> amdgpu_device *adev) >>>> return r; >>>> } >>>> - INIT_WORK(&adev->virt.flr_work, xgpu_vi_mailbox_flr_work); >>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>> xgpu_vi_mailbox_flr_work); >>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>> return 0; >>>> } >>>> @@ -619,6 +620,8 @@ void xgpu_vi_mailbox_put_irq(struct >>>> amdgpu_device *adev) >>>> { >>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>> + >>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>> &adev->virt.flr_work); >>>> } >>>> const struct amdgpu_virt_ops xgpu_vi_virt_ops = { >>> > ^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive. 2022-05-05 13:54 ` Andrey Grodzovsky @ 2022-05-05 15:06 ` Christian König 2022-05-05 18:57 ` Andrey Grodzovsky 0 siblings, 1 reply; 40+ messages in thread From: Christian König @ 2022-05-05 15:06 UTC (permalink / raw) To: Andrey Grodzovsky, amd-gfx; +Cc: Bai Zoy, lijo.lazar Am 05.05.22 um 15:54 schrieb Andrey Grodzovsky: > > On 2022-05-05 09:23, Christian König wrote: >> Am 05.05.22 um 15:15 schrieb Andrey Grodzovsky: >>> On 2022-05-05 06:09, Christian König wrote: >>> >>>> Am 04.05.22 um 18:18 schrieb Andrey Grodzovsky: >>>>> Problem: >>>>> During hive reset caused by command timing out on a ring >>>>> extra resets are generated by triggered by KFD which is >>>>> unable to accesses registers on the resetting ASIC. >>>>> >>>>> Fix: Rework GPU reset to use a list of pending reset jobs >>>>> such that the first reset jobs that actaully resets the entire >>>>> reset domain will cancel all those pending redundant resets. >>>>> >>>>> This is in line with what we already do for redundant TDRs >>>>> in scheduler code. >>>> >>>> Mhm, why exactly do you need the extra linked list then? >>>> >>>> Let's talk about that on our call today. >>> >>> >>> Going to miss it as you know, and also this is the place to discuss >>> technical questions anyway so - >> >> Good point. >> >>> It's needed because those other resets are not time out handlers >>> that are governed by the scheduler >>> but rather external resets that are triggered by such clients as >>> KFD, RAS and sysfs. Scheduler has no >>> knowledge of them (and should not have) but they are serialized into >>> same wq as the TO handlers >>> from the scheduler. It just happens that TO triggered reset causes >>> in turn another reset (from KFD in >>> this case) and we want to prevent this second reset from taking >>> place just as we want to avoid multiple >>> TO resets to take place in scheduler code. >> >> Yeah, but why do you need multiple workers? >> >> You have a single worker for the GPU reset not triggered by the >> scheduler in you adev and cancel that at the end of the reset procedure. >> >> If anybody things it needs to trigger another reset while in reset >> (which is actually a small design bug separately) the reset will just >> be canceled in the same way we cancel the scheduler resets. >> >> Christian. > > > Had this in mind at first but then I realized that each client (RAS, > KFD and sysfs) will want to fill his own data for the work (see > amdgpu_device_gpu_recover) - for XGMI hive each will want to set his > own adev (which is fine if you set a work per adev as you suggest) but > also each client might want (they all put NULL there but in theory in > the future) also set his own bad job value and here you might have a > collision. Yeah, but that is intentional. See when we have a job that needs to be consumed by the reset handler and not overwritten or something. Additional to that keep in mind that you can't allocate any memory before or during the GPU reset nor wait for the reset to complete (so you can't allocate anything on the stack either). I don't think that concept you try here will work. Regards, Christian. > Also in general seems to me it's cleaner approach where this logic > (the work items) are held and handled in reset_domain and are not > split in each adev or any other entity. We might want in the future to > even move the scheduler handling into reset domain since reset domain > is supposed to be a generic things and not only or AMD. > > Andrey > > >> >>> >>> Andrey >>> >>> >>>> >>>> Regards, >>>> Christian. >>>> >>>>> >>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> >>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com> >>>>> --- >>>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +--- >>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++-- >>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 + >>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 >>>>> +++++++++++++++++++++- >>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +- >>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++- >>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++- >>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++- >>>>> 8 files changed, 104 insertions(+), 24 deletions(-) >>>>> >>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>> index 4264abc5604d..99efd8317547 100644 >>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>> @@ -109,6 +109,7 @@ >>>>> #include "amdgpu_fdinfo.h" >>>>> #include "amdgpu_mca.h" >>>>> #include "amdgpu_ras.h" >>>>> +#include "amdgpu_reset.h" >>>>> #define MAX_GPU_INSTANCE 16 >>>>> @@ -509,16 +510,6 @@ struct amdgpu_allowed_register_entry { >>>>> bool grbm_indexed; >>>>> }; >>>>> -enum amd_reset_method { >>>>> - AMD_RESET_METHOD_NONE = -1, >>>>> - AMD_RESET_METHOD_LEGACY = 0, >>>>> - AMD_RESET_METHOD_MODE0, >>>>> - AMD_RESET_METHOD_MODE1, >>>>> - AMD_RESET_METHOD_MODE2, >>>>> - AMD_RESET_METHOD_BACO, >>>>> - AMD_RESET_METHOD_PCI, >>>>> -}; >>>>> - >>>>> struct amdgpu_video_codec_info { >>>>> u32 codec_type; >>>>> u32 max_width; >>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>> index e582f1044c0f..7fa82269c30f 100644 >>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>> @@ -5201,6 +5201,12 @@ int amdgpu_device_gpu_recover_imp(struct >>>>> amdgpu_device *adev, >>>>> } >>>>> tmp_vram_lost_counter = >>>>> atomic_read(&((adev)->vram_lost_counter)); >>>>> + >>>>> + /* Drop all pending resets since we will reset now anyway */ >>>>> + tmp_adev = list_first_entry(device_list_handle, struct >>>>> amdgpu_device, >>>>> + reset_list); >>>>> + amdgpu_reset_pending_list(tmp_adev->reset_domain); >>>>> + >>>>> /* Actual ASIC resets if needed.*/ >>>>> /* Host driver will handle XGMI hive reset for SRIOV */ >>>>> if (amdgpu_sriov_vf(adev)) { >>>>> @@ -5296,7 +5302,7 @@ int amdgpu_device_gpu_recover_imp(struct >>>>> amdgpu_device *adev, >>>>> } >>>>> struct amdgpu_recover_work_struct { >>>>> - struct work_struct base; >>>>> + struct amdgpu_reset_work_struct base; >>>>> struct amdgpu_device *adev; >>>>> struct amdgpu_job *job; >>>>> int ret; >>>>> @@ -5304,7 +5310,7 @@ struct amdgpu_recover_work_struct { >>>>> static void amdgpu_device_queue_gpu_recover_work(struct >>>>> work_struct *work) >>>>> { >>>>> - struct amdgpu_recover_work_struct *recover_work = >>>>> container_of(work, struct amdgpu_recover_work_struct, base); >>>>> + struct amdgpu_recover_work_struct *recover_work = >>>>> container_of(work, struct amdgpu_recover_work_struct, >>>>> base.base.work); >>>>> recover_work->ret = >>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, recover_work->job); >>>>> } >>>>> @@ -5316,12 +5322,15 @@ int amdgpu_device_gpu_recover(struct >>>>> amdgpu_device *adev, >>>>> { >>>>> struct amdgpu_recover_work_struct work = {.adev = adev, .job >>>>> = job}; >>>>> - INIT_WORK(&work.base, amdgpu_device_queue_gpu_recover_work); >>>>> + INIT_DELAYED_WORK(&work.base.base, >>>>> amdgpu_device_queue_gpu_recover_work); >>>>> + INIT_LIST_HEAD(&work.base.node); >>>>> if (!amdgpu_reset_domain_schedule(adev->reset_domain, >>>>> &work.base)) >>>>> return -EAGAIN; >>>>> - flush_work(&work.base); >>>>> + flush_delayed_work(&work.base.base); >>>>> + >>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>> &work.base); >>>>> return work.ret; >>>>> } >>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>> index c80af0889773..ffddd419c351 100644 >>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain >>>>> *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d >>>>> atomic_set(&reset_domain->in_gpu_reset, 0); >>>>> init_rwsem(&reset_domain->sem); >>>>> + INIT_LIST_HEAD(&reset_domain->pending_works); >>>>> + mutex_init(&reset_domain->reset_lock); >>>>> + >>>>> return reset_domain; >>>>> } >>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>> index 1949dbe28a86..863ec5720fc1 100644 >>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>> @@ -24,7 +24,18 @@ >>>>> #ifndef __AMDGPU_RESET_H__ >>>>> #define __AMDGPU_RESET_H__ >>>>> -#include "amdgpu.h" >>>>> + >>>>> +#include <linux/atomic.h> >>>>> +#include <linux/mutex.h> >>>>> +#include <linux/list.h> >>>>> +#include <linux/kref.h> >>>>> +#include <linux/rwsem.h> >>>>> +#include <linux/workqueue.h> >>>>> + >>>>> +struct amdgpu_device; >>>>> +struct amdgpu_job; >>>>> +struct amdgpu_hive_info; >>>>> + >>>>> enum AMDGPU_RESET_FLAGS { >>>>> @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS { >>>>> AMDGPU_SKIP_HW_RESET = 1, >>>>> }; >>>>> + >>>>> +enum amd_reset_method { >>>>> + AMD_RESET_METHOD_NONE = -1, >>>>> + AMD_RESET_METHOD_LEGACY = 0, >>>>> + AMD_RESET_METHOD_MODE0, >>>>> + AMD_RESET_METHOD_MODE1, >>>>> + AMD_RESET_METHOD_MODE2, >>>>> + AMD_RESET_METHOD_BACO, >>>>> + AMD_RESET_METHOD_PCI, >>>>> +}; >>>>> + >>>>> struct amdgpu_reset_context { >>>>> enum amd_reset_method method; >>>>> struct amdgpu_device *reset_req_dev; >>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context { >>>>> unsigned long flags; >>>>> }; >>>>> +struct amdgpu_reset_control; >>>>> + >>>>> struct amdgpu_reset_handler { >>>>> enum amd_reset_method reset_method; >>>>> struct list_head handler_list; >>>>> @@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type { >>>>> XGMI_HIVE >>>>> }; >>>>> + >>>>> +struct amdgpu_reset_work_struct { >>>>> + struct delayed_work base; >>>>> + struct list_head node; >>>>> +}; >>>>> + >>>>> struct amdgpu_reset_domain { >>>>> struct kref refcount; >>>>> struct workqueue_struct *wq; >>>>> enum amdgpu_reset_domain_type type; >>>>> struct rw_semaphore sem; >>>>> atomic_t in_gpu_reset; >>>>> + >>>>> + struct list_head pending_works; >>>>> + struct mutex reset_lock; >>>>> }; >>>>> @@ -113,9 +146,43 @@ static inline void >>>>> amdgpu_reset_put_reset_domain(struct amdgpu_reset_domain *dom >>>>> } >>>>> static inline bool amdgpu_reset_domain_schedule(struct >>>>> amdgpu_reset_domain *domain, >>>>> - struct work_struct *work) >>>>> + struct amdgpu_reset_work_struct *work) >>>>> { >>>>> - return queue_work(domain->wq, work); >>>>> + mutex_lock(&domain->reset_lock); >>>>> + >>>>> + if (!queue_delayed_work(domain->wq, &work->base, 0)) { >>>>> + mutex_unlock(&domain->reset_lock); >>>>> + return false; >>>>> + } >>>>> + >>>>> + list_add_tail(&work->node, &domain->pending_works); >>>>> + mutex_unlock(&domain->reset_lock); >>>>> + >>>>> + return true; >>>>> +} >>>>> + >>>>> +static inline void amdgpu_reset_domain_del_pendning_work(struct >>>>> amdgpu_reset_domain *domain, >>>>> + struct amdgpu_reset_work_struct *work) >>>>> +{ >>>>> + mutex_lock(&domain->reset_lock); >>>>> + list_del_init(&work->node); >>>>> + mutex_unlock(&domain->reset_lock); >>>>> +} >>>>> + >>>>> +static inline void amdgpu_reset_pending_list(struct >>>>> amdgpu_reset_domain *domain) >>>>> +{ >>>>> + struct amdgpu_reset_work_struct *entry, *tmp; >>>>> + >>>>> + mutex_lock(&domain->reset_lock); >>>>> + list_for_each_entry_safe(entry, tmp, &domain->pending_works, >>>>> node) { >>>>> + >>>>> + list_del_init(&entry->node); >>>>> + >>>>> + /* Stop any other related pending resets */ >>>>> + cancel_delayed_work(&entry->base); >>>>> + } >>>>> + >>>>> + mutex_unlock(&domain->reset_lock); >>>>> } >>>>> void amdgpu_device_lock_reset_domain(struct >>>>> amdgpu_reset_domain *reset_domain); >>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>> index 239f232f9c02..574e870d3064 100644 >>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>> @@ -25,6 +25,7 @@ >>>>> #define AMDGPU_VIRT_H >>>>> #include "amdgv_sriovmsg.h" >>>>> +#include "amdgpu_reset.h" >>>>> #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 << 0) /* vBIOS is >>>>> sr-iov ready */ >>>>> #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 1) /* sr-iov is >>>>> enabled on this GPU */ >>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt { >>>>> uint32_t reg_val_offs; >>>>> struct amdgpu_irq_src ack_irq; >>>>> struct amdgpu_irq_src rcv_irq; >>>>> - struct work_struct flr_work; >>>>> + struct amdgpu_reset_work_struct flr_work; >>>>> struct amdgpu_mm_table mm_table; >>>>> const struct amdgpu_virt_ops *ops; >>>>> struct amdgpu_vf_error_buffer vf_errors; >>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>> index b81acf59870c..f3d1c2be9292 100644 >>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>> @@ -251,7 +251,7 @@ static int xgpu_ai_set_mailbox_ack_irq(struct >>>>> amdgpu_device *adev, >>>>> static void xgpu_ai_mailbox_flr_work(struct work_struct *work) >>>>> { >>>>> - struct amdgpu_virt *virt = container_of(work, struct >>>>> amdgpu_virt, flr_work); >>>>> + struct amdgpu_virt *virt = container_of(work, struct >>>>> amdgpu_virt, flr_work.base.work); >>>>> struct amdgpu_device *adev = container_of(virt, struct >>>>> amdgpu_device, virt); >>>>> int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT; >>>>> @@ -380,7 +380,8 @@ int xgpu_ai_mailbox_get_irq(struct >>>>> amdgpu_device *adev) >>>>> return r; >>>>> } >>>>> - INIT_WORK(&adev->virt.flr_work, xgpu_ai_mailbox_flr_work); >>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>> xgpu_ai_mailbox_flr_work); >>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>> return 0; >>>>> } >>>>> @@ -389,6 +390,8 @@ void xgpu_ai_mailbox_put_irq(struct >>>>> amdgpu_device *adev) >>>>> { >>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>> + >>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>> &adev->virt.flr_work); >>>>> } >>>>> static int xgpu_ai_request_init_data(struct amdgpu_device *adev) >>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>> index 22c10b97ea81..927b3d5bb1d0 100644 >>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>> @@ -275,7 +275,7 @@ static int xgpu_nv_set_mailbox_ack_irq(struct >>>>> amdgpu_device *adev, >>>>> static void xgpu_nv_mailbox_flr_work(struct work_struct *work) >>>>> { >>>>> - struct amdgpu_virt *virt = container_of(work, struct >>>>> amdgpu_virt, flr_work); >>>>> + struct amdgpu_virt *virt = container_of(work, struct >>>>> amdgpu_virt, flr_work.base.work); >>>>> struct amdgpu_device *adev = container_of(virt, struct >>>>> amdgpu_device, virt); >>>>> int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT; >>>>> @@ -407,7 +407,8 @@ int xgpu_nv_mailbox_get_irq(struct >>>>> amdgpu_device *adev) >>>>> return r; >>>>> } >>>>> - INIT_WORK(&adev->virt.flr_work, xgpu_nv_mailbox_flr_work); >>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>> xgpu_nv_mailbox_flr_work); >>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>> return 0; >>>>> } >>>>> @@ -416,6 +417,8 @@ void xgpu_nv_mailbox_put_irq(struct >>>>> amdgpu_device *adev) >>>>> { >>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>> + >>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>> &adev->virt.flr_work); >>>>> } >>>>> const struct amdgpu_virt_ops xgpu_nv_virt_ops = { >>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>> index 7b63d30b9b79..1d4ef5c70730 100644 >>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>> @@ -512,7 +512,7 @@ static int xgpu_vi_set_mailbox_ack_irq(struct >>>>> amdgpu_device *adev, >>>>> static void xgpu_vi_mailbox_flr_work(struct work_struct *work) >>>>> { >>>>> - struct amdgpu_virt *virt = container_of(work, struct >>>>> amdgpu_virt, flr_work); >>>>> + struct amdgpu_virt *virt = container_of(work, struct >>>>> amdgpu_virt, flr_work.base.work); >>>>> struct amdgpu_device *adev = container_of(virt, struct >>>>> amdgpu_device, virt); >>>>> /* wait until RCV_MSG become 3 */ >>>>> @@ -610,7 +610,8 @@ int xgpu_vi_mailbox_get_irq(struct >>>>> amdgpu_device *adev) >>>>> return r; >>>>> } >>>>> - INIT_WORK(&adev->virt.flr_work, xgpu_vi_mailbox_flr_work); >>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>> xgpu_vi_mailbox_flr_work); >>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>> return 0; >>>>> } >>>>> @@ -619,6 +620,8 @@ void xgpu_vi_mailbox_put_irq(struct >>>>> amdgpu_device *adev) >>>>> { >>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>> + >>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>> &adev->virt.flr_work); >>>>> } >>>>> const struct amdgpu_virt_ops xgpu_vi_virt_ops = { >>>> >> ^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive. 2022-05-05 15:06 ` Christian König @ 2022-05-05 18:57 ` Andrey Grodzovsky 2022-05-05 19:49 ` Felix Kuehling 0 siblings, 1 reply; 40+ messages in thread From: Andrey Grodzovsky @ 2022-05-05 18:57 UTC (permalink / raw) To: Christian König, amd-gfx; +Cc: Bai Zoy, lijo.lazar On 2022-05-05 11:06, Christian König wrote: > Am 05.05.22 um 15:54 schrieb Andrey Grodzovsky: >> >> On 2022-05-05 09:23, Christian König wrote: >>> Am 05.05.22 um 15:15 schrieb Andrey Grodzovsky: >>>> On 2022-05-05 06:09, Christian König wrote: >>>> >>>>> Am 04.05.22 um 18:18 schrieb Andrey Grodzovsky: >>>>>> Problem: >>>>>> During hive reset caused by command timing out on a ring >>>>>> extra resets are generated by triggered by KFD which is >>>>>> unable to accesses registers on the resetting ASIC. >>>>>> >>>>>> Fix: Rework GPU reset to use a list of pending reset jobs >>>>>> such that the first reset jobs that actaully resets the entire >>>>>> reset domain will cancel all those pending redundant resets. >>>>>> >>>>>> This is in line with what we already do for redundant TDRs >>>>>> in scheduler code. >>>>> >>>>> Mhm, why exactly do you need the extra linked list then? >>>>> >>>>> Let's talk about that on our call today. >>>> >>>> >>>> Going to miss it as you know, and also this is the place to discuss >>>> technical questions anyway so - >>> >>> Good point. >>> >>>> It's needed because those other resets are not time out handlers >>>> that are governed by the scheduler >>>> but rather external resets that are triggered by such clients as >>>> KFD, RAS and sysfs. Scheduler has no >>>> knowledge of them (and should not have) but they are serialized >>>> into same wq as the TO handlers >>>> from the scheduler. It just happens that TO triggered reset causes >>>> in turn another reset (from KFD in >>>> this case) and we want to prevent this second reset from taking >>>> place just as we want to avoid multiple >>>> TO resets to take place in scheduler code. >>> >>> Yeah, but why do you need multiple workers? >>> >>> You have a single worker for the GPU reset not triggered by the >>> scheduler in you adev and cancel that at the end of the reset >>> procedure. >>> >>> If anybody things it needs to trigger another reset while in reset >>> (which is actually a small design bug separately) the reset will >>> just be canceled in the same way we cancel the scheduler resets. >>> >>> Christian. >> >> >> Had this in mind at first but then I realized that each client (RAS, >> KFD and sysfs) will want to fill his own data for the work (see >> amdgpu_device_gpu_recover) - for XGMI hive each will want to set his >> own adev (which is fine if you set a work per adev as you suggest) >> but also each client might want (they all put NULL there but in >> theory in the future) also set his own bad job value and here you >> might have a collision. > > Yeah, but that is intentional. See when we have a job that needs to be > consumed by the reset handler and not overwritten or something. I am not sure why this is a requirement, multiple clients can decide concurrently to trigger a reset for some reason (possibly independent reasons) hence they cannot share same work struct to pass to it their data. > > > Additional to that keep in mind that you can't allocate any memory > before or during the GPU reset nor wait for the reset to complete (so > you can't allocate anything on the stack either). There is no dynamic allocation here, regarding stack allocations - we do it all the time when we call functions, even during GPU resets, how on stack allocation of work struct in amdgpu_device_gpu_recover is different from any other local variable we allocate in any function we call ? I am also not sure why it's not allowed to wait for reset to complete ? Also, see in amdgpu_ras_do_recovery and gpu_recover_get (debugfs) - the caller expects the reset to complete before he returns. I can probably work around it in RAS code by calling atomic_set(&ras->in_recovery, 0) from some callback within actual reset function but regarding sysfs it actually expects a result returned indicating whether the call was successful or not. Andrey > > I don't think that concept you try here will work. > > Regards, > Christian. > >> Also in general seems to me it's cleaner approach where this logic >> (the work items) are held and handled in reset_domain and are not >> split in each adev or any other entity. We might want in the future >> to even move the scheduler handling into reset domain since reset >> domain is supposed to be a generic things and not only or AMD. >> >> Andrey >> >> >>> >>>> >>>> Andrey >>>> >>>> >>>>> >>>>> Regards, >>>>> Christian. >>>>> >>>>>> >>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> >>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com> >>>>>> --- >>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +--- >>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++-- >>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 + >>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 >>>>>> +++++++++++++++++++++- >>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +- >>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++- >>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++- >>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++- >>>>>> 8 files changed, 104 insertions(+), 24 deletions(-) >>>>>> >>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>> index 4264abc5604d..99efd8317547 100644 >>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>> @@ -109,6 +109,7 @@ >>>>>> #include "amdgpu_fdinfo.h" >>>>>> #include "amdgpu_mca.h" >>>>>> #include "amdgpu_ras.h" >>>>>> +#include "amdgpu_reset.h" >>>>>> #define MAX_GPU_INSTANCE 16 >>>>>> @@ -509,16 +510,6 @@ struct amdgpu_allowed_register_entry { >>>>>> bool grbm_indexed; >>>>>> }; >>>>>> -enum amd_reset_method { >>>>>> - AMD_RESET_METHOD_NONE = -1, >>>>>> - AMD_RESET_METHOD_LEGACY = 0, >>>>>> - AMD_RESET_METHOD_MODE0, >>>>>> - AMD_RESET_METHOD_MODE1, >>>>>> - AMD_RESET_METHOD_MODE2, >>>>>> - AMD_RESET_METHOD_BACO, >>>>>> - AMD_RESET_METHOD_PCI, >>>>>> -}; >>>>>> - >>>>>> struct amdgpu_video_codec_info { >>>>>> u32 codec_type; >>>>>> u32 max_width; >>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>> index e582f1044c0f..7fa82269c30f 100644 >>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>> @@ -5201,6 +5201,12 @@ int amdgpu_device_gpu_recover_imp(struct >>>>>> amdgpu_device *adev, >>>>>> } >>>>>> tmp_vram_lost_counter = >>>>>> atomic_read(&((adev)->vram_lost_counter)); >>>>>> + >>>>>> + /* Drop all pending resets since we will reset now anyway */ >>>>>> + tmp_adev = list_first_entry(device_list_handle, struct >>>>>> amdgpu_device, >>>>>> + reset_list); >>>>>> + amdgpu_reset_pending_list(tmp_adev->reset_domain); >>>>>> + >>>>>> /* Actual ASIC resets if needed.*/ >>>>>> /* Host driver will handle XGMI hive reset for SRIOV */ >>>>>> if (amdgpu_sriov_vf(adev)) { >>>>>> @@ -5296,7 +5302,7 @@ int amdgpu_device_gpu_recover_imp(struct >>>>>> amdgpu_device *adev, >>>>>> } >>>>>> struct amdgpu_recover_work_struct { >>>>>> - struct work_struct base; >>>>>> + struct amdgpu_reset_work_struct base; >>>>>> struct amdgpu_device *adev; >>>>>> struct amdgpu_job *job; >>>>>> int ret; >>>>>> @@ -5304,7 +5310,7 @@ struct amdgpu_recover_work_struct { >>>>>> static void amdgpu_device_queue_gpu_recover_work(struct >>>>>> work_struct *work) >>>>>> { >>>>>> - struct amdgpu_recover_work_struct *recover_work = >>>>>> container_of(work, struct amdgpu_recover_work_struct, base); >>>>>> + struct amdgpu_recover_work_struct *recover_work = >>>>>> container_of(work, struct amdgpu_recover_work_struct, >>>>>> base.base.work); >>>>>> recover_work->ret = >>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, >>>>>> recover_work->job); >>>>>> } >>>>>> @@ -5316,12 +5322,15 @@ int amdgpu_device_gpu_recover(struct >>>>>> amdgpu_device *adev, >>>>>> { >>>>>> struct amdgpu_recover_work_struct work = {.adev = adev, >>>>>> .job = job}; >>>>>> - INIT_WORK(&work.base, amdgpu_device_queue_gpu_recover_work); >>>>>> + INIT_DELAYED_WORK(&work.base.base, >>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>> + INIT_LIST_HEAD(&work.base.node); >>>>>> if (!amdgpu_reset_domain_schedule(adev->reset_domain, >>>>>> &work.base)) >>>>>> return -EAGAIN; >>>>>> - flush_work(&work.base); >>>>>> + flush_delayed_work(&work.base.base); >>>>>> + >>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>> &work.base); >>>>>> return work.ret; >>>>>> } >>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>> index c80af0889773..ffddd419c351 100644 >>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain >>>>>> *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d >>>>>> atomic_set(&reset_domain->in_gpu_reset, 0); >>>>>> init_rwsem(&reset_domain->sem); >>>>>> + INIT_LIST_HEAD(&reset_domain->pending_works); >>>>>> + mutex_init(&reset_domain->reset_lock); >>>>>> + >>>>>> return reset_domain; >>>>>> } >>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>> index 1949dbe28a86..863ec5720fc1 100644 >>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>> @@ -24,7 +24,18 @@ >>>>>> #ifndef __AMDGPU_RESET_H__ >>>>>> #define __AMDGPU_RESET_H__ >>>>>> -#include "amdgpu.h" >>>>>> + >>>>>> +#include <linux/atomic.h> >>>>>> +#include <linux/mutex.h> >>>>>> +#include <linux/list.h> >>>>>> +#include <linux/kref.h> >>>>>> +#include <linux/rwsem.h> >>>>>> +#include <linux/workqueue.h> >>>>>> + >>>>>> +struct amdgpu_device; >>>>>> +struct amdgpu_job; >>>>>> +struct amdgpu_hive_info; >>>>>> + >>>>>> enum AMDGPU_RESET_FLAGS { >>>>>> @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS { >>>>>> AMDGPU_SKIP_HW_RESET = 1, >>>>>> }; >>>>>> + >>>>>> +enum amd_reset_method { >>>>>> + AMD_RESET_METHOD_NONE = -1, >>>>>> + AMD_RESET_METHOD_LEGACY = 0, >>>>>> + AMD_RESET_METHOD_MODE0, >>>>>> + AMD_RESET_METHOD_MODE1, >>>>>> + AMD_RESET_METHOD_MODE2, >>>>>> + AMD_RESET_METHOD_BACO, >>>>>> + AMD_RESET_METHOD_PCI, >>>>>> +}; >>>>>> + >>>>>> struct amdgpu_reset_context { >>>>>> enum amd_reset_method method; >>>>>> struct amdgpu_device *reset_req_dev; >>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context { >>>>>> unsigned long flags; >>>>>> }; >>>>>> +struct amdgpu_reset_control; >>>>>> + >>>>>> struct amdgpu_reset_handler { >>>>>> enum amd_reset_method reset_method; >>>>>> struct list_head handler_list; >>>>>> @@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type { >>>>>> XGMI_HIVE >>>>>> }; >>>>>> + >>>>>> +struct amdgpu_reset_work_struct { >>>>>> + struct delayed_work base; >>>>>> + struct list_head node; >>>>>> +}; >>>>>> + >>>>>> struct amdgpu_reset_domain { >>>>>> struct kref refcount; >>>>>> struct workqueue_struct *wq; >>>>>> enum amdgpu_reset_domain_type type; >>>>>> struct rw_semaphore sem; >>>>>> atomic_t in_gpu_reset; >>>>>> + >>>>>> + struct list_head pending_works; >>>>>> + struct mutex reset_lock; >>>>>> }; >>>>>> @@ -113,9 +146,43 @@ static inline void >>>>>> amdgpu_reset_put_reset_domain(struct amdgpu_reset_domain *dom >>>>>> } >>>>>> static inline bool amdgpu_reset_domain_schedule(struct >>>>>> amdgpu_reset_domain *domain, >>>>>> - struct work_struct *work) >>>>>> + struct amdgpu_reset_work_struct *work) >>>>>> { >>>>>> - return queue_work(domain->wq, work); >>>>>> + mutex_lock(&domain->reset_lock); >>>>>> + >>>>>> + if (!queue_delayed_work(domain->wq, &work->base, 0)) { >>>>>> + mutex_unlock(&domain->reset_lock); >>>>>> + return false; >>>>>> + } >>>>>> + >>>>>> + list_add_tail(&work->node, &domain->pending_works); >>>>>> + mutex_unlock(&domain->reset_lock); >>>>>> + >>>>>> + return true; >>>>>> +} >>>>>> + >>>>>> +static inline void amdgpu_reset_domain_del_pendning_work(struct >>>>>> amdgpu_reset_domain *domain, >>>>>> + struct amdgpu_reset_work_struct *work) >>>>>> +{ >>>>>> + mutex_lock(&domain->reset_lock); >>>>>> + list_del_init(&work->node); >>>>>> + mutex_unlock(&domain->reset_lock); >>>>>> +} >>>>>> + >>>>>> +static inline void amdgpu_reset_pending_list(struct >>>>>> amdgpu_reset_domain *domain) >>>>>> +{ >>>>>> + struct amdgpu_reset_work_struct *entry, *tmp; >>>>>> + >>>>>> + mutex_lock(&domain->reset_lock); >>>>>> + list_for_each_entry_safe(entry, tmp, &domain->pending_works, >>>>>> node) { >>>>>> + >>>>>> + list_del_init(&entry->node); >>>>>> + >>>>>> + /* Stop any other related pending resets */ >>>>>> + cancel_delayed_work(&entry->base); >>>>>> + } >>>>>> + >>>>>> + mutex_unlock(&domain->reset_lock); >>>>>> } >>>>>> void amdgpu_device_lock_reset_domain(struct >>>>>> amdgpu_reset_domain *reset_domain); >>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>> index 239f232f9c02..574e870d3064 100644 >>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>> @@ -25,6 +25,7 @@ >>>>>> #define AMDGPU_VIRT_H >>>>>> #include "amdgv_sriovmsg.h" >>>>>> +#include "amdgpu_reset.h" >>>>>> #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 << 0) /* vBIOS is >>>>>> sr-iov ready */ >>>>>> #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 1) /* sr-iov is >>>>>> enabled on this GPU */ >>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt { >>>>>> uint32_t reg_val_offs; >>>>>> struct amdgpu_irq_src ack_irq; >>>>>> struct amdgpu_irq_src rcv_irq; >>>>>> - struct work_struct flr_work; >>>>>> + struct amdgpu_reset_work_struct flr_work; >>>>>> struct amdgpu_mm_table mm_table; >>>>>> const struct amdgpu_virt_ops *ops; >>>>>> struct amdgpu_vf_error_buffer vf_errors; >>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>> index b81acf59870c..f3d1c2be9292 100644 >>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>> @@ -251,7 +251,7 @@ static int xgpu_ai_set_mailbox_ack_irq(struct >>>>>> amdgpu_device *adev, >>>>>> static void xgpu_ai_mailbox_flr_work(struct work_struct *work) >>>>>> { >>>>>> - struct amdgpu_virt *virt = container_of(work, struct >>>>>> amdgpu_virt, flr_work); >>>>>> + struct amdgpu_virt *virt = container_of(work, struct >>>>>> amdgpu_virt, flr_work.base.work); >>>>>> struct amdgpu_device *adev = container_of(virt, struct >>>>>> amdgpu_device, virt); >>>>>> int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>> @@ -380,7 +380,8 @@ int xgpu_ai_mailbox_get_irq(struct >>>>>> amdgpu_device *adev) >>>>>> return r; >>>>>> } >>>>>> - INIT_WORK(&adev->virt.flr_work, xgpu_ai_mailbox_flr_work); >>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>> xgpu_ai_mailbox_flr_work); >>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>> return 0; >>>>>> } >>>>>> @@ -389,6 +390,8 @@ void xgpu_ai_mailbox_put_irq(struct >>>>>> amdgpu_device *adev) >>>>>> { >>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>> + >>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>> &adev->virt.flr_work); >>>>>> } >>>>>> static int xgpu_ai_request_init_data(struct amdgpu_device *adev) >>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>> index 22c10b97ea81..927b3d5bb1d0 100644 >>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>> @@ -275,7 +275,7 @@ static int xgpu_nv_set_mailbox_ack_irq(struct >>>>>> amdgpu_device *adev, >>>>>> static void xgpu_nv_mailbox_flr_work(struct work_struct *work) >>>>>> { >>>>>> - struct amdgpu_virt *virt = container_of(work, struct >>>>>> amdgpu_virt, flr_work); >>>>>> + struct amdgpu_virt *virt = container_of(work, struct >>>>>> amdgpu_virt, flr_work.base.work); >>>>>> struct amdgpu_device *adev = container_of(virt, struct >>>>>> amdgpu_device, virt); >>>>>> int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>> @@ -407,7 +407,8 @@ int xgpu_nv_mailbox_get_irq(struct >>>>>> amdgpu_device *adev) >>>>>> return r; >>>>>> } >>>>>> - INIT_WORK(&adev->virt.flr_work, xgpu_nv_mailbox_flr_work); >>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>> xgpu_nv_mailbox_flr_work); >>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>> return 0; >>>>>> } >>>>>> @@ -416,6 +417,8 @@ void xgpu_nv_mailbox_put_irq(struct >>>>>> amdgpu_device *adev) >>>>>> { >>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>> + >>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>> &adev->virt.flr_work); >>>>>> } >>>>>> const struct amdgpu_virt_ops xgpu_nv_virt_ops = { >>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>> index 7b63d30b9b79..1d4ef5c70730 100644 >>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>> @@ -512,7 +512,7 @@ static int xgpu_vi_set_mailbox_ack_irq(struct >>>>>> amdgpu_device *adev, >>>>>> static void xgpu_vi_mailbox_flr_work(struct work_struct *work) >>>>>> { >>>>>> - struct amdgpu_virt *virt = container_of(work, struct >>>>>> amdgpu_virt, flr_work); >>>>>> + struct amdgpu_virt *virt = container_of(work, struct >>>>>> amdgpu_virt, flr_work.base.work); >>>>>> struct amdgpu_device *adev = container_of(virt, struct >>>>>> amdgpu_device, virt); >>>>>> /* wait until RCV_MSG become 3 */ >>>>>> @@ -610,7 +610,8 @@ int xgpu_vi_mailbox_get_irq(struct >>>>>> amdgpu_device *adev) >>>>>> return r; >>>>>> } >>>>>> - INIT_WORK(&adev->virt.flr_work, xgpu_vi_mailbox_flr_work); >>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>> xgpu_vi_mailbox_flr_work); >>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>> return 0; >>>>>> } >>>>>> @@ -619,6 +620,8 @@ void xgpu_vi_mailbox_put_irq(struct >>>>>> amdgpu_device *adev) >>>>>> { >>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>> + >>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>> &adev->virt.flr_work); >>>>>> } >>>>>> const struct amdgpu_virt_ops xgpu_vi_virt_ops = { >>>>> >>> > ^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive. 2022-05-05 18:57 ` Andrey Grodzovsky @ 2022-05-05 19:49 ` Felix Kuehling 2022-05-05 21:47 ` Andrey Grodzovsky 0 siblings, 1 reply; 40+ messages in thread From: Felix Kuehling @ 2022-05-05 19:49 UTC (permalink / raw) To: Andrey Grodzovsky, Christian König, amd-gfx; +Cc: Bai Zoy, lijo.lazar Am 2022-05-05 um 14:57 schrieb Andrey Grodzovsky: > > On 2022-05-05 11:06, Christian König wrote: >> Am 05.05.22 um 15:54 schrieb Andrey Grodzovsky: >>> >>> On 2022-05-05 09:23, Christian König wrote: >>>> Am 05.05.22 um 15:15 schrieb Andrey Grodzovsky: >>>>> On 2022-05-05 06:09, Christian König wrote: >>>>> >>>>>> Am 04.05.22 um 18:18 schrieb Andrey Grodzovsky: >>>>>>> Problem: >>>>>>> During hive reset caused by command timing out on a ring >>>>>>> extra resets are generated by triggered by KFD which is >>>>>>> unable to accesses registers on the resetting ASIC. >>>>>>> >>>>>>> Fix: Rework GPU reset to use a list of pending reset jobs >>>>>>> such that the first reset jobs that actaully resets the entire >>>>>>> reset domain will cancel all those pending redundant resets. >>>>>>> >>>>>>> This is in line with what we already do for redundant TDRs >>>>>>> in scheduler code. >>>>>> >>>>>> Mhm, why exactly do you need the extra linked list then? >>>>>> >>>>>> Let's talk about that on our call today. >>>>> >>>>> >>>>> Going to miss it as you know, and also this is the place to >>>>> discuss technical questions anyway so - >>>> >>>> Good point. >>>> >>>>> It's needed because those other resets are not time out handlers >>>>> that are governed by the scheduler >>>>> but rather external resets that are triggered by such clients as >>>>> KFD, RAS and sysfs. Scheduler has no >>>>> knowledge of them (and should not have) but they are serialized >>>>> into same wq as the TO handlers >>>>> from the scheduler. It just happens that TO triggered reset causes >>>>> in turn another reset (from KFD in >>>>> this case) and we want to prevent this second reset from taking >>>>> place just as we want to avoid multiple >>>>> TO resets to take place in scheduler code. >>>> >>>> Yeah, but why do you need multiple workers? >>>> >>>> You have a single worker for the GPU reset not triggered by the >>>> scheduler in you adev and cancel that at the end of the reset >>>> procedure. >>>> >>>> If anybody things it needs to trigger another reset while in reset >>>> (which is actually a small design bug separately) the reset will >>>> just be canceled in the same way we cancel the scheduler resets. >>>> >>>> Christian. >>> >>> >>> Had this in mind at first but then I realized that each client (RAS, >>> KFD and sysfs) will want to fill his own data for the work (see >>> amdgpu_device_gpu_recover) - for XGMI hive each will want to set his >>> own adev (which is fine if you set a work per adev as you suggest) >>> but also each client might want (they all put NULL there but in >>> theory in the future) also set his own bad job value and here you >>> might have a collision. >> >> Yeah, but that is intentional. See when we have a job that needs to >> be consumed by the reset handler and not overwritten or something. > > > I am not sure why this is a requirement, multiple clients can decide > concurrently to trigger a reset for some reason (possibly independent > reasons) hence they cannot share same work struct to pass to it their > data. If those concurrent clients could detect that a reset was already in progress, you wouldn't need the complexity of multiple work structs being scheduled. You could simply return without triggering another reset. I'd put the reset work struct into the reset_domain struct. That way you'd have exactly one worker for the reset domain. You could implement a lock-less scheme to decide whether you need to schedule a reset, e.g. using an atomic counter in the shared work struct that gets incremented when a client wants to trigger a reset (atomic_add_return). If that counter is exactly 1 after incrementing, you need to fill in the rest of the work struct and schedule the work. Otherwise, it's already scheduled (or another client is in the process of scheduling it) and you just return. When the worker finishes (after confirming a successful reset), it resets the counter to 0, so the next client requesting a reset will schedule the worker again. Regards, Felix > > >> >> >> Additional to that keep in mind that you can't allocate any memory >> before or during the GPU reset nor wait for the reset to complete (so >> you can't allocate anything on the stack either). > > > There is no dynamic allocation here, regarding stack allocations - we > do it all the time when we call functions, even during GPU resets, how > on stack allocation of work struct in amdgpu_device_gpu_recover is > different from any other local variable we allocate in any function we > call ? > > I am also not sure why it's not allowed to wait for reset to complete > ? Also, see in amdgpu_ras_do_recovery and gpu_recover_get (debugfs) - > the caller expects the reset to complete before he returns. I can > probably work around it in RAS code by calling > atomic_set(&ras->in_recovery, 0) from some callback within actual > reset function but regarding sysfs it actually expects a result > returned indicating whether the call was successful or not. > > Andrey > > >> >> I don't think that concept you try here will work. >> >> Regards, >> Christian. >> >>> Also in general seems to me it's cleaner approach where this logic >>> (the work items) are held and handled in reset_domain and are not >>> split in each adev or any other entity. We might want in the future >>> to even move the scheduler handling into reset domain since reset >>> domain is supposed to be a generic things and not only or AMD. >>> >>> Andrey >>> >>> >>>> >>>>> >>>>> Andrey >>>>> >>>>> >>>>>> >>>>>> Regards, >>>>>> Christian. >>>>>> >>>>>>> >>>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> >>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com> >>>>>>> --- >>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +--- >>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++-- >>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 + >>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 >>>>>>> +++++++++++++++++++++- >>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +- >>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++- >>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++- >>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++- >>>>>>> 8 files changed, 104 insertions(+), 24 deletions(-) >>>>>>> >>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>> index 4264abc5604d..99efd8317547 100644 >>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>> @@ -109,6 +109,7 @@ >>>>>>> #include "amdgpu_fdinfo.h" >>>>>>> #include "amdgpu_mca.h" >>>>>>> #include "amdgpu_ras.h" >>>>>>> +#include "amdgpu_reset.h" >>>>>>> #define MAX_GPU_INSTANCE 16 >>>>>>> @@ -509,16 +510,6 @@ struct amdgpu_allowed_register_entry { >>>>>>> bool grbm_indexed; >>>>>>> }; >>>>>>> -enum amd_reset_method { >>>>>>> - AMD_RESET_METHOD_NONE = -1, >>>>>>> - AMD_RESET_METHOD_LEGACY = 0, >>>>>>> - AMD_RESET_METHOD_MODE0, >>>>>>> - AMD_RESET_METHOD_MODE1, >>>>>>> - AMD_RESET_METHOD_MODE2, >>>>>>> - AMD_RESET_METHOD_BACO, >>>>>>> - AMD_RESET_METHOD_PCI, >>>>>>> -}; >>>>>>> - >>>>>>> struct amdgpu_video_codec_info { >>>>>>> u32 codec_type; >>>>>>> u32 max_width; >>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>> index e582f1044c0f..7fa82269c30f 100644 >>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>> @@ -5201,6 +5201,12 @@ int amdgpu_device_gpu_recover_imp(struct >>>>>>> amdgpu_device *adev, >>>>>>> } >>>>>>> tmp_vram_lost_counter = >>>>>>> atomic_read(&((adev)->vram_lost_counter)); >>>>>>> + >>>>>>> + /* Drop all pending resets since we will reset now anyway */ >>>>>>> + tmp_adev = list_first_entry(device_list_handle, struct >>>>>>> amdgpu_device, >>>>>>> + reset_list); >>>>>>> + amdgpu_reset_pending_list(tmp_adev->reset_domain); >>>>>>> + >>>>>>> /* Actual ASIC resets if needed.*/ >>>>>>> /* Host driver will handle XGMI hive reset for SRIOV */ >>>>>>> if (amdgpu_sriov_vf(adev)) { >>>>>>> @@ -5296,7 +5302,7 @@ int amdgpu_device_gpu_recover_imp(struct >>>>>>> amdgpu_device *adev, >>>>>>> } >>>>>>> struct amdgpu_recover_work_struct { >>>>>>> - struct work_struct base; >>>>>>> + struct amdgpu_reset_work_struct base; >>>>>>> struct amdgpu_device *adev; >>>>>>> struct amdgpu_job *job; >>>>>>> int ret; >>>>>>> @@ -5304,7 +5310,7 @@ struct amdgpu_recover_work_struct { >>>>>>> static void amdgpu_device_queue_gpu_recover_work(struct >>>>>>> work_struct *work) >>>>>>> { >>>>>>> - struct amdgpu_recover_work_struct *recover_work = >>>>>>> container_of(work, struct amdgpu_recover_work_struct, base); >>>>>>> + struct amdgpu_recover_work_struct *recover_work = >>>>>>> container_of(work, struct amdgpu_recover_work_struct, >>>>>>> base.base.work); >>>>>>> recover_work->ret = >>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, >>>>>>> recover_work->job); >>>>>>> } >>>>>>> @@ -5316,12 +5322,15 @@ int amdgpu_device_gpu_recover(struct >>>>>>> amdgpu_device *adev, >>>>>>> { >>>>>>> struct amdgpu_recover_work_struct work = {.adev = adev, >>>>>>> .job = job}; >>>>>>> - INIT_WORK(&work.base, amdgpu_device_queue_gpu_recover_work); >>>>>>> + INIT_DELAYED_WORK(&work.base.base, >>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>> + INIT_LIST_HEAD(&work.base.node); >>>>>>> if (!amdgpu_reset_domain_schedule(adev->reset_domain, >>>>>>> &work.base)) >>>>>>> return -EAGAIN; >>>>>>> - flush_work(&work.base); >>>>>>> + flush_delayed_work(&work.base.base); >>>>>>> + >>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>> &work.base); >>>>>>> return work.ret; >>>>>>> } >>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>> index c80af0889773..ffddd419c351 100644 >>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain >>>>>>> *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d >>>>>>> atomic_set(&reset_domain->in_gpu_reset, 0); >>>>>>> init_rwsem(&reset_domain->sem); >>>>>>> + INIT_LIST_HEAD(&reset_domain->pending_works); >>>>>>> + mutex_init(&reset_domain->reset_lock); >>>>>>> + >>>>>>> return reset_domain; >>>>>>> } >>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>> index 1949dbe28a86..863ec5720fc1 100644 >>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>> @@ -24,7 +24,18 @@ >>>>>>> #ifndef __AMDGPU_RESET_H__ >>>>>>> #define __AMDGPU_RESET_H__ >>>>>>> -#include "amdgpu.h" >>>>>>> + >>>>>>> +#include <linux/atomic.h> >>>>>>> +#include <linux/mutex.h> >>>>>>> +#include <linux/list.h> >>>>>>> +#include <linux/kref.h> >>>>>>> +#include <linux/rwsem.h> >>>>>>> +#include <linux/workqueue.h> >>>>>>> + >>>>>>> +struct amdgpu_device; >>>>>>> +struct amdgpu_job; >>>>>>> +struct amdgpu_hive_info; >>>>>>> + >>>>>>> enum AMDGPU_RESET_FLAGS { >>>>>>> @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS { >>>>>>> AMDGPU_SKIP_HW_RESET = 1, >>>>>>> }; >>>>>>> + >>>>>>> +enum amd_reset_method { >>>>>>> + AMD_RESET_METHOD_NONE = -1, >>>>>>> + AMD_RESET_METHOD_LEGACY = 0, >>>>>>> + AMD_RESET_METHOD_MODE0, >>>>>>> + AMD_RESET_METHOD_MODE1, >>>>>>> + AMD_RESET_METHOD_MODE2, >>>>>>> + AMD_RESET_METHOD_BACO, >>>>>>> + AMD_RESET_METHOD_PCI, >>>>>>> +}; >>>>>>> + >>>>>>> struct amdgpu_reset_context { >>>>>>> enum amd_reset_method method; >>>>>>> struct amdgpu_device *reset_req_dev; >>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context { >>>>>>> unsigned long flags; >>>>>>> }; >>>>>>> +struct amdgpu_reset_control; >>>>>>> + >>>>>>> struct amdgpu_reset_handler { >>>>>>> enum amd_reset_method reset_method; >>>>>>> struct list_head handler_list; >>>>>>> @@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type { >>>>>>> XGMI_HIVE >>>>>>> }; >>>>>>> + >>>>>>> +struct amdgpu_reset_work_struct { >>>>>>> + struct delayed_work base; >>>>>>> + struct list_head node; >>>>>>> +}; >>>>>>> + >>>>>>> struct amdgpu_reset_domain { >>>>>>> struct kref refcount; >>>>>>> struct workqueue_struct *wq; >>>>>>> enum amdgpu_reset_domain_type type; >>>>>>> struct rw_semaphore sem; >>>>>>> atomic_t in_gpu_reset; >>>>>>> + >>>>>>> + struct list_head pending_works; >>>>>>> + struct mutex reset_lock; >>>>>>> }; >>>>>>> @@ -113,9 +146,43 @@ static inline void >>>>>>> amdgpu_reset_put_reset_domain(struct amdgpu_reset_domain *dom >>>>>>> } >>>>>>> static inline bool amdgpu_reset_domain_schedule(struct >>>>>>> amdgpu_reset_domain *domain, >>>>>>> - struct work_struct *work) >>>>>>> + struct amdgpu_reset_work_struct *work) >>>>>>> { >>>>>>> - return queue_work(domain->wq, work); >>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>> + >>>>>>> + if (!queue_delayed_work(domain->wq, &work->base, 0)) { >>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>> + return false; >>>>>>> + } >>>>>>> + >>>>>>> + list_add_tail(&work->node, &domain->pending_works); >>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>> + >>>>>>> + return true; >>>>>>> +} >>>>>>> + >>>>>>> +static inline void amdgpu_reset_domain_del_pendning_work(struct >>>>>>> amdgpu_reset_domain *domain, >>>>>>> + struct amdgpu_reset_work_struct *work) >>>>>>> +{ >>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>> + list_del_init(&work->node); >>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>> +} >>>>>>> + >>>>>>> +static inline void amdgpu_reset_pending_list(struct >>>>>>> amdgpu_reset_domain *domain) >>>>>>> +{ >>>>>>> + struct amdgpu_reset_work_struct *entry, *tmp; >>>>>>> + >>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>> + list_for_each_entry_safe(entry, tmp, >>>>>>> &domain->pending_works, node) { >>>>>>> + >>>>>>> + list_del_init(&entry->node); >>>>>>> + >>>>>>> + /* Stop any other related pending resets */ >>>>>>> + cancel_delayed_work(&entry->base); >>>>>>> + } >>>>>>> + >>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>> } >>>>>>> void amdgpu_device_lock_reset_domain(struct >>>>>>> amdgpu_reset_domain *reset_domain); >>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>> index 239f232f9c02..574e870d3064 100644 >>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>> @@ -25,6 +25,7 @@ >>>>>>> #define AMDGPU_VIRT_H >>>>>>> #include "amdgv_sriovmsg.h" >>>>>>> +#include "amdgpu_reset.h" >>>>>>> #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 << 0) /* vBIOS is >>>>>>> sr-iov ready */ >>>>>>> #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 1) /* sr-iov is >>>>>>> enabled on this GPU */ >>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt { >>>>>>> uint32_t reg_val_offs; >>>>>>> struct amdgpu_irq_src ack_irq; >>>>>>> struct amdgpu_irq_src rcv_irq; >>>>>>> - struct work_struct flr_work; >>>>>>> + struct amdgpu_reset_work_struct flr_work; >>>>>>> struct amdgpu_mm_table mm_table; >>>>>>> const struct amdgpu_virt_ops *ops; >>>>>>> struct amdgpu_vf_error_buffer vf_errors; >>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>> index b81acf59870c..f3d1c2be9292 100644 >>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>> @@ -251,7 +251,7 @@ static int >>>>>>> xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device *adev, >>>>>>> static void xgpu_ai_mailbox_flr_work(struct work_struct *work) >>>>>>> { >>>>>>> - struct amdgpu_virt *virt = container_of(work, struct >>>>>>> amdgpu_virt, flr_work); >>>>>>> + struct amdgpu_virt *virt = container_of(work, struct >>>>>>> amdgpu_virt, flr_work.base.work); >>>>>>> struct amdgpu_device *adev = container_of(virt, struct >>>>>>> amdgpu_device, virt); >>>>>>> int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>> @@ -380,7 +380,8 @@ int xgpu_ai_mailbox_get_irq(struct >>>>>>> amdgpu_device *adev) >>>>>>> return r; >>>>>>> } >>>>>>> - INIT_WORK(&adev->virt.flr_work, xgpu_ai_mailbox_flr_work); >>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>> xgpu_ai_mailbox_flr_work); >>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>> return 0; >>>>>>> } >>>>>>> @@ -389,6 +390,8 @@ void xgpu_ai_mailbox_put_irq(struct >>>>>>> amdgpu_device *adev) >>>>>>> { >>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>> + >>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>> &adev->virt.flr_work); >>>>>>> } >>>>>>> static int xgpu_ai_request_init_data(struct amdgpu_device >>>>>>> *adev) >>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644 >>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>> @@ -275,7 +275,7 @@ static int >>>>>>> xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device *adev, >>>>>>> static void xgpu_nv_mailbox_flr_work(struct work_struct *work) >>>>>>> { >>>>>>> - struct amdgpu_virt *virt = container_of(work, struct >>>>>>> amdgpu_virt, flr_work); >>>>>>> + struct amdgpu_virt *virt = container_of(work, struct >>>>>>> amdgpu_virt, flr_work.base.work); >>>>>>> struct amdgpu_device *adev = container_of(virt, struct >>>>>>> amdgpu_device, virt); >>>>>>> int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>> @@ -407,7 +407,8 @@ int xgpu_nv_mailbox_get_irq(struct >>>>>>> amdgpu_device *adev) >>>>>>> return r; >>>>>>> } >>>>>>> - INIT_WORK(&adev->virt.flr_work, xgpu_nv_mailbox_flr_work); >>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>> xgpu_nv_mailbox_flr_work); >>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>> return 0; >>>>>>> } >>>>>>> @@ -416,6 +417,8 @@ void xgpu_nv_mailbox_put_irq(struct >>>>>>> amdgpu_device *adev) >>>>>>> { >>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>> + >>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>> &adev->virt.flr_work); >>>>>>> } >>>>>>> const struct amdgpu_virt_ops xgpu_nv_virt_ops = { >>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644 >>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>> @@ -512,7 +512,7 @@ static int >>>>>>> xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device *adev, >>>>>>> static void xgpu_vi_mailbox_flr_work(struct work_struct *work) >>>>>>> { >>>>>>> - struct amdgpu_virt *virt = container_of(work, struct >>>>>>> amdgpu_virt, flr_work); >>>>>>> + struct amdgpu_virt *virt = container_of(work, struct >>>>>>> amdgpu_virt, flr_work.base.work); >>>>>>> struct amdgpu_device *adev = container_of(virt, struct >>>>>>> amdgpu_device, virt); >>>>>>> /* wait until RCV_MSG become 3 */ >>>>>>> @@ -610,7 +610,8 @@ int xgpu_vi_mailbox_get_irq(struct >>>>>>> amdgpu_device *adev) >>>>>>> return r; >>>>>>> } >>>>>>> - INIT_WORK(&adev->virt.flr_work, xgpu_vi_mailbox_flr_work); >>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>> xgpu_vi_mailbox_flr_work); >>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>> return 0; >>>>>>> } >>>>>>> @@ -619,6 +620,8 @@ void xgpu_vi_mailbox_put_irq(struct >>>>>>> amdgpu_device *adev) >>>>>>> { >>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>> + >>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>> &adev->virt.flr_work); >>>>>>> } >>>>>>> const struct amdgpu_virt_ops xgpu_vi_virt_ops = { >>>>>> >>>> >> ^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive. 2022-05-05 19:49 ` Felix Kuehling @ 2022-05-05 21:47 ` Andrey Grodzovsky 2022-05-06 5:41 ` Luben Tuikov 2022-05-06 6:02 ` Lazar, Lijo 0 siblings, 2 replies; 40+ messages in thread From: Andrey Grodzovsky @ 2022-05-05 21:47 UTC (permalink / raw) To: Felix Kuehling, Christian König, amd-gfx; +Cc: Bai Zoy, lijo.lazar On 2022-05-05 15:49, Felix Kuehling wrote: > > Am 2022-05-05 um 14:57 schrieb Andrey Grodzovsky: >> >> On 2022-05-05 11:06, Christian König wrote: >>> Am 05.05.22 um 15:54 schrieb Andrey Grodzovsky: >>>> >>>> On 2022-05-05 09:23, Christian König wrote: >>>>> Am 05.05.22 um 15:15 schrieb Andrey Grodzovsky: >>>>>> On 2022-05-05 06:09, Christian König wrote: >>>>>> >>>>>>> Am 04.05.22 um 18:18 schrieb Andrey Grodzovsky: >>>>>>>> Problem: >>>>>>>> During hive reset caused by command timing out on a ring >>>>>>>> extra resets are generated by triggered by KFD which is >>>>>>>> unable to accesses registers on the resetting ASIC. >>>>>>>> >>>>>>>> Fix: Rework GPU reset to use a list of pending reset jobs >>>>>>>> such that the first reset jobs that actaully resets the entire >>>>>>>> reset domain will cancel all those pending redundant resets. >>>>>>>> >>>>>>>> This is in line with what we already do for redundant TDRs >>>>>>>> in scheduler code. >>>>>>> >>>>>>> Mhm, why exactly do you need the extra linked list then? >>>>>>> >>>>>>> Let's talk about that on our call today. >>>>>> >>>>>> >>>>>> Going to miss it as you know, and also this is the place to >>>>>> discuss technical questions anyway so - >>>>> >>>>> Good point. >>>>> >>>>>> It's needed because those other resets are not time out handlers >>>>>> that are governed by the scheduler >>>>>> but rather external resets that are triggered by such clients as >>>>>> KFD, RAS and sysfs. Scheduler has no >>>>>> knowledge of them (and should not have) but they are serialized >>>>>> into same wq as the TO handlers >>>>>> from the scheduler. It just happens that TO triggered reset >>>>>> causes in turn another reset (from KFD in >>>>>> this case) and we want to prevent this second reset from taking >>>>>> place just as we want to avoid multiple >>>>>> TO resets to take place in scheduler code. >>>>> >>>>> Yeah, but why do you need multiple workers? >>>>> >>>>> You have a single worker for the GPU reset not triggered by the >>>>> scheduler in you adev and cancel that at the end of the reset >>>>> procedure. >>>>> >>>>> If anybody things it needs to trigger another reset while in reset >>>>> (which is actually a small design bug separately) the reset will >>>>> just be canceled in the same way we cancel the scheduler resets. >>>>> >>>>> Christian. >>>> >>>> >>>> Had this in mind at first but then I realized that each client >>>> (RAS, KFD and sysfs) will want to fill his own data for the work >>>> (see amdgpu_device_gpu_recover) - for XGMI hive each will want to >>>> set his own adev (which is fine if you set a work per adev as you >>>> suggest) but also each client might want (they all put NULL there >>>> but in theory in the future) also set his own bad job value and >>>> here you might have a collision. >>> >>> Yeah, but that is intentional. See when we have a job that needs to >>> be consumed by the reset handler and not overwritten or something. >> >> >> I am not sure why this is a requirement, multiple clients can decide >> concurrently to trigger a reset for some reason (possibly independent >> reasons) hence they cannot share same work struct to pass to it their >> data. > > If those concurrent clients could detect that a reset was already in > progress, you wouldn't need the complexity of multiple work structs > being scheduled. You could simply return without triggering another > reset. In my view main problem here with single work struct either at reset domain level or even adev level is that in some cases we optimize resets and don't really perform ASIC HW reset (see amdgpu_job_timedout with soft recovery and skip_hw_reset in amdgpu_device_gpu_recover_imp for the case the bad job does get signaled just before we start HW reset and we just skip this). You can see that if many different reset sources share same work struct what can happen is that the first to obtain the lock you describe bellow might opt out from full HW reset because his bad job did signal for example or because his hunged IP block was able to recover through SW reset but in the meantime another reset source who needed an actual HW reset just silently returned and we end up with unhandled reset request. True that today this happens only to job timeout reset sources that are handled form within the scheduler and won't use this single work struct but no one prevents a future case for this to happen and also, if we actually want to unify scheduler time out handlers within reset domain (which seems to me the right design approach) we won't be able to use just one work struct for this reason anyway. Andrey > > I'd put the reset work struct into the reset_domain struct. That way > you'd have exactly one worker for the reset domain. You could > implement a lock-less scheme to decide whether you need to schedule a > reset, e.g. using an atomic counter in the shared work struct that > gets incremented when a client wants to trigger a reset > (atomic_add_return). If that counter is exactly 1 after incrementing, > you need to fill in the rest of the work struct and schedule the work. > Otherwise, it's already scheduled (or another client is in the process > of scheduling it) and you just return. When the worker finishes (after > confirming a successful reset), it resets the counter to 0, so the > next client requesting a reset will schedule the worker again. > > Regards, > Felix > > >> >> >>> >>> >>> Additional to that keep in mind that you can't allocate any memory >>> before or during the GPU reset nor wait for the reset to complete >>> (so you can't allocate anything on the stack either). >> >> >> There is no dynamic allocation here, regarding stack allocations - we >> do it all the time when we call functions, even during GPU resets, >> how on stack allocation of work struct in amdgpu_device_gpu_recover >> is different from any other local variable we allocate in any >> function we call ? >> >> I am also not sure why it's not allowed to wait for reset to complete >> ? Also, see in amdgpu_ras_do_recovery and gpu_recover_get (debugfs) - >> the caller expects the reset to complete before he returns. I can >> probably work around it in RAS code by calling >> atomic_set(&ras->in_recovery, 0) from some callback within actual >> reset function but regarding sysfs it actually expects a result >> returned indicating whether the call was successful or not. >> >> Andrey >> >> >>> >>> I don't think that concept you try here will work. >>> >>> Regards, >>> Christian. >>> >>>> Also in general seems to me it's cleaner approach where this logic >>>> (the work items) are held and handled in reset_domain and are not >>>> split in each adev or any other entity. We might want in the future >>>> to even move the scheduler handling into reset domain since reset >>>> domain is supposed to be a generic things and not only or AMD. >>>> >>>> Andrey >>>> >>>> >>>>> >>>>>> >>>>>> Andrey >>>>>> >>>>>> >>>>>>> >>>>>>> Regards, >>>>>>> Christian. >>>>>>> >>>>>>>> >>>>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> >>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com> >>>>>>>> --- >>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +--- >>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++-- >>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 + >>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 >>>>>>>> +++++++++++++++++++++- >>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +- >>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++- >>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++- >>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++- >>>>>>>> 8 files changed, 104 insertions(+), 24 deletions(-) >>>>>>>> >>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>> index 4264abc5604d..99efd8317547 100644 >>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>> @@ -109,6 +109,7 @@ >>>>>>>> #include "amdgpu_fdinfo.h" >>>>>>>> #include "amdgpu_mca.h" >>>>>>>> #include "amdgpu_ras.h" >>>>>>>> +#include "amdgpu_reset.h" >>>>>>>> #define MAX_GPU_INSTANCE 16 >>>>>>>> @@ -509,16 +510,6 @@ struct amdgpu_allowed_register_entry { >>>>>>>> bool grbm_indexed; >>>>>>>> }; >>>>>>>> -enum amd_reset_method { >>>>>>>> - AMD_RESET_METHOD_NONE = -1, >>>>>>>> - AMD_RESET_METHOD_LEGACY = 0, >>>>>>>> - AMD_RESET_METHOD_MODE0, >>>>>>>> - AMD_RESET_METHOD_MODE1, >>>>>>>> - AMD_RESET_METHOD_MODE2, >>>>>>>> - AMD_RESET_METHOD_BACO, >>>>>>>> - AMD_RESET_METHOD_PCI, >>>>>>>> -}; >>>>>>>> - >>>>>>>> struct amdgpu_video_codec_info { >>>>>>>> u32 codec_type; >>>>>>>> u32 max_width; >>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>> index e582f1044c0f..7fa82269c30f 100644 >>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>> @@ -5201,6 +5201,12 @@ int amdgpu_device_gpu_recover_imp(struct >>>>>>>> amdgpu_device *adev, >>>>>>>> } >>>>>>>> tmp_vram_lost_counter = >>>>>>>> atomic_read(&((adev)->vram_lost_counter)); >>>>>>>> + >>>>>>>> + /* Drop all pending resets since we will reset now anyway */ >>>>>>>> + tmp_adev = list_first_entry(device_list_handle, struct >>>>>>>> amdgpu_device, >>>>>>>> + reset_list); >>>>>>>> + amdgpu_reset_pending_list(tmp_adev->reset_domain); >>>>>>>> + >>>>>>>> /* Actual ASIC resets if needed.*/ >>>>>>>> /* Host driver will handle XGMI hive reset for SRIOV */ >>>>>>>> if (amdgpu_sriov_vf(adev)) { >>>>>>>> @@ -5296,7 +5302,7 @@ int amdgpu_device_gpu_recover_imp(struct >>>>>>>> amdgpu_device *adev, >>>>>>>> } >>>>>>>> struct amdgpu_recover_work_struct { >>>>>>>> - struct work_struct base; >>>>>>>> + struct amdgpu_reset_work_struct base; >>>>>>>> struct amdgpu_device *adev; >>>>>>>> struct amdgpu_job *job; >>>>>>>> int ret; >>>>>>>> @@ -5304,7 +5310,7 @@ struct amdgpu_recover_work_struct { >>>>>>>> static void amdgpu_device_queue_gpu_recover_work(struct >>>>>>>> work_struct *work) >>>>>>>> { >>>>>>>> - struct amdgpu_recover_work_struct *recover_work = >>>>>>>> container_of(work, struct amdgpu_recover_work_struct, base); >>>>>>>> + struct amdgpu_recover_work_struct *recover_work = >>>>>>>> container_of(work, struct amdgpu_recover_work_struct, >>>>>>>> base.base.work); >>>>>>>> recover_work->ret = >>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, >>>>>>>> recover_work->job); >>>>>>>> } >>>>>>>> @@ -5316,12 +5322,15 @@ int amdgpu_device_gpu_recover(struct >>>>>>>> amdgpu_device *adev, >>>>>>>> { >>>>>>>> struct amdgpu_recover_work_struct work = {.adev = adev, >>>>>>>> .job = job}; >>>>>>>> - INIT_WORK(&work.base, >>>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>>> + INIT_DELAYED_WORK(&work.base.base, >>>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>>> + INIT_LIST_HEAD(&work.base.node); >>>>>>>> if (!amdgpu_reset_domain_schedule(adev->reset_domain, >>>>>>>> &work.base)) >>>>>>>> return -EAGAIN; >>>>>>>> - flush_work(&work.base); >>>>>>>> + flush_delayed_work(&work.base.base); >>>>>>>> + >>>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>> &work.base); >>>>>>>> return work.ret; >>>>>>>> } >>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>> index c80af0889773..ffddd419c351 100644 >>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain >>>>>>>> *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d >>>>>>>> atomic_set(&reset_domain->in_gpu_reset, 0); >>>>>>>> init_rwsem(&reset_domain->sem); >>>>>>>> + INIT_LIST_HEAD(&reset_domain->pending_works); >>>>>>>> + mutex_init(&reset_domain->reset_lock); >>>>>>>> + >>>>>>>> return reset_domain; >>>>>>>> } >>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>> index 1949dbe28a86..863ec5720fc1 100644 >>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>> @@ -24,7 +24,18 @@ >>>>>>>> #ifndef __AMDGPU_RESET_H__ >>>>>>>> #define __AMDGPU_RESET_H__ >>>>>>>> -#include "amdgpu.h" >>>>>>>> + >>>>>>>> +#include <linux/atomic.h> >>>>>>>> +#include <linux/mutex.h> >>>>>>>> +#include <linux/list.h> >>>>>>>> +#include <linux/kref.h> >>>>>>>> +#include <linux/rwsem.h> >>>>>>>> +#include <linux/workqueue.h> >>>>>>>> + >>>>>>>> +struct amdgpu_device; >>>>>>>> +struct amdgpu_job; >>>>>>>> +struct amdgpu_hive_info; >>>>>>>> + >>>>>>>> enum AMDGPU_RESET_FLAGS { >>>>>>>> @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS { >>>>>>>> AMDGPU_SKIP_HW_RESET = 1, >>>>>>>> }; >>>>>>>> + >>>>>>>> +enum amd_reset_method { >>>>>>>> + AMD_RESET_METHOD_NONE = -1, >>>>>>>> + AMD_RESET_METHOD_LEGACY = 0, >>>>>>>> + AMD_RESET_METHOD_MODE0, >>>>>>>> + AMD_RESET_METHOD_MODE1, >>>>>>>> + AMD_RESET_METHOD_MODE2, >>>>>>>> + AMD_RESET_METHOD_BACO, >>>>>>>> + AMD_RESET_METHOD_PCI, >>>>>>>> +}; >>>>>>>> + >>>>>>>> struct amdgpu_reset_context { >>>>>>>> enum amd_reset_method method; >>>>>>>> struct amdgpu_device *reset_req_dev; >>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context { >>>>>>>> unsigned long flags; >>>>>>>> }; >>>>>>>> +struct amdgpu_reset_control; >>>>>>>> + >>>>>>>> struct amdgpu_reset_handler { >>>>>>>> enum amd_reset_method reset_method; >>>>>>>> struct list_head handler_list; >>>>>>>> @@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type { >>>>>>>> XGMI_HIVE >>>>>>>> }; >>>>>>>> + >>>>>>>> +struct amdgpu_reset_work_struct { >>>>>>>> + struct delayed_work base; >>>>>>>> + struct list_head node; >>>>>>>> +}; >>>>>>>> + >>>>>>>> struct amdgpu_reset_domain { >>>>>>>> struct kref refcount; >>>>>>>> struct workqueue_struct *wq; >>>>>>>> enum amdgpu_reset_domain_type type; >>>>>>>> struct rw_semaphore sem; >>>>>>>> atomic_t in_gpu_reset; >>>>>>>> + >>>>>>>> + struct list_head pending_works; >>>>>>>> + struct mutex reset_lock; >>>>>>>> }; >>>>>>>> @@ -113,9 +146,43 @@ static inline void >>>>>>>> amdgpu_reset_put_reset_domain(struct amdgpu_reset_domain *dom >>>>>>>> } >>>>>>>> static inline bool amdgpu_reset_domain_schedule(struct >>>>>>>> amdgpu_reset_domain *domain, >>>>>>>> - struct work_struct *work) >>>>>>>> + struct amdgpu_reset_work_struct *work) >>>>>>>> { >>>>>>>> - return queue_work(domain->wq, work); >>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>> + >>>>>>>> + if (!queue_delayed_work(domain->wq, &work->base, 0)) { >>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>> + return false; >>>>>>>> + } >>>>>>>> + >>>>>>>> + list_add_tail(&work->node, &domain->pending_works); >>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>> + >>>>>>>> + return true; >>>>>>>> +} >>>>>>>> + >>>>>>>> +static inline void >>>>>>>> amdgpu_reset_domain_del_pendning_work(struct >>>>>>>> amdgpu_reset_domain *domain, >>>>>>>> + struct amdgpu_reset_work_struct *work) >>>>>>>> +{ >>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>> + list_del_init(&work->node); >>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>> +} >>>>>>>> + >>>>>>>> +static inline void amdgpu_reset_pending_list(struct >>>>>>>> amdgpu_reset_domain *domain) >>>>>>>> +{ >>>>>>>> + struct amdgpu_reset_work_struct *entry, *tmp; >>>>>>>> + >>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>> + list_for_each_entry_safe(entry, tmp, >>>>>>>> &domain->pending_works, node) { >>>>>>>> + >>>>>>>> + list_del_init(&entry->node); >>>>>>>> + >>>>>>>> + /* Stop any other related pending resets */ >>>>>>>> + cancel_delayed_work(&entry->base); >>>>>>>> + } >>>>>>>> + >>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>> } >>>>>>>> void amdgpu_device_lock_reset_domain(struct >>>>>>>> amdgpu_reset_domain *reset_domain); >>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>> index 239f232f9c02..574e870d3064 100644 >>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>> @@ -25,6 +25,7 @@ >>>>>>>> #define AMDGPU_VIRT_H >>>>>>>> #include "amdgv_sriovmsg.h" >>>>>>>> +#include "amdgpu_reset.h" >>>>>>>> #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 << 0) /* vBIOS is >>>>>>>> sr-iov ready */ >>>>>>>> #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 1) /* sr-iov is >>>>>>>> enabled on this GPU */ >>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt { >>>>>>>> uint32_t reg_val_offs; >>>>>>>> struct amdgpu_irq_src ack_irq; >>>>>>>> struct amdgpu_irq_src rcv_irq; >>>>>>>> - struct work_struct flr_work; >>>>>>>> + struct amdgpu_reset_work_struct flr_work; >>>>>>>> struct amdgpu_mm_table mm_table; >>>>>>>> const struct amdgpu_virt_ops *ops; >>>>>>>> struct amdgpu_vf_error_buffer vf_errors; >>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>> index b81acf59870c..f3d1c2be9292 100644 >>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>> @@ -251,7 +251,7 @@ static int >>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device *adev, >>>>>>>> static void xgpu_ai_mailbox_flr_work(struct work_struct *work) >>>>>>>> { >>>>>>>> - struct amdgpu_virt *virt = container_of(work, struct >>>>>>>> amdgpu_virt, flr_work); >>>>>>>> + struct amdgpu_virt *virt = container_of(work, struct >>>>>>>> amdgpu_virt, flr_work.base.work); >>>>>>>> struct amdgpu_device *adev = container_of(virt, struct >>>>>>>> amdgpu_device, virt); >>>>>>>> int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>>> @@ -380,7 +380,8 @@ int xgpu_ai_mailbox_get_irq(struct >>>>>>>> amdgpu_device *adev) >>>>>>>> return r; >>>>>>>> } >>>>>>>> - INIT_WORK(&adev->virt.flr_work, xgpu_ai_mailbox_flr_work); >>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>> xgpu_ai_mailbox_flr_work); >>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>> return 0; >>>>>>>> } >>>>>>>> @@ -389,6 +390,8 @@ void xgpu_ai_mailbox_put_irq(struct >>>>>>>> amdgpu_device *adev) >>>>>>>> { >>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>> + >>>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>> &adev->virt.flr_work); >>>>>>>> } >>>>>>>> static int xgpu_ai_request_init_data(struct amdgpu_device >>>>>>>> *adev) >>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644 >>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>> @@ -275,7 +275,7 @@ static int >>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device *adev, >>>>>>>> static void xgpu_nv_mailbox_flr_work(struct work_struct *work) >>>>>>>> { >>>>>>>> - struct amdgpu_virt *virt = container_of(work, struct >>>>>>>> amdgpu_virt, flr_work); >>>>>>>> + struct amdgpu_virt *virt = container_of(work, struct >>>>>>>> amdgpu_virt, flr_work.base.work); >>>>>>>> struct amdgpu_device *adev = container_of(virt, struct >>>>>>>> amdgpu_device, virt); >>>>>>>> int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>>> @@ -407,7 +407,8 @@ int xgpu_nv_mailbox_get_irq(struct >>>>>>>> amdgpu_device *adev) >>>>>>>> return r; >>>>>>>> } >>>>>>>> - INIT_WORK(&adev->virt.flr_work, xgpu_nv_mailbox_flr_work); >>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>> xgpu_nv_mailbox_flr_work); >>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>> return 0; >>>>>>>> } >>>>>>>> @@ -416,6 +417,8 @@ void xgpu_nv_mailbox_put_irq(struct >>>>>>>> amdgpu_device *adev) >>>>>>>> { >>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>> + >>>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>> &adev->virt.flr_work); >>>>>>>> } >>>>>>>> const struct amdgpu_virt_ops xgpu_nv_virt_ops = { >>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644 >>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>> @@ -512,7 +512,7 @@ static int >>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device *adev, >>>>>>>> static void xgpu_vi_mailbox_flr_work(struct work_struct *work) >>>>>>>> { >>>>>>>> - struct amdgpu_virt *virt = container_of(work, struct >>>>>>>> amdgpu_virt, flr_work); >>>>>>>> + struct amdgpu_virt *virt = container_of(work, struct >>>>>>>> amdgpu_virt, flr_work.base.work); >>>>>>>> struct amdgpu_device *adev = container_of(virt, struct >>>>>>>> amdgpu_device, virt); >>>>>>>> /* wait until RCV_MSG become 3 */ >>>>>>>> @@ -610,7 +610,8 @@ int xgpu_vi_mailbox_get_irq(struct >>>>>>>> amdgpu_device *adev) >>>>>>>> return r; >>>>>>>> } >>>>>>>> - INIT_WORK(&adev->virt.flr_work, xgpu_vi_mailbox_flr_work); >>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>> xgpu_vi_mailbox_flr_work); >>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>> return 0; >>>>>>>> } >>>>>>>> @@ -619,6 +620,8 @@ void xgpu_vi_mailbox_put_irq(struct >>>>>>>> amdgpu_device *adev) >>>>>>>> { >>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>> + >>>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>> &adev->virt.flr_work); >>>>>>>> } >>>>>>>> const struct amdgpu_virt_ops xgpu_vi_virt_ops = { >>>>>>> >>>>> >>> ^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive. 2022-05-05 21:47 ` Andrey Grodzovsky @ 2022-05-06 5:41 ` Luben Tuikov 2022-05-06 6:02 ` Lazar, Lijo 1 sibling, 0 replies; 40+ messages in thread From: Luben Tuikov @ 2022-05-06 5:41 UTC (permalink / raw) To: Andrey Grodzovsky, Felix Kuehling, Christian König, amd-gfx Cc: Bai Zoy, lijo.lazar On 2022-05-05 17:47, Andrey Grodzovsky wrote: > > On 2022-05-05 15:49, Felix Kuehling wrote: >> >> Am 2022-05-05 um 14:57 schrieb Andrey Grodzovsky: >>> >>> On 2022-05-05 11:06, Christian König wrote: >>>> Am 05.05.22 um 15:54 schrieb Andrey Grodzovsky: >>>>> >>>>> On 2022-05-05 09:23, Christian König wrote: >>>>>> Am 05.05.22 um 15:15 schrieb Andrey Grodzovsky: >>>>>>> On 2022-05-05 06:09, Christian König wrote: >>>>>>> >>>>>>>> Am 04.05.22 um 18:18 schrieb Andrey Grodzovsky: >>>>>>>>> Problem: >>>>>>>>> During hive reset caused by command timing out on a ring >>>>>>>>> extra resets are generated by triggered by KFD which is >>>>>>>>> unable to accesses registers on the resetting ASIC. >>>>>>>>> >>>>>>>>> Fix: Rework GPU reset to use a list of pending reset jobs >>>>>>>>> such that the first reset jobs that actaully resets the entire >>>>>>>>> reset domain will cancel all those pending redundant resets. >>>>>>>>> >>>>>>>>> This is in line with what we already do for redundant TDRs >>>>>>>>> in scheduler code. >>>>>>>> >>>>>>>> Mhm, why exactly do you need the extra linked list then? >>>>>>>> >>>>>>>> Let's talk about that on our call today. >>>>>>> >>>>>>> >>>>>>> Going to miss it as you know, and also this is the place to >>>>>>> discuss technical questions anyway so - >>>>>> >>>>>> Good point. >>>>>> >>>>>>> It's needed because those other resets are not time out handlers >>>>>>> that are governed by the scheduler >>>>>>> but rather external resets that are triggered by such clients as >>>>>>> KFD, RAS and sysfs. Scheduler has no >>>>>>> knowledge of them (and should not have) but they are serialized >>>>>>> into same wq as the TO handlers >>>>>>> from the scheduler. It just happens that TO triggered reset >>>>>>> causes in turn another reset (from KFD in >>>>>>> this case) and we want to prevent this second reset from taking >>>>>>> place just as we want to avoid multiple >>>>>>> TO resets to take place in scheduler code. >>>>>> >>>>>> Yeah, but why do you need multiple workers? >>>>>> >>>>>> You have a single worker for the GPU reset not triggered by the >>>>>> scheduler in you adev and cancel that at the end of the reset >>>>>> procedure. >>>>>> >>>>>> If anybody things it needs to trigger another reset while in reset >>>>>> (which is actually a small design bug separately) the reset will >>>>>> just be canceled in the same way we cancel the scheduler resets. >>>>>> >>>>>> Christian. >>>>> >>>>> >>>>> Had this in mind at first but then I realized that each client >>>>> (RAS, KFD and sysfs) will want to fill his own data for the work >>>>> (see amdgpu_device_gpu_recover) - for XGMI hive each will want to >>>>> set his own adev (which is fine if you set a work per adev as you >>>>> suggest) but also each client might want (they all put NULL there >>>>> but in theory in the future) also set his own bad job value and >>>>> here you might have a collision. >>>> >>>> Yeah, but that is intentional. See when we have a job that needs to >>>> be consumed by the reset handler and not overwritten or something. >>> >>> >>> I am not sure why this is a requirement, multiple clients can decide >>> concurrently to trigger a reset for some reason (possibly independent >>> reasons) hence they cannot share same work struct to pass to it their >>> data. >> >> If those concurrent clients could detect that a reset was already in >> progress, you wouldn't need the complexity of multiple work structs >> being scheduled. You could simply return without triggering another >> reset. > > > In my view main problem here with single work struct either at reset > domain level or even adev level is that in some cases we optimize resets > and don't really perform ASIC HW reset (see amdgpu_job_timedout with > soft recovery and skip_hw_reset in amdgpu_device_gpu_recover_imp for the > case the bad job does get signaled just before we start HW reset and we > just skip this). You can see that if many different reset sources share > same work struct what can happen is that the first to obtain the lock > you describe bellow might opt out from full HW reset because his bad job The problem is this "opting out" of reset--meaning that the client didn't do the complete recovery work, and were too quick to call for a GPU reset. So they need to fix this locally in their code. Generally when a GPU reset is scheduled, it should proceed regardless, and there shouldn't be any opting out. > did signal for example or because his hunged IP block was able to > recover through SW reset but in the meantime another reset source who > needed an actual HW reset just silently returned and we end up with > unhandled reset request. True that today this happens only to job > timeout reset sources that are handled form within the scheduler and > won't use this single work struct but no one prevents a future case for > this to happen and also, if we actually want to unify scheduler time out > handlers within reset domain (which seems to me the right design > approach) we won't be able to use just one work struct for this reason > anyway. Felix idea is good, and here is another idea which I've implemented in open source when having to handle infinite number of wire events with finite number of type (5 types of event), in a finite memory. Instead of allocating a myriad of reset events when each client wants a GPU reset, you'd have only one instance of a "reset" event, say an instance of a linked list entry, call it R. Initially R is list_init(), so it points to itself. Anytime someone wants to do a reset, they do a list_move(R, event work list); (or list_move_tail() if you want them events chronologically ordered) and wake up a thread. If R is already on the list, meaning another client has had scheduled the reset, then no change occurs. (You could check if R is non-empty, and that check would tell you if R is scheduled to be processed, or you could just do the list_move() and up(processing thread), always, to mitigate races). Note that many clients can call list_move(R, event work list), and the result is the same--a reset is pending to occur, regardless if it had already occurred, or if it hadn't occurred and is about to. Then when the processing thread wakes up, it starts consuming "events" from that linked list, and removing them from it, and processing them. Obviously, access to the linked list needs to be guarded by a lock. If you have a set of different type of event, say "reset", "start", "stop", "load", etc., you can create an array of linked list event structs and index them by a macro whose value is the index into the array, and whose name is the type of the event. Seeing that it's just a reset event, you could just have a single instance of a linked list. Anyway, just an idea of an old old implementation of mine. > > Andrey > > >> >> I'd put the reset work struct into the reset_domain struct. That way >> you'd have exactly one worker for the reset domain. You could >> implement a lock-less scheme to decide whether you need to schedule a >> reset, e.g. using an atomic counter in the shared work struct that >> gets incremented when a client wants to trigger a reset >> (atomic_add_return). If that counter is exactly 1 after incrementing, >> you need to fill in the rest of the work struct and schedule the work. >> Otherwise, it's already scheduled (or another client is in the process >> of scheduling it) and you just return. When the worker finishes (after >> confirming a successful reset), it resets the counter to 0, so the >> next client requesting a reset will schedule the worker again. >> >> Regards, >> Felix >> >> >>> >>> >>>> >>>> >>>> Additional to that keep in mind that you can't allocate any memory >>>> before or during the GPU reset nor wait for the reset to complete >>>> (so you can't allocate anything on the stack either). >>> >>> >>> There is no dynamic allocation here, regarding stack allocations - we >>> do it all the time when we call functions, even during GPU resets, >>> how on stack allocation of work struct in amdgpu_device_gpu_recover >>> is different from any other local variable we allocate in any >>> function we call ? >>> >>> I am also not sure why it's not allowed to wait for reset to complete >>> ? Also, see in amdgpu_ras_do_recovery and gpu_recover_get (debugfs) - >>> the caller expects the reset to complete before he returns. I can >>> probably work around it in RAS code by calling >>> atomic_set(&ras->in_recovery, 0) from some callback within actual >>> reset function but regarding sysfs it actually expects a result >>> returned indicating whether the call was successful or not. >>> >>> Andrey >>> >>> >>>> >>>> I don't think that concept you try here will work. >>>> >>>> Regards, >>>> Christian. >>>> >>>>> Also in general seems to me it's cleaner approach where this logic >>>>> (the work items) are held and handled in reset_domain and are not >>>>> split in each adev or any other entity. We might want in the future >>>>> to even move the scheduler handling into reset domain since reset >>>>> domain is supposed to be a generic things and not only or AMD. >>>>> >>>>> Andrey >>>>> >>>>> >>>>>> >>>>>>> >>>>>>> Andrey >>>>>>> >>>>>>> >>>>>>>> >>>>>>>> Regards, >>>>>>>> Christian. >>>>>>>> >>>>>>>>> >>>>>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> >>>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com> >>>>>>>>> --- >>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +--- >>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++-- >>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 + >>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 >>>>>>>>> +++++++++++++++++++++- >>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +- >>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++- >>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++- >>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++- >>>>>>>>> 8 files changed, 104 insertions(+), 24 deletions(-) >>>>>>>>> >>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>> index 4264abc5604d..99efd8317547 100644 >>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>> @@ -109,6 +109,7 @@ >>>>>>>>> #include "amdgpu_fdinfo.h" >>>>>>>>> #include "amdgpu_mca.h" >>>>>>>>> #include "amdgpu_ras.h" >>>>>>>>> +#include "amdgpu_reset.h" >>>>>>>>> #define MAX_GPU_INSTANCE 16 >>>>>>>>> @@ -509,16 +510,6 @@ struct amdgpu_allowed_register_entry { >>>>>>>>> bool grbm_indexed; >>>>>>>>> }; >>>>>>>>> -enum amd_reset_method { >>>>>>>>> - AMD_RESET_METHOD_NONE = -1, >>>>>>>>> - AMD_RESET_METHOD_LEGACY = 0, >>>>>>>>> - AMD_RESET_METHOD_MODE0, >>>>>>>>> - AMD_RESET_METHOD_MODE1, >>>>>>>>> - AMD_RESET_METHOD_MODE2, >>>>>>>>> - AMD_RESET_METHOD_BACO, >>>>>>>>> - AMD_RESET_METHOD_PCI, >>>>>>>>> -}; >>>>>>>>> - >>>>>>>>> struct amdgpu_video_codec_info { >>>>>>>>> u32 codec_type; >>>>>>>>> u32 max_width; >>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>> index e582f1044c0f..7fa82269c30f 100644 >>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>> @@ -5201,6 +5201,12 @@ int amdgpu_device_gpu_recover_imp(struct >>>>>>>>> amdgpu_device *adev, >>>>>>>>> } >>>>>>>>> tmp_vram_lost_counter = >>>>>>>>> atomic_read(&((adev)->vram_lost_counter)); >>>>>>>>> + >>>>>>>>> + /* Drop all pending resets since we will reset now anyway */ >>>>>>>>> + tmp_adev = list_first_entry(device_list_handle, struct >>>>>>>>> amdgpu_device, >>>>>>>>> + reset_list); >>>>>>>>> + amdgpu_reset_pending_list(tmp_adev->reset_domain); >>>>>>>>> + >>>>>>>>> /* Actual ASIC resets if needed.*/ >>>>>>>>> /* Host driver will handle XGMI hive reset for SRIOV */ >>>>>>>>> if (amdgpu_sriov_vf(adev)) { >>>>>>>>> @@ -5296,7 +5302,7 @@ int amdgpu_device_gpu_recover_imp(struct >>>>>>>>> amdgpu_device *adev, >>>>>>>>> } >>>>>>>>> struct amdgpu_recover_work_struct { >>>>>>>>> - struct work_struct base; >>>>>>>>> + struct amdgpu_reset_work_struct base; >>>>>>>>> struct amdgpu_device *adev; >>>>>>>>> struct amdgpu_job *job; >>>>>>>>> int ret; >>>>>>>>> @@ -5304,7 +5310,7 @@ struct amdgpu_recover_work_struct { >>>>>>>>> static void amdgpu_device_queue_gpu_recover_work(struct >>>>>>>>> work_struct *work) >>>>>>>>> { >>>>>>>>> - struct amdgpu_recover_work_struct *recover_work = >>>>>>>>> container_of(work, struct amdgpu_recover_work_struct, base); >>>>>>>>> + struct amdgpu_recover_work_struct *recover_work = >>>>>>>>> container_of(work, struct amdgpu_recover_work_struct, >>>>>>>>> base.base.work); >>>>>>>>> recover_work->ret = >>>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, >>>>>>>>> recover_work->job); >>>>>>>>> } >>>>>>>>> @@ -5316,12 +5322,15 @@ int amdgpu_device_gpu_recover(struct >>>>>>>>> amdgpu_device *adev, >>>>>>>>> { >>>>>>>>> struct amdgpu_recover_work_struct work = {.adev = adev, >>>>>>>>> .job = job}; >>>>>>>>> - INIT_WORK(&work.base, >>>>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>>>> + INIT_DELAYED_WORK(&work.base.base, >>>>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>>>> + INIT_LIST_HEAD(&work.base.node); >>>>>>>>> if (!amdgpu_reset_domain_schedule(adev->reset_domain, >>>>>>>>> &work.base)) >>>>>>>>> return -EAGAIN; >>>>>>>>> - flush_work(&work.base); >>>>>>>>> + flush_delayed_work(&work.base.base); >>>>>>>>> + >>>>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>> &work.base); >>>>>>>>> return work.ret; >>>>>>>>> } >>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>> index c80af0889773..ffddd419c351 100644 >>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain >>>>>>>>> *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d >>>>>>>>> atomic_set(&reset_domain->in_gpu_reset, 0); >>>>>>>>> init_rwsem(&reset_domain->sem); >>>>>>>>> + INIT_LIST_HEAD(&reset_domain->pending_works); >>>>>>>>> + mutex_init(&reset_domain->reset_lock); >>>>>>>>> + >>>>>>>>> return reset_domain; >>>>>>>>> } >>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>> index 1949dbe28a86..863ec5720fc1 100644 >>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>> @@ -24,7 +24,18 @@ >>>>>>>>> #ifndef __AMDGPU_RESET_H__ >>>>>>>>> #define __AMDGPU_RESET_H__ >>>>>>>>> -#include "amdgpu.h" >>>>>>>>> + >>>>>>>>> +#include <linux/atomic.h> >>>>>>>>> +#include <linux/mutex.h> >>>>>>>>> +#include <linux/list.h> >>>>>>>>> +#include <linux/kref.h> >>>>>>>>> +#include <linux/rwsem.h> >>>>>>>>> +#include <linux/workqueue.h> >>>>>>>>> + >>>>>>>>> +struct amdgpu_device; >>>>>>>>> +struct amdgpu_job; >>>>>>>>> +struct amdgpu_hive_info; >>>>>>>>> + >>>>>>>>> enum AMDGPU_RESET_FLAGS { >>>>>>>>> @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS { >>>>>>>>> AMDGPU_SKIP_HW_RESET = 1, >>>>>>>>> }; >>>>>>>>> + >>>>>>>>> +enum amd_reset_method { >>>>>>>>> + AMD_RESET_METHOD_NONE = -1, >>>>>>>>> + AMD_RESET_METHOD_LEGACY = 0, >>>>>>>>> + AMD_RESET_METHOD_MODE0, >>>>>>>>> + AMD_RESET_METHOD_MODE1, >>>>>>>>> + AMD_RESET_METHOD_MODE2, >>>>>>>>> + AMD_RESET_METHOD_BACO, >>>>>>>>> + AMD_RESET_METHOD_PCI, >>>>>>>>> +}; >>>>>>>>> + >>>>>>>>> struct amdgpu_reset_context { >>>>>>>>> enum amd_reset_method method; >>>>>>>>> struct amdgpu_device *reset_req_dev; >>>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context { >>>>>>>>> unsigned long flags; >>>>>>>>> }; >>>>>>>>> +struct amdgpu_reset_control; >>>>>>>>> + >>>>>>>>> struct amdgpu_reset_handler { >>>>>>>>> enum amd_reset_method reset_method; >>>>>>>>> struct list_head handler_list; >>>>>>>>> @@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type { >>>>>>>>> XGMI_HIVE >>>>>>>>> }; >>>>>>>>> + >>>>>>>>> +struct amdgpu_reset_work_struct { >>>>>>>>> + struct delayed_work base; >>>>>>>>> + struct list_head node; >>>>>>>>> +}; >>>>>>>>> + >>>>>>>>> struct amdgpu_reset_domain { >>>>>>>>> struct kref refcount; >>>>>>>>> struct workqueue_struct *wq; >>>>>>>>> enum amdgpu_reset_domain_type type; >>>>>>>>> struct rw_semaphore sem; >>>>>>>>> atomic_t in_gpu_reset; >>>>>>>>> + >>>>>>>>> + struct list_head pending_works; >>>>>>>>> + struct mutex reset_lock; >>>>>>>>> }; >>>>>>>>> @@ -113,9 +146,43 @@ static inline void >>>>>>>>> amdgpu_reset_put_reset_domain(struct amdgpu_reset_domain *dom >>>>>>>>> } >>>>>>>>> static inline bool amdgpu_reset_domain_schedule(struct >>>>>>>>> amdgpu_reset_domain *domain, >>>>>>>>> - struct work_struct *work) >>>>>>>>> + struct amdgpu_reset_work_struct *work) >>>>>>>>> { >>>>>>>>> - return queue_work(domain->wq, work); >>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>> + >>>>>>>>> + if (!queue_delayed_work(domain->wq, &work->base, 0)) { >>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>> + return false; >>>>>>>>> + } >>>>>>>>> + >>>>>>>>> + list_add_tail(&work->node, &domain->pending_works); >>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>> + >>>>>>>>> + return true; >>>>>>>>> +} >>>>>>>>> + >>>>>>>>> +static inline void >>>>>>>>> amdgpu_reset_domain_del_pendning_work(struct >>>>>>>>> amdgpu_reset_domain *domain, >>>>>>>>> + struct amdgpu_reset_work_struct *work) >>>>>>>>> +{ >>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>> + list_del_init(&work->node); >>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>> +} >>>>>>>>> + >>>>>>>>> +static inline void amdgpu_reset_pending_list(struct >>>>>>>>> amdgpu_reset_domain *domain) >>>>>>>>> +{ >>>>>>>>> + struct amdgpu_reset_work_struct *entry, *tmp; >>>>>>>>> + >>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>> + list_for_each_entry_safe(entry, tmp, >>>>>>>>> &domain->pending_works, node) { >>>>>>>>> + >>>>>>>>> + list_del_init(&entry->node); >>>>>>>>> + >>>>>>>>> + /* Stop any other related pending resets */ >>>>>>>>> + cancel_delayed_work(&entry->base); >>>>>>>>> + } >>>>>>>>> + >>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>> } >>>>>>>>> void amdgpu_device_lock_reset_domain(struct >>>>>>>>> amdgpu_reset_domain *reset_domain); >>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>> index 239f232f9c02..574e870d3064 100644 >>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>> @@ -25,6 +25,7 @@ >>>>>>>>> #define AMDGPU_VIRT_H >>>>>>>>> #include "amdgv_sriovmsg.h" >>>>>>>>> +#include "amdgpu_reset.h" >>>>>>>>> #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 << 0) /* vBIOS is >>>>>>>>> sr-iov ready */ >>>>>>>>> #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 1) /* sr-iov is >>>>>>>>> enabled on this GPU */ >>>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt { >>>>>>>>> uint32_t reg_val_offs; >>>>>>>>> struct amdgpu_irq_src ack_irq; >>>>>>>>> struct amdgpu_irq_src rcv_irq; >>>>>>>>> - struct work_struct flr_work; >>>>>>>>> + struct amdgpu_reset_work_struct flr_work; >>>>>>>>> struct amdgpu_mm_table mm_table; >>>>>>>>> const struct amdgpu_virt_ops *ops; >>>>>>>>> struct amdgpu_vf_error_buffer vf_errors; >>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>> index b81acf59870c..f3d1c2be9292 100644 >>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>> @@ -251,7 +251,7 @@ static int >>>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device *adev, >>>>>>>>> static void xgpu_ai_mailbox_flr_work(struct work_struct *work) >>>>>>>>> { >>>>>>>>> - struct amdgpu_virt *virt = container_of(work, struct >>>>>>>>> amdgpu_virt, flr_work); >>>>>>>>> + struct amdgpu_virt *virt = container_of(work, struct >>>>>>>>> amdgpu_virt, flr_work.base.work); >>>>>>>>> struct amdgpu_device *adev = container_of(virt, struct >>>>>>>>> amdgpu_device, virt); >>>>>>>>> int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>>>> @@ -380,7 +380,8 @@ int xgpu_ai_mailbox_get_irq(struct >>>>>>>>> amdgpu_device *adev) >>>>>>>>> return r; >>>>>>>>> } >>>>>>>>> - INIT_WORK(&adev->virt.flr_work, xgpu_ai_mailbox_flr_work); >>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>> xgpu_ai_mailbox_flr_work); >>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>> return 0; >>>>>>>>> } >>>>>>>>> @@ -389,6 +390,8 @@ void xgpu_ai_mailbox_put_irq(struct >>>>>>>>> amdgpu_device *adev) >>>>>>>>> { >>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>> + >>>>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>> &adev->virt.flr_work); >>>>>>>>> } >>>>>>>>> static int xgpu_ai_request_init_data(struct amdgpu_device >>>>>>>>> *adev) >>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644 >>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>> @@ -275,7 +275,7 @@ static int >>>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device *adev, >>>>>>>>> static void xgpu_nv_mailbox_flr_work(struct work_struct *work) >>>>>>>>> { >>>>>>>>> - struct amdgpu_virt *virt = container_of(work, struct >>>>>>>>> amdgpu_virt, flr_work); >>>>>>>>> + struct amdgpu_virt *virt = container_of(work, struct >>>>>>>>> amdgpu_virt, flr_work.base.work); >>>>>>>>> struct amdgpu_device *adev = container_of(virt, struct >>>>>>>>> amdgpu_device, virt); >>>>>>>>> int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>>>> @@ -407,7 +407,8 @@ int xgpu_nv_mailbox_get_irq(struct >>>>>>>>> amdgpu_device *adev) >>>>>>>>> return r; >>>>>>>>> } >>>>>>>>> - INIT_WORK(&adev->virt.flr_work, xgpu_nv_mailbox_flr_work); >>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>> xgpu_nv_mailbox_flr_work); >>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>> return 0; >>>>>>>>> } >>>>>>>>> @@ -416,6 +417,8 @@ void xgpu_nv_mailbox_put_irq(struct >>>>>>>>> amdgpu_device *adev) >>>>>>>>> { >>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>> + >>>>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>> &adev->virt.flr_work); >>>>>>>>> } >>>>>>>>> const struct amdgpu_virt_ops xgpu_nv_virt_ops = { >>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644 >>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>> @@ -512,7 +512,7 @@ static int >>>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device *adev, >>>>>>>>> static void xgpu_vi_mailbox_flr_work(struct work_struct *work) >>>>>>>>> { >>>>>>>>> - struct amdgpu_virt *virt = container_of(work, struct >>>>>>>>> amdgpu_virt, flr_work); >>>>>>>>> + struct amdgpu_virt *virt = container_of(work, struct >>>>>>>>> amdgpu_virt, flr_work.base.work); >>>>>>>>> struct amdgpu_device *adev = container_of(virt, struct >>>>>>>>> amdgpu_device, virt); >>>>>>>>> /* wait until RCV_MSG become 3 */ >>>>>>>>> @@ -610,7 +610,8 @@ int xgpu_vi_mailbox_get_irq(struct >>>>>>>>> amdgpu_device *adev) >>>>>>>>> return r; >>>>>>>>> } >>>>>>>>> - INIT_WORK(&adev->virt.flr_work, xgpu_vi_mailbox_flr_work); >>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>> xgpu_vi_mailbox_flr_work); >>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>> return 0; >>>>>>>>> } >>>>>>>>> @@ -619,6 +620,8 @@ void xgpu_vi_mailbox_put_irq(struct >>>>>>>>> amdgpu_device *adev) >>>>>>>>> { >>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>> + >>>>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>> &adev->virt.flr_work); >>>>>>>>> } >>>>>>>>> const struct amdgpu_virt_ops xgpu_vi_virt_ops = { >>>>>>>> >>>>>> >>>> Regards, -- Luben ^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive. 2022-05-05 21:47 ` Andrey Grodzovsky 2022-05-06 5:41 ` Luben Tuikov @ 2022-05-06 6:02 ` Lazar, Lijo 2022-05-06 8:56 ` Christian König 1 sibling, 1 reply; 40+ messages in thread From: Lazar, Lijo @ 2022-05-06 6:02 UTC (permalink / raw) To: Andrey Grodzovsky, Felix Kuehling, Christian König, amd-gfx; +Cc: Bai Zoy On 5/6/2022 3:17 AM, Andrey Grodzovsky wrote: > > On 2022-05-05 15:49, Felix Kuehling wrote: >> >> Am 2022-05-05 um 14:57 schrieb Andrey Grodzovsky: >>> >>> On 2022-05-05 11:06, Christian König wrote: >>>> Am 05.05.22 um 15:54 schrieb Andrey Grodzovsky: >>>>> >>>>> On 2022-05-05 09:23, Christian König wrote: >>>>>> Am 05.05.22 um 15:15 schrieb Andrey Grodzovsky: >>>>>>> On 2022-05-05 06:09, Christian König wrote: >>>>>>> >>>>>>>> Am 04.05.22 um 18:18 schrieb Andrey Grodzovsky: >>>>>>>>> Problem: >>>>>>>>> During hive reset caused by command timing out on a ring >>>>>>>>> extra resets are generated by triggered by KFD which is >>>>>>>>> unable to accesses registers on the resetting ASIC. >>>>>>>>> >>>>>>>>> Fix: Rework GPU reset to use a list of pending reset jobs >>>>>>>>> such that the first reset jobs that actaully resets the entire >>>>>>>>> reset domain will cancel all those pending redundant resets. >>>>>>>>> >>>>>>>>> This is in line with what we already do for redundant TDRs >>>>>>>>> in scheduler code. >>>>>>>> >>>>>>>> Mhm, why exactly do you need the extra linked list then? >>>>>>>> >>>>>>>> Let's talk about that on our call today. >>>>>>> >>>>>>> >>>>>>> Going to miss it as you know, and also this is the place to >>>>>>> discuss technical questions anyway so - >>>>>> >>>>>> Good point. >>>>>> >>>>>>> It's needed because those other resets are not time out handlers >>>>>>> that are governed by the scheduler >>>>>>> but rather external resets that are triggered by such clients as >>>>>>> KFD, RAS and sysfs. Scheduler has no >>>>>>> knowledge of them (and should not have) but they are serialized >>>>>>> into same wq as the TO handlers >>>>>>> from the scheduler. It just happens that TO triggered reset >>>>>>> causes in turn another reset (from KFD in >>>>>>> this case) and we want to prevent this second reset from taking >>>>>>> place just as we want to avoid multiple >>>>>>> TO resets to take place in scheduler code. >>>>>> >>>>>> Yeah, but why do you need multiple workers? >>>>>> >>>>>> You have a single worker for the GPU reset not triggered by the >>>>>> scheduler in you adev and cancel that at the end of the reset >>>>>> procedure. >>>>>> >>>>>> If anybody things it needs to trigger another reset while in reset >>>>>> (which is actually a small design bug separately) the reset will >>>>>> just be canceled in the same way we cancel the scheduler resets. >>>>>> >>>>>> Christian. >>>>> >>>>> >>>>> Had this in mind at first but then I realized that each client >>>>> (RAS, KFD and sysfs) will want to fill his own data for the work >>>>> (see amdgpu_device_gpu_recover) - for XGMI hive each will want to >>>>> set his own adev (which is fine if you set a work per adev as you >>>>> suggest) but also each client might want (they all put NULL there >>>>> but in theory in the future) also set his own bad job value and >>>>> here you might have a collision. >>>> >>>> Yeah, but that is intentional. See when we have a job that needs to >>>> be consumed by the reset handler and not overwritten or something. >>> >>> >>> I am not sure why this is a requirement, multiple clients can decide >>> concurrently to trigger a reset for some reason (possibly independent >>> reasons) hence they cannot share same work struct to pass to it their >>> data. >> >> If those concurrent clients could detect that a reset was already in >> progress, you wouldn't need the complexity of multiple work structs >> being scheduled. You could simply return without triggering another >> reset. > > > In my view main problem here with single work struct either at reset > domain level or even adev level is that in some cases we optimize resets > and don't really perform ASIC HW reset (see amdgpu_job_timedout with > soft recovery and skip_hw_reset in amdgpu_device_gpu_recover_imp for the > case the bad job does get signaled just before we start HW reset and we > just skip this). You can see that if many different reset sources share > same work struct what can happen is that the first to obtain the lock > you describe bellow might opt out from full HW reset because his bad job > did signal for example or because his hunged IP block was able to > recover through SW reset but in the meantime another reset source who > needed an actual HW reset just silently returned and we end up with > unhandled reset request. True that today this happens only to job > timeout reset sources that are handled form within the scheduler and > won't use this single work struct but no one prevents a future case for > this to happen and also, if we actually want to unify scheduler time out > handlers within reset domain (which seems to me the right design > approach) we won't be able to use just one work struct for this reason > anyway. > Just to add to this point - a reset domain is co-operative domain. In addition to sharing a set of clients from various reset sources for one device, it also will have a set of devices like in XGMI hive. The job timeout on one device may not eventually result in result, but a RAS error happening on another device at the same time would need a reset. The second device's RAS error cannot return seeing that a reset work already started, or ignore the reset work given that another device has filled the reset data. When there is a reset domain, it should take care of the work scheduled and keeping it in device or any other level doesn't sound good. Thanks, Lijo > Andrey > > >> >> I'd put the reset work struct into the reset_domain struct. That way >> you'd have exactly one worker for the reset domain. You could >> implement a lock-less scheme to decide whether you need to schedule a >> reset, e.g. using an atomic counter in the shared work struct that >> gets incremented when a client wants to trigger a reset >> (atomic_add_return). If that counter is exactly 1 after incrementing, >> you need to fill in the rest of the work struct and schedule the work. >> Otherwise, it's already scheduled (or another client is in the process >> of scheduling it) and you just return. When the worker finishes (after >> confirming a successful reset), it resets the counter to 0, so the >> next client requesting a reset will schedule the worker again. >> >> Regards, >> Felix >> >> >>> >>> >>>> >>>> >>>> Additional to that keep in mind that you can't allocate any memory >>>> before or during the GPU reset nor wait for the reset to complete >>>> (so you can't allocate anything on the stack either). >>> >>> >>> There is no dynamic allocation here, regarding stack allocations - we >>> do it all the time when we call functions, even during GPU resets, >>> how on stack allocation of work struct in amdgpu_device_gpu_recover >>> is different from any other local variable we allocate in any >>> function we call ? >>> >>> I am also not sure why it's not allowed to wait for reset to complete >>> ? Also, see in amdgpu_ras_do_recovery and gpu_recover_get (debugfs) - >>> the caller expects the reset to complete before he returns. I can >>> probably work around it in RAS code by calling >>> atomic_set(&ras->in_recovery, 0) from some callback within actual >>> reset function but regarding sysfs it actually expects a result >>> returned indicating whether the call was successful or not. >>> >>> Andrey >>> >>> >>>> >>>> I don't think that concept you try here will work. >>>> >>>> Regards, >>>> Christian. >>>> >>>>> Also in general seems to me it's cleaner approach where this logic >>>>> (the work items) are held and handled in reset_domain and are not >>>>> split in each adev or any other entity. We might want in the future >>>>> to even move the scheduler handling into reset domain since reset >>>>> domain is supposed to be a generic things and not only or AMD. >>>>> >>>>> Andrey >>>>> >>>>> >>>>>> >>>>>>> >>>>>>> Andrey >>>>>>> >>>>>>> >>>>>>>> >>>>>>>> Regards, >>>>>>>> Christian. >>>>>>>> >>>>>>>>> >>>>>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> >>>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com> >>>>>>>>> --- >>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +--- >>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++-- >>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 + >>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 >>>>>>>>> +++++++++++++++++++++- >>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +- >>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++- >>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++- >>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++- >>>>>>>>> 8 files changed, 104 insertions(+), 24 deletions(-) >>>>>>>>> >>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>> index 4264abc5604d..99efd8317547 100644 >>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>> @@ -109,6 +109,7 @@ >>>>>>>>> #include "amdgpu_fdinfo.h" >>>>>>>>> #include "amdgpu_mca.h" >>>>>>>>> #include "amdgpu_ras.h" >>>>>>>>> +#include "amdgpu_reset.h" >>>>>>>>> #define MAX_GPU_INSTANCE 16 >>>>>>>>> @@ -509,16 +510,6 @@ struct amdgpu_allowed_register_entry { >>>>>>>>> bool grbm_indexed; >>>>>>>>> }; >>>>>>>>> -enum amd_reset_method { >>>>>>>>> - AMD_RESET_METHOD_NONE = -1, >>>>>>>>> - AMD_RESET_METHOD_LEGACY = 0, >>>>>>>>> - AMD_RESET_METHOD_MODE0, >>>>>>>>> - AMD_RESET_METHOD_MODE1, >>>>>>>>> - AMD_RESET_METHOD_MODE2, >>>>>>>>> - AMD_RESET_METHOD_BACO, >>>>>>>>> - AMD_RESET_METHOD_PCI, >>>>>>>>> -}; >>>>>>>>> - >>>>>>>>> struct amdgpu_video_codec_info { >>>>>>>>> u32 codec_type; >>>>>>>>> u32 max_width; >>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>> index e582f1044c0f..7fa82269c30f 100644 >>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>> @@ -5201,6 +5201,12 @@ int amdgpu_device_gpu_recover_imp(struct >>>>>>>>> amdgpu_device *adev, >>>>>>>>> } >>>>>>>>> tmp_vram_lost_counter = >>>>>>>>> atomic_read(&((adev)->vram_lost_counter)); >>>>>>>>> + >>>>>>>>> + /* Drop all pending resets since we will reset now anyway */ >>>>>>>>> + tmp_adev = list_first_entry(device_list_handle, struct >>>>>>>>> amdgpu_device, >>>>>>>>> + reset_list); >>>>>>>>> + amdgpu_reset_pending_list(tmp_adev->reset_domain); >>>>>>>>> + >>>>>>>>> /* Actual ASIC resets if needed.*/ >>>>>>>>> /* Host driver will handle XGMI hive reset for SRIOV */ >>>>>>>>> if (amdgpu_sriov_vf(adev)) { >>>>>>>>> @@ -5296,7 +5302,7 @@ int amdgpu_device_gpu_recover_imp(struct >>>>>>>>> amdgpu_device *adev, >>>>>>>>> } >>>>>>>>> struct amdgpu_recover_work_struct { >>>>>>>>> - struct work_struct base; >>>>>>>>> + struct amdgpu_reset_work_struct base; >>>>>>>>> struct amdgpu_device *adev; >>>>>>>>> struct amdgpu_job *job; >>>>>>>>> int ret; >>>>>>>>> @@ -5304,7 +5310,7 @@ struct amdgpu_recover_work_struct { >>>>>>>>> static void amdgpu_device_queue_gpu_recover_work(struct >>>>>>>>> work_struct *work) >>>>>>>>> { >>>>>>>>> - struct amdgpu_recover_work_struct *recover_work = >>>>>>>>> container_of(work, struct amdgpu_recover_work_struct, base); >>>>>>>>> + struct amdgpu_recover_work_struct *recover_work = >>>>>>>>> container_of(work, struct amdgpu_recover_work_struct, >>>>>>>>> base.base.work); >>>>>>>>> recover_work->ret = >>>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, >>>>>>>>> recover_work->job); >>>>>>>>> } >>>>>>>>> @@ -5316,12 +5322,15 @@ int amdgpu_device_gpu_recover(struct >>>>>>>>> amdgpu_device *adev, >>>>>>>>> { >>>>>>>>> struct amdgpu_recover_work_struct work = {.adev = adev, >>>>>>>>> .job = job}; >>>>>>>>> - INIT_WORK(&work.base, >>>>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>>>> + INIT_DELAYED_WORK(&work.base.base, >>>>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>>>> + INIT_LIST_HEAD(&work.base.node); >>>>>>>>> if (!amdgpu_reset_domain_schedule(adev->reset_domain, >>>>>>>>> &work.base)) >>>>>>>>> return -EAGAIN; >>>>>>>>> - flush_work(&work.base); >>>>>>>>> + flush_delayed_work(&work.base.base); >>>>>>>>> + >>>>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>> &work.base); >>>>>>>>> return work.ret; >>>>>>>>> } >>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>> index c80af0889773..ffddd419c351 100644 >>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain >>>>>>>>> *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d >>>>>>>>> atomic_set(&reset_domain->in_gpu_reset, 0); >>>>>>>>> init_rwsem(&reset_domain->sem); >>>>>>>>> + INIT_LIST_HEAD(&reset_domain->pending_works); >>>>>>>>> + mutex_init(&reset_domain->reset_lock); >>>>>>>>> + >>>>>>>>> return reset_domain; >>>>>>>>> } >>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>> index 1949dbe28a86..863ec5720fc1 100644 >>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>> @@ -24,7 +24,18 @@ >>>>>>>>> #ifndef __AMDGPU_RESET_H__ >>>>>>>>> #define __AMDGPU_RESET_H__ >>>>>>>>> -#include "amdgpu.h" >>>>>>>>> + >>>>>>>>> +#include <linux/atomic.h> >>>>>>>>> +#include <linux/mutex.h> >>>>>>>>> +#include <linux/list.h> >>>>>>>>> +#include <linux/kref.h> >>>>>>>>> +#include <linux/rwsem.h> >>>>>>>>> +#include <linux/workqueue.h> >>>>>>>>> + >>>>>>>>> +struct amdgpu_device; >>>>>>>>> +struct amdgpu_job; >>>>>>>>> +struct amdgpu_hive_info; >>>>>>>>> + >>>>>>>>> enum AMDGPU_RESET_FLAGS { >>>>>>>>> @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS { >>>>>>>>> AMDGPU_SKIP_HW_RESET = 1, >>>>>>>>> }; >>>>>>>>> + >>>>>>>>> +enum amd_reset_method { >>>>>>>>> + AMD_RESET_METHOD_NONE = -1, >>>>>>>>> + AMD_RESET_METHOD_LEGACY = 0, >>>>>>>>> + AMD_RESET_METHOD_MODE0, >>>>>>>>> + AMD_RESET_METHOD_MODE1, >>>>>>>>> + AMD_RESET_METHOD_MODE2, >>>>>>>>> + AMD_RESET_METHOD_BACO, >>>>>>>>> + AMD_RESET_METHOD_PCI, >>>>>>>>> +}; >>>>>>>>> + >>>>>>>>> struct amdgpu_reset_context { >>>>>>>>> enum amd_reset_method method; >>>>>>>>> struct amdgpu_device *reset_req_dev; >>>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context { >>>>>>>>> unsigned long flags; >>>>>>>>> }; >>>>>>>>> +struct amdgpu_reset_control; >>>>>>>>> + >>>>>>>>> struct amdgpu_reset_handler { >>>>>>>>> enum amd_reset_method reset_method; >>>>>>>>> struct list_head handler_list; >>>>>>>>> @@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type { >>>>>>>>> XGMI_HIVE >>>>>>>>> }; >>>>>>>>> + >>>>>>>>> +struct amdgpu_reset_work_struct { >>>>>>>>> + struct delayed_work base; >>>>>>>>> + struct list_head node; >>>>>>>>> +}; >>>>>>>>> + >>>>>>>>> struct amdgpu_reset_domain { >>>>>>>>> struct kref refcount; >>>>>>>>> struct workqueue_struct *wq; >>>>>>>>> enum amdgpu_reset_domain_type type; >>>>>>>>> struct rw_semaphore sem; >>>>>>>>> atomic_t in_gpu_reset; >>>>>>>>> + >>>>>>>>> + struct list_head pending_works; >>>>>>>>> + struct mutex reset_lock; >>>>>>>>> }; >>>>>>>>> @@ -113,9 +146,43 @@ static inline void >>>>>>>>> amdgpu_reset_put_reset_domain(struct amdgpu_reset_domain *dom >>>>>>>>> } >>>>>>>>> static inline bool amdgpu_reset_domain_schedule(struct >>>>>>>>> amdgpu_reset_domain *domain, >>>>>>>>> - struct work_struct *work) >>>>>>>>> + struct amdgpu_reset_work_struct *work) >>>>>>>>> { >>>>>>>>> - return queue_work(domain->wq, work); >>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>> + >>>>>>>>> + if (!queue_delayed_work(domain->wq, &work->base, 0)) { >>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>> + return false; >>>>>>>>> + } >>>>>>>>> + >>>>>>>>> + list_add_tail(&work->node, &domain->pending_works); >>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>> + >>>>>>>>> + return true; >>>>>>>>> +} >>>>>>>>> + >>>>>>>>> +static inline void >>>>>>>>> amdgpu_reset_domain_del_pendning_work(struct >>>>>>>>> amdgpu_reset_domain *domain, >>>>>>>>> + struct amdgpu_reset_work_struct *work) >>>>>>>>> +{ >>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>> + list_del_init(&work->node); >>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>> +} >>>>>>>>> + >>>>>>>>> +static inline void amdgpu_reset_pending_list(struct >>>>>>>>> amdgpu_reset_domain *domain) >>>>>>>>> +{ >>>>>>>>> + struct amdgpu_reset_work_struct *entry, *tmp; >>>>>>>>> + >>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>> + list_for_each_entry_safe(entry, tmp, >>>>>>>>> &domain->pending_works, node) { >>>>>>>>> + >>>>>>>>> + list_del_init(&entry->node); >>>>>>>>> + >>>>>>>>> + /* Stop any other related pending resets */ >>>>>>>>> + cancel_delayed_work(&entry->base); >>>>>>>>> + } >>>>>>>>> + >>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>> } >>>>>>>>> void amdgpu_device_lock_reset_domain(struct >>>>>>>>> amdgpu_reset_domain *reset_domain); >>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>> index 239f232f9c02..574e870d3064 100644 >>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>> @@ -25,6 +25,7 @@ >>>>>>>>> #define AMDGPU_VIRT_H >>>>>>>>> #include "amdgv_sriovmsg.h" >>>>>>>>> +#include "amdgpu_reset.h" >>>>>>>>> #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 << 0) /* vBIOS is >>>>>>>>> sr-iov ready */ >>>>>>>>> #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 1) /* sr-iov is >>>>>>>>> enabled on this GPU */ >>>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt { >>>>>>>>> uint32_t reg_val_offs; >>>>>>>>> struct amdgpu_irq_src ack_irq; >>>>>>>>> struct amdgpu_irq_src rcv_irq; >>>>>>>>> - struct work_struct flr_work; >>>>>>>>> + struct amdgpu_reset_work_struct flr_work; >>>>>>>>> struct amdgpu_mm_table mm_table; >>>>>>>>> const struct amdgpu_virt_ops *ops; >>>>>>>>> struct amdgpu_vf_error_buffer vf_errors; >>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>> index b81acf59870c..f3d1c2be9292 100644 >>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>> @@ -251,7 +251,7 @@ static int >>>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device *adev, >>>>>>>>> static void xgpu_ai_mailbox_flr_work(struct work_struct *work) >>>>>>>>> { >>>>>>>>> - struct amdgpu_virt *virt = container_of(work, struct >>>>>>>>> amdgpu_virt, flr_work); >>>>>>>>> + struct amdgpu_virt *virt = container_of(work, struct >>>>>>>>> amdgpu_virt, flr_work.base.work); >>>>>>>>> struct amdgpu_device *adev = container_of(virt, struct >>>>>>>>> amdgpu_device, virt); >>>>>>>>> int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>>>> @@ -380,7 +380,8 @@ int xgpu_ai_mailbox_get_irq(struct >>>>>>>>> amdgpu_device *adev) >>>>>>>>> return r; >>>>>>>>> } >>>>>>>>> - INIT_WORK(&adev->virt.flr_work, xgpu_ai_mailbox_flr_work); >>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>> xgpu_ai_mailbox_flr_work); >>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>> return 0; >>>>>>>>> } >>>>>>>>> @@ -389,6 +390,8 @@ void xgpu_ai_mailbox_put_irq(struct >>>>>>>>> amdgpu_device *adev) >>>>>>>>> { >>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>> + >>>>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>> &adev->virt.flr_work); >>>>>>>>> } >>>>>>>>> static int xgpu_ai_request_init_data(struct amdgpu_device >>>>>>>>> *adev) >>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644 >>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>> @@ -275,7 +275,7 @@ static int >>>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device *adev, >>>>>>>>> static void xgpu_nv_mailbox_flr_work(struct work_struct *work) >>>>>>>>> { >>>>>>>>> - struct amdgpu_virt *virt = container_of(work, struct >>>>>>>>> amdgpu_virt, flr_work); >>>>>>>>> + struct amdgpu_virt *virt = container_of(work, struct >>>>>>>>> amdgpu_virt, flr_work.base.work); >>>>>>>>> struct amdgpu_device *adev = container_of(virt, struct >>>>>>>>> amdgpu_device, virt); >>>>>>>>> int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>>>> @@ -407,7 +407,8 @@ int xgpu_nv_mailbox_get_irq(struct >>>>>>>>> amdgpu_device *adev) >>>>>>>>> return r; >>>>>>>>> } >>>>>>>>> - INIT_WORK(&adev->virt.flr_work, xgpu_nv_mailbox_flr_work); >>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>> xgpu_nv_mailbox_flr_work); >>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>> return 0; >>>>>>>>> } >>>>>>>>> @@ -416,6 +417,8 @@ void xgpu_nv_mailbox_put_irq(struct >>>>>>>>> amdgpu_device *adev) >>>>>>>>> { >>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>> + >>>>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>> &adev->virt.flr_work); >>>>>>>>> } >>>>>>>>> const struct amdgpu_virt_ops xgpu_nv_virt_ops = { >>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644 >>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>> @@ -512,7 +512,7 @@ static int >>>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device *adev, >>>>>>>>> static void xgpu_vi_mailbox_flr_work(struct work_struct *work) >>>>>>>>> { >>>>>>>>> - struct amdgpu_virt *virt = container_of(work, struct >>>>>>>>> amdgpu_virt, flr_work); >>>>>>>>> + struct amdgpu_virt *virt = container_of(work, struct >>>>>>>>> amdgpu_virt, flr_work.base.work); >>>>>>>>> struct amdgpu_device *adev = container_of(virt, struct >>>>>>>>> amdgpu_device, virt); >>>>>>>>> /* wait until RCV_MSG become 3 */ >>>>>>>>> @@ -610,7 +610,8 @@ int xgpu_vi_mailbox_get_irq(struct >>>>>>>>> amdgpu_device *adev) >>>>>>>>> return r; >>>>>>>>> } >>>>>>>>> - INIT_WORK(&adev->virt.flr_work, xgpu_vi_mailbox_flr_work); >>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>> xgpu_vi_mailbox_flr_work); >>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>> return 0; >>>>>>>>> } >>>>>>>>> @@ -619,6 +620,8 @@ void xgpu_vi_mailbox_put_irq(struct >>>>>>>>> amdgpu_device *adev) >>>>>>>>> { >>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>> + >>>>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>> &adev->virt.flr_work); >>>>>>>>> } >>>>>>>>> const struct amdgpu_virt_ops xgpu_vi_virt_ops = { >>>>>>>> >>>>>> >>>> ^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive. 2022-05-06 6:02 ` Lazar, Lijo @ 2022-05-06 8:56 ` Christian König 2022-05-10 16:00 ` Andrey Grodzovsky 0 siblings, 1 reply; 40+ messages in thread From: Christian König @ 2022-05-06 8:56 UTC (permalink / raw) To: Lazar, Lijo, Andrey Grodzovsky, Felix Kuehling, Christian König, amd-gfx Cc: Bai Zoy Am 06.05.22 um 08:02 schrieb Lazar, Lijo: > On 5/6/2022 3:17 AM, Andrey Grodzovsky wrote: >> >> On 2022-05-05 15:49, Felix Kuehling wrote: >>> >>> Am 2022-05-05 um 14:57 schrieb Andrey Grodzovsky: >>>> >>>> On 2022-05-05 11:06, Christian König wrote: >>>>> Am 05.05.22 um 15:54 schrieb Andrey Grodzovsky: >>>>>> >>>>>> On 2022-05-05 09:23, Christian König wrote: >>>>>>> Am 05.05.22 um 15:15 schrieb Andrey Grodzovsky: >>>>>>>> On 2022-05-05 06:09, Christian König wrote: >>>>>>>> >>>>>>>>> Am 04.05.22 um 18:18 schrieb Andrey Grodzovsky: >>>>>>>>>> Problem: >>>>>>>>>> During hive reset caused by command timing out on a ring >>>>>>>>>> extra resets are generated by triggered by KFD which is >>>>>>>>>> unable to accesses registers on the resetting ASIC. >>>>>>>>>> >>>>>>>>>> Fix: Rework GPU reset to use a list of pending reset jobs >>>>>>>>>> such that the first reset jobs that actaully resets the entire >>>>>>>>>> reset domain will cancel all those pending redundant resets. >>>>>>>>>> >>>>>>>>>> This is in line with what we already do for redundant TDRs >>>>>>>>>> in scheduler code. >>>>>>>>> >>>>>>>>> Mhm, why exactly do you need the extra linked list then? >>>>>>>>> >>>>>>>>> Let's talk about that on our call today. >>>>>>>> >>>>>>>> >>>>>>>> Going to miss it as you know, and also this is the place to >>>>>>>> discuss technical questions anyway so - >>>>>>> >>>>>>> Good point. >>>>>>> >>>>>>>> It's needed because those other resets are not time out >>>>>>>> handlers that are governed by the scheduler >>>>>>>> but rather external resets that are triggered by such clients >>>>>>>> as KFD, RAS and sysfs. Scheduler has no >>>>>>>> knowledge of them (and should not have) but they are serialized >>>>>>>> into same wq as the TO handlers >>>>>>>> from the scheduler. It just happens that TO triggered reset >>>>>>>> causes in turn another reset (from KFD in >>>>>>>> this case) and we want to prevent this second reset from taking >>>>>>>> place just as we want to avoid multiple >>>>>>>> TO resets to take place in scheduler code. >>>>>>> >>>>>>> Yeah, but why do you need multiple workers? >>>>>>> >>>>>>> You have a single worker for the GPU reset not triggered by the >>>>>>> scheduler in you adev and cancel that at the end of the reset >>>>>>> procedure. >>>>>>> >>>>>>> If anybody things it needs to trigger another reset while in >>>>>>> reset (which is actually a small design bug separately) the >>>>>>> reset will just be canceled in the same way we cancel the >>>>>>> scheduler resets. >>>>>>> >>>>>>> Christian. >>>>>> >>>>>> >>>>>> Had this in mind at first but then I realized that each client >>>>>> (RAS, KFD and sysfs) will want to fill his own data for the work >>>>>> (see amdgpu_device_gpu_recover) - for XGMI hive each will want to >>>>>> set his own adev (which is fine if you set a work per adev as you >>>>>> suggest) but also each client might want (they all put NULL there >>>>>> but in theory in the future) also set his own bad job value and >>>>>> here you might have a collision. >>>>> >>>>> Yeah, but that is intentional. See when we have a job that needs >>>>> to be consumed by the reset handler and not overwritten or something. >>>> >>>> >>>> I am not sure why this is a requirement, multiple clients can >>>> decide concurrently to trigger a reset for some reason (possibly >>>> independent reasons) hence they cannot share same work struct to >>>> pass to it their data. >>> >>> If those concurrent clients could detect that a reset was already in >>> progress, you wouldn't need the complexity of multiple work structs >>> being scheduled. You could simply return without triggering another >>> reset. >> >> >> In my view main problem here with single work struct either at reset >> domain level or even adev level is that in some cases we optimize >> resets and don't really perform ASIC HW reset (see >> amdgpu_job_timedout with soft recovery and skip_hw_reset in >> amdgpu_device_gpu_recover_imp for the case the bad job does get >> signaled just before we start HW reset and we just skip this). That's one of the reasons why we should have multiple work items for job based reset and other reset sources. See the whole idea is the following: 1. We have one single queued work queue for each reset domain which makes sure that all reset requests execute in order. 2. We have one delayed work item for each scheduler which fires when a timeout on a scheduler occurs and eventually calls the reset procedure with the last running job. 3. We have one work item for each necessary hard reset. The delayed work item from the scheduler first tries a soft recovery and checks if a hard reset is really necessary. If it's not necessary and we can cancel the offending job we skip the hard reset. The hard reset work item doesn't do any of those checks and just does a reset no matter what. When we really do a reset, independent if its triggered by a job or other source we cancel all sources at the end of the reset procedure. This makes sure that a) We only do one reset even when multiple sources fire at the same time and b) when any source bails out and only does a soft recovery we do a full reset anyway when necessary. That design was outlined multiple times now on the mailing list and looks totally clear to me. We should probably document that somewhere. Regards, Christian. >> You can see that if many different reset sources share same work >> struct what can happen is that the first to obtain the lock you >> describe bellow might opt out from full HW reset because his bad job >> did signal for example or because his hunged IP block was able to >> recover through SW reset but in the meantime another reset source who >> needed an actual HW reset just silently returned and we end up with >> unhandled reset request. True that today this happens only to job >> timeout reset sources that are handled form within the scheduler and >> won't use this single work struct but no one prevents a future case >> for this to happen and also, if we actually want to unify scheduler >> time out handlers within reset domain (which seems to me the right >> design approach) we won't be able to use just one work struct for >> this reason anyway. >> > > Just to add to this point - a reset domain is co-operative domain. In > addition to sharing a set of clients from various reset sources for > one device, it also will have a set of devices like in XGMI hive. The > job timeout on one device may not eventually result in result, but a > RAS error happening on another device at the same time would need a > reset. The second device's RAS error cannot return seeing that a > reset work already started, or ignore the reset work given that > another device has filled the reset data. > > When there is a reset domain, it should take care of the work > scheduled and keeping it in device or any other level doesn't sound good. > > Thanks, > Lijo > >> Andrey >> >> >>> >>> I'd put the reset work struct into the reset_domain struct. That way >>> you'd have exactly one worker for the reset domain. You could >>> implement a lock-less scheme to decide whether you need to schedule >>> a reset, e.g. using an atomic counter in the shared work struct that >>> gets incremented when a client wants to trigger a reset >>> (atomic_add_return). If that counter is exactly 1 after >>> incrementing, you need to fill in the rest of the work struct and >>> schedule the work. Otherwise, it's already scheduled (or another >>> client is in the process of scheduling it) and you just return. When >>> the worker finishes (after confirming a successful reset), it resets >>> the counter to 0, so the next client requesting a reset will >>> schedule the worker again. >>> >>> Regards, >>> Felix >>> >>> >>>> >>>> >>>>> >>>>> >>>>> Additional to that keep in mind that you can't allocate any memory >>>>> before or during the GPU reset nor wait for the reset to complete >>>>> (so you can't allocate anything on the stack either). >>>> >>>> >>>> There is no dynamic allocation here, regarding stack allocations - >>>> we do it all the time when we call functions, even during GPU >>>> resets, how on stack allocation of work struct in >>>> amdgpu_device_gpu_recover is different from any other local >>>> variable we allocate in any function we call ? >>>> >>>> I am also not sure why it's not allowed to wait for reset to >>>> complete ? Also, see in amdgpu_ras_do_recovery and gpu_recover_get >>>> (debugfs) - the caller expects the reset to complete before he >>>> returns. I can probably work around it in RAS code by calling >>>> atomic_set(&ras->in_recovery, 0) from some callback within actual >>>> reset function but regarding sysfs it actually expects a result >>>> returned indicating whether the call was successful or not. >>>> >>>> Andrey >>>> >>>> >>>>> >>>>> I don't think that concept you try here will work. >>>>> >>>>> Regards, >>>>> Christian. >>>>> >>>>>> Also in general seems to me it's cleaner approach where this >>>>>> logic (the work items) are held and handled in reset_domain and >>>>>> are not split in each adev or any other entity. We might want in >>>>>> the future to even move the scheduler handling into reset domain >>>>>> since reset domain is supposed to be a generic things and not >>>>>> only or AMD. >>>>>> >>>>>> Andrey >>>>>> >>>>>> >>>>>>> >>>>>>>> >>>>>>>> Andrey >>>>>>>> >>>>>>>> >>>>>>>>> >>>>>>>>> Regards, >>>>>>>>> Christian. >>>>>>>>> >>>>>>>>>> >>>>>>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> >>>>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com> >>>>>>>>>> --- >>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +--- >>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++-- >>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 + >>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 >>>>>>>>>> +++++++++++++++++++++- >>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +- >>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++- >>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++- >>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++- >>>>>>>>>> 8 files changed, 104 insertions(+), 24 deletions(-) >>>>>>>>>> >>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>> index 4264abc5604d..99efd8317547 100644 >>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>> @@ -109,6 +109,7 @@ >>>>>>>>>> #include "amdgpu_fdinfo.h" >>>>>>>>>> #include "amdgpu_mca.h" >>>>>>>>>> #include "amdgpu_ras.h" >>>>>>>>>> +#include "amdgpu_reset.h" >>>>>>>>>> #define MAX_GPU_INSTANCE 16 >>>>>>>>>> @@ -509,16 +510,6 @@ struct amdgpu_allowed_register_entry { >>>>>>>>>> bool grbm_indexed; >>>>>>>>>> }; >>>>>>>>>> -enum amd_reset_method { >>>>>>>>>> - AMD_RESET_METHOD_NONE = -1, >>>>>>>>>> - AMD_RESET_METHOD_LEGACY = 0, >>>>>>>>>> - AMD_RESET_METHOD_MODE0, >>>>>>>>>> - AMD_RESET_METHOD_MODE1, >>>>>>>>>> - AMD_RESET_METHOD_MODE2, >>>>>>>>>> - AMD_RESET_METHOD_BACO, >>>>>>>>>> - AMD_RESET_METHOD_PCI, >>>>>>>>>> -}; >>>>>>>>>> - >>>>>>>>>> struct amdgpu_video_codec_info { >>>>>>>>>> u32 codec_type; >>>>>>>>>> u32 max_width; >>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>> index e582f1044c0f..7fa82269c30f 100644 >>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>> @@ -5201,6 +5201,12 @@ int >>>>>>>>>> amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev, >>>>>>>>>> } >>>>>>>>>> tmp_vram_lost_counter = >>>>>>>>>> atomic_read(&((adev)->vram_lost_counter)); >>>>>>>>>> + >>>>>>>>>> + /* Drop all pending resets since we will reset now >>>>>>>>>> anyway */ >>>>>>>>>> + tmp_adev = list_first_entry(device_list_handle, struct >>>>>>>>>> amdgpu_device, >>>>>>>>>> + reset_list); >>>>>>>>>> + amdgpu_reset_pending_list(tmp_adev->reset_domain); >>>>>>>>>> + >>>>>>>>>> /* Actual ASIC resets if needed.*/ >>>>>>>>>> /* Host driver will handle XGMI hive reset for SRIOV */ >>>>>>>>>> if (amdgpu_sriov_vf(adev)) { >>>>>>>>>> @@ -5296,7 +5302,7 @@ int >>>>>>>>>> amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev, >>>>>>>>>> } >>>>>>>>>> struct amdgpu_recover_work_struct { >>>>>>>>>> - struct work_struct base; >>>>>>>>>> + struct amdgpu_reset_work_struct base; >>>>>>>>>> struct amdgpu_device *adev; >>>>>>>>>> struct amdgpu_job *job; >>>>>>>>>> int ret; >>>>>>>>>> @@ -5304,7 +5310,7 @@ struct amdgpu_recover_work_struct { >>>>>>>>>> static void amdgpu_device_queue_gpu_recover_work(struct >>>>>>>>>> work_struct *work) >>>>>>>>>> { >>>>>>>>>> - struct amdgpu_recover_work_struct *recover_work = >>>>>>>>>> container_of(work, struct amdgpu_recover_work_struct, base); >>>>>>>>>> + struct amdgpu_recover_work_struct *recover_work = >>>>>>>>>> container_of(work, struct amdgpu_recover_work_struct, >>>>>>>>>> base.base.work); >>>>>>>>>> recover_work->ret = >>>>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, >>>>>>>>>> recover_work->job); >>>>>>>>>> } >>>>>>>>>> @@ -5316,12 +5322,15 @@ int amdgpu_device_gpu_recover(struct >>>>>>>>>> amdgpu_device *adev, >>>>>>>>>> { >>>>>>>>>> struct amdgpu_recover_work_struct work = {.adev = adev, >>>>>>>>>> .job = job}; >>>>>>>>>> - INIT_WORK(&work.base, >>>>>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>>>>> + INIT_DELAYED_WORK(&work.base.base, >>>>>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>>>>> + INIT_LIST_HEAD(&work.base.node); >>>>>>>>>> if (!amdgpu_reset_domain_schedule(adev->reset_domain, >>>>>>>>>> &work.base)) >>>>>>>>>> return -EAGAIN; >>>>>>>>>> - flush_work(&work.base); >>>>>>>>>> + flush_delayed_work(&work.base.base); >>>>>>>>>> + >>>>>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>> &work.base); >>>>>>>>>> return work.ret; >>>>>>>>>> } >>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>> index c80af0889773..ffddd419c351 100644 >>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain >>>>>>>>>> *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d >>>>>>>>>> atomic_set(&reset_domain->in_gpu_reset, 0); >>>>>>>>>> init_rwsem(&reset_domain->sem); >>>>>>>>>> + INIT_LIST_HEAD(&reset_domain->pending_works); >>>>>>>>>> + mutex_init(&reset_domain->reset_lock); >>>>>>>>>> + >>>>>>>>>> return reset_domain; >>>>>>>>>> } >>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>> index 1949dbe28a86..863ec5720fc1 100644 >>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>> @@ -24,7 +24,18 @@ >>>>>>>>>> #ifndef __AMDGPU_RESET_H__ >>>>>>>>>> #define __AMDGPU_RESET_H__ >>>>>>>>>> -#include "amdgpu.h" >>>>>>>>>> + >>>>>>>>>> +#include <linux/atomic.h> >>>>>>>>>> +#include <linux/mutex.h> >>>>>>>>>> +#include <linux/list.h> >>>>>>>>>> +#include <linux/kref.h> >>>>>>>>>> +#include <linux/rwsem.h> >>>>>>>>>> +#include <linux/workqueue.h> >>>>>>>>>> + >>>>>>>>>> +struct amdgpu_device; >>>>>>>>>> +struct amdgpu_job; >>>>>>>>>> +struct amdgpu_hive_info; >>>>>>>>>> + >>>>>>>>>> enum AMDGPU_RESET_FLAGS { >>>>>>>>>> @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS { >>>>>>>>>> AMDGPU_SKIP_HW_RESET = 1, >>>>>>>>>> }; >>>>>>>>>> + >>>>>>>>>> +enum amd_reset_method { >>>>>>>>>> + AMD_RESET_METHOD_NONE = -1, >>>>>>>>>> + AMD_RESET_METHOD_LEGACY = 0, >>>>>>>>>> + AMD_RESET_METHOD_MODE0, >>>>>>>>>> + AMD_RESET_METHOD_MODE1, >>>>>>>>>> + AMD_RESET_METHOD_MODE2, >>>>>>>>>> + AMD_RESET_METHOD_BACO, >>>>>>>>>> + AMD_RESET_METHOD_PCI, >>>>>>>>>> +}; >>>>>>>>>> + >>>>>>>>>> struct amdgpu_reset_context { >>>>>>>>>> enum amd_reset_method method; >>>>>>>>>> struct amdgpu_device *reset_req_dev; >>>>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context { >>>>>>>>>> unsigned long flags; >>>>>>>>>> }; >>>>>>>>>> +struct amdgpu_reset_control; >>>>>>>>>> + >>>>>>>>>> struct amdgpu_reset_handler { >>>>>>>>>> enum amd_reset_method reset_method; >>>>>>>>>> struct list_head handler_list; >>>>>>>>>> @@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type { >>>>>>>>>> XGMI_HIVE >>>>>>>>>> }; >>>>>>>>>> + >>>>>>>>>> +struct amdgpu_reset_work_struct { >>>>>>>>>> + struct delayed_work base; >>>>>>>>>> + struct list_head node; >>>>>>>>>> +}; >>>>>>>>>> + >>>>>>>>>> struct amdgpu_reset_domain { >>>>>>>>>> struct kref refcount; >>>>>>>>>> struct workqueue_struct *wq; >>>>>>>>>> enum amdgpu_reset_domain_type type; >>>>>>>>>> struct rw_semaphore sem; >>>>>>>>>> atomic_t in_gpu_reset; >>>>>>>>>> + >>>>>>>>>> + struct list_head pending_works; >>>>>>>>>> + struct mutex reset_lock; >>>>>>>>>> }; >>>>>>>>>> @@ -113,9 +146,43 @@ static inline void >>>>>>>>>> amdgpu_reset_put_reset_domain(struct amdgpu_reset_domain *dom >>>>>>>>>> } >>>>>>>>>> static inline bool amdgpu_reset_domain_schedule(struct >>>>>>>>>> amdgpu_reset_domain *domain, >>>>>>>>>> - struct work_struct *work) >>>>>>>>>> + struct amdgpu_reset_work_struct *work) >>>>>>>>>> { >>>>>>>>>> - return queue_work(domain->wq, work); >>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>> + >>>>>>>>>> + if (!queue_delayed_work(domain->wq, &work->base, 0)) { >>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>> + return false; >>>>>>>>>> + } >>>>>>>>>> + >>>>>>>>>> + list_add_tail(&work->node, &domain->pending_works); >>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>> + >>>>>>>>>> + return true; >>>>>>>>>> +} >>>>>>>>>> + >>>>>>>>>> +static inline void >>>>>>>>>> amdgpu_reset_domain_del_pendning_work(struct >>>>>>>>>> amdgpu_reset_domain *domain, >>>>>>>>>> + struct amdgpu_reset_work_struct *work) >>>>>>>>>> +{ >>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>> + list_del_init(&work->node); >>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>> +} >>>>>>>>>> + >>>>>>>>>> +static inline void amdgpu_reset_pending_list(struct >>>>>>>>>> amdgpu_reset_domain *domain) >>>>>>>>>> +{ >>>>>>>>>> + struct amdgpu_reset_work_struct *entry, *tmp; >>>>>>>>>> + >>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>> + list_for_each_entry_safe(entry, tmp, >>>>>>>>>> &domain->pending_works, node) { >>>>>>>>>> + >>>>>>>>>> + list_del_init(&entry->node); >>>>>>>>>> + >>>>>>>>>> + /* Stop any other related pending resets */ >>>>>>>>>> + cancel_delayed_work(&entry->base); >>>>>>>>>> + } >>>>>>>>>> + >>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>> } >>>>>>>>>> void amdgpu_device_lock_reset_domain(struct >>>>>>>>>> amdgpu_reset_domain *reset_domain); >>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>> index 239f232f9c02..574e870d3064 100644 >>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>> @@ -25,6 +25,7 @@ >>>>>>>>>> #define AMDGPU_VIRT_H >>>>>>>>>> #include "amdgv_sriovmsg.h" >>>>>>>>>> +#include "amdgpu_reset.h" >>>>>>>>>> #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 << 0) /* vBIOS >>>>>>>>>> is sr-iov ready */ >>>>>>>>>> #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 1) /* sr-iov >>>>>>>>>> is enabled on this GPU */ >>>>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt { >>>>>>>>>> uint32_t reg_val_offs; >>>>>>>>>> struct amdgpu_irq_src ack_irq; >>>>>>>>>> struct amdgpu_irq_src rcv_irq; >>>>>>>>>> - struct work_struct flr_work; >>>>>>>>>> + struct amdgpu_reset_work_struct flr_work; >>>>>>>>>> struct amdgpu_mm_table mm_table; >>>>>>>>>> const struct amdgpu_virt_ops *ops; >>>>>>>>>> struct amdgpu_vf_error_buffer vf_errors; >>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>> index b81acf59870c..f3d1c2be9292 100644 >>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>> @@ -251,7 +251,7 @@ static int >>>>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device *adev, >>>>>>>>>> static void xgpu_ai_mailbox_flr_work(struct work_struct >>>>>>>>>> *work) >>>>>>>>>> { >>>>>>>>>> - struct amdgpu_virt *virt = container_of(work, struct >>>>>>>>>> amdgpu_virt, flr_work); >>>>>>>>>> + struct amdgpu_virt *virt = container_of(work, struct >>>>>>>>>> amdgpu_virt, flr_work.base.work); >>>>>>>>>> struct amdgpu_device *adev = container_of(virt, struct >>>>>>>>>> amdgpu_device, virt); >>>>>>>>>> int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>>>>> @@ -380,7 +380,8 @@ int xgpu_ai_mailbox_get_irq(struct >>>>>>>>>> amdgpu_device *adev) >>>>>>>>>> return r; >>>>>>>>>> } >>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>> xgpu_ai_mailbox_flr_work); >>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>> xgpu_ai_mailbox_flr_work); >>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>> return 0; >>>>>>>>>> } >>>>>>>>>> @@ -389,6 +390,8 @@ void xgpu_ai_mailbox_put_irq(struct >>>>>>>>>> amdgpu_device *adev) >>>>>>>>>> { >>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>> + >>>>>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>> } >>>>>>>>>> static int xgpu_ai_request_init_data(struct amdgpu_device >>>>>>>>>> *adev) >>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644 >>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>> @@ -275,7 +275,7 @@ static int >>>>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device *adev, >>>>>>>>>> static void xgpu_nv_mailbox_flr_work(struct work_struct >>>>>>>>>> *work) >>>>>>>>>> { >>>>>>>>>> - struct amdgpu_virt *virt = container_of(work, struct >>>>>>>>>> amdgpu_virt, flr_work); >>>>>>>>>> + struct amdgpu_virt *virt = container_of(work, struct >>>>>>>>>> amdgpu_virt, flr_work.base.work); >>>>>>>>>> struct amdgpu_device *adev = container_of(virt, struct >>>>>>>>>> amdgpu_device, virt); >>>>>>>>>> int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>>>>> @@ -407,7 +407,8 @@ int xgpu_nv_mailbox_get_irq(struct >>>>>>>>>> amdgpu_device *adev) >>>>>>>>>> return r; >>>>>>>>>> } >>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>> xgpu_nv_mailbox_flr_work); >>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>> xgpu_nv_mailbox_flr_work); >>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>> return 0; >>>>>>>>>> } >>>>>>>>>> @@ -416,6 +417,8 @@ void xgpu_nv_mailbox_put_irq(struct >>>>>>>>>> amdgpu_device *adev) >>>>>>>>>> { >>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>> + >>>>>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>> } >>>>>>>>>> const struct amdgpu_virt_ops xgpu_nv_virt_ops = { >>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644 >>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>> @@ -512,7 +512,7 @@ static int >>>>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device *adev, >>>>>>>>>> static void xgpu_vi_mailbox_flr_work(struct work_struct >>>>>>>>>> *work) >>>>>>>>>> { >>>>>>>>>> - struct amdgpu_virt *virt = container_of(work, struct >>>>>>>>>> amdgpu_virt, flr_work); >>>>>>>>>> + struct amdgpu_virt *virt = container_of(work, struct >>>>>>>>>> amdgpu_virt, flr_work.base.work); >>>>>>>>>> struct amdgpu_device *adev = container_of(virt, struct >>>>>>>>>> amdgpu_device, virt); >>>>>>>>>> /* wait until RCV_MSG become 3 */ >>>>>>>>>> @@ -610,7 +610,8 @@ int xgpu_vi_mailbox_get_irq(struct >>>>>>>>>> amdgpu_device *adev) >>>>>>>>>> return r; >>>>>>>>>> } >>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>> xgpu_vi_mailbox_flr_work); >>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>> xgpu_vi_mailbox_flr_work); >>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>> return 0; >>>>>>>>>> } >>>>>>>>>> @@ -619,6 +620,8 @@ void xgpu_vi_mailbox_put_irq(struct >>>>>>>>>> amdgpu_device *adev) >>>>>>>>>> { >>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>> + >>>>>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>> } >>>>>>>>>> const struct amdgpu_virt_ops xgpu_vi_virt_ops = { >>>>>>>>> >>>>>>> >>>>> ^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive. 2022-05-06 8:56 ` Christian König @ 2022-05-10 16:00 ` Andrey Grodzovsky 2022-05-10 16:17 ` Christian König 0 siblings, 1 reply; 40+ messages in thread From: Andrey Grodzovsky @ 2022-05-10 16:00 UTC (permalink / raw) To: Christian König, Lazar, Lijo, Felix Kuehling, Christian König, amd-gfx Cc: Bai Zoy On 2022-05-06 04:56, Christian König wrote: > Am 06.05.22 um 08:02 schrieb Lazar, Lijo: >> On 5/6/2022 3:17 AM, Andrey Grodzovsky wrote: >>> >>> On 2022-05-05 15:49, Felix Kuehling wrote: >>>> >>>> Am 2022-05-05 um 14:57 schrieb Andrey Grodzovsky: >>>>> >>>>> On 2022-05-05 11:06, Christian König wrote: >>>>>> Am 05.05.22 um 15:54 schrieb Andrey Grodzovsky: >>>>>>> >>>>>>> On 2022-05-05 09:23, Christian König wrote: >>>>>>>> Am 05.05.22 um 15:15 schrieb Andrey Grodzovsky: >>>>>>>>> On 2022-05-05 06:09, Christian König wrote: >>>>>>>>> >>>>>>>>>> Am 04.05.22 um 18:18 schrieb Andrey Grodzovsky: >>>>>>>>>>> Problem: >>>>>>>>>>> During hive reset caused by command timing out on a ring >>>>>>>>>>> extra resets are generated by triggered by KFD which is >>>>>>>>>>> unable to accesses registers on the resetting ASIC. >>>>>>>>>>> >>>>>>>>>>> Fix: Rework GPU reset to use a list of pending reset jobs >>>>>>>>>>> such that the first reset jobs that actaully resets the entire >>>>>>>>>>> reset domain will cancel all those pending redundant resets. >>>>>>>>>>> >>>>>>>>>>> This is in line with what we already do for redundant TDRs >>>>>>>>>>> in scheduler code. >>>>>>>>>> >>>>>>>>>> Mhm, why exactly do you need the extra linked list then? >>>>>>>>>> >>>>>>>>>> Let's talk about that on our call today. >>>>>>>>> >>>>>>>>> >>>>>>>>> Going to miss it as you know, and also this is the place to >>>>>>>>> discuss technical questions anyway so - >>>>>>>> >>>>>>>> Good point. >>>>>>>> >>>>>>>>> It's needed because those other resets are not time out >>>>>>>>> handlers that are governed by the scheduler >>>>>>>>> but rather external resets that are triggered by such clients >>>>>>>>> as KFD, RAS and sysfs. Scheduler has no >>>>>>>>> knowledge of them (and should not have) but they are >>>>>>>>> serialized into same wq as the TO handlers >>>>>>>>> from the scheduler. It just happens that TO triggered reset >>>>>>>>> causes in turn another reset (from KFD in >>>>>>>>> this case) and we want to prevent this second reset from >>>>>>>>> taking place just as we want to avoid multiple >>>>>>>>> TO resets to take place in scheduler code. >>>>>>>> >>>>>>>> Yeah, but why do you need multiple workers? >>>>>>>> >>>>>>>> You have a single worker for the GPU reset not triggered by the >>>>>>>> scheduler in you adev and cancel that at the end of the reset >>>>>>>> procedure. >>>>>>>> >>>>>>>> If anybody things it needs to trigger another reset while in >>>>>>>> reset (which is actually a small design bug separately) the >>>>>>>> reset will just be canceled in the same way we cancel the >>>>>>>> scheduler resets. >>>>>>>> >>>>>>>> Christian. >>>>>>> >>>>>>> >>>>>>> Had this in mind at first but then I realized that each client >>>>>>> (RAS, KFD and sysfs) will want to fill his own data for the work >>>>>>> (see amdgpu_device_gpu_recover) - for XGMI hive each will want >>>>>>> to set his own adev (which is fine if you set a work per adev as >>>>>>> you suggest) but also each client might want (they all put NULL >>>>>>> there but in theory in the future) also set his own bad job >>>>>>> value and here you might have a collision. >>>>>> >>>>>> Yeah, but that is intentional. See when we have a job that needs >>>>>> to be consumed by the reset handler and not overwritten or >>>>>> something. >>>>> >>>>> >>>>> I am not sure why this is a requirement, multiple clients can >>>>> decide concurrently to trigger a reset for some reason (possibly >>>>> independent reasons) hence they cannot share same work struct to >>>>> pass to it their data. >>>> >>>> If those concurrent clients could detect that a reset was already >>>> in progress, you wouldn't need the complexity of multiple work >>>> structs being scheduled. You could simply return without triggering >>>> another reset. >>> >>> >>> In my view main problem here with single work struct either at reset >>> domain level or even adev level is that in some cases we optimize >>> resets and don't really perform ASIC HW reset (see >>> amdgpu_job_timedout with soft recovery and skip_hw_reset in >>> amdgpu_device_gpu_recover_imp for the case the bad job does get >>> signaled just before we start HW reset and we just skip this). > > That's one of the reasons why we should have multiple work items for > job based reset and other reset sources. > > See the whole idea is the following: > 1. We have one single queued work queue for each reset domain which > makes sure that all reset requests execute in order. > 2. We have one delayed work item for each scheduler which fires when a > timeout on a scheduler occurs and eventually calls the reset procedure > with the last running job. > 3. We have one work item for each necessary hard reset. > > The delayed work item from the scheduler first tries a soft recovery > and checks if a hard reset is really necessary. If it's not necessary > and we can cancel the offending job we skip the hard reset. > > The hard reset work item doesn't do any of those checks and just does > a reset no matter what. > > When we really do a reset, independent if its triggered by a job or > other source we cancel all sources at the end of the reset procedure. > > This makes sure that a) We only do one reset even when multiple > sources fire at the same time and b) when any source bails out and > only does a soft recovery we do a full reset anyway when necessary. > > That design was outlined multiple times now on the mailing list and > looks totally clear to me. We should probably document that somewhere. If you look at the patch what you described above is exactly what is happening - since scheduler's delayed work is different from any non scheduler delayed work the SW reset which might take place from scheduler's reset will not have any impact on any non scheduler delayed work and will not cancel them. In case the scheduler actually reaches the point of HW reset it will cancel out all pending reset works from any other sources on the same reset domain. Non scheduler reset will always proceed to do full HW reset and will cancel any other pending resets. The only difference is I chose to do the canceling right BEFORE the HW reset and not AFTER. I did this because I see a possible race where a new reset request is being generated exactly after we finished the HW reset but before we canceled out all pending resets - in such case you wold not want to cancel this 'border line new' reset request. Andrey > > Regards, > Christian. > >>> You can see that if many different reset sources share same work >>> struct what can happen is that the first to obtain the lock you >>> describe bellow might opt out from full HW reset because his bad job >>> did signal for example or because his hunged IP block was able to >>> recover through SW reset but in the meantime another reset source >>> who needed an actual HW reset just silently returned and we end up >>> with unhandled reset request. True that today this happens only to >>> job timeout reset sources that are handled form within the scheduler >>> and won't use this single work struct but no one prevents a future >>> case for this to happen and also, if we actually want to unify >>> scheduler time out handlers within reset domain (which seems to me >>> the right design approach) we won't be able to use just one work >>> struct for this reason anyway. >>> >> >> Just to add to this point - a reset domain is co-operative domain. In >> addition to sharing a set of clients from various reset sources for >> one device, it also will have a set of devices like in XGMI hive. The >> job timeout on one device may not eventually result in result, but a >> RAS error happening on another device at the same time would need a >> reset. The second device's RAS error cannot return seeing that a >> reset work already started, or ignore the reset work given that >> another device has filled the reset data. >> >> When there is a reset domain, it should take care of the work >> scheduled and keeping it in device or any other level doesn't sound >> good. >> >> Thanks, >> Lijo >> >>> Andrey >>> >>> >>>> >>>> I'd put the reset work struct into the reset_domain struct. That >>>> way you'd have exactly one worker for the reset domain. You could >>>> implement a lock-less scheme to decide whether you need to schedule >>>> a reset, e.g. using an atomic counter in the shared work struct >>>> that gets incremented when a client wants to trigger a reset >>>> (atomic_add_return). If that counter is exactly 1 after >>>> incrementing, you need to fill in the rest of the work struct and >>>> schedule the work. Otherwise, it's already scheduled (or another >>>> client is in the process of scheduling it) and you just return. >>>> When the worker finishes (after confirming a successful reset), it >>>> resets the counter to 0, so the next client requesting a reset will >>>> schedule the worker again. >>>> >>>> Regards, >>>> Felix >>>> >>>> >>>>> >>>>> >>>>>> >>>>>> >>>>>> Additional to that keep in mind that you can't allocate any >>>>>> memory before or during the GPU reset nor wait for the reset to >>>>>> complete (so you can't allocate anything on the stack either). >>>>> >>>>> >>>>> There is no dynamic allocation here, regarding stack allocations - >>>>> we do it all the time when we call functions, even during GPU >>>>> resets, how on stack allocation of work struct in >>>>> amdgpu_device_gpu_recover is different from any other local >>>>> variable we allocate in any function we call ? >>>>> >>>>> I am also not sure why it's not allowed to wait for reset to >>>>> complete ? Also, see in amdgpu_ras_do_recovery and gpu_recover_get >>>>> (debugfs) - the caller expects the reset to complete before he >>>>> returns. I can probably work around it in RAS code by calling >>>>> atomic_set(&ras->in_recovery, 0) from some callback within actual >>>>> reset function but regarding sysfs it actually expects a result >>>>> returned indicating whether the call was successful or not. >>>>> >>>>> Andrey >>>>> >>>>> >>>>>> >>>>>> I don't think that concept you try here will work. >>>>>> >>>>>> Regards, >>>>>> Christian. >>>>>> >>>>>>> Also in general seems to me it's cleaner approach where this >>>>>>> logic (the work items) are held and handled in reset_domain and >>>>>>> are not split in each adev or any other entity. We might want in >>>>>>> the future to even move the scheduler handling into reset domain >>>>>>> since reset domain is supposed to be a generic things and not >>>>>>> only or AMD. >>>>>>> >>>>>>> Andrey >>>>>>> >>>>>>> >>>>>>>> >>>>>>>>> >>>>>>>>> Andrey >>>>>>>>> >>>>>>>>> >>>>>>>>>> >>>>>>>>>> Regards, >>>>>>>>>> Christian. >>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> >>>>>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com> >>>>>>>>>>> --- >>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +--- >>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++-- >>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 + >>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 >>>>>>>>>>> +++++++++++++++++++++- >>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +- >>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++- >>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++- >>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++- >>>>>>>>>>> 8 files changed, 104 insertions(+), 24 deletions(-) >>>>>>>>>>> >>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>> index 4264abc5604d..99efd8317547 100644 >>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>> @@ -109,6 +109,7 @@ >>>>>>>>>>> #include "amdgpu_fdinfo.h" >>>>>>>>>>> #include "amdgpu_mca.h" >>>>>>>>>>> #include "amdgpu_ras.h" >>>>>>>>>>> +#include "amdgpu_reset.h" >>>>>>>>>>> #define MAX_GPU_INSTANCE 16 >>>>>>>>>>> @@ -509,16 +510,6 @@ struct amdgpu_allowed_register_entry { >>>>>>>>>>> bool grbm_indexed; >>>>>>>>>>> }; >>>>>>>>>>> -enum amd_reset_method { >>>>>>>>>>> - AMD_RESET_METHOD_NONE = -1, >>>>>>>>>>> - AMD_RESET_METHOD_LEGACY = 0, >>>>>>>>>>> - AMD_RESET_METHOD_MODE0, >>>>>>>>>>> - AMD_RESET_METHOD_MODE1, >>>>>>>>>>> - AMD_RESET_METHOD_MODE2, >>>>>>>>>>> - AMD_RESET_METHOD_BACO, >>>>>>>>>>> - AMD_RESET_METHOD_PCI, >>>>>>>>>>> -}; >>>>>>>>>>> - >>>>>>>>>>> struct amdgpu_video_codec_info { >>>>>>>>>>> u32 codec_type; >>>>>>>>>>> u32 max_width; >>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>> index e582f1044c0f..7fa82269c30f 100644 >>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>> @@ -5201,6 +5201,12 @@ int >>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev, >>>>>>>>>>> } >>>>>>>>>>> tmp_vram_lost_counter = >>>>>>>>>>> atomic_read(&((adev)->vram_lost_counter)); >>>>>>>>>>> + >>>>>>>>>>> + /* Drop all pending resets since we will reset now >>>>>>>>>>> anyway */ >>>>>>>>>>> + tmp_adev = list_first_entry(device_list_handle, struct >>>>>>>>>>> amdgpu_device, >>>>>>>>>>> + reset_list); >>>>>>>>>>> + amdgpu_reset_pending_list(tmp_adev->reset_domain); >>>>>>>>>>> + >>>>>>>>>>> /* Actual ASIC resets if needed.*/ >>>>>>>>>>> /* Host driver will handle XGMI hive reset for SRIOV */ >>>>>>>>>>> if (amdgpu_sriov_vf(adev)) { >>>>>>>>>>> @@ -5296,7 +5302,7 @@ int >>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev, >>>>>>>>>>> } >>>>>>>>>>> struct amdgpu_recover_work_struct { >>>>>>>>>>> - struct work_struct base; >>>>>>>>>>> + struct amdgpu_reset_work_struct base; >>>>>>>>>>> struct amdgpu_device *adev; >>>>>>>>>>> struct amdgpu_job *job; >>>>>>>>>>> int ret; >>>>>>>>>>> @@ -5304,7 +5310,7 @@ struct amdgpu_recover_work_struct { >>>>>>>>>>> static void amdgpu_device_queue_gpu_recover_work(struct >>>>>>>>>>> work_struct *work) >>>>>>>>>>> { >>>>>>>>>>> - struct amdgpu_recover_work_struct *recover_work = >>>>>>>>>>> container_of(work, struct amdgpu_recover_work_struct, base); >>>>>>>>>>> + struct amdgpu_recover_work_struct *recover_work = >>>>>>>>>>> container_of(work, struct amdgpu_recover_work_struct, >>>>>>>>>>> base.base.work); >>>>>>>>>>> recover_work->ret = >>>>>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, >>>>>>>>>>> recover_work->job); >>>>>>>>>>> } >>>>>>>>>>> @@ -5316,12 +5322,15 @@ int amdgpu_device_gpu_recover(struct >>>>>>>>>>> amdgpu_device *adev, >>>>>>>>>>> { >>>>>>>>>>> struct amdgpu_recover_work_struct work = {.adev = >>>>>>>>>>> adev, .job = job}; >>>>>>>>>>> - INIT_WORK(&work.base, >>>>>>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>>>>>> + INIT_DELAYED_WORK(&work.base.base, >>>>>>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>>>>>> + INIT_LIST_HEAD(&work.base.node); >>>>>>>>>>> if >>>>>>>>>>> (!amdgpu_reset_domain_schedule(adev->reset_domain, &work.base)) >>>>>>>>>>> return -EAGAIN; >>>>>>>>>>> - flush_work(&work.base); >>>>>>>>>>> + flush_delayed_work(&work.base.base); >>>>>>>>>>> + >>>>>>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>> &work.base); >>>>>>>>>>> return work.ret; >>>>>>>>>>> } >>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>> index c80af0889773..ffddd419c351 100644 >>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain >>>>>>>>>>> *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d >>>>>>>>>>> atomic_set(&reset_domain->in_gpu_reset, 0); >>>>>>>>>>> init_rwsem(&reset_domain->sem); >>>>>>>>>>> + INIT_LIST_HEAD(&reset_domain->pending_works); >>>>>>>>>>> + mutex_init(&reset_domain->reset_lock); >>>>>>>>>>> + >>>>>>>>>>> return reset_domain; >>>>>>>>>>> } >>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>> index 1949dbe28a86..863ec5720fc1 100644 >>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>> @@ -24,7 +24,18 @@ >>>>>>>>>>> #ifndef __AMDGPU_RESET_H__ >>>>>>>>>>> #define __AMDGPU_RESET_H__ >>>>>>>>>>> -#include "amdgpu.h" >>>>>>>>>>> + >>>>>>>>>>> +#include <linux/atomic.h> >>>>>>>>>>> +#include <linux/mutex.h> >>>>>>>>>>> +#include <linux/list.h> >>>>>>>>>>> +#include <linux/kref.h> >>>>>>>>>>> +#include <linux/rwsem.h> >>>>>>>>>>> +#include <linux/workqueue.h> >>>>>>>>>>> + >>>>>>>>>>> +struct amdgpu_device; >>>>>>>>>>> +struct amdgpu_job; >>>>>>>>>>> +struct amdgpu_hive_info; >>>>>>>>>>> + >>>>>>>>>>> enum AMDGPU_RESET_FLAGS { >>>>>>>>>>> @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS { >>>>>>>>>>> AMDGPU_SKIP_HW_RESET = 1, >>>>>>>>>>> }; >>>>>>>>>>> + >>>>>>>>>>> +enum amd_reset_method { >>>>>>>>>>> + AMD_RESET_METHOD_NONE = -1, >>>>>>>>>>> + AMD_RESET_METHOD_LEGACY = 0, >>>>>>>>>>> + AMD_RESET_METHOD_MODE0, >>>>>>>>>>> + AMD_RESET_METHOD_MODE1, >>>>>>>>>>> + AMD_RESET_METHOD_MODE2, >>>>>>>>>>> + AMD_RESET_METHOD_BACO, >>>>>>>>>>> + AMD_RESET_METHOD_PCI, >>>>>>>>>>> +}; >>>>>>>>>>> + >>>>>>>>>>> struct amdgpu_reset_context { >>>>>>>>>>> enum amd_reset_method method; >>>>>>>>>>> struct amdgpu_device *reset_req_dev; >>>>>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context { >>>>>>>>>>> unsigned long flags; >>>>>>>>>>> }; >>>>>>>>>>> +struct amdgpu_reset_control; >>>>>>>>>>> + >>>>>>>>>>> struct amdgpu_reset_handler { >>>>>>>>>>> enum amd_reset_method reset_method; >>>>>>>>>>> struct list_head handler_list; >>>>>>>>>>> @@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type { >>>>>>>>>>> XGMI_HIVE >>>>>>>>>>> }; >>>>>>>>>>> + >>>>>>>>>>> +struct amdgpu_reset_work_struct { >>>>>>>>>>> + struct delayed_work base; >>>>>>>>>>> + struct list_head node; >>>>>>>>>>> +}; >>>>>>>>>>> + >>>>>>>>>>> struct amdgpu_reset_domain { >>>>>>>>>>> struct kref refcount; >>>>>>>>>>> struct workqueue_struct *wq; >>>>>>>>>>> enum amdgpu_reset_domain_type type; >>>>>>>>>>> struct rw_semaphore sem; >>>>>>>>>>> atomic_t in_gpu_reset; >>>>>>>>>>> + >>>>>>>>>>> + struct list_head pending_works; >>>>>>>>>>> + struct mutex reset_lock; >>>>>>>>>>> }; >>>>>>>>>>> @@ -113,9 +146,43 @@ static inline void >>>>>>>>>>> amdgpu_reset_put_reset_domain(struct amdgpu_reset_domain *dom >>>>>>>>>>> } >>>>>>>>>>> static inline bool amdgpu_reset_domain_schedule(struct >>>>>>>>>>> amdgpu_reset_domain *domain, >>>>>>>>>>> - struct work_struct *work) >>>>>>>>>>> + struct amdgpu_reset_work_struct *work) >>>>>>>>>>> { >>>>>>>>>>> - return queue_work(domain->wq, work); >>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>> + >>>>>>>>>>> + if (!queue_delayed_work(domain->wq, &work->base, 0)) { >>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>> + return false; >>>>>>>>>>> + } >>>>>>>>>>> + >>>>>>>>>>> + list_add_tail(&work->node, &domain->pending_works); >>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>> + >>>>>>>>>>> + return true; >>>>>>>>>>> +} >>>>>>>>>>> + >>>>>>>>>>> +static inline void >>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(struct >>>>>>>>>>> amdgpu_reset_domain *domain, >>>>>>>>>>> + struct amdgpu_reset_work_struct *work) >>>>>>>>>>> +{ >>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>> + list_del_init(&work->node); >>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>> +} >>>>>>>>>>> + >>>>>>>>>>> +static inline void amdgpu_reset_pending_list(struct >>>>>>>>>>> amdgpu_reset_domain *domain) >>>>>>>>>>> +{ >>>>>>>>>>> + struct amdgpu_reset_work_struct *entry, *tmp; >>>>>>>>>>> + >>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>> + list_for_each_entry_safe(entry, tmp, >>>>>>>>>>> &domain->pending_works, node) { >>>>>>>>>>> + >>>>>>>>>>> + list_del_init(&entry->node); >>>>>>>>>>> + >>>>>>>>>>> + /* Stop any other related pending resets */ >>>>>>>>>>> + cancel_delayed_work(&entry->base); >>>>>>>>>>> + } >>>>>>>>>>> + >>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>> } >>>>>>>>>>> void amdgpu_device_lock_reset_domain(struct >>>>>>>>>>> amdgpu_reset_domain *reset_domain); >>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>> index 239f232f9c02..574e870d3064 100644 >>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>> @@ -25,6 +25,7 @@ >>>>>>>>>>> #define AMDGPU_VIRT_H >>>>>>>>>>> #include "amdgv_sriovmsg.h" >>>>>>>>>>> +#include "amdgpu_reset.h" >>>>>>>>>>> #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 << 0) /* vBIOS >>>>>>>>>>> is sr-iov ready */ >>>>>>>>>>> #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 1) /* sr-iov >>>>>>>>>>> is enabled on this GPU */ >>>>>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt { >>>>>>>>>>> uint32_t reg_val_offs; >>>>>>>>>>> struct amdgpu_irq_src ack_irq; >>>>>>>>>>> struct amdgpu_irq_src rcv_irq; >>>>>>>>>>> - struct work_struct flr_work; >>>>>>>>>>> + struct amdgpu_reset_work_struct flr_work; >>>>>>>>>>> struct amdgpu_mm_table mm_table; >>>>>>>>>>> const struct amdgpu_virt_ops *ops; >>>>>>>>>>> struct amdgpu_vf_error_buffer vf_errors; >>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>> index b81acf59870c..f3d1c2be9292 100644 >>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>> @@ -251,7 +251,7 @@ static int >>>>>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device *adev, >>>>>>>>>>> static void xgpu_ai_mailbox_flr_work(struct work_struct >>>>>>>>>>> *work) >>>>>>>>>>> { >>>>>>>>>>> - struct amdgpu_virt *virt = container_of(work, struct >>>>>>>>>>> amdgpu_virt, flr_work); >>>>>>>>>>> + struct amdgpu_virt *virt = container_of(work, struct >>>>>>>>>>> amdgpu_virt, flr_work.base.work); >>>>>>>>>>> struct amdgpu_device *adev = container_of(virt, struct >>>>>>>>>>> amdgpu_device, virt); >>>>>>>>>>> int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>>>>>> @@ -380,7 +380,8 @@ int xgpu_ai_mailbox_get_irq(struct >>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>> return r; >>>>>>>>>>> } >>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>> xgpu_ai_mailbox_flr_work); >>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>> xgpu_ai_mailbox_flr_work); >>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>> return 0; >>>>>>>>>>> } >>>>>>>>>>> @@ -389,6 +390,8 @@ void xgpu_ai_mailbox_put_irq(struct >>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>> { >>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>> + >>>>>>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>> } >>>>>>>>>>> static int xgpu_ai_request_init_data(struct >>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644 >>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>> @@ -275,7 +275,7 @@ static int >>>>>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device *adev, >>>>>>>>>>> static void xgpu_nv_mailbox_flr_work(struct work_struct >>>>>>>>>>> *work) >>>>>>>>>>> { >>>>>>>>>>> - struct amdgpu_virt *virt = container_of(work, struct >>>>>>>>>>> amdgpu_virt, flr_work); >>>>>>>>>>> + struct amdgpu_virt *virt = container_of(work, struct >>>>>>>>>>> amdgpu_virt, flr_work.base.work); >>>>>>>>>>> struct amdgpu_device *adev = container_of(virt, struct >>>>>>>>>>> amdgpu_device, virt); >>>>>>>>>>> int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>>>>>> @@ -407,7 +407,8 @@ int xgpu_nv_mailbox_get_irq(struct >>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>> return r; >>>>>>>>>>> } >>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>> xgpu_nv_mailbox_flr_work); >>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>> xgpu_nv_mailbox_flr_work); >>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>> return 0; >>>>>>>>>>> } >>>>>>>>>>> @@ -416,6 +417,8 @@ void xgpu_nv_mailbox_put_irq(struct >>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>> { >>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>> + >>>>>>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>> } >>>>>>>>>>> const struct amdgpu_virt_ops xgpu_nv_virt_ops = { >>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644 >>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>> @@ -512,7 +512,7 @@ static int >>>>>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device *adev, >>>>>>>>>>> static void xgpu_vi_mailbox_flr_work(struct work_struct >>>>>>>>>>> *work) >>>>>>>>>>> { >>>>>>>>>>> - struct amdgpu_virt *virt = container_of(work, struct >>>>>>>>>>> amdgpu_virt, flr_work); >>>>>>>>>>> + struct amdgpu_virt *virt = container_of(work, struct >>>>>>>>>>> amdgpu_virt, flr_work.base.work); >>>>>>>>>>> struct amdgpu_device *adev = container_of(virt, struct >>>>>>>>>>> amdgpu_device, virt); >>>>>>>>>>> /* wait until RCV_MSG become 3 */ >>>>>>>>>>> @@ -610,7 +610,8 @@ int xgpu_vi_mailbox_get_irq(struct >>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>> return r; >>>>>>>>>>> } >>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>> xgpu_vi_mailbox_flr_work); >>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>> xgpu_vi_mailbox_flr_work); >>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>> return 0; >>>>>>>>>>> } >>>>>>>>>>> @@ -619,6 +620,8 @@ void xgpu_vi_mailbox_put_irq(struct >>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>> { >>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>> + >>>>>>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>> } >>>>>>>>>>> const struct amdgpu_virt_ops xgpu_vi_virt_ops = { >>>>>>>>>> >>>>>>>> >>>>>> > ^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive. 2022-05-10 16:00 ` Andrey Grodzovsky @ 2022-05-10 16:17 ` Christian König 2022-05-10 17:01 ` Andrey Grodzovsky 0 siblings, 1 reply; 40+ messages in thread From: Christian König @ 2022-05-10 16:17 UTC (permalink / raw) To: Andrey Grodzovsky, Christian König, Lazar, Lijo, Felix Kuehling, amd-gfx Cc: Bai Zoy Am 10.05.22 um 18:00 schrieb Andrey Grodzovsky: > [SNIP] >> That's one of the reasons why we should have multiple work items for >> job based reset and other reset sources. >> >> See the whole idea is the following: >> 1. We have one single queued work queue for each reset domain which >> makes sure that all reset requests execute in order. >> 2. We have one delayed work item for each scheduler which fires when >> a timeout on a scheduler occurs and eventually calls the reset >> procedure with the last running job. >> 3. We have one work item for each necessary hard reset. >> >> The delayed work item from the scheduler first tries a soft recovery >> and checks if a hard reset is really necessary. If it's not necessary >> and we can cancel the offending job we skip the hard reset. >> >> The hard reset work item doesn't do any of those checks and just does >> a reset no matter what. >> >> When we really do a reset, independent if its triggered by a job or >> other source we cancel all sources at the end of the reset procedure. >> >> This makes sure that a) We only do one reset even when multiple >> sources fire at the same time and b) when any source bails out and >> only does a soft recovery we do a full reset anyway when necessary. >> >> That design was outlined multiple times now on the mailing list and >> looks totally clear to me. We should probably document that somewhere. > > > If you look at the patch what you described above is exactly what is > happening - since scheduler's delayed work is different from any non > scheduler delayed work the SW reset which might take place from > scheduler's reset > will not have any impact on any non scheduler delayed work and will > not cancel them. In case the scheduler actually reaches the point of > HW reset it will cancel out all pending reset works from any other > sources on the same > reset domain. Non scheduler reset will always proceed to do full HW > reset and will cancel any other pending resets. Ok, but why you then need that linked list? The number of reset sources should be static and not in any way dynamic. See using the linked list sounds like you only wanted to cancel the reset sources raised so far which would not be correct as far as I can see. > > The only difference is I chose to do the canceling right BEFORE the HW > reset and not AFTER. I did this because I see a possible race where a > new reset request is being generated exactly after we finished the HW > reset but before we canceled out all pending resets - in such case you > wold not want to cancel this 'border line new' reset request. Why not? Any new reset request directly after a hardware reset is most likely just falsely generated by the reset itself. Ideally I would cancel all sources after the reset, but before starting any new work. Regards, Christian. > > > Andrey > > >> >> Regards, >> Christian. >> >>>> You can see that if many different reset sources share same work >>>> struct what can happen is that the first to obtain the lock you >>>> describe bellow might opt out from full HW reset because his bad >>>> job did signal for example or because his hunged IP block was able >>>> to recover through SW reset but in the meantime another reset >>>> source who needed an actual HW reset just silently returned and we >>>> end up with unhandled reset request. True that today this happens >>>> only to job timeout reset sources that are handled form within the >>>> scheduler and won't use this single work struct but no one prevents >>>> a future case for this to happen and also, if we actually want to >>>> unify scheduler time out handlers within reset domain (which seems >>>> to me the right design approach) we won't be able to use just one >>>> work struct for this reason anyway. >>>> >>> >>> Just to add to this point - a reset domain is co-operative domain. >>> In addition to sharing a set of clients from various reset sources >>> for one device, it also will have a set of devices like in XGMI >>> hive. The job timeout on one device may not eventually result in >>> result, but a RAS error happening on another device at the same time >>> would need a reset. The second device's RAS error cannot return >>> seeing that a reset work already started, or ignore the reset work >>> given that another device has filled the reset data. >>> >>> When there is a reset domain, it should take care of the work >>> scheduled and keeping it in device or any other level doesn't sound >>> good. >>> >>> Thanks, >>> Lijo >>> >>>> Andrey >>>> >>>> >>>>> >>>>> I'd put the reset work struct into the reset_domain struct. That >>>>> way you'd have exactly one worker for the reset domain. You could >>>>> implement a lock-less scheme to decide whether you need to >>>>> schedule a reset, e.g. using an atomic counter in the shared work >>>>> struct that gets incremented when a client wants to trigger a >>>>> reset (atomic_add_return). If that counter is exactly 1 after >>>>> incrementing, you need to fill in the rest of the work struct and >>>>> schedule the work. Otherwise, it's already scheduled (or another >>>>> client is in the process of scheduling it) and you just return. >>>>> When the worker finishes (after confirming a successful reset), it >>>>> resets the counter to 0, so the next client requesting a reset >>>>> will schedule the worker again. >>>>> >>>>> Regards, >>>>> Felix >>>>> >>>>> >>>>>> >>>>>> >>>>>>> >>>>>>> >>>>>>> Additional to that keep in mind that you can't allocate any >>>>>>> memory before or during the GPU reset nor wait for the reset to >>>>>>> complete (so you can't allocate anything on the stack either). >>>>>> >>>>>> >>>>>> There is no dynamic allocation here, regarding stack allocations >>>>>> - we do it all the time when we call functions, even during GPU >>>>>> resets, how on stack allocation of work struct in >>>>>> amdgpu_device_gpu_recover is different from any other local >>>>>> variable we allocate in any function we call ? >>>>>> >>>>>> I am also not sure why it's not allowed to wait for reset to >>>>>> complete ? Also, see in amdgpu_ras_do_recovery and >>>>>> gpu_recover_get (debugfs) - the caller expects the reset to >>>>>> complete before he returns. I can probably work around it in RAS >>>>>> code by calling atomic_set(&ras->in_recovery, 0) from some >>>>>> callback within actual reset function but regarding sysfs it >>>>>> actually expects a result returned indicating whether the call >>>>>> was successful or not. >>>>>> >>>>>> Andrey >>>>>> >>>>>> >>>>>>> >>>>>>> I don't think that concept you try here will work. >>>>>>> >>>>>>> Regards, >>>>>>> Christian. >>>>>>> >>>>>>>> Also in general seems to me it's cleaner approach where this >>>>>>>> logic (the work items) are held and handled in reset_domain and >>>>>>>> are not split in each adev or any other entity. We might want >>>>>>>> in the future to even move the scheduler handling into reset >>>>>>>> domain since reset domain is supposed to be a generic things >>>>>>>> and not only or AMD. >>>>>>>> >>>>>>>> Andrey >>>>>>>> >>>>>>>> >>>>>>>>> >>>>>>>>>> >>>>>>>>>> Andrey >>>>>>>>>> >>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>> Regards, >>>>>>>>>>> Christian. >>>>>>>>>>> >>>>>>>>>>>> >>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> >>>>>>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com> >>>>>>>>>>>> --- >>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +--- >>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++-- >>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 + >>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 >>>>>>>>>>>> +++++++++++++++++++++- >>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +- >>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++- >>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++- >>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++- >>>>>>>>>>>> 8 files changed, 104 insertions(+), 24 deletions(-) >>>>>>>>>>>> >>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>> index 4264abc5604d..99efd8317547 100644 >>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>> @@ -109,6 +109,7 @@ >>>>>>>>>>>> #include "amdgpu_fdinfo.h" >>>>>>>>>>>> #include "amdgpu_mca.h" >>>>>>>>>>>> #include "amdgpu_ras.h" >>>>>>>>>>>> +#include "amdgpu_reset.h" >>>>>>>>>>>> #define MAX_GPU_INSTANCE 16 >>>>>>>>>>>> @@ -509,16 +510,6 @@ struct amdgpu_allowed_register_entry { >>>>>>>>>>>> bool grbm_indexed; >>>>>>>>>>>> }; >>>>>>>>>>>> -enum amd_reset_method { >>>>>>>>>>>> - AMD_RESET_METHOD_NONE = -1, >>>>>>>>>>>> - AMD_RESET_METHOD_LEGACY = 0, >>>>>>>>>>>> - AMD_RESET_METHOD_MODE0, >>>>>>>>>>>> - AMD_RESET_METHOD_MODE1, >>>>>>>>>>>> - AMD_RESET_METHOD_MODE2, >>>>>>>>>>>> - AMD_RESET_METHOD_BACO, >>>>>>>>>>>> - AMD_RESET_METHOD_PCI, >>>>>>>>>>>> -}; >>>>>>>>>>>> - >>>>>>>>>>>> struct amdgpu_video_codec_info { >>>>>>>>>>>> u32 codec_type; >>>>>>>>>>>> u32 max_width; >>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>> index e582f1044c0f..7fa82269c30f 100644 >>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>> @@ -5201,6 +5201,12 @@ int >>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev, >>>>>>>>>>>> } >>>>>>>>>>>> tmp_vram_lost_counter = >>>>>>>>>>>> atomic_read(&((adev)->vram_lost_counter)); >>>>>>>>>>>> + >>>>>>>>>>>> + /* Drop all pending resets since we will reset now >>>>>>>>>>>> anyway */ >>>>>>>>>>>> + tmp_adev = list_first_entry(device_list_handle, struct >>>>>>>>>>>> amdgpu_device, >>>>>>>>>>>> + reset_list); >>>>>>>>>>>> + amdgpu_reset_pending_list(tmp_adev->reset_domain); >>>>>>>>>>>> + >>>>>>>>>>>> /* Actual ASIC resets if needed.*/ >>>>>>>>>>>> /* Host driver will handle XGMI hive reset for SRIOV */ >>>>>>>>>>>> if (amdgpu_sriov_vf(adev)) { >>>>>>>>>>>> @@ -5296,7 +5302,7 @@ int >>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev, >>>>>>>>>>>> } >>>>>>>>>>>> struct amdgpu_recover_work_struct { >>>>>>>>>>>> - struct work_struct base; >>>>>>>>>>>> + struct amdgpu_reset_work_struct base; >>>>>>>>>>>> struct amdgpu_device *adev; >>>>>>>>>>>> struct amdgpu_job *job; >>>>>>>>>>>> int ret; >>>>>>>>>>>> @@ -5304,7 +5310,7 @@ struct amdgpu_recover_work_struct { >>>>>>>>>>>> static void amdgpu_device_queue_gpu_recover_work(struct >>>>>>>>>>>> work_struct *work) >>>>>>>>>>>> { >>>>>>>>>>>> - struct amdgpu_recover_work_struct *recover_work = >>>>>>>>>>>> container_of(work, struct amdgpu_recover_work_struct, base); >>>>>>>>>>>> + struct amdgpu_recover_work_struct *recover_work = >>>>>>>>>>>> container_of(work, struct amdgpu_recover_work_struct, >>>>>>>>>>>> base.base.work); >>>>>>>>>>>> recover_work->ret = >>>>>>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, >>>>>>>>>>>> recover_work->job); >>>>>>>>>>>> } >>>>>>>>>>>> @@ -5316,12 +5322,15 @@ int >>>>>>>>>>>> amdgpu_device_gpu_recover(struct amdgpu_device *adev, >>>>>>>>>>>> { >>>>>>>>>>>> struct amdgpu_recover_work_struct work = {.adev = >>>>>>>>>>>> adev, .job = job}; >>>>>>>>>>>> - INIT_WORK(&work.base, >>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>>>>>>> + INIT_DELAYED_WORK(&work.base.base, >>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>>>>>>> + INIT_LIST_HEAD(&work.base.node); >>>>>>>>>>>> if >>>>>>>>>>>> (!amdgpu_reset_domain_schedule(adev->reset_domain, >>>>>>>>>>>> &work.base)) >>>>>>>>>>>> return -EAGAIN; >>>>>>>>>>>> - flush_work(&work.base); >>>>>>>>>>>> + flush_delayed_work(&work.base.base); >>>>>>>>>>>> + >>>>>>>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>> &work.base); >>>>>>>>>>>> return work.ret; >>>>>>>>>>>> } >>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>> index c80af0889773..ffddd419c351 100644 >>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain >>>>>>>>>>>> *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d >>>>>>>>>>>> atomic_set(&reset_domain->in_gpu_reset, 0); >>>>>>>>>>>> init_rwsem(&reset_domain->sem); >>>>>>>>>>>> + INIT_LIST_HEAD(&reset_domain->pending_works); >>>>>>>>>>>> + mutex_init(&reset_domain->reset_lock); >>>>>>>>>>>> + >>>>>>>>>>>> return reset_domain; >>>>>>>>>>>> } >>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>> index 1949dbe28a86..863ec5720fc1 100644 >>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>> @@ -24,7 +24,18 @@ >>>>>>>>>>>> #ifndef __AMDGPU_RESET_H__ >>>>>>>>>>>> #define __AMDGPU_RESET_H__ >>>>>>>>>>>> -#include "amdgpu.h" >>>>>>>>>>>> + >>>>>>>>>>>> +#include <linux/atomic.h> >>>>>>>>>>>> +#include <linux/mutex.h> >>>>>>>>>>>> +#include <linux/list.h> >>>>>>>>>>>> +#include <linux/kref.h> >>>>>>>>>>>> +#include <linux/rwsem.h> >>>>>>>>>>>> +#include <linux/workqueue.h> >>>>>>>>>>>> + >>>>>>>>>>>> +struct amdgpu_device; >>>>>>>>>>>> +struct amdgpu_job; >>>>>>>>>>>> +struct amdgpu_hive_info; >>>>>>>>>>>> + >>>>>>>>>>>> enum AMDGPU_RESET_FLAGS { >>>>>>>>>>>> @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS { >>>>>>>>>>>> AMDGPU_SKIP_HW_RESET = 1, >>>>>>>>>>>> }; >>>>>>>>>>>> + >>>>>>>>>>>> +enum amd_reset_method { >>>>>>>>>>>> + AMD_RESET_METHOD_NONE = -1, >>>>>>>>>>>> + AMD_RESET_METHOD_LEGACY = 0, >>>>>>>>>>>> + AMD_RESET_METHOD_MODE0, >>>>>>>>>>>> + AMD_RESET_METHOD_MODE1, >>>>>>>>>>>> + AMD_RESET_METHOD_MODE2, >>>>>>>>>>>> + AMD_RESET_METHOD_BACO, >>>>>>>>>>>> + AMD_RESET_METHOD_PCI, >>>>>>>>>>>> +}; >>>>>>>>>>>> + >>>>>>>>>>>> struct amdgpu_reset_context { >>>>>>>>>>>> enum amd_reset_method method; >>>>>>>>>>>> struct amdgpu_device *reset_req_dev; >>>>>>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context { >>>>>>>>>>>> unsigned long flags; >>>>>>>>>>>> }; >>>>>>>>>>>> +struct amdgpu_reset_control; >>>>>>>>>>>> + >>>>>>>>>>>> struct amdgpu_reset_handler { >>>>>>>>>>>> enum amd_reset_method reset_method; >>>>>>>>>>>> struct list_head handler_list; >>>>>>>>>>>> @@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type { >>>>>>>>>>>> XGMI_HIVE >>>>>>>>>>>> }; >>>>>>>>>>>> + >>>>>>>>>>>> +struct amdgpu_reset_work_struct { >>>>>>>>>>>> + struct delayed_work base; >>>>>>>>>>>> + struct list_head node; >>>>>>>>>>>> +}; >>>>>>>>>>>> + >>>>>>>>>>>> struct amdgpu_reset_domain { >>>>>>>>>>>> struct kref refcount; >>>>>>>>>>>> struct workqueue_struct *wq; >>>>>>>>>>>> enum amdgpu_reset_domain_type type; >>>>>>>>>>>> struct rw_semaphore sem; >>>>>>>>>>>> atomic_t in_gpu_reset; >>>>>>>>>>>> + >>>>>>>>>>>> + struct list_head pending_works; >>>>>>>>>>>> + struct mutex reset_lock; >>>>>>>>>>>> }; >>>>>>>>>>>> @@ -113,9 +146,43 @@ static inline void >>>>>>>>>>>> amdgpu_reset_put_reset_domain(struct amdgpu_reset_domain *dom >>>>>>>>>>>> } >>>>>>>>>>>> static inline bool amdgpu_reset_domain_schedule(struct >>>>>>>>>>>> amdgpu_reset_domain *domain, >>>>>>>>>>>> - struct work_struct *work) >>>>>>>>>>>> + struct amdgpu_reset_work_struct >>>>>>>>>>>> *work) >>>>>>>>>>>> { >>>>>>>>>>>> - return queue_work(domain->wq, work); >>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>> + >>>>>>>>>>>> + if (!queue_delayed_work(domain->wq, &work->base, 0)) { >>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>> + return false; >>>>>>>>>>>> + } >>>>>>>>>>>> + >>>>>>>>>>>> + list_add_tail(&work->node, &domain->pending_works); >>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>> + >>>>>>>>>>>> + return true; >>>>>>>>>>>> +} >>>>>>>>>>>> + >>>>>>>>>>>> +static inline void >>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(struct >>>>>>>>>>>> amdgpu_reset_domain *domain, >>>>>>>>>>>> + struct amdgpu_reset_work_struct *work) >>>>>>>>>>>> +{ >>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>> + list_del_init(&work->node); >>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>> +} >>>>>>>>>>>> + >>>>>>>>>>>> +static inline void amdgpu_reset_pending_list(struct >>>>>>>>>>>> amdgpu_reset_domain *domain) >>>>>>>>>>>> +{ >>>>>>>>>>>> + struct amdgpu_reset_work_struct *entry, *tmp; >>>>>>>>>>>> + >>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>> + list_for_each_entry_safe(entry, tmp, >>>>>>>>>>>> &domain->pending_works, node) { >>>>>>>>>>>> + >>>>>>>>>>>> + list_del_init(&entry->node); >>>>>>>>>>>> + >>>>>>>>>>>> + /* Stop any other related pending resets */ >>>>>>>>>>>> + cancel_delayed_work(&entry->base); >>>>>>>>>>>> + } >>>>>>>>>>>> + >>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>> } >>>>>>>>>>>> void amdgpu_device_lock_reset_domain(struct >>>>>>>>>>>> amdgpu_reset_domain *reset_domain); >>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>> index 239f232f9c02..574e870d3064 100644 >>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>> @@ -25,6 +25,7 @@ >>>>>>>>>>>> #define AMDGPU_VIRT_H >>>>>>>>>>>> #include "amdgv_sriovmsg.h" >>>>>>>>>>>> +#include "amdgpu_reset.h" >>>>>>>>>>>> #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 << 0) /* vBIOS >>>>>>>>>>>> is sr-iov ready */ >>>>>>>>>>>> #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 1) /* sr-iov >>>>>>>>>>>> is enabled on this GPU */ >>>>>>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt { >>>>>>>>>>>> uint32_t reg_val_offs; >>>>>>>>>>>> struct amdgpu_irq_src ack_irq; >>>>>>>>>>>> struct amdgpu_irq_src rcv_irq; >>>>>>>>>>>> - struct work_struct flr_work; >>>>>>>>>>>> + struct amdgpu_reset_work_struct flr_work; >>>>>>>>>>>> struct amdgpu_mm_table mm_table; >>>>>>>>>>>> const struct amdgpu_virt_ops *ops; >>>>>>>>>>>> struct amdgpu_vf_error_buffer vf_errors; >>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>> index b81acf59870c..f3d1c2be9292 100644 >>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>> @@ -251,7 +251,7 @@ static int >>>>>>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device *adev, >>>>>>>>>>>> static void xgpu_ai_mailbox_flr_work(struct work_struct >>>>>>>>>>>> *work) >>>>>>>>>>>> { >>>>>>>>>>>> - struct amdgpu_virt *virt = container_of(work, struct >>>>>>>>>>>> amdgpu_virt, flr_work); >>>>>>>>>>>> + struct amdgpu_virt *virt = container_of(work, struct >>>>>>>>>>>> amdgpu_virt, flr_work.base.work); >>>>>>>>>>>> struct amdgpu_device *adev = container_of(virt, >>>>>>>>>>>> struct amdgpu_device, virt); >>>>>>>>>>>> int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>>>>>>> @@ -380,7 +380,8 @@ int xgpu_ai_mailbox_get_irq(struct >>>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>>> return r; >>>>>>>>>>>> } >>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, xgpu_ai_mailbox_flr_work); >>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>> xgpu_ai_mailbox_flr_work); >>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>> return 0; >>>>>>>>>>>> } >>>>>>>>>>>> @@ -389,6 +390,8 @@ void xgpu_ai_mailbox_put_irq(struct >>>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>>> { >>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>> + >>>>>>>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>> } >>>>>>>>>>>> static int xgpu_ai_request_init_data(struct >>>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644 >>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>> @@ -275,7 +275,7 @@ static int >>>>>>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device *adev, >>>>>>>>>>>> static void xgpu_nv_mailbox_flr_work(struct work_struct >>>>>>>>>>>> *work) >>>>>>>>>>>> { >>>>>>>>>>>> - struct amdgpu_virt *virt = container_of(work, struct >>>>>>>>>>>> amdgpu_virt, flr_work); >>>>>>>>>>>> + struct amdgpu_virt *virt = container_of(work, struct >>>>>>>>>>>> amdgpu_virt, flr_work.base.work); >>>>>>>>>>>> struct amdgpu_device *adev = container_of(virt, >>>>>>>>>>>> struct amdgpu_device, virt); >>>>>>>>>>>> int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>>>>>>> @@ -407,7 +407,8 @@ int xgpu_nv_mailbox_get_irq(struct >>>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>>> return r; >>>>>>>>>>>> } >>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, xgpu_nv_mailbox_flr_work); >>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>> xgpu_nv_mailbox_flr_work); >>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>> return 0; >>>>>>>>>>>> } >>>>>>>>>>>> @@ -416,6 +417,8 @@ void xgpu_nv_mailbox_put_irq(struct >>>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>>> { >>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>> + >>>>>>>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>> } >>>>>>>>>>>> const struct amdgpu_virt_ops xgpu_nv_virt_ops = { >>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644 >>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>> @@ -512,7 +512,7 @@ static int >>>>>>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device *adev, >>>>>>>>>>>> static void xgpu_vi_mailbox_flr_work(struct work_struct >>>>>>>>>>>> *work) >>>>>>>>>>>> { >>>>>>>>>>>> - struct amdgpu_virt *virt = container_of(work, struct >>>>>>>>>>>> amdgpu_virt, flr_work); >>>>>>>>>>>> + struct amdgpu_virt *virt = container_of(work, struct >>>>>>>>>>>> amdgpu_virt, flr_work.base.work); >>>>>>>>>>>> struct amdgpu_device *adev = container_of(virt, >>>>>>>>>>>> struct amdgpu_device, virt); >>>>>>>>>>>> /* wait until RCV_MSG become 3 */ >>>>>>>>>>>> @@ -610,7 +610,8 @@ int xgpu_vi_mailbox_get_irq(struct >>>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>>> return r; >>>>>>>>>>>> } >>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, xgpu_vi_mailbox_flr_work); >>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>> xgpu_vi_mailbox_flr_work); >>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>> return 0; >>>>>>>>>>>> } >>>>>>>>>>>> @@ -619,6 +620,8 @@ void xgpu_vi_mailbox_put_irq(struct >>>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>>> { >>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>> + >>>>>>>>>>>> + amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>> } >>>>>>>>>>>> const struct amdgpu_virt_ops xgpu_vi_virt_ops = { >>>>>>>>>>> >>>>>>>>> >>>>>>> >> ^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive. 2022-05-10 16:17 ` Christian König @ 2022-05-10 17:01 ` Andrey Grodzovsky 2022-05-10 17:19 ` Christian König 0 siblings, 1 reply; 40+ messages in thread From: Andrey Grodzovsky @ 2022-05-10 17:01 UTC (permalink / raw) To: Christian König, Christian König, Lazar, Lijo, Felix Kuehling, amd-gfx Cc: Bai Zoy On 2022-05-10 12:17, Christian König wrote: > Am 10.05.22 um 18:00 schrieb Andrey Grodzovsky: >> [SNIP] >>> That's one of the reasons why we should have multiple work items for >>> job based reset and other reset sources. >>> >>> See the whole idea is the following: >>> 1. We have one single queued work queue for each reset domain which >>> makes sure that all reset requests execute in order. >>> 2. We have one delayed work item for each scheduler which fires when >>> a timeout on a scheduler occurs and eventually calls the reset >>> procedure with the last running job. >>> 3. We have one work item for each necessary hard reset. >>> >>> The delayed work item from the scheduler first tries a soft recovery >>> and checks if a hard reset is really necessary. If it's not >>> necessary and we can cancel the offending job we skip the hard reset. >>> >>> The hard reset work item doesn't do any of those checks and just >>> does a reset no matter what. >>> >>> When we really do a reset, independent if its triggered by a job or >>> other source we cancel all sources at the end of the reset procedure. >>> >>> This makes sure that a) We only do one reset even when multiple >>> sources fire at the same time and b) when any source bails out and >>> only does a soft recovery we do a full reset anyway when necessary. >>> >>> That design was outlined multiple times now on the mailing list and >>> looks totally clear to me. We should probably document that somewhere. >> >> >> If you look at the patch what you described above is exactly what is >> happening - since scheduler's delayed work is different from any non >> scheduler delayed work the SW reset which might take place from >> scheduler's reset >> will not have any impact on any non scheduler delayed work and will >> not cancel them. In case the scheduler actually reaches the point of >> HW reset it will cancel out all pending reset works from any other >> sources on the same >> reset domain. Non scheduler reset will always proceed to do full HW >> reset and will cancel any other pending resets. > > Ok, but why you then need that linked list? The number of reset > sources should be static and not in any way dynamic. So array reset_src[i] holds a pointer to pending delayed work from source i or NULL if no pedning work ? What if same source triggers multiple reset requests such as multiple RAS errors at once , don't set the delayed work pointer in the arr[RAS_index] if it's already not NULL ? > > See using the linked list sounds like you only wanted to cancel the > reset sources raised so far which would not be correct as far as I can > see. Not clear about this one ? We do want to cancel those reset sources that were raised so far because we just did a HW reset which should fix them anyway ? Those who not raised reset request so far their respective array index will have a NULL ptr. Andrey > >> >> The only difference is I chose to do the canceling right BEFORE the >> HW reset and not AFTER. I did this because I see a possible race >> where a new reset request is being generated exactly after we >> finished the HW reset but before we canceled out all pending resets - >> in such case you wold not want to cancel this 'border line new' reset >> request. > > Why not? Any new reset request directly after a hardware reset is most > likely just falsely generated by the reset itself. > > Ideally I would cancel all sources after the reset, but before > starting any new work. > > Regards, > Christian. > >> >> >> Andrey >> >> >>> >>> Regards, >>> Christian. >>> >>>>> You can see that if many different reset sources share same work >>>>> struct what can happen is that the first to obtain the lock you >>>>> describe bellow might opt out from full HW reset because his bad >>>>> job did signal for example or because his hunged IP block was able >>>>> to recover through SW reset but in the meantime another reset >>>>> source who needed an actual HW reset just silently returned and we >>>>> end up with unhandled reset request. True that today this happens >>>>> only to job timeout reset sources that are handled form within the >>>>> scheduler and won't use this single work struct but no one >>>>> prevents a future case for this to happen and also, if we actually >>>>> want to unify scheduler time out handlers within reset domain >>>>> (which seems to me the right design approach) we won't be able to >>>>> use just one work struct for this reason anyway. >>>>> >>>> >>>> Just to add to this point - a reset domain is co-operative domain. >>>> In addition to sharing a set of clients from various reset sources >>>> for one device, it also will have a set of devices like in XGMI >>>> hive. The job timeout on one device may not eventually result in >>>> result, but a RAS error happening on another device at the same >>>> time would need a reset. The second device's RAS error cannot >>>> return seeing that a reset work already started, or ignore the >>>> reset work given that another device has filled the reset data. >>>> >>>> When there is a reset domain, it should take care of the work >>>> scheduled and keeping it in device or any other level doesn't sound >>>> good. >>>> >>>> Thanks, >>>> Lijo >>>> >>>>> Andrey >>>>> >>>>> >>>>>> >>>>>> I'd put the reset work struct into the reset_domain struct. That >>>>>> way you'd have exactly one worker for the reset domain. You could >>>>>> implement a lock-less scheme to decide whether you need to >>>>>> schedule a reset, e.g. using an atomic counter in the shared work >>>>>> struct that gets incremented when a client wants to trigger a >>>>>> reset (atomic_add_return). If that counter is exactly 1 after >>>>>> incrementing, you need to fill in the rest of the work struct and >>>>>> schedule the work. Otherwise, it's already scheduled (or another >>>>>> client is in the process of scheduling it) and you just return. >>>>>> When the worker finishes (after confirming a successful reset), >>>>>> it resets the counter to 0, so the next client requesting a reset >>>>>> will schedule the worker again. >>>>>> >>>>>> Regards, >>>>>> Felix >>>>>> >>>>>> >>>>>>> >>>>>>> >>>>>>>> >>>>>>>> >>>>>>>> Additional to that keep in mind that you can't allocate any >>>>>>>> memory before or during the GPU reset nor wait for the reset to >>>>>>>> complete (so you can't allocate anything on the stack either). >>>>>>> >>>>>>> >>>>>>> There is no dynamic allocation here, regarding stack allocations >>>>>>> - we do it all the time when we call functions, even during GPU >>>>>>> resets, how on stack allocation of work struct in >>>>>>> amdgpu_device_gpu_recover is different from any other local >>>>>>> variable we allocate in any function we call ? >>>>>>> >>>>>>> I am also not sure why it's not allowed to wait for reset to >>>>>>> complete ? Also, see in amdgpu_ras_do_recovery and >>>>>>> gpu_recover_get (debugfs) - the caller expects the reset to >>>>>>> complete before he returns. I can probably work around it in RAS >>>>>>> code by calling atomic_set(&ras->in_recovery, 0) from some >>>>>>> callback within actual reset function but regarding sysfs it >>>>>>> actually expects a result returned indicating whether the call >>>>>>> was successful or not. >>>>>>> >>>>>>> Andrey >>>>>>> >>>>>>> >>>>>>>> >>>>>>>> I don't think that concept you try here will work. >>>>>>>> >>>>>>>> Regards, >>>>>>>> Christian. >>>>>>>> >>>>>>>>> Also in general seems to me it's cleaner approach where this >>>>>>>>> logic (the work items) are held and handled in reset_domain >>>>>>>>> and are not split in each adev or any other entity. We might >>>>>>>>> want in the future to even move the scheduler handling into >>>>>>>>> reset domain since reset domain is supposed to be a generic >>>>>>>>> things and not only or AMD. >>>>>>>>> >>>>>>>>> Andrey >>>>>>>>> >>>>>>>>> >>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>> Andrey >>>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>>> >>>>>>>>>>>> Regards, >>>>>>>>>>>> Christian. >>>>>>>>>>>> >>>>>>>>>>>>> >>>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> >>>>>>>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com> >>>>>>>>>>>>> --- >>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +--- >>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++-- >>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 + >>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 >>>>>>>>>>>>> +++++++++++++++++++++- >>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +- >>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++- >>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++- >>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++- >>>>>>>>>>>>> 8 files changed, 104 insertions(+), 24 deletions(-) >>>>>>>>>>>>> >>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>> index 4264abc5604d..99efd8317547 100644 >>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>> @@ -109,6 +109,7 @@ >>>>>>>>>>>>> #include "amdgpu_fdinfo.h" >>>>>>>>>>>>> #include "amdgpu_mca.h" >>>>>>>>>>>>> #include "amdgpu_ras.h" >>>>>>>>>>>>> +#include "amdgpu_reset.h" >>>>>>>>>>>>> #define MAX_GPU_INSTANCE 16 >>>>>>>>>>>>> @@ -509,16 +510,6 @@ struct amdgpu_allowed_register_entry { >>>>>>>>>>>>> bool grbm_indexed; >>>>>>>>>>>>> }; >>>>>>>>>>>>> -enum amd_reset_method { >>>>>>>>>>>>> - AMD_RESET_METHOD_NONE = -1, >>>>>>>>>>>>> - AMD_RESET_METHOD_LEGACY = 0, >>>>>>>>>>>>> - AMD_RESET_METHOD_MODE0, >>>>>>>>>>>>> - AMD_RESET_METHOD_MODE1, >>>>>>>>>>>>> - AMD_RESET_METHOD_MODE2, >>>>>>>>>>>>> - AMD_RESET_METHOD_BACO, >>>>>>>>>>>>> - AMD_RESET_METHOD_PCI, >>>>>>>>>>>>> -}; >>>>>>>>>>>>> - >>>>>>>>>>>>> struct amdgpu_video_codec_info { >>>>>>>>>>>>> u32 codec_type; >>>>>>>>>>>>> u32 max_width; >>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>> index e582f1044c0f..7fa82269c30f 100644 >>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>> @@ -5201,6 +5201,12 @@ int >>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev, >>>>>>>>>>>>> } >>>>>>>>>>>>> tmp_vram_lost_counter = >>>>>>>>>>>>> atomic_read(&((adev)->vram_lost_counter)); >>>>>>>>>>>>> + >>>>>>>>>>>>> + /* Drop all pending resets since we will reset now >>>>>>>>>>>>> anyway */ >>>>>>>>>>>>> + tmp_adev = list_first_entry(device_list_handle, >>>>>>>>>>>>> struct amdgpu_device, >>>>>>>>>>>>> + reset_list); >>>>>>>>>>>>> + amdgpu_reset_pending_list(tmp_adev->reset_domain); >>>>>>>>>>>>> + >>>>>>>>>>>>> /* Actual ASIC resets if needed.*/ >>>>>>>>>>>>> /* Host driver will handle XGMI hive reset for SRIOV */ >>>>>>>>>>>>> if (amdgpu_sriov_vf(adev)) { >>>>>>>>>>>>> @@ -5296,7 +5302,7 @@ int >>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev, >>>>>>>>>>>>> } >>>>>>>>>>>>> struct amdgpu_recover_work_struct { >>>>>>>>>>>>> - struct work_struct base; >>>>>>>>>>>>> + struct amdgpu_reset_work_struct base; >>>>>>>>>>>>> struct amdgpu_device *adev; >>>>>>>>>>>>> struct amdgpu_job *job; >>>>>>>>>>>>> int ret; >>>>>>>>>>>>> @@ -5304,7 +5310,7 @@ struct amdgpu_recover_work_struct { >>>>>>>>>>>>> static void >>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work(struct work_struct >>>>>>>>>>>>> *work) >>>>>>>>>>>>> { >>>>>>>>>>>>> - struct amdgpu_recover_work_struct *recover_work = >>>>>>>>>>>>> container_of(work, struct amdgpu_recover_work_struct, base); >>>>>>>>>>>>> + struct amdgpu_recover_work_struct *recover_work = >>>>>>>>>>>>> container_of(work, struct amdgpu_recover_work_struct, >>>>>>>>>>>>> base.base.work); >>>>>>>>>>>>> recover_work->ret = >>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, >>>>>>>>>>>>> recover_work->job); >>>>>>>>>>>>> } >>>>>>>>>>>>> @@ -5316,12 +5322,15 @@ int >>>>>>>>>>>>> amdgpu_device_gpu_recover(struct amdgpu_device *adev, >>>>>>>>>>>>> { >>>>>>>>>>>>> struct amdgpu_recover_work_struct work = {.adev = >>>>>>>>>>>>> adev, .job = job}; >>>>>>>>>>>>> - INIT_WORK(&work.base, >>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>>>>>>>> + INIT_DELAYED_WORK(&work.base.base, >>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>>>>>>>> + INIT_LIST_HEAD(&work.base.node); >>>>>>>>>>>>> if >>>>>>>>>>>>> (!amdgpu_reset_domain_schedule(adev->reset_domain, >>>>>>>>>>>>> &work.base)) >>>>>>>>>>>>> return -EAGAIN; >>>>>>>>>>>>> - flush_work(&work.base); >>>>>>>>>>>>> + flush_delayed_work(&work.base.base); >>>>>>>>>>>>> + >>>>>>>>>>>>> + >>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>> &work.base); >>>>>>>>>>>>> return work.ret; >>>>>>>>>>>>> } >>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>> index c80af0889773..ffddd419c351 100644 >>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain >>>>>>>>>>>>> *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d >>>>>>>>>>>>> atomic_set(&reset_domain->in_gpu_reset, 0); >>>>>>>>>>>>> init_rwsem(&reset_domain->sem); >>>>>>>>>>>>> + INIT_LIST_HEAD(&reset_domain->pending_works); >>>>>>>>>>>>> + mutex_init(&reset_domain->reset_lock); >>>>>>>>>>>>> + >>>>>>>>>>>>> return reset_domain; >>>>>>>>>>>>> } >>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>> index 1949dbe28a86..863ec5720fc1 100644 >>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>> @@ -24,7 +24,18 @@ >>>>>>>>>>>>> #ifndef __AMDGPU_RESET_H__ >>>>>>>>>>>>> #define __AMDGPU_RESET_H__ >>>>>>>>>>>>> -#include "amdgpu.h" >>>>>>>>>>>>> + >>>>>>>>>>>>> +#include <linux/atomic.h> >>>>>>>>>>>>> +#include <linux/mutex.h> >>>>>>>>>>>>> +#include <linux/list.h> >>>>>>>>>>>>> +#include <linux/kref.h> >>>>>>>>>>>>> +#include <linux/rwsem.h> >>>>>>>>>>>>> +#include <linux/workqueue.h> >>>>>>>>>>>>> + >>>>>>>>>>>>> +struct amdgpu_device; >>>>>>>>>>>>> +struct amdgpu_job; >>>>>>>>>>>>> +struct amdgpu_hive_info; >>>>>>>>>>>>> + >>>>>>>>>>>>> enum AMDGPU_RESET_FLAGS { >>>>>>>>>>>>> @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS { >>>>>>>>>>>>> AMDGPU_SKIP_HW_RESET = 1, >>>>>>>>>>>>> }; >>>>>>>>>>>>> + >>>>>>>>>>>>> +enum amd_reset_method { >>>>>>>>>>>>> + AMD_RESET_METHOD_NONE = -1, >>>>>>>>>>>>> + AMD_RESET_METHOD_LEGACY = 0, >>>>>>>>>>>>> + AMD_RESET_METHOD_MODE0, >>>>>>>>>>>>> + AMD_RESET_METHOD_MODE1, >>>>>>>>>>>>> + AMD_RESET_METHOD_MODE2, >>>>>>>>>>>>> + AMD_RESET_METHOD_BACO, >>>>>>>>>>>>> + AMD_RESET_METHOD_PCI, >>>>>>>>>>>>> +}; >>>>>>>>>>>>> + >>>>>>>>>>>>> struct amdgpu_reset_context { >>>>>>>>>>>>> enum amd_reset_method method; >>>>>>>>>>>>> struct amdgpu_device *reset_req_dev; >>>>>>>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context { >>>>>>>>>>>>> unsigned long flags; >>>>>>>>>>>>> }; >>>>>>>>>>>>> +struct amdgpu_reset_control; >>>>>>>>>>>>> + >>>>>>>>>>>>> struct amdgpu_reset_handler { >>>>>>>>>>>>> enum amd_reset_method reset_method; >>>>>>>>>>>>> struct list_head handler_list; >>>>>>>>>>>>> @@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type { >>>>>>>>>>>>> XGMI_HIVE >>>>>>>>>>>>> }; >>>>>>>>>>>>> + >>>>>>>>>>>>> +struct amdgpu_reset_work_struct { >>>>>>>>>>>>> + struct delayed_work base; >>>>>>>>>>>>> + struct list_head node; >>>>>>>>>>>>> +}; >>>>>>>>>>>>> + >>>>>>>>>>>>> struct amdgpu_reset_domain { >>>>>>>>>>>>> struct kref refcount; >>>>>>>>>>>>> struct workqueue_struct *wq; >>>>>>>>>>>>> enum amdgpu_reset_domain_type type; >>>>>>>>>>>>> struct rw_semaphore sem; >>>>>>>>>>>>> atomic_t in_gpu_reset; >>>>>>>>>>>>> + >>>>>>>>>>>>> + struct list_head pending_works; >>>>>>>>>>>>> + struct mutex reset_lock; >>>>>>>>>>>>> }; >>>>>>>>>>>>> @@ -113,9 +146,43 @@ static inline void >>>>>>>>>>>>> amdgpu_reset_put_reset_domain(struct amdgpu_reset_domain *dom >>>>>>>>>>>>> } >>>>>>>>>>>>> static inline bool amdgpu_reset_domain_schedule(struct >>>>>>>>>>>>> amdgpu_reset_domain *domain, >>>>>>>>>>>>> - struct work_struct *work) >>>>>>>>>>>>> + struct amdgpu_reset_work_struct >>>>>>>>>>>>> *work) >>>>>>>>>>>>> { >>>>>>>>>>>>> - return queue_work(domain->wq, work); >>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>> + >>>>>>>>>>>>> + if (!queue_delayed_work(domain->wq, &work->base, 0)) { >>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>> + return false; >>>>>>>>>>>>> + } >>>>>>>>>>>>> + >>>>>>>>>>>>> + list_add_tail(&work->node, &domain->pending_works); >>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>> + >>>>>>>>>>>>> + return true; >>>>>>>>>>>>> +} >>>>>>>>>>>>> + >>>>>>>>>>>>> +static inline void >>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(struct >>>>>>>>>>>>> amdgpu_reset_domain *domain, >>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work) >>>>>>>>>>>>> +{ >>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>> + list_del_init(&work->node); >>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>> +} >>>>>>>>>>>>> + >>>>>>>>>>>>> +static inline void amdgpu_reset_pending_list(struct >>>>>>>>>>>>> amdgpu_reset_domain *domain) >>>>>>>>>>>>> +{ >>>>>>>>>>>>> + struct amdgpu_reset_work_struct *entry, *tmp; >>>>>>>>>>>>> + >>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>> + list_for_each_entry_safe(entry, tmp, >>>>>>>>>>>>> &domain->pending_works, node) { >>>>>>>>>>>>> + >>>>>>>>>>>>> + list_del_init(&entry->node); >>>>>>>>>>>>> + >>>>>>>>>>>>> + /* Stop any other related pending resets */ >>>>>>>>>>>>> + cancel_delayed_work(&entry->base); >>>>>>>>>>>>> + } >>>>>>>>>>>>> + >>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>> } >>>>>>>>>>>>> void amdgpu_device_lock_reset_domain(struct >>>>>>>>>>>>> amdgpu_reset_domain *reset_domain); >>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>> index 239f232f9c02..574e870d3064 100644 >>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>> @@ -25,6 +25,7 @@ >>>>>>>>>>>>> #define AMDGPU_VIRT_H >>>>>>>>>>>>> #include "amdgv_sriovmsg.h" >>>>>>>>>>>>> +#include "amdgpu_reset.h" >>>>>>>>>>>>> #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 << 0) /* >>>>>>>>>>>>> vBIOS is sr-iov ready */ >>>>>>>>>>>>> #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 1) /* sr-iov >>>>>>>>>>>>> is enabled on this GPU */ >>>>>>>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt { >>>>>>>>>>>>> uint32_t reg_val_offs; >>>>>>>>>>>>> struct amdgpu_irq_src ack_irq; >>>>>>>>>>>>> struct amdgpu_irq_src rcv_irq; >>>>>>>>>>>>> - struct work_struct flr_work; >>>>>>>>>>>>> + struct amdgpu_reset_work_struct flr_work; >>>>>>>>>>>>> struct amdgpu_mm_table mm_table; >>>>>>>>>>>>> const struct amdgpu_virt_ops *ops; >>>>>>>>>>>>> struct amdgpu_vf_error_buffer vf_errors; >>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>> index b81acf59870c..f3d1c2be9292 100644 >>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>> @@ -251,7 +251,7 @@ static int >>>>>>>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device *adev, >>>>>>>>>>>>> static void xgpu_ai_mailbox_flr_work(struct >>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>> { >>>>>>>>>>>>> - struct amdgpu_virt *virt = container_of(work, struct >>>>>>>>>>>>> amdgpu_virt, flr_work); >>>>>>>>>>>>> + struct amdgpu_virt *virt = container_of(work, struct >>>>>>>>>>>>> amdgpu_virt, flr_work.base.work); >>>>>>>>>>>>> struct amdgpu_device *adev = container_of(virt, >>>>>>>>>>>>> struct amdgpu_device, virt); >>>>>>>>>>>>> int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>>>>>>>> @@ -380,7 +380,8 @@ int xgpu_ai_mailbox_get_irq(struct >>>>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>>>> return r; >>>>>>>>>>>>> } >>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>> xgpu_ai_mailbox_flr_work); >>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>> xgpu_ai_mailbox_flr_work); >>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>> return 0; >>>>>>>>>>>>> } >>>>>>>>>>>>> @@ -389,6 +390,8 @@ void xgpu_ai_mailbox_put_irq(struct >>>>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>>>> { >>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>> + >>>>>>>>>>>>> + >>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>> } >>>>>>>>>>>>> static int xgpu_ai_request_init_data(struct >>>>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644 >>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>> @@ -275,7 +275,7 @@ static int >>>>>>>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device *adev, >>>>>>>>>>>>> static void xgpu_nv_mailbox_flr_work(struct >>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>> { >>>>>>>>>>>>> - struct amdgpu_virt *virt = container_of(work, struct >>>>>>>>>>>>> amdgpu_virt, flr_work); >>>>>>>>>>>>> + struct amdgpu_virt *virt = container_of(work, struct >>>>>>>>>>>>> amdgpu_virt, flr_work.base.work); >>>>>>>>>>>>> struct amdgpu_device *adev = container_of(virt, >>>>>>>>>>>>> struct amdgpu_device, virt); >>>>>>>>>>>>> int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>>>>>>>> @@ -407,7 +407,8 @@ int xgpu_nv_mailbox_get_irq(struct >>>>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>>>> return r; >>>>>>>>>>>>> } >>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>> xgpu_nv_mailbox_flr_work); >>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>> xgpu_nv_mailbox_flr_work); >>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>> return 0; >>>>>>>>>>>>> } >>>>>>>>>>>>> @@ -416,6 +417,8 @@ void xgpu_nv_mailbox_put_irq(struct >>>>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>>>> { >>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>> + >>>>>>>>>>>>> + >>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>> } >>>>>>>>>>>>> const struct amdgpu_virt_ops xgpu_nv_virt_ops = { >>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644 >>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>> @@ -512,7 +512,7 @@ static int >>>>>>>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device *adev, >>>>>>>>>>>>> static void xgpu_vi_mailbox_flr_work(struct >>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>> { >>>>>>>>>>>>> - struct amdgpu_virt *virt = container_of(work, struct >>>>>>>>>>>>> amdgpu_virt, flr_work); >>>>>>>>>>>>> + struct amdgpu_virt *virt = container_of(work, struct >>>>>>>>>>>>> amdgpu_virt, flr_work.base.work); >>>>>>>>>>>>> struct amdgpu_device *adev = container_of(virt, >>>>>>>>>>>>> struct amdgpu_device, virt); >>>>>>>>>>>>> /* wait until RCV_MSG become 3 */ >>>>>>>>>>>>> @@ -610,7 +610,8 @@ int xgpu_vi_mailbox_get_irq(struct >>>>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>>>> return r; >>>>>>>>>>>>> } >>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>> xgpu_vi_mailbox_flr_work); >>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>> xgpu_vi_mailbox_flr_work); >>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>> return 0; >>>>>>>>>>>>> } >>>>>>>>>>>>> @@ -619,6 +620,8 @@ void xgpu_vi_mailbox_put_irq(struct >>>>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>>>> { >>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>> + >>>>>>>>>>>>> + >>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>> } >>>>>>>>>>>>> const struct amdgpu_virt_ops xgpu_vi_virt_ops = { >>>>>>>>>>>> >>>>>>>>>> >>>>>>>> >>> > ^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive. 2022-05-10 17:01 ` Andrey Grodzovsky @ 2022-05-10 17:19 ` Christian König 2022-05-10 18:53 ` Andrey Grodzovsky 0 siblings, 1 reply; 40+ messages in thread From: Christian König @ 2022-05-10 17:19 UTC (permalink / raw) To: Andrey Grodzovsky, Christian König, Lazar, Lijo, Felix Kuehling, amd-gfx Cc: Bai Zoy Am 10.05.22 um 19:01 schrieb Andrey Grodzovsky: > > On 2022-05-10 12:17, Christian König wrote: >> Am 10.05.22 um 18:00 schrieb Andrey Grodzovsky: >>> [SNIP] >>>> That's one of the reasons why we should have multiple work items >>>> for job based reset and other reset sources. >>>> >>>> See the whole idea is the following: >>>> 1. We have one single queued work queue for each reset domain which >>>> makes sure that all reset requests execute in order. >>>> 2. We have one delayed work item for each scheduler which fires >>>> when a timeout on a scheduler occurs and eventually calls the reset >>>> procedure with the last running job. >>>> 3. We have one work item for each necessary hard reset. >>>> >>>> The delayed work item from the scheduler first tries a soft >>>> recovery and checks if a hard reset is really necessary. If it's >>>> not necessary and we can cancel the offending job we skip the hard >>>> reset. >>>> >>>> The hard reset work item doesn't do any of those checks and just >>>> does a reset no matter what. >>>> >>>> When we really do a reset, independent if its triggered by a job or >>>> other source we cancel all sources at the end of the reset procedure. >>>> >>>> This makes sure that a) We only do one reset even when multiple >>>> sources fire at the same time and b) when any source bails out and >>>> only does a soft recovery we do a full reset anyway when necessary. >>>> >>>> That design was outlined multiple times now on the mailing list and >>>> looks totally clear to me. We should probably document that somewhere. >>> >>> >>> If you look at the patch what you described above is exactly what is >>> happening - since scheduler's delayed work is different from any non >>> scheduler delayed work the SW reset which might take place from >>> scheduler's reset >>> will not have any impact on any non scheduler delayed work and will >>> not cancel them. In case the scheduler actually reaches the point of >>> HW reset it will cancel out all pending reset works from any other >>> sources on the same >>> reset domain. Non scheduler reset will always proceed to do full HW >>> reset and will cancel any other pending resets. >> >> Ok, but why you then need that linked list? The number of reset >> sources should be static and not in any way dynamic. > > > So array reset_src[i] holds a pointer to pending delayed work from > source i or NULL if no pedning work ? > What if same source triggers multiple reset requests such as multiple > RAS errors at once , don't set the delayed work pointer in the > arr[RAS_index] if it's already not NULL ? > >> >> See using the linked list sounds like you only wanted to cancel the >> reset sources raised so far which would not be correct as far as I >> can see. > > > Not clear about this one ? We do want to cancel those reset sources > that were raised so far because we just did a HW reset which should > fix them anyway ? Those who not raised reset request so far their > respective array index will have a NULL ptr. And exactly that's what I want to prevent. See you don't care if a reset source has fired once, twice, ten times or never. You just cancel all of them! That's why I want to come to a static list of reset sources. E.g. in the reset code (either before or after the reset, that's debatable) you do something like this: for (i = 0; i < num_ring; ++i) cancel_delayed_work(ring[i]->scheduler....) cancel_work(adev->ras_work); cancel_work(adev->iofault_work); cancel_work(adev->debugfs_work); ... You don't really need to track which reset source has fired and which hasn't, because that would be racy again. Instead just bluntly reset all possible sources. Christian. > > Andrey > > >> >>> >>> The only difference is I chose to do the canceling right BEFORE the >>> HW reset and not AFTER. I did this because I see a possible race >>> where a new reset request is being generated exactly after we >>> finished the HW reset but before we canceled out all pending resets >>> - in such case you wold not want to cancel this 'border line new' >>> reset request. >> >> Why not? Any new reset request directly after a hardware reset is >> most likely just falsely generated by the reset itself. >> >> Ideally I would cancel all sources after the reset, but before >> starting any new work. >> >> Regards, >> Christian. >> >>> >>> >>> Andrey >>> >>> >>>> >>>> Regards, >>>> Christian. >>>> >>>>>> You can see that if many different reset sources share same work >>>>>> struct what can happen is that the first to obtain the lock you >>>>>> describe bellow might opt out from full HW reset because his bad >>>>>> job did signal for example or because his hunged IP block was >>>>>> able to recover through SW reset but in the meantime another >>>>>> reset source who needed an actual HW reset just silently returned >>>>>> and we end up with unhandled reset request. True that today this >>>>>> happens only to job timeout reset sources that are handled form >>>>>> within the scheduler and won't use this single work struct but no >>>>>> one prevents a future case for this to happen and also, if we >>>>>> actually want to unify scheduler time out handlers within reset >>>>>> domain (which seems to me the right design approach) we won't be >>>>>> able to use just one work struct for this reason anyway. >>>>>> >>>>> >>>>> Just to add to this point - a reset domain is co-operative domain. >>>>> In addition to sharing a set of clients from various reset sources >>>>> for one device, it also will have a set of devices like in XGMI >>>>> hive. The job timeout on one device may not eventually result in >>>>> result, but a RAS error happening on another device at the same >>>>> time would need a reset. The second device's RAS error cannot >>>>> return seeing that a reset work already started, or ignore the >>>>> reset work given that another device has filled the reset data. >>>>> >>>>> When there is a reset domain, it should take care of the work >>>>> scheduled and keeping it in device or any other level doesn't >>>>> sound good. >>>>> >>>>> Thanks, >>>>> Lijo >>>>> >>>>>> Andrey >>>>>> >>>>>> >>>>>>> >>>>>>> I'd put the reset work struct into the reset_domain struct. That >>>>>>> way you'd have exactly one worker for the reset domain. You >>>>>>> could implement a lock-less scheme to decide whether you need to >>>>>>> schedule a reset, e.g. using an atomic counter in the shared >>>>>>> work struct that gets incremented when a client wants to trigger >>>>>>> a reset (atomic_add_return). If that counter is exactly 1 after >>>>>>> incrementing, you need to fill in the rest of the work struct >>>>>>> and schedule the work. Otherwise, it's already scheduled (or >>>>>>> another client is in the process of scheduling it) and you just >>>>>>> return. When the worker finishes (after confirming a successful >>>>>>> reset), it resets the counter to 0, so the next client >>>>>>> requesting a reset will schedule the worker again. >>>>>>> >>>>>>> Regards, >>>>>>> Felix >>>>>>> >>>>>>> >>>>>>>> >>>>>>>> >>>>>>>>> >>>>>>>>> >>>>>>>>> Additional to that keep in mind that you can't allocate any >>>>>>>>> memory before or during the GPU reset nor wait for the reset >>>>>>>>> to complete (so you can't allocate anything on the stack either). >>>>>>>> >>>>>>>> >>>>>>>> There is no dynamic allocation here, regarding stack >>>>>>>> allocations - we do it all the time when we call functions, >>>>>>>> even during GPU resets, how on stack allocation of work struct >>>>>>>> in amdgpu_device_gpu_recover is different from any other local >>>>>>>> variable we allocate in any function we call ? >>>>>>>> >>>>>>>> I am also not sure why it's not allowed to wait for reset to >>>>>>>> complete ? Also, see in amdgpu_ras_do_recovery and >>>>>>>> gpu_recover_get (debugfs) - the caller expects the reset to >>>>>>>> complete before he returns. I can probably work around it in >>>>>>>> RAS code by calling atomic_set(&ras->in_recovery, 0) from some >>>>>>>> callback within actual reset function but regarding sysfs it >>>>>>>> actually expects a result returned indicating whether the call >>>>>>>> was successful or not. >>>>>>>> >>>>>>>> Andrey >>>>>>>> >>>>>>>> >>>>>>>>> >>>>>>>>> I don't think that concept you try here will work. >>>>>>>>> >>>>>>>>> Regards, >>>>>>>>> Christian. >>>>>>>>> >>>>>>>>>> Also in general seems to me it's cleaner approach where this >>>>>>>>>> logic (the work items) are held and handled in reset_domain >>>>>>>>>> and are not split in each adev or any other entity. We might >>>>>>>>>> want in the future to even move the scheduler handling into >>>>>>>>>> reset domain since reset domain is supposed to be a generic >>>>>>>>>> things and not only or AMD. >>>>>>>>>> >>>>>>>>>> Andrey >>>>>>>>>> >>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>>> >>>>>>>>>>>> Andrey >>>>>>>>>>>> >>>>>>>>>>>> >>>>>>>>>>>>> >>>>>>>>>>>>> Regards, >>>>>>>>>>>>> Christian. >>>>>>>>>>>>> >>>>>>>>>>>>>> >>>>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> >>>>>>>>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com> >>>>>>>>>>>>>> --- >>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +--- >>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++-- >>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 + >>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 >>>>>>>>>>>>>> +++++++++++++++++++++- >>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +- >>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++- >>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++- >>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++- >>>>>>>>>>>>>> 8 files changed, 104 insertions(+), 24 deletions(-) >>>>>>>>>>>>>> >>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>> index 4264abc5604d..99efd8317547 100644 >>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>> @@ -109,6 +109,7 @@ >>>>>>>>>>>>>> #include "amdgpu_fdinfo.h" >>>>>>>>>>>>>> #include "amdgpu_mca.h" >>>>>>>>>>>>>> #include "amdgpu_ras.h" >>>>>>>>>>>>>> +#include "amdgpu_reset.h" >>>>>>>>>>>>>> #define MAX_GPU_INSTANCE 16 >>>>>>>>>>>>>> @@ -509,16 +510,6 @@ struct >>>>>>>>>>>>>> amdgpu_allowed_register_entry { >>>>>>>>>>>>>> bool grbm_indexed; >>>>>>>>>>>>>> }; >>>>>>>>>>>>>> -enum amd_reset_method { >>>>>>>>>>>>>> - AMD_RESET_METHOD_NONE = -1, >>>>>>>>>>>>>> - AMD_RESET_METHOD_LEGACY = 0, >>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE0, >>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE1, >>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE2, >>>>>>>>>>>>>> - AMD_RESET_METHOD_BACO, >>>>>>>>>>>>>> - AMD_RESET_METHOD_PCI, >>>>>>>>>>>>>> -}; >>>>>>>>>>>>>> - >>>>>>>>>>>>>> struct amdgpu_video_codec_info { >>>>>>>>>>>>>> u32 codec_type; >>>>>>>>>>>>>> u32 max_width; >>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>> index e582f1044c0f..7fa82269c30f 100644 >>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>> @@ -5201,6 +5201,12 @@ int >>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev, >>>>>>>>>>>>>> } >>>>>>>>>>>>>> tmp_vram_lost_counter = >>>>>>>>>>>>>> atomic_read(&((adev)->vram_lost_counter)); >>>>>>>>>>>>>> + >>>>>>>>>>>>>> + /* Drop all pending resets since we will reset now >>>>>>>>>>>>>> anyway */ >>>>>>>>>>>>>> + tmp_adev = list_first_entry(device_list_handle, >>>>>>>>>>>>>> struct amdgpu_device, >>>>>>>>>>>>>> + reset_list); >>>>>>>>>>>>>> + amdgpu_reset_pending_list(tmp_adev->reset_domain); >>>>>>>>>>>>>> + >>>>>>>>>>>>>> /* Actual ASIC resets if needed.*/ >>>>>>>>>>>>>> /* Host driver will handle XGMI hive reset for >>>>>>>>>>>>>> SRIOV */ >>>>>>>>>>>>>> if (amdgpu_sriov_vf(adev)) { >>>>>>>>>>>>>> @@ -5296,7 +5302,7 @@ int >>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev, >>>>>>>>>>>>>> } >>>>>>>>>>>>>> struct amdgpu_recover_work_struct { >>>>>>>>>>>>>> - struct work_struct base; >>>>>>>>>>>>>> + struct amdgpu_reset_work_struct base; >>>>>>>>>>>>>> struct amdgpu_device *adev; >>>>>>>>>>>>>> struct amdgpu_job *job; >>>>>>>>>>>>>> int ret; >>>>>>>>>>>>>> @@ -5304,7 +5310,7 @@ struct amdgpu_recover_work_struct { >>>>>>>>>>>>>> static void >>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work(struct work_struct >>>>>>>>>>>>>> *work) >>>>>>>>>>>>>> { >>>>>>>>>>>>>> - struct amdgpu_recover_work_struct *recover_work = >>>>>>>>>>>>>> container_of(work, struct amdgpu_recover_work_struct, base); >>>>>>>>>>>>>> + struct amdgpu_recover_work_struct *recover_work = >>>>>>>>>>>>>> container_of(work, struct amdgpu_recover_work_struct, >>>>>>>>>>>>>> base.base.work); >>>>>>>>>>>>>> recover_work->ret = >>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, >>>>>>>>>>>>>> recover_work->job); >>>>>>>>>>>>>> } >>>>>>>>>>>>>> @@ -5316,12 +5322,15 @@ int >>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct amdgpu_device *adev, >>>>>>>>>>>>>> { >>>>>>>>>>>>>> struct amdgpu_recover_work_struct work = {.adev = >>>>>>>>>>>>>> adev, .job = job}; >>>>>>>>>>>>>> - INIT_WORK(&work.base, >>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>>>>>>>>> + INIT_DELAYED_WORK(&work.base.base, >>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>>>>>>>>> + INIT_LIST_HEAD(&work.base.node); >>>>>>>>>>>>>> if >>>>>>>>>>>>>> (!amdgpu_reset_domain_schedule(adev->reset_domain, >>>>>>>>>>>>>> &work.base)) >>>>>>>>>>>>>> return -EAGAIN; >>>>>>>>>>>>>> - flush_work(&work.base); >>>>>>>>>>>>>> + flush_delayed_work(&work.base.base); >>>>>>>>>>>>>> + >>>>>>>>>>>>>> + >>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>> &work.base); >>>>>>>>>>>>>> return work.ret; >>>>>>>>>>>>>> } >>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>> index c80af0889773..ffddd419c351 100644 >>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain >>>>>>>>>>>>>> *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d >>>>>>>>>>>>>> atomic_set(&reset_domain->in_gpu_reset, 0); >>>>>>>>>>>>>> init_rwsem(&reset_domain->sem); >>>>>>>>>>>>>> + INIT_LIST_HEAD(&reset_domain->pending_works); >>>>>>>>>>>>>> + mutex_init(&reset_domain->reset_lock); >>>>>>>>>>>>>> + >>>>>>>>>>>>>> return reset_domain; >>>>>>>>>>>>>> } >>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>> index 1949dbe28a86..863ec5720fc1 100644 >>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>> @@ -24,7 +24,18 @@ >>>>>>>>>>>>>> #ifndef __AMDGPU_RESET_H__ >>>>>>>>>>>>>> #define __AMDGPU_RESET_H__ >>>>>>>>>>>>>> -#include "amdgpu.h" >>>>>>>>>>>>>> + >>>>>>>>>>>>>> +#include <linux/atomic.h> >>>>>>>>>>>>>> +#include <linux/mutex.h> >>>>>>>>>>>>>> +#include <linux/list.h> >>>>>>>>>>>>>> +#include <linux/kref.h> >>>>>>>>>>>>>> +#include <linux/rwsem.h> >>>>>>>>>>>>>> +#include <linux/workqueue.h> >>>>>>>>>>>>>> + >>>>>>>>>>>>>> +struct amdgpu_device; >>>>>>>>>>>>>> +struct amdgpu_job; >>>>>>>>>>>>>> +struct amdgpu_hive_info; >>>>>>>>>>>>>> + >>>>>>>>>>>>>> enum AMDGPU_RESET_FLAGS { >>>>>>>>>>>>>> @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS { >>>>>>>>>>>>>> AMDGPU_SKIP_HW_RESET = 1, >>>>>>>>>>>>>> }; >>>>>>>>>>>>>> + >>>>>>>>>>>>>> +enum amd_reset_method { >>>>>>>>>>>>>> + AMD_RESET_METHOD_NONE = -1, >>>>>>>>>>>>>> + AMD_RESET_METHOD_LEGACY = 0, >>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE0, >>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE1, >>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE2, >>>>>>>>>>>>>> + AMD_RESET_METHOD_BACO, >>>>>>>>>>>>>> + AMD_RESET_METHOD_PCI, >>>>>>>>>>>>>> +}; >>>>>>>>>>>>>> + >>>>>>>>>>>>>> struct amdgpu_reset_context { >>>>>>>>>>>>>> enum amd_reset_method method; >>>>>>>>>>>>>> struct amdgpu_device *reset_req_dev; >>>>>>>>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context { >>>>>>>>>>>>>> unsigned long flags; >>>>>>>>>>>>>> }; >>>>>>>>>>>>>> +struct amdgpu_reset_control; >>>>>>>>>>>>>> + >>>>>>>>>>>>>> struct amdgpu_reset_handler { >>>>>>>>>>>>>> enum amd_reset_method reset_method; >>>>>>>>>>>>>> struct list_head handler_list; >>>>>>>>>>>>>> @@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type { >>>>>>>>>>>>>> XGMI_HIVE >>>>>>>>>>>>>> }; >>>>>>>>>>>>>> + >>>>>>>>>>>>>> +struct amdgpu_reset_work_struct { >>>>>>>>>>>>>> + struct delayed_work base; >>>>>>>>>>>>>> + struct list_head node; >>>>>>>>>>>>>> +}; >>>>>>>>>>>>>> + >>>>>>>>>>>>>> struct amdgpu_reset_domain { >>>>>>>>>>>>>> struct kref refcount; >>>>>>>>>>>>>> struct workqueue_struct *wq; >>>>>>>>>>>>>> enum amdgpu_reset_domain_type type; >>>>>>>>>>>>>> struct rw_semaphore sem; >>>>>>>>>>>>>> atomic_t in_gpu_reset; >>>>>>>>>>>>>> + >>>>>>>>>>>>>> + struct list_head pending_works; >>>>>>>>>>>>>> + struct mutex reset_lock; >>>>>>>>>>>>>> }; >>>>>>>>>>>>>> @@ -113,9 +146,43 @@ static inline void >>>>>>>>>>>>>> amdgpu_reset_put_reset_domain(struct amdgpu_reset_domain >>>>>>>>>>>>>> *dom >>>>>>>>>>>>>> } >>>>>>>>>>>>>> static inline bool >>>>>>>>>>>>>> amdgpu_reset_domain_schedule(struct amdgpu_reset_domain >>>>>>>>>>>>>> *domain, >>>>>>>>>>>>>> - struct work_struct *work) >>>>>>>>>>>>>> + struct amdgpu_reset_work_struct >>>>>>>>>>>>>> *work) >>>>>>>>>>>>>> { >>>>>>>>>>>>>> - return queue_work(domain->wq, work); >>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>> + >>>>>>>>>>>>>> + if (!queue_delayed_work(domain->wq, &work->base, 0)) { >>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>> + return false; >>>>>>>>>>>>>> + } >>>>>>>>>>>>>> + >>>>>>>>>>>>>> + list_add_tail(&work->node, &domain->pending_works); >>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>> + >>>>>>>>>>>>>> + return true; >>>>>>>>>>>>>> +} >>>>>>>>>>>>>> + >>>>>>>>>>>>>> +static inline void >>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(struct >>>>>>>>>>>>>> amdgpu_reset_domain *domain, >>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work) >>>>>>>>>>>>>> +{ >>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>> + list_del_init(&work->node); >>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>> +} >>>>>>>>>>>>>> + >>>>>>>>>>>>>> +static inline void amdgpu_reset_pending_list(struct >>>>>>>>>>>>>> amdgpu_reset_domain *domain) >>>>>>>>>>>>>> +{ >>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *entry, *tmp; >>>>>>>>>>>>>> + >>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>> + list_for_each_entry_safe(entry, tmp, >>>>>>>>>>>>>> &domain->pending_works, node) { >>>>>>>>>>>>>> + >>>>>>>>>>>>>> + list_del_init(&entry->node); >>>>>>>>>>>>>> + >>>>>>>>>>>>>> + /* Stop any other related pending resets */ >>>>>>>>>>>>>> + cancel_delayed_work(&entry->base); >>>>>>>>>>>>>> + } >>>>>>>>>>>>>> + >>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>> } >>>>>>>>>>>>>> void amdgpu_device_lock_reset_domain(struct >>>>>>>>>>>>>> amdgpu_reset_domain *reset_domain); >>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>> index 239f232f9c02..574e870d3064 100644 >>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>> @@ -25,6 +25,7 @@ >>>>>>>>>>>>>> #define AMDGPU_VIRT_H >>>>>>>>>>>>>> #include "amdgv_sriovmsg.h" >>>>>>>>>>>>>> +#include "amdgpu_reset.h" >>>>>>>>>>>>>> #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 << 0) /* >>>>>>>>>>>>>> vBIOS is sr-iov ready */ >>>>>>>>>>>>>> #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 1) /* sr-iov >>>>>>>>>>>>>> is enabled on this GPU */ >>>>>>>>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt { >>>>>>>>>>>>>> uint32_t reg_val_offs; >>>>>>>>>>>>>> struct amdgpu_irq_src ack_irq; >>>>>>>>>>>>>> struct amdgpu_irq_src rcv_irq; >>>>>>>>>>>>>> - struct work_struct flr_work; >>>>>>>>>>>>>> + struct amdgpu_reset_work_struct flr_work; >>>>>>>>>>>>>> struct amdgpu_mm_table mm_table; >>>>>>>>>>>>>> const struct amdgpu_virt_ops *ops; >>>>>>>>>>>>>> struct amdgpu_vf_error_buffer vf_errors; >>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>> index b81acf59870c..f3d1c2be9292 100644 >>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>> @@ -251,7 +251,7 @@ static int >>>>>>>>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device *adev, >>>>>>>>>>>>>> static void xgpu_ai_mailbox_flr_work(struct >>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>> { >>>>>>>>>>>>>> - struct amdgpu_virt *virt = container_of(work, struct >>>>>>>>>>>>>> amdgpu_virt, flr_work); >>>>>>>>>>>>>> + struct amdgpu_virt *virt = container_of(work, struct >>>>>>>>>>>>>> amdgpu_virt, flr_work.base.work); >>>>>>>>>>>>>> struct amdgpu_device *adev = container_of(virt, >>>>>>>>>>>>>> struct amdgpu_device, virt); >>>>>>>>>>>>>> int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>>>>>>>>> @@ -380,7 +380,8 @@ int xgpu_ai_mailbox_get_irq(struct >>>>>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>>>>> return r; >>>>>>>>>>>>>> } >>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work); >>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work); >>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>> } >>>>>>>>>>>>>> @@ -389,6 +390,8 @@ void xgpu_ai_mailbox_put_irq(struct >>>>>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>>>>> { >>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>> + >>>>>>>>>>>>>> + >>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>> } >>>>>>>>>>>>>> static int xgpu_ai_request_init_data(struct >>>>>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644 >>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>> @@ -275,7 +275,7 @@ static int >>>>>>>>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device *adev, >>>>>>>>>>>>>> static void xgpu_nv_mailbox_flr_work(struct >>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>> { >>>>>>>>>>>>>> - struct amdgpu_virt *virt = container_of(work, struct >>>>>>>>>>>>>> amdgpu_virt, flr_work); >>>>>>>>>>>>>> + struct amdgpu_virt *virt = container_of(work, struct >>>>>>>>>>>>>> amdgpu_virt, flr_work.base.work); >>>>>>>>>>>>>> struct amdgpu_device *adev = container_of(virt, >>>>>>>>>>>>>> struct amdgpu_device, virt); >>>>>>>>>>>>>> int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>>>>>>>>> @@ -407,7 +407,8 @@ int xgpu_nv_mailbox_get_irq(struct >>>>>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>>>>> return r; >>>>>>>>>>>>>> } >>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work); >>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work); >>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>> } >>>>>>>>>>>>>> @@ -416,6 +417,8 @@ void xgpu_nv_mailbox_put_irq(struct >>>>>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>>>>> { >>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>> + >>>>>>>>>>>>>> + >>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>> } >>>>>>>>>>>>>> const struct amdgpu_virt_ops xgpu_nv_virt_ops = { >>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644 >>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>> @@ -512,7 +512,7 @@ static int >>>>>>>>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device *adev, >>>>>>>>>>>>>> static void xgpu_vi_mailbox_flr_work(struct >>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>> { >>>>>>>>>>>>>> - struct amdgpu_virt *virt = container_of(work, struct >>>>>>>>>>>>>> amdgpu_virt, flr_work); >>>>>>>>>>>>>> + struct amdgpu_virt *virt = container_of(work, struct >>>>>>>>>>>>>> amdgpu_virt, flr_work.base.work); >>>>>>>>>>>>>> struct amdgpu_device *adev = container_of(virt, >>>>>>>>>>>>>> struct amdgpu_device, virt); >>>>>>>>>>>>>> /* wait until RCV_MSG become 3 */ >>>>>>>>>>>>>> @@ -610,7 +610,8 @@ int xgpu_vi_mailbox_get_irq(struct >>>>>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>>>>> return r; >>>>>>>>>>>>>> } >>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work); >>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work); >>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>> } >>>>>>>>>>>>>> @@ -619,6 +620,8 @@ void xgpu_vi_mailbox_put_irq(struct >>>>>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>>>>> { >>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>> + >>>>>>>>>>>>>> + >>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>> } >>>>>>>>>>>>>> const struct amdgpu_virt_ops xgpu_vi_virt_ops = { >>>>>>>>>>>>> >>>>>>>>>>> >>>>>>>>> >>>> >> ^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive. 2022-05-10 17:19 ` Christian König @ 2022-05-10 18:53 ` Andrey Grodzovsky 2022-05-11 7:38 ` Christian König 0 siblings, 1 reply; 40+ messages in thread From: Andrey Grodzovsky @ 2022-05-10 18:53 UTC (permalink / raw) To: Christian König, Christian König, Lazar, Lijo, Felix Kuehling, amd-gfx Cc: Bai Zoy On 2022-05-10 13:19, Christian König wrote: > Am 10.05.22 um 19:01 schrieb Andrey Grodzovsky: >> >> On 2022-05-10 12:17, Christian König wrote: >>> Am 10.05.22 um 18:00 schrieb Andrey Grodzovsky: >>>> [SNIP] >>>>> That's one of the reasons why we should have multiple work items >>>>> for job based reset and other reset sources. >>>>> >>>>> See the whole idea is the following: >>>>> 1. We have one single queued work queue for each reset domain >>>>> which makes sure that all reset requests execute in order. >>>>> 2. We have one delayed work item for each scheduler which fires >>>>> when a timeout on a scheduler occurs and eventually calls the >>>>> reset procedure with the last running job. >>>>> 3. We have one work item for each necessary hard reset. >>>>> >>>>> The delayed work item from the scheduler first tries a soft >>>>> recovery and checks if a hard reset is really necessary. If it's >>>>> not necessary and we can cancel the offending job we skip the hard >>>>> reset. >>>>> >>>>> The hard reset work item doesn't do any of those checks and just >>>>> does a reset no matter what. >>>>> >>>>> When we really do a reset, independent if its triggered by a job >>>>> or other source we cancel all sources at the end of the reset >>>>> procedure. >>>>> >>>>> This makes sure that a) We only do one reset even when multiple >>>>> sources fire at the same time and b) when any source bails out and >>>>> only does a soft recovery we do a full reset anyway when necessary. >>>>> >>>>> That design was outlined multiple times now on the mailing list >>>>> and looks totally clear to me. We should probably document that >>>>> somewhere. >>>> >>>> >>>> If you look at the patch what you described above is exactly what >>>> is happening - since scheduler's delayed work is different from any >>>> non scheduler delayed work the SW reset which might take place from >>>> scheduler's reset >>>> will not have any impact on any non scheduler delayed work and will >>>> not cancel them. In case the scheduler actually reaches the point >>>> of HW reset it will cancel out all pending reset works from any >>>> other sources on the same >>>> reset domain. Non scheduler reset will always proceed to do full HW >>>> reset and will cancel any other pending resets. >>> >>> Ok, but why you then need that linked list? The number of reset >>> sources should be static and not in any way dynamic. >> >> >> So array reset_src[i] holds a pointer to pending delayed work from >> source i or NULL if no pedning work ? >> What if same source triggers multiple reset requests such as multiple >> RAS errors at once , don't set the delayed work pointer in the >> arr[RAS_index] if it's already not NULL ? >> >>> >>> See using the linked list sounds like you only wanted to cancel the >>> reset sources raised so far which would not be correct as far as I >>> can see. >> >> >> Not clear about this one ? We do want to cancel those reset sources >> that were raised so far because we just did a HW reset which should >> fix them anyway ? Those who not raised reset request so far their >> respective array index will have a NULL ptr. > > And exactly that's what I want to prevent. See you don't care if a > reset source has fired once, twice, ten times or never. You just > cancel all of them! > > That's why I want to come to a static list of reset sources. > > E.g. in the reset code (either before or after the reset, that's > debatable) you do something like this: > > for (i = 0; i < num_ring; ++i) > cancel_delayed_work(ring[i]->scheduler....) > cancel_work(adev->ras_work); > cancel_work(adev->iofault_work); > cancel_work(adev->debugfs_work); > ... > > You don't really need to track which reset source has fired and which > hasn't, because that would be racy again. Instead just bluntly reset > all possible sources. > > Christian. I don't say we care if it fired once or twice (I need to add a fix to only insert reset work to pending reset list if it's not already there), the point of using list (or array) to which you add and from which you remove is that the logic of this is encapsulated within reset domain. In your way we need to be aware who exactly schedules reset work and explicitly cancel them, this also means that for any new source added in the future you will need to remember to add him to the cancellation list which you showed above. In current way all this done automatically within reset_domain code and it's agnostic to specific driver and it's specific list of reset sources. Also in case we would want to generalize reset_domain to other GPU drivers (which was a plan as far as i remember) this explicit mention of each reset works for cancellation is again less suitable in my opinion. Andrey > >> >> Andrey >> >> >>> >>>> >>>> The only difference is I chose to do the canceling right BEFORE the >>>> HW reset and not AFTER. I did this because I see a possible race >>>> where a new reset request is being generated exactly after we >>>> finished the HW reset but before we canceled out all pending resets >>>> - in such case you wold not want to cancel this 'border line new' >>>> reset request. >>> >>> Why not? Any new reset request directly after a hardware reset is >>> most likely just falsely generated by the reset itself. >>> >>> Ideally I would cancel all sources after the reset, but before >>> starting any new work. >>> >>> Regards, >>> Christian. >>> >>>> >>>> >>>> Andrey >>>> >>>> >>>>> >>>>> Regards, >>>>> Christian. >>>>> >>>>>>> You can see that if many different reset sources share same work >>>>>>> struct what can happen is that the first to obtain the lock you >>>>>>> describe bellow might opt out from full HW reset because his bad >>>>>>> job did signal for example or because his hunged IP block was >>>>>>> able to recover through SW reset but in the meantime another >>>>>>> reset source who needed an actual HW reset just silently >>>>>>> returned and we end up with unhandled reset request. True that >>>>>>> today this happens only to job timeout reset sources that are >>>>>>> handled form within the scheduler and won't use this single work >>>>>>> struct but no one prevents a future case for this to happen and >>>>>>> also, if we actually want to unify scheduler time out handlers >>>>>>> within reset domain (which seems to me the right design >>>>>>> approach) we won't be able to use just one work struct for this >>>>>>> reason anyway. >>>>>>> >>>>>> >>>>>> Just to add to this point - a reset domain is co-operative >>>>>> domain. In addition to sharing a set of clients from various >>>>>> reset sources for one device, it also will have a set of devices >>>>>> like in XGMI hive. The job timeout on one device may not >>>>>> eventually result in result, but a RAS error happening on another >>>>>> device at the same time would need a reset. The second device's >>>>>> RAS error cannot return seeing that a reset work already >>>>>> started, or ignore the reset work given that another device has >>>>>> filled the reset data. >>>>>> >>>>>> When there is a reset domain, it should take care of the work >>>>>> scheduled and keeping it in device or any other level doesn't >>>>>> sound good. >>>>>> >>>>>> Thanks, >>>>>> Lijo >>>>>> >>>>>>> Andrey >>>>>>> >>>>>>> >>>>>>>> >>>>>>>> I'd put the reset work struct into the reset_domain struct. >>>>>>>> That way you'd have exactly one worker for the reset domain. >>>>>>>> You could implement a lock-less scheme to decide whether you >>>>>>>> need to schedule a reset, e.g. using an atomic counter in the >>>>>>>> shared work struct that gets incremented when a client wants to >>>>>>>> trigger a reset (atomic_add_return). If that counter is exactly >>>>>>>> 1 after incrementing, you need to fill in the rest of the work >>>>>>>> struct and schedule the work. Otherwise, it's already scheduled >>>>>>>> (or another client is in the process of scheduling it) and you >>>>>>>> just return. When the worker finishes (after confirming a >>>>>>>> successful reset), it resets the counter to 0, so the next >>>>>>>> client requesting a reset will schedule the worker again. >>>>>>>> >>>>>>>> Regards, >>>>>>>> Felix >>>>>>>> >>>>>>>> >>>>>>>>> >>>>>>>>> >>>>>>>>>> >>>>>>>>>> >>>>>>>>>> Additional to that keep in mind that you can't allocate any >>>>>>>>>> memory before or during the GPU reset nor wait for the reset >>>>>>>>>> to complete (so you can't allocate anything on the stack >>>>>>>>>> either). >>>>>>>>> >>>>>>>>> >>>>>>>>> There is no dynamic allocation here, regarding stack >>>>>>>>> allocations - we do it all the time when we call functions, >>>>>>>>> even during GPU resets, how on stack allocation of work struct >>>>>>>>> in amdgpu_device_gpu_recover is different from any other local >>>>>>>>> variable we allocate in any function we call ? >>>>>>>>> >>>>>>>>> I am also not sure why it's not allowed to wait for reset to >>>>>>>>> complete ? Also, see in amdgpu_ras_do_recovery and >>>>>>>>> gpu_recover_get (debugfs) - the caller expects the reset to >>>>>>>>> complete before he returns. I can probably work around it in >>>>>>>>> RAS code by calling atomic_set(&ras->in_recovery, 0) from some >>>>>>>>> callback within actual reset function but regarding sysfs it >>>>>>>>> actually expects a result returned indicating whether the call >>>>>>>>> was successful or not. >>>>>>>>> >>>>>>>>> Andrey >>>>>>>>> >>>>>>>>> >>>>>>>>>> >>>>>>>>>> I don't think that concept you try here will work. >>>>>>>>>> >>>>>>>>>> Regards, >>>>>>>>>> Christian. >>>>>>>>>> >>>>>>>>>>> Also in general seems to me it's cleaner approach where this >>>>>>>>>>> logic (the work items) are held and handled in reset_domain >>>>>>>>>>> and are not split in each adev or any other entity. We might >>>>>>>>>>> want in the future to even move the scheduler handling into >>>>>>>>>>> reset domain since reset domain is supposed to be a generic >>>>>>>>>>> things and not only or AMD. >>>>>>>>>>> >>>>>>>>>>> Andrey >>>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>>> >>>>>>>>>>>>> >>>>>>>>>>>>> Andrey >>>>>>>>>>>>> >>>>>>>>>>>>> >>>>>>>>>>>>>> >>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>> Christian. >>>>>>>>>>>>>> >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky >>>>>>>>>>>>>>> <andrey.grodzovsky@amd.com> >>>>>>>>>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com> >>>>>>>>>>>>>>> --- >>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +--- >>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++-- >>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 + >>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 >>>>>>>>>>>>>>> +++++++++++++++++++++- >>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +- >>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++- >>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++- >>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++- >>>>>>>>>>>>>>> 8 files changed, 104 insertions(+), 24 deletions(-) >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>> index 4264abc5604d..99efd8317547 100644 >>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>> @@ -109,6 +109,7 @@ >>>>>>>>>>>>>>> #include "amdgpu_fdinfo.h" >>>>>>>>>>>>>>> #include "amdgpu_mca.h" >>>>>>>>>>>>>>> #include "amdgpu_ras.h" >>>>>>>>>>>>>>> +#include "amdgpu_reset.h" >>>>>>>>>>>>>>> #define MAX_GPU_INSTANCE 16 >>>>>>>>>>>>>>> @@ -509,16 +510,6 @@ struct >>>>>>>>>>>>>>> amdgpu_allowed_register_entry { >>>>>>>>>>>>>>> bool grbm_indexed; >>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>> -enum amd_reset_method { >>>>>>>>>>>>>>> - AMD_RESET_METHOD_NONE = -1, >>>>>>>>>>>>>>> - AMD_RESET_METHOD_LEGACY = 0, >>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE0, >>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE1, >>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE2, >>>>>>>>>>>>>>> - AMD_RESET_METHOD_BACO, >>>>>>>>>>>>>>> - AMD_RESET_METHOD_PCI, >>>>>>>>>>>>>>> -}; >>>>>>>>>>>>>>> - >>>>>>>>>>>>>>> struct amdgpu_video_codec_info { >>>>>>>>>>>>>>> u32 codec_type; >>>>>>>>>>>>>>> u32 max_width; >>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>> index e582f1044c0f..7fa82269c30f 100644 >>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>> @@ -5201,6 +5201,12 @@ int >>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev, >>>>>>>>>>>>>>> } >>>>>>>>>>>>>>> tmp_vram_lost_counter = >>>>>>>>>>>>>>> atomic_read(&((adev)->vram_lost_counter)); >>>>>>>>>>>>>>> + >>>>>>>>>>>>>>> + /* Drop all pending resets since we will reset now >>>>>>>>>>>>>>> anyway */ >>>>>>>>>>>>>>> + tmp_adev = list_first_entry(device_list_handle, >>>>>>>>>>>>>>> struct amdgpu_device, >>>>>>>>>>>>>>> + reset_list); >>>>>>>>>>>>>>> + amdgpu_reset_pending_list(tmp_adev->reset_domain); >>>>>>>>>>>>>>> + >>>>>>>>>>>>>>> /* Actual ASIC resets if needed.*/ >>>>>>>>>>>>>>> /* Host driver will handle XGMI hive reset for >>>>>>>>>>>>>>> SRIOV */ >>>>>>>>>>>>>>> if (amdgpu_sriov_vf(adev)) { >>>>>>>>>>>>>>> @@ -5296,7 +5302,7 @@ int >>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev, >>>>>>>>>>>>>>> } >>>>>>>>>>>>>>> struct amdgpu_recover_work_struct { >>>>>>>>>>>>>>> - struct work_struct base; >>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct base; >>>>>>>>>>>>>>> struct amdgpu_device *adev; >>>>>>>>>>>>>>> struct amdgpu_job *job; >>>>>>>>>>>>>>> int ret; >>>>>>>>>>>>>>> @@ -5304,7 +5310,7 @@ struct amdgpu_recover_work_struct { >>>>>>>>>>>>>>> static void >>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work(struct work_struct >>>>>>>>>>>>>>> *work) >>>>>>>>>>>>>>> { >>>>>>>>>>>>>>> - struct amdgpu_recover_work_struct *recover_work = >>>>>>>>>>>>>>> container_of(work, struct amdgpu_recover_work_struct, >>>>>>>>>>>>>>> base); >>>>>>>>>>>>>>> + struct amdgpu_recover_work_struct *recover_work = >>>>>>>>>>>>>>> container_of(work, struct amdgpu_recover_work_struct, >>>>>>>>>>>>>>> base.base.work); >>>>>>>>>>>>>>> recover_work->ret = >>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, >>>>>>>>>>>>>>> recover_work->job); >>>>>>>>>>>>>>> } >>>>>>>>>>>>>>> @@ -5316,12 +5322,15 @@ int >>>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct amdgpu_device *adev, >>>>>>>>>>>>>>> { >>>>>>>>>>>>>>> struct amdgpu_recover_work_struct work = {.adev = >>>>>>>>>>>>>>> adev, .job = job}; >>>>>>>>>>>>>>> - INIT_WORK(&work.base, >>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&work.base.base, >>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>>>>>>>>>> + INIT_LIST_HEAD(&work.base.node); >>>>>>>>>>>>>>> if >>>>>>>>>>>>>>> (!amdgpu_reset_domain_schedule(adev->reset_domain, >>>>>>>>>>>>>>> &work.base)) >>>>>>>>>>>>>>> return -EAGAIN; >>>>>>>>>>>>>>> - flush_work(&work.base); >>>>>>>>>>>>>>> + flush_delayed_work(&work.base.base); >>>>>>>>>>>>>>> + >>>>>>>>>>>>>>> + >>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>> &work.base); >>>>>>>>>>>>>>> return work.ret; >>>>>>>>>>>>>>> } >>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>> index c80af0889773..ffddd419c351 100644 >>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain >>>>>>>>>>>>>>> *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d >>>>>>>>>>>>>>> atomic_set(&reset_domain->in_gpu_reset, 0); >>>>>>>>>>>>>>> init_rwsem(&reset_domain->sem); >>>>>>>>>>>>>>> + INIT_LIST_HEAD(&reset_domain->pending_works); >>>>>>>>>>>>>>> + mutex_init(&reset_domain->reset_lock); >>>>>>>>>>>>>>> + >>>>>>>>>>>>>>> return reset_domain; >>>>>>>>>>>>>>> } >>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>> index 1949dbe28a86..863ec5720fc1 100644 >>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>> @@ -24,7 +24,18 @@ >>>>>>>>>>>>>>> #ifndef __AMDGPU_RESET_H__ >>>>>>>>>>>>>>> #define __AMDGPU_RESET_H__ >>>>>>>>>>>>>>> -#include "amdgpu.h" >>>>>>>>>>>>>>> + >>>>>>>>>>>>>>> +#include <linux/atomic.h> >>>>>>>>>>>>>>> +#include <linux/mutex.h> >>>>>>>>>>>>>>> +#include <linux/list.h> >>>>>>>>>>>>>>> +#include <linux/kref.h> >>>>>>>>>>>>>>> +#include <linux/rwsem.h> >>>>>>>>>>>>>>> +#include <linux/workqueue.h> >>>>>>>>>>>>>>> + >>>>>>>>>>>>>>> +struct amdgpu_device; >>>>>>>>>>>>>>> +struct amdgpu_job; >>>>>>>>>>>>>>> +struct amdgpu_hive_info; >>>>>>>>>>>>>>> + >>>>>>>>>>>>>>> enum AMDGPU_RESET_FLAGS { >>>>>>>>>>>>>>> @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS { >>>>>>>>>>>>>>> AMDGPU_SKIP_HW_RESET = 1, >>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>> + >>>>>>>>>>>>>>> +enum amd_reset_method { >>>>>>>>>>>>>>> + AMD_RESET_METHOD_NONE = -1, >>>>>>>>>>>>>>> + AMD_RESET_METHOD_LEGACY = 0, >>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE0, >>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE1, >>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE2, >>>>>>>>>>>>>>> + AMD_RESET_METHOD_BACO, >>>>>>>>>>>>>>> + AMD_RESET_METHOD_PCI, >>>>>>>>>>>>>>> +}; >>>>>>>>>>>>>>> + >>>>>>>>>>>>>>> struct amdgpu_reset_context { >>>>>>>>>>>>>>> enum amd_reset_method method; >>>>>>>>>>>>>>> struct amdgpu_device *reset_req_dev; >>>>>>>>>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context { >>>>>>>>>>>>>>> unsigned long flags; >>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>> +struct amdgpu_reset_control; >>>>>>>>>>>>>>> + >>>>>>>>>>>>>>> struct amdgpu_reset_handler { >>>>>>>>>>>>>>> enum amd_reset_method reset_method; >>>>>>>>>>>>>>> struct list_head handler_list; >>>>>>>>>>>>>>> @@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type { >>>>>>>>>>>>>>> XGMI_HIVE >>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>> + >>>>>>>>>>>>>>> +struct amdgpu_reset_work_struct { >>>>>>>>>>>>>>> + struct delayed_work base; >>>>>>>>>>>>>>> + struct list_head node; >>>>>>>>>>>>>>> +}; >>>>>>>>>>>>>>> + >>>>>>>>>>>>>>> struct amdgpu_reset_domain { >>>>>>>>>>>>>>> struct kref refcount; >>>>>>>>>>>>>>> struct workqueue_struct *wq; >>>>>>>>>>>>>>> enum amdgpu_reset_domain_type type; >>>>>>>>>>>>>>> struct rw_semaphore sem; >>>>>>>>>>>>>>> atomic_t in_gpu_reset; >>>>>>>>>>>>>>> + >>>>>>>>>>>>>>> + struct list_head pending_works; >>>>>>>>>>>>>>> + struct mutex reset_lock; >>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>> @@ -113,9 +146,43 @@ static inline void >>>>>>>>>>>>>>> amdgpu_reset_put_reset_domain(struct amdgpu_reset_domain >>>>>>>>>>>>>>> *dom >>>>>>>>>>>>>>> } >>>>>>>>>>>>>>> static inline bool >>>>>>>>>>>>>>> amdgpu_reset_domain_schedule(struct amdgpu_reset_domain >>>>>>>>>>>>>>> *domain, >>>>>>>>>>>>>>> - struct work_struct *work) >>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct >>>>>>>>>>>>>>> *work) >>>>>>>>>>>>>>> { >>>>>>>>>>>>>>> - return queue_work(domain->wq, work); >>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>> + >>>>>>>>>>>>>>> + if (!queue_delayed_work(domain->wq, &work->base, 0)) { >>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>> + return false; >>>>>>>>>>>>>>> + } >>>>>>>>>>>>>>> + >>>>>>>>>>>>>>> + list_add_tail(&work->node, &domain->pending_works); >>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>> + >>>>>>>>>>>>>>> + return true; >>>>>>>>>>>>>>> +} >>>>>>>>>>>>>>> + >>>>>>>>>>>>>>> +static inline void >>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(struct >>>>>>>>>>>>>>> amdgpu_reset_domain *domain, >>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work) >>>>>>>>>>>>>>> +{ >>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>> + list_del_init(&work->node); >>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>> +} >>>>>>>>>>>>>>> + >>>>>>>>>>>>>>> +static inline void amdgpu_reset_pending_list(struct >>>>>>>>>>>>>>> amdgpu_reset_domain *domain) >>>>>>>>>>>>>>> +{ >>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *entry, *tmp; >>>>>>>>>>>>>>> + >>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>> + list_for_each_entry_safe(entry, tmp, >>>>>>>>>>>>>>> &domain->pending_works, node) { >>>>>>>>>>>>>>> + >>>>>>>>>>>>>>> + list_del_init(&entry->node); >>>>>>>>>>>>>>> + >>>>>>>>>>>>>>> + /* Stop any other related pending resets */ >>>>>>>>>>>>>>> + cancel_delayed_work(&entry->base); >>>>>>>>>>>>>>> + } >>>>>>>>>>>>>>> + >>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>> } >>>>>>>>>>>>>>> void amdgpu_device_lock_reset_domain(struct >>>>>>>>>>>>>>> amdgpu_reset_domain *reset_domain); >>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>> index 239f232f9c02..574e870d3064 100644 >>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>> @@ -25,6 +25,7 @@ >>>>>>>>>>>>>>> #define AMDGPU_VIRT_H >>>>>>>>>>>>>>> #include "amdgv_sriovmsg.h" >>>>>>>>>>>>>>> +#include "amdgpu_reset.h" >>>>>>>>>>>>>>> #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 << 0) /* >>>>>>>>>>>>>>> vBIOS is sr-iov ready */ >>>>>>>>>>>>>>> #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 1) /* >>>>>>>>>>>>>>> sr-iov is enabled on this GPU */ >>>>>>>>>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt { >>>>>>>>>>>>>>> uint32_t reg_val_offs; >>>>>>>>>>>>>>> struct amdgpu_irq_src ack_irq; >>>>>>>>>>>>>>> struct amdgpu_irq_src rcv_irq; >>>>>>>>>>>>>>> - struct work_struct flr_work; >>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct flr_work; >>>>>>>>>>>>>>> struct amdgpu_mm_table mm_table; >>>>>>>>>>>>>>> const struct amdgpu_virt_ops *ops; >>>>>>>>>>>>>>> struct amdgpu_vf_error_buffer vf_errors; >>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>> index b81acf59870c..f3d1c2be9292 100644 >>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>> @@ -251,7 +251,7 @@ static int >>>>>>>>>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device *adev, >>>>>>>>>>>>>>> static void xgpu_ai_mailbox_flr_work(struct >>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>> { >>>>>>>>>>>>>>> - struct amdgpu_virt *virt = container_of(work, >>>>>>>>>>>>>>> struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>> + struct amdgpu_virt *virt = container_of(work, >>>>>>>>>>>>>>> struct amdgpu_virt, flr_work.base.work); >>>>>>>>>>>>>>> struct amdgpu_device *adev = container_of(virt, >>>>>>>>>>>>>>> struct amdgpu_device, virt); >>>>>>>>>>>>>>> int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>>>>>>>>>> @@ -380,7 +380,8 @@ int xgpu_ai_mailbox_get_irq(struct >>>>>>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>> } >>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work); >>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work); >>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>> } >>>>>>>>>>>>>>> @@ -389,6 +390,8 @@ void xgpu_ai_mailbox_put_irq(struct >>>>>>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>>>>>> { >>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>> + >>>>>>>>>>>>>>> + >>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>> } >>>>>>>>>>>>>>> static int xgpu_ai_request_init_data(struct >>>>>>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644 >>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>> @@ -275,7 +275,7 @@ static int >>>>>>>>>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device *adev, >>>>>>>>>>>>>>> static void xgpu_nv_mailbox_flr_work(struct >>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>> { >>>>>>>>>>>>>>> - struct amdgpu_virt *virt = container_of(work, >>>>>>>>>>>>>>> struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>> + struct amdgpu_virt *virt = container_of(work, >>>>>>>>>>>>>>> struct amdgpu_virt, flr_work.base.work); >>>>>>>>>>>>>>> struct amdgpu_device *adev = container_of(virt, >>>>>>>>>>>>>>> struct amdgpu_device, virt); >>>>>>>>>>>>>>> int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>>>>>>>>>> @@ -407,7 +407,8 @@ int xgpu_nv_mailbox_get_irq(struct >>>>>>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>> } >>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work); >>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work); >>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>> } >>>>>>>>>>>>>>> @@ -416,6 +417,8 @@ void xgpu_nv_mailbox_put_irq(struct >>>>>>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>>>>>> { >>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>> + >>>>>>>>>>>>>>> + >>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>> } >>>>>>>>>>>>>>> const struct amdgpu_virt_ops xgpu_nv_virt_ops = { >>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644 >>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>> @@ -512,7 +512,7 @@ static int >>>>>>>>>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device *adev, >>>>>>>>>>>>>>> static void xgpu_vi_mailbox_flr_work(struct >>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>> { >>>>>>>>>>>>>>> - struct amdgpu_virt *virt = container_of(work, >>>>>>>>>>>>>>> struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>> + struct amdgpu_virt *virt = container_of(work, >>>>>>>>>>>>>>> struct amdgpu_virt, flr_work.base.work); >>>>>>>>>>>>>>> struct amdgpu_device *adev = container_of(virt, >>>>>>>>>>>>>>> struct amdgpu_device, virt); >>>>>>>>>>>>>>> /* wait until RCV_MSG become 3 */ >>>>>>>>>>>>>>> @@ -610,7 +610,8 @@ int xgpu_vi_mailbox_get_irq(struct >>>>>>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>> } >>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work); >>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work); >>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>> } >>>>>>>>>>>>>>> @@ -619,6 +620,8 @@ void xgpu_vi_mailbox_put_irq(struct >>>>>>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>>>>>> { >>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>> + >>>>>>>>>>>>>>> + >>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>> } >>>>>>>>>>>>>>> const struct amdgpu_virt_ops xgpu_vi_virt_ops = { >>>>>>>>>>>>>> >>>>>>>>>>>> >>>>>>>>>> >>>>> >>> > ^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive. 2022-05-10 18:53 ` Andrey Grodzovsky @ 2022-05-11 7:38 ` Christian König 2022-05-11 13:43 ` Andrey Grodzovsky 0 siblings, 1 reply; 40+ messages in thread From: Christian König @ 2022-05-11 7:38 UTC (permalink / raw) To: Andrey Grodzovsky, Christian König, Lazar, Lijo, Felix Kuehling, amd-gfx Cc: Bai Zoy Am 10.05.22 um 20:53 schrieb Andrey Grodzovsky: > > On 2022-05-10 13:19, Christian König wrote: >> Am 10.05.22 um 19:01 schrieb Andrey Grodzovsky: >>> >>> On 2022-05-10 12:17, Christian König wrote: >>>> Am 10.05.22 um 18:00 schrieb Andrey Grodzovsky: >>>>> [SNIP] >>>>>> That's one of the reasons why we should have multiple work items >>>>>> for job based reset and other reset sources. >>>>>> >>>>>> See the whole idea is the following: >>>>>> 1. We have one single queued work queue for each reset domain >>>>>> which makes sure that all reset requests execute in order. >>>>>> 2. We have one delayed work item for each scheduler which fires >>>>>> when a timeout on a scheduler occurs and eventually calls the >>>>>> reset procedure with the last running job. >>>>>> 3. We have one work item for each necessary hard reset. >>>>>> >>>>>> The delayed work item from the scheduler first tries a soft >>>>>> recovery and checks if a hard reset is really necessary. If it's >>>>>> not necessary and we can cancel the offending job we skip the >>>>>> hard reset. >>>>>> >>>>>> The hard reset work item doesn't do any of those checks and just >>>>>> does a reset no matter what. >>>>>> >>>>>> When we really do a reset, independent if its triggered by a job >>>>>> or other source we cancel all sources at the end of the reset >>>>>> procedure. >>>>>> >>>>>> This makes sure that a) We only do one reset even when multiple >>>>>> sources fire at the same time and b) when any source bails out >>>>>> and only does a soft recovery we do a full reset anyway when >>>>>> necessary. >>>>>> >>>>>> That design was outlined multiple times now on the mailing list >>>>>> and looks totally clear to me. We should probably document that >>>>>> somewhere. >>>>> >>>>> >>>>> If you look at the patch what you described above is exactly what >>>>> is happening - since scheduler's delayed work is different from >>>>> any non scheduler delayed work the SW reset which might take place >>>>> from scheduler's reset >>>>> will not have any impact on any non scheduler delayed work and >>>>> will not cancel them. In case the scheduler actually reaches the >>>>> point of HW reset it will cancel out all pending reset works from >>>>> any other sources on the same >>>>> reset domain. Non scheduler reset will always proceed to do full >>>>> HW reset and will cancel any other pending resets. >>>> >>>> Ok, but why you then need that linked list? The number of reset >>>> sources should be static and not in any way dynamic. >>> >>> >>> So array reset_src[i] holds a pointer to pending delayed work from >>> source i or NULL if no pedning work ? >>> What if same source triggers multiple reset requests such as >>> multiple RAS errors at once , don't set the delayed work pointer in >>> the arr[RAS_index] if it's already not NULL ? >>> >>>> >>>> See using the linked list sounds like you only wanted to cancel the >>>> reset sources raised so far which would not be correct as far as I >>>> can see. >>> >>> >>> Not clear about this one ? We do want to cancel those reset sources >>> that were raised so far because we just did a HW reset which should >>> fix them anyway ? Those who not raised reset request so far their >>> respective array index will have a NULL ptr. >> >> And exactly that's what I want to prevent. See you don't care if a >> reset source has fired once, twice, ten times or never. You just >> cancel all of them! >> >> That's why I want to come to a static list of reset sources. >> >> E.g. in the reset code (either before or after the reset, that's >> debatable) you do something like this: >> >> for (i = 0; i < num_ring; ++i) >> cancel_delayed_work(ring[i]->scheduler....) >> cancel_work(adev->ras_work); >> cancel_work(adev->iofault_work); >> cancel_work(adev->debugfs_work); >> ... >> >> You don't really need to track which reset source has fired and which >> hasn't, because that would be racy again. Instead just bluntly reset >> all possible sources. >> >> Christian. > > > I don't say we care if it fired once or twice (I need to add a fix to > only insert reset work to pending reset list if it's not already > there), the point of using list (or array) to which you add and from > which you remove is that the logic of this is encapsulated within > reset domain. In your way we need to be aware who exactly schedules > reset work and explicitly cancel them, this also means that for any > new source added in the future you will need to remember to add him I don't think that this is a valid argument. Additionally to the schedulers we probably just need less than a handful of reset sources, most likely even just one or two is enough. The only justification I can see of having additional separate reset sources would be if somebody wants to know if a specific source has been handled or not (e.g. call flush_work() or work_pending()). Like in the case of a reset triggered through debugfs. > to the cancellation list which you showed above. In current way all > this done automatically within reset_domain code and it's agnostic to > specific driver and it's specific list of reset sources. Also in case > we would want to generalize reset_domain to other GPU drivers (which was > a plan as far as i remember) this explicit mention of each reset works > for cancellation is again less suitable in my opinion. Well we could put the work item for the scheduler independent reset source into the reset domain as well. But I'm not sure those additional reset sources should be part of any common handling, that is largely amdgpu specific. Christian. > > Andrey > > >> >>> >>> Andrey >>> >>> >>>> >>>>> >>>>> The only difference is I chose to do the canceling right BEFORE >>>>> the HW reset and not AFTER. I did this because I see a possible >>>>> race where a new reset request is being generated exactly after we >>>>> finished the HW reset but before we canceled out all pending >>>>> resets - in such case you wold not want to cancel this 'border >>>>> line new' reset request. >>>> >>>> Why not? Any new reset request directly after a hardware reset is >>>> most likely just falsely generated by the reset itself. >>>> >>>> Ideally I would cancel all sources after the reset, but before >>>> starting any new work. >>>> >>>> Regards, >>>> Christian. >>>> >>>>> >>>>> >>>>> Andrey >>>>> >>>>> >>>>>> >>>>>> Regards, >>>>>> Christian. >>>>>> >>>>>>>> You can see that if many different reset sources share same >>>>>>>> work struct what can happen is that the first to obtain the >>>>>>>> lock you describe bellow might opt out from full HW reset >>>>>>>> because his bad job did signal for example or because his >>>>>>>> hunged IP block was able to recover through SW reset but in the >>>>>>>> meantime another reset source who needed an actual HW reset >>>>>>>> just silently returned and we end up with unhandled reset >>>>>>>> request. True that today this happens only to job timeout reset >>>>>>>> sources that are handled form within the scheduler and won't >>>>>>>> use this single work struct but no one prevents a future case >>>>>>>> for this to happen and also, if we actually want to unify >>>>>>>> scheduler time out handlers within reset domain (which seems to >>>>>>>> me the right design approach) we won't be able to use just one >>>>>>>> work struct for this reason anyway. >>>>>>>> >>>>>>> >>>>>>> Just to add to this point - a reset domain is co-operative >>>>>>> domain. In addition to sharing a set of clients from various >>>>>>> reset sources for one device, it also will have a set of devices >>>>>>> like in XGMI hive. The job timeout on one device may not >>>>>>> eventually result in result, but a RAS error happening on >>>>>>> another device at the same time would need a reset. The second >>>>>>> device's RAS error cannot return seeing that a reset work >>>>>>> already started, or ignore the reset work given that another >>>>>>> device has filled the reset data. >>>>>>> >>>>>>> When there is a reset domain, it should take care of the work >>>>>>> scheduled and keeping it in device or any other level doesn't >>>>>>> sound good. >>>>>>> >>>>>>> Thanks, >>>>>>> Lijo >>>>>>> >>>>>>>> Andrey >>>>>>>> >>>>>>>> >>>>>>>>> >>>>>>>>> I'd put the reset work struct into the reset_domain struct. >>>>>>>>> That way you'd have exactly one worker for the reset domain. >>>>>>>>> You could implement a lock-less scheme to decide whether you >>>>>>>>> need to schedule a reset, e.g. using an atomic counter in the >>>>>>>>> shared work struct that gets incremented when a client wants >>>>>>>>> to trigger a reset (atomic_add_return). If that counter is >>>>>>>>> exactly 1 after incrementing, you need to fill in the rest of >>>>>>>>> the work struct and schedule the work. Otherwise, it's already >>>>>>>>> scheduled (or another client is in the process of scheduling >>>>>>>>> it) and you just return. When the worker finishes (after >>>>>>>>> confirming a successful reset), it resets the counter to 0, so >>>>>>>>> the next client requesting a reset will schedule the worker >>>>>>>>> again. >>>>>>>>> >>>>>>>>> Regards, >>>>>>>>> Felix >>>>>>>>> >>>>>>>>> >>>>>>>>>> >>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>> Additional to that keep in mind that you can't allocate any >>>>>>>>>>> memory before or during the GPU reset nor wait for the reset >>>>>>>>>>> to complete (so you can't allocate anything on the stack >>>>>>>>>>> either). >>>>>>>>>> >>>>>>>>>> >>>>>>>>>> There is no dynamic allocation here, regarding stack >>>>>>>>>> allocations - we do it all the time when we call functions, >>>>>>>>>> even during GPU resets, how on stack allocation of work >>>>>>>>>> struct in amdgpu_device_gpu_recover is different from any >>>>>>>>>> other local variable we allocate in any function we call ? >>>>>>>>>> >>>>>>>>>> I am also not sure why it's not allowed to wait for reset to >>>>>>>>>> complete ? Also, see in amdgpu_ras_do_recovery and >>>>>>>>>> gpu_recover_get (debugfs) - the caller expects the reset to >>>>>>>>>> complete before he returns. I can probably work around it in >>>>>>>>>> RAS code by calling atomic_set(&ras->in_recovery, 0) from >>>>>>>>>> some callback within actual reset function but regarding >>>>>>>>>> sysfs it actually expects a result returned indicating >>>>>>>>>> whether the call was successful or not. >>>>>>>>>> >>>>>>>>>> Andrey >>>>>>>>>> >>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>> I don't think that concept you try here will work. >>>>>>>>>>> >>>>>>>>>>> Regards, >>>>>>>>>>> Christian. >>>>>>>>>>> >>>>>>>>>>>> Also in general seems to me it's cleaner approach where >>>>>>>>>>>> this logic (the work items) are held and handled in >>>>>>>>>>>> reset_domain and are not split in each adev or any other >>>>>>>>>>>> entity. We might want in the future to even move the >>>>>>>>>>>> scheduler handling into reset domain since reset domain is >>>>>>>>>>>> supposed to be a generic things and not only or AMD. >>>>>>>>>>>> >>>>>>>>>>>> Andrey >>>>>>>>>>>> >>>>>>>>>>>> >>>>>>>>>>>>> >>>>>>>>>>>>>> >>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>> >>>>>>>>>>>>>> >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>>> Christian. >>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky >>>>>>>>>>>>>>>> <andrey.grodzovsky@amd.com> >>>>>>>>>>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com> >>>>>>>>>>>>>>>> --- >>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +--- >>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++-- >>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 + >>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 >>>>>>>>>>>>>>>> +++++++++++++++++++++- >>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +- >>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++- >>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++- >>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++- >>>>>>>>>>>>>>>> 8 files changed, 104 insertions(+), 24 deletions(-) >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>> index 4264abc5604d..99efd8317547 100644 >>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>> @@ -109,6 +109,7 @@ >>>>>>>>>>>>>>>> #include "amdgpu_fdinfo.h" >>>>>>>>>>>>>>>> #include "amdgpu_mca.h" >>>>>>>>>>>>>>>> #include "amdgpu_ras.h" >>>>>>>>>>>>>>>> +#include "amdgpu_reset.h" >>>>>>>>>>>>>>>> #define MAX_GPU_INSTANCE 16 >>>>>>>>>>>>>>>> @@ -509,16 +510,6 @@ struct >>>>>>>>>>>>>>>> amdgpu_allowed_register_entry { >>>>>>>>>>>>>>>> bool grbm_indexed; >>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>> -enum amd_reset_method { >>>>>>>>>>>>>>>> - AMD_RESET_METHOD_NONE = -1, >>>>>>>>>>>>>>>> - AMD_RESET_METHOD_LEGACY = 0, >>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE0, >>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE1, >>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE2, >>>>>>>>>>>>>>>> - AMD_RESET_METHOD_BACO, >>>>>>>>>>>>>>>> - AMD_RESET_METHOD_PCI, >>>>>>>>>>>>>>>> -}; >>>>>>>>>>>>>>>> - >>>>>>>>>>>>>>>> struct amdgpu_video_codec_info { >>>>>>>>>>>>>>>> u32 codec_type; >>>>>>>>>>>>>>>> u32 max_width; >>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>> index e582f1044c0f..7fa82269c30f 100644 >>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>> @@ -5201,6 +5201,12 @@ int >>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev, >>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>> tmp_vram_lost_counter = >>>>>>>>>>>>>>>> atomic_read(&((adev)->vram_lost_counter)); >>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>> + /* Drop all pending resets since we will reset now >>>>>>>>>>>>>>>> anyway */ >>>>>>>>>>>>>>>> + tmp_adev = list_first_entry(device_list_handle, >>>>>>>>>>>>>>>> struct amdgpu_device, >>>>>>>>>>>>>>>> + reset_list); >>>>>>>>>>>>>>>> + amdgpu_reset_pending_list(tmp_adev->reset_domain); >>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>> /* Actual ASIC resets if needed.*/ >>>>>>>>>>>>>>>> /* Host driver will handle XGMI hive reset for >>>>>>>>>>>>>>>> SRIOV */ >>>>>>>>>>>>>>>> if (amdgpu_sriov_vf(adev)) { >>>>>>>>>>>>>>>> @@ -5296,7 +5302,7 @@ int >>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev, >>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>> struct amdgpu_recover_work_struct { >>>>>>>>>>>>>>>> - struct work_struct base; >>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct base; >>>>>>>>>>>>>>>> struct amdgpu_device *adev; >>>>>>>>>>>>>>>> struct amdgpu_job *job; >>>>>>>>>>>>>>>> int ret; >>>>>>>>>>>>>>>> @@ -5304,7 +5310,7 @@ struct amdgpu_recover_work_struct { >>>>>>>>>>>>>>>> static void >>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work(struct work_struct >>>>>>>>>>>>>>>> *work) >>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>> - struct amdgpu_recover_work_struct *recover_work = >>>>>>>>>>>>>>>> container_of(work, struct amdgpu_recover_work_struct, >>>>>>>>>>>>>>>> base); >>>>>>>>>>>>>>>> + struct amdgpu_recover_work_struct *recover_work = >>>>>>>>>>>>>>>> container_of(work, struct amdgpu_recover_work_struct, >>>>>>>>>>>>>>>> base.base.work); >>>>>>>>>>>>>>>> recover_work->ret = >>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, >>>>>>>>>>>>>>>> recover_work->job); >>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>> @@ -5316,12 +5322,15 @@ int >>>>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct amdgpu_device *adev, >>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>> struct amdgpu_recover_work_struct work = {.adev = >>>>>>>>>>>>>>>> adev, .job = job}; >>>>>>>>>>>>>>>> - INIT_WORK(&work.base, >>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&work.base.base, >>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&work.base.node); >>>>>>>>>>>>>>>> if >>>>>>>>>>>>>>>> (!amdgpu_reset_domain_schedule(adev->reset_domain, >>>>>>>>>>>>>>>> &work.base)) >>>>>>>>>>>>>>>> return -EAGAIN; >>>>>>>>>>>>>>>> - flush_work(&work.base); >>>>>>>>>>>>>>>> + flush_delayed_work(&work.base.base); >>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>> &work.base); >>>>>>>>>>>>>>>> return work.ret; >>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>> index c80af0889773..ffddd419c351 100644 >>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain >>>>>>>>>>>>>>>> *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d >>>>>>>>>>>>>>>> atomic_set(&reset_domain->in_gpu_reset, 0); >>>>>>>>>>>>>>>> init_rwsem(&reset_domain->sem); >>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&reset_domain->pending_works); >>>>>>>>>>>>>>>> + mutex_init(&reset_domain->reset_lock); >>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>> return reset_domain; >>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>> index 1949dbe28a86..863ec5720fc1 100644 >>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>> @@ -24,7 +24,18 @@ >>>>>>>>>>>>>>>> #ifndef __AMDGPU_RESET_H__ >>>>>>>>>>>>>>>> #define __AMDGPU_RESET_H__ >>>>>>>>>>>>>>>> -#include "amdgpu.h" >>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>> +#include <linux/atomic.h> >>>>>>>>>>>>>>>> +#include <linux/mutex.h> >>>>>>>>>>>>>>>> +#include <linux/list.h> >>>>>>>>>>>>>>>> +#include <linux/kref.h> >>>>>>>>>>>>>>>> +#include <linux/rwsem.h> >>>>>>>>>>>>>>>> +#include <linux/workqueue.h> >>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>> +struct amdgpu_device; >>>>>>>>>>>>>>>> +struct amdgpu_job; >>>>>>>>>>>>>>>> +struct amdgpu_hive_info; >>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>> enum AMDGPU_RESET_FLAGS { >>>>>>>>>>>>>>>> @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS { >>>>>>>>>>>>>>>> AMDGPU_SKIP_HW_RESET = 1, >>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>> +enum amd_reset_method { >>>>>>>>>>>>>>>> + AMD_RESET_METHOD_NONE = -1, >>>>>>>>>>>>>>>> + AMD_RESET_METHOD_LEGACY = 0, >>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE0, >>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE1, >>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE2, >>>>>>>>>>>>>>>> + AMD_RESET_METHOD_BACO, >>>>>>>>>>>>>>>> + AMD_RESET_METHOD_PCI, >>>>>>>>>>>>>>>> +}; >>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>> struct amdgpu_reset_context { >>>>>>>>>>>>>>>> enum amd_reset_method method; >>>>>>>>>>>>>>>> struct amdgpu_device *reset_req_dev; >>>>>>>>>>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context { >>>>>>>>>>>>>>>> unsigned long flags; >>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>> +struct amdgpu_reset_control; >>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>> struct amdgpu_reset_handler { >>>>>>>>>>>>>>>> enum amd_reset_method reset_method; >>>>>>>>>>>>>>>> struct list_head handler_list; >>>>>>>>>>>>>>>> @@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type { >>>>>>>>>>>>>>>> XGMI_HIVE >>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>> +struct amdgpu_reset_work_struct { >>>>>>>>>>>>>>>> + struct delayed_work base; >>>>>>>>>>>>>>>> + struct list_head node; >>>>>>>>>>>>>>>> +}; >>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>> struct amdgpu_reset_domain { >>>>>>>>>>>>>>>> struct kref refcount; >>>>>>>>>>>>>>>> struct workqueue_struct *wq; >>>>>>>>>>>>>>>> enum amdgpu_reset_domain_type type; >>>>>>>>>>>>>>>> struct rw_semaphore sem; >>>>>>>>>>>>>>>> atomic_t in_gpu_reset; >>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>> + struct list_head pending_works; >>>>>>>>>>>>>>>> + struct mutex reset_lock; >>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>> @@ -113,9 +146,43 @@ static inline void >>>>>>>>>>>>>>>> amdgpu_reset_put_reset_domain(struct >>>>>>>>>>>>>>>> amdgpu_reset_domain *dom >>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>> static inline bool >>>>>>>>>>>>>>>> amdgpu_reset_domain_schedule(struct amdgpu_reset_domain >>>>>>>>>>>>>>>> *domain, >>>>>>>>>>>>>>>> - struct work_struct *work) >>>>>>>>>>>>>>>> + struct >>>>>>>>>>>>>>>> amdgpu_reset_work_struct *work) >>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>> - return queue_work(domain->wq, work); >>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>> + if (!queue_delayed_work(domain->wq, &work->base, >>>>>>>>>>>>>>>> 0)) { >>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>> + return false; >>>>>>>>>>>>>>>> + } >>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>> + list_add_tail(&work->node, &domain->pending_works); >>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>> + return true; >>>>>>>>>>>>>>>> +} >>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>> +static inline void >>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(struct >>>>>>>>>>>>>>>> amdgpu_reset_domain *domain, >>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work) >>>>>>>>>>>>>>>> +{ >>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>> + list_del_init(&work->node); >>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>> +} >>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>> +static inline void amdgpu_reset_pending_list(struct >>>>>>>>>>>>>>>> amdgpu_reset_domain *domain) >>>>>>>>>>>>>>>> +{ >>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *entry, *tmp; >>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>> + list_for_each_entry_safe(entry, tmp, >>>>>>>>>>>>>>>> &domain->pending_works, node) { >>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>> + list_del_init(&entry->node); >>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>> + /* Stop any other related pending resets */ >>>>>>>>>>>>>>>> + cancel_delayed_work(&entry->base); >>>>>>>>>>>>>>>> + } >>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>> void amdgpu_device_lock_reset_domain(struct >>>>>>>>>>>>>>>> amdgpu_reset_domain *reset_domain); >>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>> index 239f232f9c02..574e870d3064 100644 >>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>> @@ -25,6 +25,7 @@ >>>>>>>>>>>>>>>> #define AMDGPU_VIRT_H >>>>>>>>>>>>>>>> #include "amdgv_sriovmsg.h" >>>>>>>>>>>>>>>> +#include "amdgpu_reset.h" >>>>>>>>>>>>>>>> #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 << 0) /* >>>>>>>>>>>>>>>> vBIOS is sr-iov ready */ >>>>>>>>>>>>>>>> #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 1) /* >>>>>>>>>>>>>>>> sr-iov is enabled on this GPU */ >>>>>>>>>>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt { >>>>>>>>>>>>>>>> uint32_t reg_val_offs; >>>>>>>>>>>>>>>> struct amdgpu_irq_src ack_irq; >>>>>>>>>>>>>>>> struct amdgpu_irq_src rcv_irq; >>>>>>>>>>>>>>>> - struct work_struct flr_work; >>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct flr_work; >>>>>>>>>>>>>>>> struct amdgpu_mm_table mm_table; >>>>>>>>>>>>>>>> const struct amdgpu_virt_ops *ops; >>>>>>>>>>>>>>>> struct amdgpu_vf_error_buffer vf_errors; >>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>> index b81acf59870c..f3d1c2be9292 100644 >>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>> @@ -251,7 +251,7 @@ static int >>>>>>>>>>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device *adev, >>>>>>>>>>>>>>>> static void xgpu_ai_mailbox_flr_work(struct >>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = container_of(work, >>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = container_of(work, >>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work.base.work); >>>>>>>>>>>>>>>> struct amdgpu_device *adev = container_of(virt, >>>>>>>>>>>>>>>> struct amdgpu_device, virt); >>>>>>>>>>>>>>>> int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>>>>>>>>>>> @@ -380,7 +380,8 @@ int >>>>>>>>>>>>>>>> xgpu_ai_mailbox_get_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work); >>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work); >>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>> @@ -389,6 +390,8 @@ void xgpu_ai_mailbox_put_irq(struct >>>>>>>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>> static int xgpu_ai_request_init_data(struct >>>>>>>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644 >>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>> @@ -275,7 +275,7 @@ static int >>>>>>>>>>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device *adev, >>>>>>>>>>>>>>>> static void xgpu_nv_mailbox_flr_work(struct >>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = container_of(work, >>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = container_of(work, >>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work.base.work); >>>>>>>>>>>>>>>> struct amdgpu_device *adev = container_of(virt, >>>>>>>>>>>>>>>> struct amdgpu_device, virt); >>>>>>>>>>>>>>>> int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>>>>>>>>>>> @@ -407,7 +407,8 @@ int >>>>>>>>>>>>>>>> xgpu_nv_mailbox_get_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work); >>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work); >>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>> @@ -416,6 +417,8 @@ void xgpu_nv_mailbox_put_irq(struct >>>>>>>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>> const struct amdgpu_virt_ops xgpu_nv_virt_ops = { >>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644 >>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>> @@ -512,7 +512,7 @@ static int >>>>>>>>>>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device *adev, >>>>>>>>>>>>>>>> static void xgpu_vi_mailbox_flr_work(struct >>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = container_of(work, >>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = container_of(work, >>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work.base.work); >>>>>>>>>>>>>>>> struct amdgpu_device *adev = container_of(virt, >>>>>>>>>>>>>>>> struct amdgpu_device, virt); >>>>>>>>>>>>>>>> /* wait until RCV_MSG become 3 */ >>>>>>>>>>>>>>>> @@ -610,7 +610,8 @@ int xgpu_vi_mailbox_get_irq(struct >>>>>>>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work); >>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work); >>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>> @@ -619,6 +620,8 @@ void xgpu_vi_mailbox_put_irq(struct >>>>>>>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>> const struct amdgpu_virt_ops xgpu_vi_virt_ops = { >>>>>>>>>>>>>>> >>>>>>>>>>>>> >>>>>>>>>>> >>>>>> >>>> >> ^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive. 2022-05-11 7:38 ` Christian König @ 2022-05-11 13:43 ` Andrey Grodzovsky 2022-05-11 13:58 ` Christian König 0 siblings, 1 reply; 40+ messages in thread From: Andrey Grodzovsky @ 2022-05-11 13:43 UTC (permalink / raw) To: Christian König, Christian König, Lazar, Lijo, Felix Kuehling, amd-gfx Cc: Bai Zoy On 2022-05-11 03:38, Christian König wrote: > Am 10.05.22 um 20:53 schrieb Andrey Grodzovsky: >> >> On 2022-05-10 13:19, Christian König wrote: >>> Am 10.05.22 um 19:01 schrieb Andrey Grodzovsky: >>>> >>>> On 2022-05-10 12:17, Christian König wrote: >>>>> Am 10.05.22 um 18:00 schrieb Andrey Grodzovsky: >>>>>> [SNIP] >>>>>>> That's one of the reasons why we should have multiple work items >>>>>>> for job based reset and other reset sources. >>>>>>> >>>>>>> See the whole idea is the following: >>>>>>> 1. We have one single queued work queue for each reset domain >>>>>>> which makes sure that all reset requests execute in order. >>>>>>> 2. We have one delayed work item for each scheduler which fires >>>>>>> when a timeout on a scheduler occurs and eventually calls the >>>>>>> reset procedure with the last running job. >>>>>>> 3. We have one work item for each necessary hard reset. >>>>>>> >>>>>>> The delayed work item from the scheduler first tries a soft >>>>>>> recovery and checks if a hard reset is really necessary. If it's >>>>>>> not necessary and we can cancel the offending job we skip the >>>>>>> hard reset. >>>>>>> >>>>>>> The hard reset work item doesn't do any of those checks and just >>>>>>> does a reset no matter what. >>>>>>> >>>>>>> When we really do a reset, independent if its triggered by a job >>>>>>> or other source we cancel all sources at the end of the reset >>>>>>> procedure. >>>>>>> >>>>>>> This makes sure that a) We only do one reset even when multiple >>>>>>> sources fire at the same time and b) when any source bails out >>>>>>> and only does a soft recovery we do a full reset anyway when >>>>>>> necessary. >>>>>>> >>>>>>> That design was outlined multiple times now on the mailing list >>>>>>> and looks totally clear to me. We should probably document that >>>>>>> somewhere. >>>>>> >>>>>> >>>>>> If you look at the patch what you described above is exactly what >>>>>> is happening - since scheduler's delayed work is different from >>>>>> any non scheduler delayed work the SW reset which might take >>>>>> place from scheduler's reset >>>>>> will not have any impact on any non scheduler delayed work and >>>>>> will not cancel them. In case the scheduler actually reaches the >>>>>> point of HW reset it will cancel out all pending reset works from >>>>>> any other sources on the same >>>>>> reset domain. Non scheduler reset will always proceed to do full >>>>>> HW reset and will cancel any other pending resets. >>>>> >>>>> Ok, but why you then need that linked list? The number of reset >>>>> sources should be static and not in any way dynamic. >>>> >>>> >>>> So array reset_src[i] holds a pointer to pending delayed work from >>>> source i or NULL if no pedning work ? >>>> What if same source triggers multiple reset requests such as >>>> multiple RAS errors at once , don't set the delayed work pointer in >>>> the arr[RAS_index] if it's already not NULL ? >>>> >>>>> >>>>> See using the linked list sounds like you only wanted to cancel >>>>> the reset sources raised so far which would not be correct as far >>>>> as I can see. >>>> >>>> >>>> Not clear about this one ? We do want to cancel those reset sources >>>> that were raised so far because we just did a HW reset which should >>>> fix them anyway ? Those who not raised reset request so far their >>>> respective array index will have a NULL ptr. >>> >>> And exactly that's what I want to prevent. See you don't care if a >>> reset source has fired once, twice, ten times or never. You just >>> cancel all of them! >>> >>> That's why I want to come to a static list of reset sources. >>> >>> E.g. in the reset code (either before or after the reset, that's >>> debatable) you do something like this: >>> >>> for (i = 0; i < num_ring; ++i) >>> cancel_delayed_work(ring[i]->scheduler....) >>> cancel_work(adev->ras_work); >>> cancel_work(adev->iofault_work); >>> cancel_work(adev->debugfs_work); >>> ... >>> >>> You don't really need to track which reset source has fired and >>> which hasn't, because that would be racy again. Instead just bluntly >>> reset all possible sources. >>> >>> Christian. >> >> >> I don't say we care if it fired once or twice (I need to add a fix to >> only insert reset work to pending reset list if it's not already >> there), the point of using list (or array) to which you add and from >> which you remove is that the logic of this is encapsulated within >> reset domain. In your way we need to be aware who exactly schedules >> reset work and explicitly cancel them, this also means that for any >> new source added in the future you will need to remember to add him > > I don't think that this is a valid argument. Additionally to the > schedulers we probably just need less than a handful of reset sources, > most likely even just one or two is enough. > > The only justification I can see of having additional separate reset > sources would be if somebody wants to know if a specific source has > been handled or not (e.g. call flush_work() or work_pending()). Like > in the case of a reset triggered through debugfs. This is indeed one reason, another is as we said before that if you share 'reset source' (meaning a delayed work) with another client (i.e. RAS and KFD) it means you make assumption that the other client always proceeds with the reset exactly the same way as you expect. So today we have this only in scheduler vs non scheduler reset happening - non scheduler reset clients assume the reset is always fully executed in HW while scheduler based reset makes shortcuts and not always does HW reset hence they cannot share 'reset source' (delayed work). Yes, we can always add this in the future if and when such problem will arise but no one will remember this then and a new bug will be introduced and will take time to find and resolve. > >> to the cancellation list which you showed above. In current way all >> this done automatically within reset_domain code and it's agnostic to >> specific driver and it's specific list of reset sources. Also in case >> we would want to generalize reset_domain to other GPU drivers (which was >> a plan as far as i remember) this explicit mention of each reset >> works for cancellation is again less suitable in my opinion. > > Well we could put the work item for the scheduler independent reset > source into the reset domain as well. But I'm not sure those > additional reset sources should be part of any common handling, that > is largely amdgpu specific. So it's for sure more then one source for the reasons described above, also note that for scheduler we already cancel delayed work in drm_sched_stop so calling them again in amdgpu code kind of superfluous. Andrey > > Christian. > >> >> Andrey >> >> >>> >>>> >>>> Andrey >>>> >>>> >>>>> >>>>>> >>>>>> The only difference is I chose to do the canceling right BEFORE >>>>>> the HW reset and not AFTER. I did this because I see a possible >>>>>> race where a new reset request is being generated exactly after >>>>>> we finished the HW reset but before we canceled out all pending >>>>>> resets - in such case you wold not want to cancel this 'border >>>>>> line new' reset request. >>>>> >>>>> Why not? Any new reset request directly after a hardware reset is >>>>> most likely just falsely generated by the reset itself. >>>>> >>>>> Ideally I would cancel all sources after the reset, but before >>>>> starting any new work. >>>>> >>>>> Regards, >>>>> Christian. >>>>> >>>>>> >>>>>> >>>>>> Andrey >>>>>> >>>>>> >>>>>>> >>>>>>> Regards, >>>>>>> Christian. >>>>>>> >>>>>>>>> You can see that if many different reset sources share same >>>>>>>>> work struct what can happen is that the first to obtain the >>>>>>>>> lock you describe bellow might opt out from full HW reset >>>>>>>>> because his bad job did signal for example or because his >>>>>>>>> hunged IP block was able to recover through SW reset but in >>>>>>>>> the meantime another reset source who needed an actual HW >>>>>>>>> reset just silently returned and we end up with unhandled >>>>>>>>> reset request. True that today this happens only to job >>>>>>>>> timeout reset sources that are handled form within the >>>>>>>>> scheduler and won't use this single work struct but no one >>>>>>>>> prevents a future case for this to happen and also, if we >>>>>>>>> actually want to unify scheduler time out handlers within >>>>>>>>> reset domain (which seems to me the right design approach) we >>>>>>>>> won't be able to use just one work struct for this reason anyway. >>>>>>>>> >>>>>>>> >>>>>>>> Just to add to this point - a reset domain is co-operative >>>>>>>> domain. In addition to sharing a set of clients from various >>>>>>>> reset sources for one device, it also will have a set of >>>>>>>> devices like in XGMI hive. The job timeout on one device may >>>>>>>> not eventually result in result, but a RAS error happening on >>>>>>>> another device at the same time would need a reset. The second >>>>>>>> device's RAS error cannot return seeing that a reset work >>>>>>>> already started, or ignore the reset work given that another >>>>>>>> device has filled the reset data. >>>>>>>> >>>>>>>> When there is a reset domain, it should take care of the work >>>>>>>> scheduled and keeping it in device or any other level doesn't >>>>>>>> sound good. >>>>>>>> >>>>>>>> Thanks, >>>>>>>> Lijo >>>>>>>> >>>>>>>>> Andrey >>>>>>>>> >>>>>>>>> >>>>>>>>>> >>>>>>>>>> I'd put the reset work struct into the reset_domain struct. >>>>>>>>>> That way you'd have exactly one worker for the reset domain. >>>>>>>>>> You could implement a lock-less scheme to decide whether you >>>>>>>>>> need to schedule a reset, e.g. using an atomic counter in the >>>>>>>>>> shared work struct that gets incremented when a client wants >>>>>>>>>> to trigger a reset (atomic_add_return). If that counter is >>>>>>>>>> exactly 1 after incrementing, you need to fill in the rest of >>>>>>>>>> the work struct and schedule the work. Otherwise, it's >>>>>>>>>> already scheduled (or another client is in the process of >>>>>>>>>> scheduling it) and you just return. When the worker finishes >>>>>>>>>> (after confirming a successful reset), it resets the counter >>>>>>>>>> to 0, so the next client requesting a reset will schedule the >>>>>>>>>> worker again. >>>>>>>>>> >>>>>>>>>> Regards, >>>>>>>>>> Felix >>>>>>>>>> >>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>>> >>>>>>>>>>>> >>>>>>>>>>>> Additional to that keep in mind that you can't allocate any >>>>>>>>>>>> memory before or during the GPU reset nor wait for the >>>>>>>>>>>> reset to complete (so you can't allocate anything on the >>>>>>>>>>>> stack either). >>>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>> There is no dynamic allocation here, regarding stack >>>>>>>>>>> allocations - we do it all the time when we call functions, >>>>>>>>>>> even during GPU resets, how on stack allocation of work >>>>>>>>>>> struct in amdgpu_device_gpu_recover is different from any >>>>>>>>>>> other local variable we allocate in any function we call ? >>>>>>>>>>> >>>>>>>>>>> I am also not sure why it's not allowed to wait for reset to >>>>>>>>>>> complete ? Also, see in amdgpu_ras_do_recovery and >>>>>>>>>>> gpu_recover_get (debugfs) - the caller expects the reset to >>>>>>>>>>> complete before he returns. I can probably work around it in >>>>>>>>>>> RAS code by calling atomic_set(&ras->in_recovery, 0) from >>>>>>>>>>> some callback within actual reset function but regarding >>>>>>>>>>> sysfs it actually expects a result returned indicating >>>>>>>>>>> whether the call was successful or not. >>>>>>>>>>> >>>>>>>>>>> Andrey >>>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>>> >>>>>>>>>>>> I don't think that concept you try here will work. >>>>>>>>>>>> >>>>>>>>>>>> Regards, >>>>>>>>>>>> Christian. >>>>>>>>>>>> >>>>>>>>>>>>> Also in general seems to me it's cleaner approach where >>>>>>>>>>>>> this logic (the work items) are held and handled in >>>>>>>>>>>>> reset_domain and are not split in each adev or any other >>>>>>>>>>>>> entity. We might want in the future to even move the >>>>>>>>>>>>> scheduler handling into reset domain since reset domain is >>>>>>>>>>>>> supposed to be a generic things and not only or AMD. >>>>>>>>>>>>> >>>>>>>>>>>>> Andrey >>>>>>>>>>>>> >>>>>>>>>>>>> >>>>>>>>>>>>>> >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>>>> Christian. >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky >>>>>>>>>>>>>>>>> <andrey.grodzovsky@amd.com> >>>>>>>>>>>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com> >>>>>>>>>>>>>>>>> --- >>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +--- >>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++-- >>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 + >>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 >>>>>>>>>>>>>>>>> +++++++++++++++++++++- >>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +- >>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++- >>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++- >>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++- >>>>>>>>>>>>>>>>> 8 files changed, 104 insertions(+), 24 deletions(-) >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>> index 4264abc5604d..99efd8317547 100644 >>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>> @@ -109,6 +109,7 @@ >>>>>>>>>>>>>>>>> #include "amdgpu_fdinfo.h" >>>>>>>>>>>>>>>>> #include "amdgpu_mca.h" >>>>>>>>>>>>>>>>> #include "amdgpu_ras.h" >>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h" >>>>>>>>>>>>>>>>> #define MAX_GPU_INSTANCE 16 >>>>>>>>>>>>>>>>> @@ -509,16 +510,6 @@ struct >>>>>>>>>>>>>>>>> amdgpu_allowed_register_entry { >>>>>>>>>>>>>>>>> bool grbm_indexed; >>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>> -enum amd_reset_method { >>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_NONE = -1, >>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_LEGACY = 0, >>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE0, >>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE1, >>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE2, >>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_BACO, >>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_PCI, >>>>>>>>>>>>>>>>> -}; >>>>>>>>>>>>>>>>> - >>>>>>>>>>>>>>>>> struct amdgpu_video_codec_info { >>>>>>>>>>>>>>>>> u32 codec_type; >>>>>>>>>>>>>>>>> u32 max_width; >>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>> index e582f1044c0f..7fa82269c30f 100644 >>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>> @@ -5201,6 +5201,12 @@ int >>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev, >>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>> tmp_vram_lost_counter = >>>>>>>>>>>>>>>>> atomic_read(&((adev)->vram_lost_counter)); >>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>> + /* Drop all pending resets since we will reset >>>>>>>>>>>>>>>>> now anyway */ >>>>>>>>>>>>>>>>> + tmp_adev = list_first_entry(device_list_handle, >>>>>>>>>>>>>>>>> struct amdgpu_device, >>>>>>>>>>>>>>>>> + reset_list); >>>>>>>>>>>>>>>>> + amdgpu_reset_pending_list(tmp_adev->reset_domain); >>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>> /* Actual ASIC resets if needed.*/ >>>>>>>>>>>>>>>>> /* Host driver will handle XGMI hive reset for >>>>>>>>>>>>>>>>> SRIOV */ >>>>>>>>>>>>>>>>> if (amdgpu_sriov_vf(adev)) { >>>>>>>>>>>>>>>>> @@ -5296,7 +5302,7 @@ int >>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev, >>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>> struct amdgpu_recover_work_struct { >>>>>>>>>>>>>>>>> - struct work_struct base; >>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct base; >>>>>>>>>>>>>>>>> struct amdgpu_device *adev; >>>>>>>>>>>>>>>>> struct amdgpu_job *job; >>>>>>>>>>>>>>>>> int ret; >>>>>>>>>>>>>>>>> @@ -5304,7 +5310,7 @@ struct amdgpu_recover_work_struct { >>>>>>>>>>>>>>>>> static void >>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work(struct >>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>> - struct amdgpu_recover_work_struct *recover_work = >>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_recover_work_struct, >>>>>>>>>>>>>>>>> base); >>>>>>>>>>>>>>>>> + struct amdgpu_recover_work_struct *recover_work = >>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_recover_work_struct, >>>>>>>>>>>>>>>>> base.base.work); >>>>>>>>>>>>>>>>> recover_work->ret = >>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, >>>>>>>>>>>>>>>>> recover_work->job); >>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>> @@ -5316,12 +5322,15 @@ int >>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct amdgpu_device *adev, >>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>> struct amdgpu_recover_work_struct work = {.adev >>>>>>>>>>>>>>>>> = adev, .job = job}; >>>>>>>>>>>>>>>>> - INIT_WORK(&work.base, >>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&work.base.base, >>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&work.base.node); >>>>>>>>>>>>>>>>> if >>>>>>>>>>>>>>>>> (!amdgpu_reset_domain_schedule(adev->reset_domain, >>>>>>>>>>>>>>>>> &work.base)) >>>>>>>>>>>>>>>>> return -EAGAIN; >>>>>>>>>>>>>>>>> - flush_work(&work.base); >>>>>>>>>>>>>>>>> + flush_delayed_work(&work.base.base); >>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>> &work.base); >>>>>>>>>>>>>>>>> return work.ret; >>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>> index c80af0889773..ffddd419c351 100644 >>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain >>>>>>>>>>>>>>>>> *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d >>>>>>>>>>>>>>>>> atomic_set(&reset_domain->in_gpu_reset, 0); >>>>>>>>>>>>>>>>> init_rwsem(&reset_domain->sem); >>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&reset_domain->pending_works); >>>>>>>>>>>>>>>>> + mutex_init(&reset_domain->reset_lock); >>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>> return reset_domain; >>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>> index 1949dbe28a86..863ec5720fc1 100644 >>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>> @@ -24,7 +24,18 @@ >>>>>>>>>>>>>>>>> #ifndef __AMDGPU_RESET_H__ >>>>>>>>>>>>>>>>> #define __AMDGPU_RESET_H__ >>>>>>>>>>>>>>>>> -#include "amdgpu.h" >>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>> +#include <linux/atomic.h> >>>>>>>>>>>>>>>>> +#include <linux/mutex.h> >>>>>>>>>>>>>>>>> +#include <linux/list.h> >>>>>>>>>>>>>>>>> +#include <linux/kref.h> >>>>>>>>>>>>>>>>> +#include <linux/rwsem.h> >>>>>>>>>>>>>>>>> +#include <linux/workqueue.h> >>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>> +struct amdgpu_device; >>>>>>>>>>>>>>>>> +struct amdgpu_job; >>>>>>>>>>>>>>>>> +struct amdgpu_hive_info; >>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>> enum AMDGPU_RESET_FLAGS { >>>>>>>>>>>>>>>>> @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS { >>>>>>>>>>>>>>>>> AMDGPU_SKIP_HW_RESET = 1, >>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>> +enum amd_reset_method { >>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_NONE = -1, >>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_LEGACY = 0, >>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE0, >>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE1, >>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE2, >>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_BACO, >>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_PCI, >>>>>>>>>>>>>>>>> +}; >>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>> struct amdgpu_reset_context { >>>>>>>>>>>>>>>>> enum amd_reset_method method; >>>>>>>>>>>>>>>>> struct amdgpu_device *reset_req_dev; >>>>>>>>>>>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context { >>>>>>>>>>>>>>>>> unsigned long flags; >>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>> +struct amdgpu_reset_control; >>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>> struct amdgpu_reset_handler { >>>>>>>>>>>>>>>>> enum amd_reset_method reset_method; >>>>>>>>>>>>>>>>> struct list_head handler_list; >>>>>>>>>>>>>>>>> @@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type { >>>>>>>>>>>>>>>>> XGMI_HIVE >>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>> +struct amdgpu_reset_work_struct { >>>>>>>>>>>>>>>>> + struct delayed_work base; >>>>>>>>>>>>>>>>> + struct list_head node; >>>>>>>>>>>>>>>>> +}; >>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>> struct amdgpu_reset_domain { >>>>>>>>>>>>>>>>> struct kref refcount; >>>>>>>>>>>>>>>>> struct workqueue_struct *wq; >>>>>>>>>>>>>>>>> enum amdgpu_reset_domain_type type; >>>>>>>>>>>>>>>>> struct rw_semaphore sem; >>>>>>>>>>>>>>>>> atomic_t in_gpu_reset; >>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>> + struct list_head pending_works; >>>>>>>>>>>>>>>>> + struct mutex reset_lock; >>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>> @@ -113,9 +146,43 @@ static inline void >>>>>>>>>>>>>>>>> amdgpu_reset_put_reset_domain(struct >>>>>>>>>>>>>>>>> amdgpu_reset_domain *dom >>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>> static inline bool >>>>>>>>>>>>>>>>> amdgpu_reset_domain_schedule(struct >>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain, >>>>>>>>>>>>>>>>> - struct work_struct *work) >>>>>>>>>>>>>>>>> + struct >>>>>>>>>>>>>>>>> amdgpu_reset_work_struct *work) >>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>> - return queue_work(domain->wq, work); >>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>> + if (!queue_delayed_work(domain->wq, &work->base, >>>>>>>>>>>>>>>>> 0)) { >>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>> + return false; >>>>>>>>>>>>>>>>> + } >>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>> + list_add_tail(&work->node, &domain->pending_works); >>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>> + return true; >>>>>>>>>>>>>>>>> +} >>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>> +static inline void >>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(struct >>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain, >>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work) >>>>>>>>>>>>>>>>> +{ >>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>>> + list_del_init(&work->node); >>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>> +} >>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>> +static inline void amdgpu_reset_pending_list(struct >>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain) >>>>>>>>>>>>>>>>> +{ >>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *entry, *tmp; >>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>>> + list_for_each_entry_safe(entry, tmp, >>>>>>>>>>>>>>>>> &domain->pending_works, node) { >>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>> + list_del_init(&entry->node); >>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>> + /* Stop any other related pending resets */ >>>>>>>>>>>>>>>>> + cancel_delayed_work(&entry->base); >>>>>>>>>>>>>>>>> + } >>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>> void amdgpu_device_lock_reset_domain(struct >>>>>>>>>>>>>>>>> amdgpu_reset_domain *reset_domain); >>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>> index 239f232f9c02..574e870d3064 100644 >>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>> @@ -25,6 +25,7 @@ >>>>>>>>>>>>>>>>> #define AMDGPU_VIRT_H >>>>>>>>>>>>>>>>> #include "amdgv_sriovmsg.h" >>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h" >>>>>>>>>>>>>>>>> #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 << 0) /* >>>>>>>>>>>>>>>>> vBIOS is sr-iov ready */ >>>>>>>>>>>>>>>>> #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 1) /* >>>>>>>>>>>>>>>>> sr-iov is enabled on this GPU */ >>>>>>>>>>>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt { >>>>>>>>>>>>>>>>> uint32_t reg_val_offs; >>>>>>>>>>>>>>>>> struct amdgpu_irq_src ack_irq; >>>>>>>>>>>>>>>>> struct amdgpu_irq_src rcv_irq; >>>>>>>>>>>>>>>>> - struct work_struct flr_work; >>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct flr_work; >>>>>>>>>>>>>>>>> struct amdgpu_mm_table mm_table; >>>>>>>>>>>>>>>>> const struct amdgpu_virt_ops *ops; >>>>>>>>>>>>>>>>> struct amdgpu_vf_error_buffer vf_errors; >>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>> index b81acf59870c..f3d1c2be9292 100644 >>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>> @@ -251,7 +251,7 @@ static int >>>>>>>>>>>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device *adev, >>>>>>>>>>>>>>>>> static void xgpu_ai_mailbox_flr_work(struct >>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = container_of(work, >>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = container_of(work, >>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work.base.work); >>>>>>>>>>>>>>>>> struct amdgpu_device *adev = container_of(virt, >>>>>>>>>>>>>>>>> struct amdgpu_device, virt); >>>>>>>>>>>>>>>>> int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>>>>>>>>>>>> @@ -380,7 +380,8 @@ int >>>>>>>>>>>>>>>>> xgpu_ai_mailbox_get_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work); >>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work); >>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>> @@ -389,6 +390,8 @@ void >>>>>>>>>>>>>>>>> xgpu_ai_mailbox_put_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>> static int xgpu_ai_request_init_data(struct >>>>>>>>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644 >>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>> @@ -275,7 +275,7 @@ static int >>>>>>>>>>>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device *adev, >>>>>>>>>>>>>>>>> static void xgpu_nv_mailbox_flr_work(struct >>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = container_of(work, >>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = container_of(work, >>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work.base.work); >>>>>>>>>>>>>>>>> struct amdgpu_device *adev = container_of(virt, >>>>>>>>>>>>>>>>> struct amdgpu_device, virt); >>>>>>>>>>>>>>>>> int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>>>>>>>>>>>> @@ -407,7 +407,8 @@ int >>>>>>>>>>>>>>>>> xgpu_nv_mailbox_get_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work); >>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work); >>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>> @@ -416,6 +417,8 @@ void >>>>>>>>>>>>>>>>> xgpu_nv_mailbox_put_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>> const struct amdgpu_virt_ops xgpu_nv_virt_ops = { >>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644 >>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>> @@ -512,7 +512,7 @@ static int >>>>>>>>>>>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device *adev, >>>>>>>>>>>>>>>>> static void xgpu_vi_mailbox_flr_work(struct >>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = container_of(work, >>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = container_of(work, >>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work.base.work); >>>>>>>>>>>>>>>>> struct amdgpu_device *adev = container_of(virt, >>>>>>>>>>>>>>>>> struct amdgpu_device, virt); >>>>>>>>>>>>>>>>> /* wait until RCV_MSG become 3 */ >>>>>>>>>>>>>>>>> @@ -610,7 +610,8 @@ int xgpu_vi_mailbox_get_irq(struct >>>>>>>>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work); >>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work); >>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>> @@ -619,6 +620,8 @@ void >>>>>>>>>>>>>>>>> xgpu_vi_mailbox_put_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>> const struct amdgpu_virt_ops xgpu_vi_virt_ops = { >>>>>>>>>>>>>>>> >>>>>>>>>>>>>> >>>>>>>>>>>> >>>>>>> >>>>> >>> > ^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive. 2022-05-11 13:43 ` Andrey Grodzovsky @ 2022-05-11 13:58 ` Christian König 2022-05-11 15:20 ` Lazar, Lijo 0 siblings, 1 reply; 40+ messages in thread From: Christian König @ 2022-05-11 13:58 UTC (permalink / raw) To: Andrey Grodzovsky, Christian König, Lazar, Lijo, Felix Kuehling, amd-gfx Cc: Bai Zoy Am 11.05.22 um 15:43 schrieb Andrey Grodzovsky: > On 2022-05-11 03:38, Christian König wrote: >> Am 10.05.22 um 20:53 schrieb Andrey Grodzovsky: >>> [SNIP] >>>> E.g. in the reset code (either before or after the reset, that's >>>> debatable) you do something like this: >>>> >>>> for (i = 0; i < num_ring; ++i) >>>> cancel_delayed_work(ring[i]->scheduler....) >>>> cancel_work(adev->ras_work); >>>> cancel_work(adev->iofault_work); >>>> cancel_work(adev->debugfs_work); >>>> ... >>>> >>>> You don't really need to track which reset source has fired and >>>> which hasn't, because that would be racy again. Instead just >>>> bluntly reset all possible sources. >>>> >>>> Christian. >>> >>> >>> I don't say we care if it fired once or twice (I need to add a fix >>> to only insert reset work to pending reset list if it's not already >>> there), the point of using list (or array) to which you add and from >>> which you remove is that the logic of this is encapsulated within >>> reset domain. In your way we need to be aware who exactly schedules >>> reset work and explicitly cancel them, this also means that for any >>> new source added in the future you will need to remember to add him >> >> I don't think that this is a valid argument. Additionally to the >> schedulers we probably just need less than a handful of reset >> sources, most likely even just one or two is enough. >> >> The only justification I can see of having additional separate reset >> sources would be if somebody wants to know if a specific source has >> been handled or not (e.g. call flush_work() or work_pending()). Like >> in the case of a reset triggered through debugfs. > > > This is indeed one reason, another is as we said before that if you > share 'reset source' (meaning a delayed work) with another client > (i.e. RAS and KFD) it means you make assumption that the other client > always proceeds with the > reset exactly the same way as you expect. So today we have this only > in scheduler vs non scheduler reset happening - non scheduler reset > clients assume the reset is always fully executed in HW while > scheduler based reset makes shortcuts and not always does HW reset > hence they cannot share 'reset source' (delayed work). Yes, we can > always add this in the future if and when such problem will arise but > no one will remember this then and a new bug will be introduced and > will take time to find and resolve. Mhm, so your main concern is that we forget to correctly handle the new reset sources? How about we do it like this then: struct amdgpu_reset_domain { .... union { struct { struct work_item debugfs; struct work_item ras; .... }; struct work_item array[] } reset_sources; } Not 100% sure if that works, but something like that should do the trick. My main concern is that I don't want to allocate the work items on the stack and dynamic allocation (e.g. kmalloc) is usually not possible either. Additional to that putting/removing work items from a list, array or other container is a very common source for race conditions. Regards, Christian. > >>> to the cancellation list which you showed above. In current way all >>> this done automatically within reset_domain code and it's agnostic >>> to specific driver and it's specific list of reset sources. Also in >>> case we would want to generalize reset_domain to other GPU drivers >>> (which was >>> a plan as far as i remember) this explicit mention of each reset >>> works for cancellation is again less suitable in my opinion. >> >> Well we could put the work item for the scheduler independent reset >> source into the reset domain as well. But I'm not sure those >> additional reset sources should be part of any common handling, that >> is largely amdgpu specific. > > > So it's for sure more then one source for the reasons described above, > also note that for scheduler we already cancel delayed work in > drm_sched_stop so calling them again in amdgpu code kind of superfluous. > > Andrey > > >> >> Christian. >> >>> >>> Andrey >>> >>> >>>> >>>>> >>>>> Andrey >>>>> >>>>> >>>>>> >>>>>>> >>>>>>> The only difference is I chose to do the canceling right BEFORE >>>>>>> the HW reset and not AFTER. I did this because I see a possible >>>>>>> race where a new reset request is being generated exactly after >>>>>>> we finished the HW reset but before we canceled out all pending >>>>>>> resets - in such case you wold not want to cancel this 'border >>>>>>> line new' reset request. >>>>>> >>>>>> Why not? Any new reset request directly after a hardware reset is >>>>>> most likely just falsely generated by the reset itself. >>>>>> >>>>>> Ideally I would cancel all sources after the reset, but before >>>>>> starting any new work. >>>>>> >>>>>> Regards, >>>>>> Christian. >>>>>> >>>>>>> >>>>>>> >>>>>>> Andrey >>>>>>> >>>>>>> >>>>>>>> >>>>>>>> Regards, >>>>>>>> Christian. >>>>>>>> >>>>>>>>>> You can see that if many different reset sources share same >>>>>>>>>> work struct what can happen is that the first to obtain the >>>>>>>>>> lock you describe bellow might opt out from full HW reset >>>>>>>>>> because his bad job did signal for example or because his >>>>>>>>>> hunged IP block was able to recover through SW reset but in >>>>>>>>>> the meantime another reset source who needed an actual HW >>>>>>>>>> reset just silently returned and we end up with unhandled >>>>>>>>>> reset request. True that today this happens only to job >>>>>>>>>> timeout reset sources that are handled form within the >>>>>>>>>> scheduler and won't use this single work struct but no one >>>>>>>>>> prevents a future case for this to happen and also, if we >>>>>>>>>> actually want to unify scheduler time out handlers within >>>>>>>>>> reset domain (which seems to me the right design approach) we >>>>>>>>>> won't be able to use just one work struct for this reason >>>>>>>>>> anyway. >>>>>>>>>> >>>>>>>>> >>>>>>>>> Just to add to this point - a reset domain is co-operative >>>>>>>>> domain. In addition to sharing a set of clients from various >>>>>>>>> reset sources for one device, it also will have a set of >>>>>>>>> devices like in XGMI hive. The job timeout on one device may >>>>>>>>> not eventually result in result, but a RAS error happening on >>>>>>>>> another device at the same time would need a reset. The second >>>>>>>>> device's RAS error cannot return seeing that a reset work >>>>>>>>> already started, or ignore the reset work given that another >>>>>>>>> device has filled the reset data. >>>>>>>>> >>>>>>>>> When there is a reset domain, it should take care of the work >>>>>>>>> scheduled and keeping it in device or any other level doesn't >>>>>>>>> sound good. >>>>>>>>> >>>>>>>>> Thanks, >>>>>>>>> Lijo >>>>>>>>> >>>>>>>>>> Andrey >>>>>>>>>> >>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>> I'd put the reset work struct into the reset_domain struct. >>>>>>>>>>> That way you'd have exactly one worker for the reset domain. >>>>>>>>>>> You could implement a lock-less scheme to decide whether you >>>>>>>>>>> need to schedule a reset, e.g. using an atomic counter in >>>>>>>>>>> the shared work struct that gets incremented when a client >>>>>>>>>>> wants to trigger a reset (atomic_add_return). If that >>>>>>>>>>> counter is exactly 1 after incrementing, you need to fill in >>>>>>>>>>> the rest of the work struct and schedule the work. >>>>>>>>>>> Otherwise, it's already scheduled (or another client is in >>>>>>>>>>> the process of scheduling it) and you just return. When the >>>>>>>>>>> worker finishes (after confirming a successful reset), it >>>>>>>>>>> resets the counter to 0, so the next client requesting a >>>>>>>>>>> reset will schedule the worker again. >>>>>>>>>>> >>>>>>>>>>> Regards, >>>>>>>>>>> Felix >>>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>>> >>>>>>>>>>>> >>>>>>>>>>>>> >>>>>>>>>>>>> >>>>>>>>>>>>> Additional to that keep in mind that you can't allocate >>>>>>>>>>>>> any memory before or during the GPU reset nor wait for the >>>>>>>>>>>>> reset to complete (so you can't allocate anything on the >>>>>>>>>>>>> stack either). >>>>>>>>>>>> >>>>>>>>>>>> >>>>>>>>>>>> There is no dynamic allocation here, regarding stack >>>>>>>>>>>> allocations - we do it all the time when we call functions, >>>>>>>>>>>> even during GPU resets, how on stack allocation of work >>>>>>>>>>>> struct in amdgpu_device_gpu_recover is different from any >>>>>>>>>>>> other local variable we allocate in any function we call ? >>>>>>>>>>>> >>>>>>>>>>>> I am also not sure why it's not allowed to wait for reset >>>>>>>>>>>> to complete ? Also, see in amdgpu_ras_do_recovery and >>>>>>>>>>>> gpu_recover_get (debugfs) - the caller expects the reset to >>>>>>>>>>>> complete before he returns. I can probably work around it >>>>>>>>>>>> in RAS code by calling atomic_set(&ras->in_recovery, 0) >>>>>>>>>>>> from some callback within actual reset function but >>>>>>>>>>>> regarding sysfs it actually expects a result returned >>>>>>>>>>>> indicating whether the call was successful or not. >>>>>>>>>>>> >>>>>>>>>>>> Andrey >>>>>>>>>>>> >>>>>>>>>>>> >>>>>>>>>>>>> >>>>>>>>>>>>> I don't think that concept you try here will work. >>>>>>>>>>>>> >>>>>>>>>>>>> Regards, >>>>>>>>>>>>> Christian. >>>>>>>>>>>>> >>>>>>>>>>>>>> Also in general seems to me it's cleaner approach where >>>>>>>>>>>>>> this logic (the work items) are held and handled in >>>>>>>>>>>>>> reset_domain and are not split in each adev or any other >>>>>>>>>>>>>> entity. We might want in the future to even move the >>>>>>>>>>>>>> scheduler handling into reset domain since reset domain >>>>>>>>>>>>>> is supposed to be a generic things and not only or AMD. >>>>>>>>>>>>>> >>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>> >>>>>>>>>>>>>> >>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>>>>> Christian. >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky >>>>>>>>>>>>>>>>>> <andrey.grodzovsky@amd.com> >>>>>>>>>>>>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com> >>>>>>>>>>>>>>>>>> --- >>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +--- >>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++-- >>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 + >>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 >>>>>>>>>>>>>>>>>> +++++++++++++++++++++- >>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +- >>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++- >>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++- >>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++- >>>>>>>>>>>>>>>>>> 8 files changed, 104 insertions(+), 24 deletions(-) >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>> index 4264abc5604d..99efd8317547 100644 >>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>> @@ -109,6 +109,7 @@ >>>>>>>>>>>>>>>>>> #include "amdgpu_fdinfo.h" >>>>>>>>>>>>>>>>>> #include "amdgpu_mca.h" >>>>>>>>>>>>>>>>>> #include "amdgpu_ras.h" >>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h" >>>>>>>>>>>>>>>>>> #define MAX_GPU_INSTANCE 16 >>>>>>>>>>>>>>>>>> @@ -509,16 +510,6 @@ struct >>>>>>>>>>>>>>>>>> amdgpu_allowed_register_entry { >>>>>>>>>>>>>>>>>> bool grbm_indexed; >>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>> -enum amd_reset_method { >>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_NONE = -1, >>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_LEGACY = 0, >>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE0, >>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE1, >>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE2, >>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_BACO, >>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_PCI, >>>>>>>>>>>>>>>>>> -}; >>>>>>>>>>>>>>>>>> - >>>>>>>>>>>>>>>>>> struct amdgpu_video_codec_info { >>>>>>>>>>>>>>>>>> u32 codec_type; >>>>>>>>>>>>>>>>>> u32 max_width; >>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>> index e582f1044c0f..7fa82269c30f 100644 >>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>> @@ -5201,6 +5201,12 @@ int >>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct amdgpu_device >>>>>>>>>>>>>>>>>> *adev, >>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>> tmp_vram_lost_counter = >>>>>>>>>>>>>>>>>> atomic_read(&((adev)->vram_lost_counter)); >>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>> + /* Drop all pending resets since we will reset >>>>>>>>>>>>>>>>>> now anyway */ >>>>>>>>>>>>>>>>>> + tmp_adev = list_first_entry(device_list_handle, >>>>>>>>>>>>>>>>>> struct amdgpu_device, >>>>>>>>>>>>>>>>>> + reset_list); >>>>>>>>>>>>>>>>>> + amdgpu_reset_pending_list(tmp_adev->reset_domain); >>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>> /* Actual ASIC resets if needed.*/ >>>>>>>>>>>>>>>>>> /* Host driver will handle XGMI hive reset for >>>>>>>>>>>>>>>>>> SRIOV */ >>>>>>>>>>>>>>>>>> if (amdgpu_sriov_vf(adev)) { >>>>>>>>>>>>>>>>>> @@ -5296,7 +5302,7 @@ int >>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct amdgpu_device >>>>>>>>>>>>>>>>>> *adev, >>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>> struct amdgpu_recover_work_struct { >>>>>>>>>>>>>>>>>> - struct work_struct base; >>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct base; >>>>>>>>>>>>>>>>>> struct amdgpu_device *adev; >>>>>>>>>>>>>>>>>> struct amdgpu_job *job; >>>>>>>>>>>>>>>>>> int ret; >>>>>>>>>>>>>>>>>> @@ -5304,7 +5310,7 @@ struct >>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct { >>>>>>>>>>>>>>>>>> static void >>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work(struct >>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>> - struct amdgpu_recover_work_struct *recover_work >>>>>>>>>>>>>>>>>> = container_of(work, struct >>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base); >>>>>>>>>>>>>>>>>> + struct amdgpu_recover_work_struct *recover_work >>>>>>>>>>>>>>>>>> = container_of(work, struct >>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base.base.work); >>>>>>>>>>>>>>>>>> recover_work->ret = >>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, >>>>>>>>>>>>>>>>>> recover_work->job); >>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>> @@ -5316,12 +5322,15 @@ int >>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct amdgpu_device *adev, >>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>> struct amdgpu_recover_work_struct work = {.adev >>>>>>>>>>>>>>>>>> = adev, .job = job}; >>>>>>>>>>>>>>>>>> - INIT_WORK(&work.base, >>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&work.base.base, >>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&work.base.node); >>>>>>>>>>>>>>>>>> if >>>>>>>>>>>>>>>>>> (!amdgpu_reset_domain_schedule(adev->reset_domain, >>>>>>>>>>>>>>>>>> &work.base)) >>>>>>>>>>>>>>>>>> return -EAGAIN; >>>>>>>>>>>>>>>>>> - flush_work(&work.base); >>>>>>>>>>>>>>>>>> + flush_delayed_work(&work.base.base); >>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>> &work.base); >>>>>>>>>>>>>>>>>> return work.ret; >>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>> index c80af0889773..ffddd419c351 100644 >>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain >>>>>>>>>>>>>>>>>> *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d >>>>>>>>>>>>>>>>>> atomic_set(&reset_domain->in_gpu_reset, 0); >>>>>>>>>>>>>>>>>> init_rwsem(&reset_domain->sem); >>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&reset_domain->pending_works); >>>>>>>>>>>>>>>>>> + mutex_init(&reset_domain->reset_lock); >>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>> return reset_domain; >>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>> index 1949dbe28a86..863ec5720fc1 100644 >>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>> @@ -24,7 +24,18 @@ >>>>>>>>>>>>>>>>>> #ifndef __AMDGPU_RESET_H__ >>>>>>>>>>>>>>>>>> #define __AMDGPU_RESET_H__ >>>>>>>>>>>>>>>>>> -#include "amdgpu.h" >>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>> +#include <linux/atomic.h> >>>>>>>>>>>>>>>>>> +#include <linux/mutex.h> >>>>>>>>>>>>>>>>>> +#include <linux/list.h> >>>>>>>>>>>>>>>>>> +#include <linux/kref.h> >>>>>>>>>>>>>>>>>> +#include <linux/rwsem.h> >>>>>>>>>>>>>>>>>> +#include <linux/workqueue.h> >>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>> +struct amdgpu_device; >>>>>>>>>>>>>>>>>> +struct amdgpu_job; >>>>>>>>>>>>>>>>>> +struct amdgpu_hive_info; >>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>> enum AMDGPU_RESET_FLAGS { >>>>>>>>>>>>>>>>>> @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS { >>>>>>>>>>>>>>>>>> AMDGPU_SKIP_HW_RESET = 1, >>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>> +enum amd_reset_method { >>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_NONE = -1, >>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_LEGACY = 0, >>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE0, >>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE1, >>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE2, >>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_BACO, >>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_PCI, >>>>>>>>>>>>>>>>>> +}; >>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>> struct amdgpu_reset_context { >>>>>>>>>>>>>>>>>> enum amd_reset_method method; >>>>>>>>>>>>>>>>>> struct amdgpu_device *reset_req_dev; >>>>>>>>>>>>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context { >>>>>>>>>>>>>>>>>> unsigned long flags; >>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>> +struct amdgpu_reset_control; >>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>> struct amdgpu_reset_handler { >>>>>>>>>>>>>>>>>> enum amd_reset_method reset_method; >>>>>>>>>>>>>>>>>> struct list_head handler_list; >>>>>>>>>>>>>>>>>> @@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type { >>>>>>>>>>>>>>>>>> XGMI_HIVE >>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>> +struct amdgpu_reset_work_struct { >>>>>>>>>>>>>>>>>> + struct delayed_work base; >>>>>>>>>>>>>>>>>> + struct list_head node; >>>>>>>>>>>>>>>>>> +}; >>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>> struct amdgpu_reset_domain { >>>>>>>>>>>>>>>>>> struct kref refcount; >>>>>>>>>>>>>>>>>> struct workqueue_struct *wq; >>>>>>>>>>>>>>>>>> enum amdgpu_reset_domain_type type; >>>>>>>>>>>>>>>>>> struct rw_semaphore sem; >>>>>>>>>>>>>>>>>> atomic_t in_gpu_reset; >>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>> + struct list_head pending_works; >>>>>>>>>>>>>>>>>> + struct mutex reset_lock; >>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>> @@ -113,9 +146,43 @@ static inline void >>>>>>>>>>>>>>>>>> amdgpu_reset_put_reset_domain(struct >>>>>>>>>>>>>>>>>> amdgpu_reset_domain *dom >>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>> static inline bool >>>>>>>>>>>>>>>>>> amdgpu_reset_domain_schedule(struct >>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain, >>>>>>>>>>>>>>>>>> - struct work_struct *work) >>>>>>>>>>>>>>>>>> + struct >>>>>>>>>>>>>>>>>> amdgpu_reset_work_struct *work) >>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>> - return queue_work(domain->wq, work); >>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>> + if (!queue_delayed_work(domain->wq, &work->base, >>>>>>>>>>>>>>>>>> 0)) { >>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>> + return false; >>>>>>>>>>>>>>>>>> + } >>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>> + list_add_tail(&work->node, &domain->pending_works); >>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>> + return true; >>>>>>>>>>>>>>>>>> +} >>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>> +static inline void >>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(struct >>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain, >>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct >>>>>>>>>>>>>>>>>> *work) >>>>>>>>>>>>>>>>>> +{ >>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>>>> + list_del_init(&work->node); >>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>> +} >>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>> +static inline void amdgpu_reset_pending_list(struct >>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain) >>>>>>>>>>>>>>>>>> +{ >>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *entry, *tmp; >>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>>>> + list_for_each_entry_safe(entry, tmp, >>>>>>>>>>>>>>>>>> &domain->pending_works, node) { >>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>> + list_del_init(&entry->node); >>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>> + /* Stop any other related pending resets */ >>>>>>>>>>>>>>>>>> + cancel_delayed_work(&entry->base); >>>>>>>>>>>>>>>>>> + } >>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>> void amdgpu_device_lock_reset_domain(struct >>>>>>>>>>>>>>>>>> amdgpu_reset_domain *reset_domain); >>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>> index 239f232f9c02..574e870d3064 100644 >>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>> @@ -25,6 +25,7 @@ >>>>>>>>>>>>>>>>>> #define AMDGPU_VIRT_H >>>>>>>>>>>>>>>>>> #include "amdgv_sriovmsg.h" >>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h" >>>>>>>>>>>>>>>>>> #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 << 0) /* >>>>>>>>>>>>>>>>>> vBIOS is sr-iov ready */ >>>>>>>>>>>>>>>>>> #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 1) /* >>>>>>>>>>>>>>>>>> sr-iov is enabled on this GPU */ >>>>>>>>>>>>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt { >>>>>>>>>>>>>>>>>> uint32_t reg_val_offs; >>>>>>>>>>>>>>>>>> struct amdgpu_irq_src ack_irq; >>>>>>>>>>>>>>>>>> struct amdgpu_irq_src rcv_irq; >>>>>>>>>>>>>>>>>> - struct work_struct flr_work; >>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct flr_work; >>>>>>>>>>>>>>>>>> struct amdgpu_mm_table mm_table; >>>>>>>>>>>>>>>>>> const struct amdgpu_virt_ops *ops; >>>>>>>>>>>>>>>>>> struct amdgpu_vf_error_buffer vf_errors; >>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>> index b81acf59870c..f3d1c2be9292 100644 >>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>> @@ -251,7 +251,7 @@ static int >>>>>>>>>>>>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device *adev, >>>>>>>>>>>>>>>>>> static void xgpu_ai_mailbox_flr_work(struct >>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = container_of(work, >>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = container_of(work, >>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work.base.work); >>>>>>>>>>>>>>>>>> struct amdgpu_device *adev = container_of(virt, >>>>>>>>>>>>>>>>>> struct amdgpu_device, virt); >>>>>>>>>>>>>>>>>> int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>>>>>>>>>>>>> @@ -380,7 +380,8 @@ int >>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_get_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work); >>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work); >>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>> @@ -389,6 +390,8 @@ void >>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_put_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>> static int xgpu_ai_request_init_data(struct >>>>>>>>>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644 >>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>> @@ -275,7 +275,7 @@ static int >>>>>>>>>>>>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device *adev, >>>>>>>>>>>>>>>>>> static void xgpu_nv_mailbox_flr_work(struct >>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = container_of(work, >>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = container_of(work, >>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work.base.work); >>>>>>>>>>>>>>>>>> struct amdgpu_device *adev = container_of(virt, >>>>>>>>>>>>>>>>>> struct amdgpu_device, virt); >>>>>>>>>>>>>>>>>> int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>>>>>>>>>>>>> @@ -407,7 +407,8 @@ int >>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_get_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work); >>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work); >>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>> @@ -416,6 +417,8 @@ void >>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_put_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>> const struct amdgpu_virt_ops xgpu_nv_virt_ops = { >>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644 >>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>> @@ -512,7 +512,7 @@ static int >>>>>>>>>>>>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device *adev, >>>>>>>>>>>>>>>>>> static void xgpu_vi_mailbox_flr_work(struct >>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = container_of(work, >>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = container_of(work, >>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work.base.work); >>>>>>>>>>>>>>>>>> struct amdgpu_device *adev = container_of(virt, >>>>>>>>>>>>>>>>>> struct amdgpu_device, virt); >>>>>>>>>>>>>>>>>> /* wait until RCV_MSG become 3 */ >>>>>>>>>>>>>>>>>> @@ -610,7 +610,8 @@ int >>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_get_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work); >>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work); >>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>> @@ -619,6 +620,8 @@ void >>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_put_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>> const struct amdgpu_virt_ops xgpu_vi_virt_ops = { >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>> >>>>>>>>>>>>> >>>>>>>> >>>>>> >>>> >> ^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive. 2022-05-11 13:58 ` Christian König @ 2022-05-11 15:20 ` Lazar, Lijo 2022-05-11 15:35 ` Andrey Grodzovsky 0 siblings, 1 reply; 40+ messages in thread From: Lazar, Lijo @ 2022-05-11 15:20 UTC (permalink / raw) To: Christian König, Andrey Grodzovsky, Christian König, Felix Kuehling, amd-gfx Cc: Bai Zoy On 5/11/2022 7:28 PM, Christian König wrote: > Am 11.05.22 um 15:43 schrieb Andrey Grodzovsky: >> On 2022-05-11 03:38, Christian König wrote: >>> Am 10.05.22 um 20:53 schrieb Andrey Grodzovsky: >>>> [SNIP] >>>>> E.g. in the reset code (either before or after the reset, that's >>>>> debatable) you do something like this: >>>>> >>>>> for (i = 0; i < num_ring; ++i) >>>>> cancel_delayed_work(ring[i]->scheduler....) >>>>> cancel_work(adev->ras_work); >>>>> cancel_work(adev->iofault_work); >>>>> cancel_work(adev->debugfs_work); >>>>> ... >>>>> >>>>> You don't really need to track which reset source has fired and >>>>> which hasn't, because that would be racy again. Instead just >>>>> bluntly reset all possible sources. >>>>> >>>>> Christian. >>>> >>>> >>>> I don't say we care if it fired once or twice (I need to add a fix >>>> to only insert reset work to pending reset list if it's not already >>>> there), the point of using list (or array) to which you add and from >>>> which you remove is that the logic of this is encapsulated within >>>> reset domain. In your way we need to be aware who exactly schedules >>>> reset work and explicitly cancel them, this also means that for any >>>> new source added in the future you will need to remember to add him >>> >>> I don't think that this is a valid argument. Additionally to the >>> schedulers we probably just need less than a handful of reset >>> sources, most likely even just one or two is enough. >>> >>> The only justification I can see of having additional separate reset >>> sources would be if somebody wants to know if a specific source has >>> been handled or not (e.g. call flush_work() or work_pending()). Like >>> in the case of a reset triggered through debugfs. >> >> >> This is indeed one reason, another is as we said before that if you >> share 'reset source' (meaning a delayed work) with another client >> (i.e. RAS and KFD) it means you make assumption that the other client >> always proceeds with the >> reset exactly the same way as you expect. So today we have this only >> in scheduler vs non scheduler reset happening - non scheduler reset >> clients assume the reset is always fully executed in HW while >> scheduler based reset makes shortcuts and not always does HW reset >> hence they cannot share 'reset source' (delayed work). Yes, we can >> always add this in the future if and when such problem will arise but >> no one will remember this then and a new bug will be introduced and >> will take time to find and resolve. > > Mhm, so your main concern is that we forget to correctly handle the new > reset sources? > > How about we do it like this then: > > struct amdgpu_reset_domain { > .... > union { > struct { > struct work_item debugfs; > struct work_item ras; > .... > }; > struct work_item array[] > } reset_sources; > } > If it's only about static array, enum amdgpu_reset_soruce { AMDGPU_RESET_SRC_RAS, AMDGPU_RESET_SRC_ABC, ..... AMDGPU_RESET_SRC_XYZ, AMDGPU_RESET_SRC_MAX }; struct work_struct reset_work[AMDGPU_RESET_SRC_MAX]; => An index for each work item Thanks, Lijo > Not 100% sure if that works, but something like that should do the trick. > > My main concern is that I don't want to allocate the work items on the > stack and dynamic allocation (e.g. kmalloc) is usually not possible either. > > Additional to that putting/removing work items from a list, array or > other container is a very common source for race conditions. > > Regards, > Christian. > >> >>>> to the cancellation list which you showed above. In current way all >>>> this done automatically within reset_domain code and it's agnostic >>>> to specific driver and it's specific list of reset sources. Also in >>>> case we would want to generalize reset_domain to other GPU drivers >>>> (which was >>>> a plan as far as i remember) this explicit mention of each reset >>>> works for cancellation is again less suitable in my opinion. >>> >>> Well we could put the work item for the scheduler independent reset >>> source into the reset domain as well. But I'm not sure those >>> additional reset sources should be part of any common handling, that >>> is largely amdgpu specific. >> >> >> So it's for sure more then one source for the reasons described above, >> also note that for scheduler we already cancel delayed work in >> drm_sched_stop so calling them again in amdgpu code kind of superfluous. >> >> Andrey >> >> >>> >>> Christian. >>> >>>> >>>> Andrey >>>> >>>> >>>>> >>>>>> >>>>>> Andrey >>>>>> >>>>>> >>>>>>> >>>>>>>> >>>>>>>> The only difference is I chose to do the canceling right BEFORE >>>>>>>> the HW reset and not AFTER. I did this because I see a possible >>>>>>>> race where a new reset request is being generated exactly after >>>>>>>> we finished the HW reset but before we canceled out all pending >>>>>>>> resets - in such case you wold not want to cancel this 'border >>>>>>>> line new' reset request. >>>>>>> >>>>>>> Why not? Any new reset request directly after a hardware reset is >>>>>>> most likely just falsely generated by the reset itself. >>>>>>> >>>>>>> Ideally I would cancel all sources after the reset, but before >>>>>>> starting any new work. >>>>>>> >>>>>>> Regards, >>>>>>> Christian. >>>>>>> >>>>>>>> >>>>>>>> >>>>>>>> Andrey >>>>>>>> >>>>>>>> >>>>>>>>> >>>>>>>>> Regards, >>>>>>>>> Christian. >>>>>>>>> >>>>>>>>>>> You can see that if many different reset sources share same >>>>>>>>>>> work struct what can happen is that the first to obtain the >>>>>>>>>>> lock you describe bellow might opt out from full HW reset >>>>>>>>>>> because his bad job did signal for example or because his >>>>>>>>>>> hunged IP block was able to recover through SW reset but in >>>>>>>>>>> the meantime another reset source who needed an actual HW >>>>>>>>>>> reset just silently returned and we end up with unhandled >>>>>>>>>>> reset request. True that today this happens only to job >>>>>>>>>>> timeout reset sources that are handled form within the >>>>>>>>>>> scheduler and won't use this single work struct but no one >>>>>>>>>>> prevents a future case for this to happen and also, if we >>>>>>>>>>> actually want to unify scheduler time out handlers within >>>>>>>>>>> reset domain (which seems to me the right design approach) we >>>>>>>>>>> won't be able to use just one work struct for this reason >>>>>>>>>>> anyway. >>>>>>>>>>> >>>>>>>>>> >>>>>>>>>> Just to add to this point - a reset domain is co-operative >>>>>>>>>> domain. In addition to sharing a set of clients from various >>>>>>>>>> reset sources for one device, it also will have a set of >>>>>>>>>> devices like in XGMI hive. The job timeout on one device may >>>>>>>>>> not eventually result in result, but a RAS error happening on >>>>>>>>>> another device at the same time would need a reset. The second >>>>>>>>>> device's RAS error cannot return seeing that a reset work >>>>>>>>>> already started, or ignore the reset work given that another >>>>>>>>>> device has filled the reset data. >>>>>>>>>> >>>>>>>>>> When there is a reset domain, it should take care of the work >>>>>>>>>> scheduled and keeping it in device or any other level doesn't >>>>>>>>>> sound good. >>>>>>>>>> >>>>>>>>>> Thanks, >>>>>>>>>> Lijo >>>>>>>>>> >>>>>>>>>>> Andrey >>>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>>> >>>>>>>>>>>> I'd put the reset work struct into the reset_domain struct. >>>>>>>>>>>> That way you'd have exactly one worker for the reset domain. >>>>>>>>>>>> You could implement a lock-less scheme to decide whether you >>>>>>>>>>>> need to schedule a reset, e.g. using an atomic counter in >>>>>>>>>>>> the shared work struct that gets incremented when a client >>>>>>>>>>>> wants to trigger a reset (atomic_add_return). If that >>>>>>>>>>>> counter is exactly 1 after incrementing, you need to fill in >>>>>>>>>>>> the rest of the work struct and schedule the work. >>>>>>>>>>>> Otherwise, it's already scheduled (or another client is in >>>>>>>>>>>> the process of scheduling it) and you just return. When the >>>>>>>>>>>> worker finishes (after confirming a successful reset), it >>>>>>>>>>>> resets the counter to 0, so the next client requesting a >>>>>>>>>>>> reset will schedule the worker again. >>>>>>>>>>>> >>>>>>>>>>>> Regards, >>>>>>>>>>>> Felix >>>>>>>>>>>> >>>>>>>>>>>> >>>>>>>>>>>>> >>>>>>>>>>>>> >>>>>>>>>>>>>> >>>>>>>>>>>>>> >>>>>>>>>>>>>> Additional to that keep in mind that you can't allocate >>>>>>>>>>>>>> any memory before or during the GPU reset nor wait for the >>>>>>>>>>>>>> reset to complete (so you can't allocate anything on the >>>>>>>>>>>>>> stack either). >>>>>>>>>>>>> >>>>>>>>>>>>> >>>>>>>>>>>>> There is no dynamic allocation here, regarding stack >>>>>>>>>>>>> allocations - we do it all the time when we call functions, >>>>>>>>>>>>> even during GPU resets, how on stack allocation of work >>>>>>>>>>>>> struct in amdgpu_device_gpu_recover is different from any >>>>>>>>>>>>> other local variable we allocate in any function we call ? >>>>>>>>>>>>> >>>>>>>>>>>>> I am also not sure why it's not allowed to wait for reset >>>>>>>>>>>>> to complete ? Also, see in amdgpu_ras_do_recovery and >>>>>>>>>>>>> gpu_recover_get (debugfs) - the caller expects the reset to >>>>>>>>>>>>> complete before he returns. I can probably work around it >>>>>>>>>>>>> in RAS code by calling atomic_set(&ras->in_recovery, 0) >>>>>>>>>>>>> from some callback within actual reset function but >>>>>>>>>>>>> regarding sysfs it actually expects a result returned >>>>>>>>>>>>> indicating whether the call was successful or not. >>>>>>>>>>>>> >>>>>>>>>>>>> Andrey >>>>>>>>>>>>> >>>>>>>>>>>>> >>>>>>>>>>>>>> >>>>>>>>>>>>>> I don't think that concept you try here will work. >>>>>>>>>>>>>> >>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>> Christian. >>>>>>>>>>>>>> >>>>>>>>>>>>>>> Also in general seems to me it's cleaner approach where >>>>>>>>>>>>>>> this logic (the work items) are held and handled in >>>>>>>>>>>>>>> reset_domain and are not split in each adev or any other >>>>>>>>>>>>>>> entity. We might want in the future to even move the >>>>>>>>>>>>>>> scheduler handling into reset domain since reset domain >>>>>>>>>>>>>>> is supposed to be a generic things and not only or AMD. >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>>>>>> Christian. >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky >>>>>>>>>>>>>>>>>>> <andrey.grodzovsky@amd.com> >>>>>>>>>>>>>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com> >>>>>>>>>>>>>>>>>>> --- >>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +--- >>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++-- >>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 + >>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 >>>>>>>>>>>>>>>>>>> +++++++++++++++++++++- >>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +- >>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++- >>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++- >>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++- >>>>>>>>>>>>>>>>>>> 8 files changed, 104 insertions(+), 24 deletions(-) >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>> index 4264abc5604d..99efd8317547 100644 >>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>> @@ -109,6 +109,7 @@ >>>>>>>>>>>>>>>>>>> #include "amdgpu_fdinfo.h" >>>>>>>>>>>>>>>>>>> #include "amdgpu_mca.h" >>>>>>>>>>>>>>>>>>> #include "amdgpu_ras.h" >>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h" >>>>>>>>>>>>>>>>>>> #define MAX_GPU_INSTANCE 16 >>>>>>>>>>>>>>>>>>> @@ -509,16 +510,6 @@ struct >>>>>>>>>>>>>>>>>>> amdgpu_allowed_register_entry { >>>>>>>>>>>>>>>>>>> bool grbm_indexed; >>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>> -enum amd_reset_method { >>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_NONE = -1, >>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_LEGACY = 0, >>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE0, >>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE1, >>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE2, >>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_BACO, >>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_PCI, >>>>>>>>>>>>>>>>>>> -}; >>>>>>>>>>>>>>>>>>> - >>>>>>>>>>>>>>>>>>> struct amdgpu_video_codec_info { >>>>>>>>>>>>>>>>>>> u32 codec_type; >>>>>>>>>>>>>>>>>>> u32 max_width; >>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>> index e582f1044c0f..7fa82269c30f 100644 >>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>> @@ -5201,6 +5201,12 @@ int >>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct amdgpu_device >>>>>>>>>>>>>>>>>>> *adev, >>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>> tmp_vram_lost_counter = >>>>>>>>>>>>>>>>>>> atomic_read(&((adev)->vram_lost_counter)); >>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>> + /* Drop all pending resets since we will reset >>>>>>>>>>>>>>>>>>> now anyway */ >>>>>>>>>>>>>>>>>>> + tmp_adev = list_first_entry(device_list_handle, >>>>>>>>>>>>>>>>>>> struct amdgpu_device, >>>>>>>>>>>>>>>>>>> + reset_list); >>>>>>>>>>>>>>>>>>> + amdgpu_reset_pending_list(tmp_adev->reset_domain); >>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>> /* Actual ASIC resets if needed.*/ >>>>>>>>>>>>>>>>>>> /* Host driver will handle XGMI hive reset for >>>>>>>>>>>>>>>>>>> SRIOV */ >>>>>>>>>>>>>>>>>>> if (amdgpu_sriov_vf(adev)) { >>>>>>>>>>>>>>>>>>> @@ -5296,7 +5302,7 @@ int >>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct amdgpu_device >>>>>>>>>>>>>>>>>>> *adev, >>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>> struct amdgpu_recover_work_struct { >>>>>>>>>>>>>>>>>>> - struct work_struct base; >>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct base; >>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev; >>>>>>>>>>>>>>>>>>> struct amdgpu_job *job; >>>>>>>>>>>>>>>>>>> int ret; >>>>>>>>>>>>>>>>>>> @@ -5304,7 +5310,7 @@ struct >>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct { >>>>>>>>>>>>>>>>>>> static void >>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work(struct >>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>> - struct amdgpu_recover_work_struct *recover_work >>>>>>>>>>>>>>>>>>> = container_of(work, struct >>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base); >>>>>>>>>>>>>>>>>>> + struct amdgpu_recover_work_struct *recover_work >>>>>>>>>>>>>>>>>>> = container_of(work, struct >>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base.base.work); >>>>>>>>>>>>>>>>>>> recover_work->ret = >>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, >>>>>>>>>>>>>>>>>>> recover_work->job); >>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>> @@ -5316,12 +5322,15 @@ int >>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct amdgpu_device *adev, >>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>> struct amdgpu_recover_work_struct work = {.adev >>>>>>>>>>>>>>>>>>> = adev, .job = job}; >>>>>>>>>>>>>>>>>>> - INIT_WORK(&work.base, >>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&work.base.base, >>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&work.base.node); >>>>>>>>>>>>>>>>>>> if >>>>>>>>>>>>>>>>>>> (!amdgpu_reset_domain_schedule(adev->reset_domain, >>>>>>>>>>>>>>>>>>> &work.base)) >>>>>>>>>>>>>>>>>>> return -EAGAIN; >>>>>>>>>>>>>>>>>>> - flush_work(&work.base); >>>>>>>>>>>>>>>>>>> + flush_delayed_work(&work.base.base); >>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>> &work.base); >>>>>>>>>>>>>>>>>>> return work.ret; >>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>> index c80af0889773..ffddd419c351 100644 >>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain >>>>>>>>>>>>>>>>>>> *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d >>>>>>>>>>>>>>>>>>> atomic_set(&reset_domain->in_gpu_reset, 0); >>>>>>>>>>>>>>>>>>> init_rwsem(&reset_domain->sem); >>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&reset_domain->pending_works); >>>>>>>>>>>>>>>>>>> + mutex_init(&reset_domain->reset_lock); >>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>> return reset_domain; >>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>> index 1949dbe28a86..863ec5720fc1 100644 >>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>> @@ -24,7 +24,18 @@ >>>>>>>>>>>>>>>>>>> #ifndef __AMDGPU_RESET_H__ >>>>>>>>>>>>>>>>>>> #define __AMDGPU_RESET_H__ >>>>>>>>>>>>>>>>>>> -#include "amdgpu.h" >>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>> +#include <linux/atomic.h> >>>>>>>>>>>>>>>>>>> +#include <linux/mutex.h> >>>>>>>>>>>>>>>>>>> +#include <linux/list.h> >>>>>>>>>>>>>>>>>>> +#include <linux/kref.h> >>>>>>>>>>>>>>>>>>> +#include <linux/rwsem.h> >>>>>>>>>>>>>>>>>>> +#include <linux/workqueue.h> >>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>> +struct amdgpu_device; >>>>>>>>>>>>>>>>>>> +struct amdgpu_job; >>>>>>>>>>>>>>>>>>> +struct amdgpu_hive_info; >>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>> enum AMDGPU_RESET_FLAGS { >>>>>>>>>>>>>>>>>>> @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS { >>>>>>>>>>>>>>>>>>> AMDGPU_SKIP_HW_RESET = 1, >>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>> +enum amd_reset_method { >>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_NONE = -1, >>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_LEGACY = 0, >>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE0, >>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE1, >>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE2, >>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_BACO, >>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_PCI, >>>>>>>>>>>>>>>>>>> +}; >>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>> struct amdgpu_reset_context { >>>>>>>>>>>>>>>>>>> enum amd_reset_method method; >>>>>>>>>>>>>>>>>>> struct amdgpu_device *reset_req_dev; >>>>>>>>>>>>>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context { >>>>>>>>>>>>>>>>>>> unsigned long flags; >>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>> +struct amdgpu_reset_control; >>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>> struct amdgpu_reset_handler { >>>>>>>>>>>>>>>>>>> enum amd_reset_method reset_method; >>>>>>>>>>>>>>>>>>> struct list_head handler_list; >>>>>>>>>>>>>>>>>>> @@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type { >>>>>>>>>>>>>>>>>>> XGMI_HIVE >>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>> +struct amdgpu_reset_work_struct { >>>>>>>>>>>>>>>>>>> + struct delayed_work base; >>>>>>>>>>>>>>>>>>> + struct list_head node; >>>>>>>>>>>>>>>>>>> +}; >>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>> struct amdgpu_reset_domain { >>>>>>>>>>>>>>>>>>> struct kref refcount; >>>>>>>>>>>>>>>>>>> struct workqueue_struct *wq; >>>>>>>>>>>>>>>>>>> enum amdgpu_reset_domain_type type; >>>>>>>>>>>>>>>>>>> struct rw_semaphore sem; >>>>>>>>>>>>>>>>>>> atomic_t in_gpu_reset; >>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>> + struct list_head pending_works; >>>>>>>>>>>>>>>>>>> + struct mutex reset_lock; >>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>> @@ -113,9 +146,43 @@ static inline void >>>>>>>>>>>>>>>>>>> amdgpu_reset_put_reset_domain(struct >>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *dom >>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>> static inline bool >>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_schedule(struct >>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain, >>>>>>>>>>>>>>>>>>> - struct work_struct *work) >>>>>>>>>>>>>>>>>>> + struct >>>>>>>>>>>>>>>>>>> amdgpu_reset_work_struct *work) >>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>> - return queue_work(domain->wq, work); >>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>> + if (!queue_delayed_work(domain->wq, &work->base, >>>>>>>>>>>>>>>>>>> 0)) { >>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>> + return false; >>>>>>>>>>>>>>>>>>> + } >>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>> + list_add_tail(&work->node, &domain->pending_works); >>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>> + return true; >>>>>>>>>>>>>>>>>>> +} >>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>> +static inline void >>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(struct >>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain, >>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct >>>>>>>>>>>>>>>>>>> *work) >>>>>>>>>>>>>>>>>>> +{ >>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>> + list_del_init(&work->node); >>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>> +} >>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>> +static inline void amdgpu_reset_pending_list(struct >>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain) >>>>>>>>>>>>>>>>>>> +{ >>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *entry, *tmp; >>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>> + list_for_each_entry_safe(entry, tmp, >>>>>>>>>>>>>>>>>>> &domain->pending_works, node) { >>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>> + list_del_init(&entry->node); >>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>> + /* Stop any other related pending resets */ >>>>>>>>>>>>>>>>>>> + cancel_delayed_work(&entry->base); >>>>>>>>>>>>>>>>>>> + } >>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>> void amdgpu_device_lock_reset_domain(struct >>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *reset_domain); >>>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>> index 239f232f9c02..574e870d3064 100644 >>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>> @@ -25,6 +25,7 @@ >>>>>>>>>>>>>>>>>>> #define AMDGPU_VIRT_H >>>>>>>>>>>>>>>>>>> #include "amdgv_sriovmsg.h" >>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h" >>>>>>>>>>>>>>>>>>> #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 << 0) /* >>>>>>>>>>>>>>>>>>> vBIOS is sr-iov ready */ >>>>>>>>>>>>>>>>>>> #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 1) /* >>>>>>>>>>>>>>>>>>> sr-iov is enabled on this GPU */ >>>>>>>>>>>>>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt { >>>>>>>>>>>>>>>>>>> uint32_t reg_val_offs; >>>>>>>>>>>>>>>>>>> struct amdgpu_irq_src ack_irq; >>>>>>>>>>>>>>>>>>> struct amdgpu_irq_src rcv_irq; >>>>>>>>>>>>>>>>>>> - struct work_struct flr_work; >>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct flr_work; >>>>>>>>>>>>>>>>>>> struct amdgpu_mm_table mm_table; >>>>>>>>>>>>>>>>>>> const struct amdgpu_virt_ops *ops; >>>>>>>>>>>>>>>>>>> struct amdgpu_vf_error_buffer vf_errors; >>>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>> index b81acf59870c..f3d1c2be9292 100644 >>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>> @@ -251,7 +251,7 @@ static int >>>>>>>>>>>>>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device *adev, >>>>>>>>>>>>>>>>>>> static void xgpu_ai_mailbox_flr_work(struct >>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = container_of(work, >>>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = container_of(work, >>>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work.base.work); >>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev = container_of(virt, >>>>>>>>>>>>>>>>>>> struct amdgpu_device, virt); >>>>>>>>>>>>>>>>>>> int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>>>>>>>>>>>>>> @@ -380,7 +380,8 @@ int >>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_get_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work); >>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work); >>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>> @@ -389,6 +390,8 @@ void >>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_put_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>> static int xgpu_ai_request_init_data(struct >>>>>>>>>>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644 >>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>> @@ -275,7 +275,7 @@ static int >>>>>>>>>>>>>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device *adev, >>>>>>>>>>>>>>>>>>> static void xgpu_nv_mailbox_flr_work(struct >>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = container_of(work, >>>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = container_of(work, >>>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work.base.work); >>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev = container_of(virt, >>>>>>>>>>>>>>>>>>> struct amdgpu_device, virt); >>>>>>>>>>>>>>>>>>> int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>>>>>>>>>>>>>> @@ -407,7 +407,8 @@ int >>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_get_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work); >>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work); >>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>> @@ -416,6 +417,8 @@ void >>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_put_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>> const struct amdgpu_virt_ops xgpu_nv_virt_ops = { >>>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644 >>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>> @@ -512,7 +512,7 @@ static int >>>>>>>>>>>>>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device *adev, >>>>>>>>>>>>>>>>>>> static void xgpu_vi_mailbox_flr_work(struct >>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = container_of(work, >>>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = container_of(work, >>>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work.base.work); >>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev = container_of(virt, >>>>>>>>>>>>>>>>>>> struct amdgpu_device, virt); >>>>>>>>>>>>>>>>>>> /* wait until RCV_MSG become 3 */ >>>>>>>>>>>>>>>>>>> @@ -610,7 +610,8 @@ int >>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_get_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work); >>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work); >>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>> @@ -619,6 +620,8 @@ void >>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_put_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>> const struct amdgpu_virt_ops xgpu_vi_virt_ops = { >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> >>>>>>>>>>>>>> >>>>>>>>> >>>>>>> >>>>> >>> > ^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive. 2022-05-11 15:20 ` Lazar, Lijo @ 2022-05-11 15:35 ` Andrey Grodzovsky 2022-05-11 15:37 ` Lazar, Lijo 2022-05-11 15:39 ` Christian König 0 siblings, 2 replies; 40+ messages in thread From: Andrey Grodzovsky @ 2022-05-11 15:35 UTC (permalink / raw) To: Lazar, Lijo, Christian König, Christian König, Felix Kuehling, amd-gfx Cc: Bai Zoy On 2022-05-11 11:20, Lazar, Lijo wrote: > > > On 5/11/2022 7:28 PM, Christian König wrote: >> Am 11.05.22 um 15:43 schrieb Andrey Grodzovsky: >>> On 2022-05-11 03:38, Christian König wrote: >>>> Am 10.05.22 um 20:53 schrieb Andrey Grodzovsky: >>>>> [SNIP] >>>>>> E.g. in the reset code (either before or after the reset, that's >>>>>> debatable) you do something like this: >>>>>> >>>>>> for (i = 0; i < num_ring; ++i) >>>>>> cancel_delayed_work(ring[i]->scheduler....) >>>>>> cancel_work(adev->ras_work); >>>>>> cancel_work(adev->iofault_work); >>>>>> cancel_work(adev->debugfs_work); >>>>>> ... >>>>>> >>>>>> You don't really need to track which reset source has fired and >>>>>> which hasn't, because that would be racy again. Instead just >>>>>> bluntly reset all possible sources. >>>>>> >>>>>> Christian. >>>>> >>>>> >>>>> I don't say we care if it fired once or twice (I need to add a fix >>>>> to only insert reset work to pending reset list if it's not >>>>> already there), the point of using list (or array) to which you >>>>> add and from which you remove is that the logic of this is >>>>> encapsulated within reset domain. In your way we need to be aware >>>>> who exactly schedules reset work and explicitly cancel them, this >>>>> also means that for any new source added in the future you will >>>>> need to remember to add him >>>> >>>> I don't think that this is a valid argument. Additionally to the >>>> schedulers we probably just need less than a handful of reset >>>> sources, most likely even just one or two is enough. >>>> >>>> The only justification I can see of having additional separate >>>> reset sources would be if somebody wants to know if a specific >>>> source has been handled or not (e.g. call flush_work() or >>>> work_pending()). Like in the case of a reset triggered through >>>> debugfs. >>> >>> >>> This is indeed one reason, another is as we said before that if you >>> share 'reset source' (meaning a delayed work) with another client >>> (i.e. RAS and KFD) it means you make assumption that the other >>> client always proceeds with the >>> reset exactly the same way as you expect. So today we have this only >>> in scheduler vs non scheduler reset happening - non scheduler reset >>> clients assume the reset is always fully executed in HW while >>> scheduler based reset makes shortcuts and not always does HW reset >>> hence they cannot share 'reset source' (delayed work). Yes, we can >>> always add this in the future if and when such problem will arise >>> but no one will remember this then and a new bug will be introduced >>> and will take time to find and resolve. >> >> Mhm, so your main concern is that we forget to correctly handle the >> new reset sources? >> >> How about we do it like this then: >> >> struct amdgpu_reset_domain { >> .... >> union { >> struct { >> struct work_item debugfs; >> struct work_item ras; >> .... >> }; >> struct work_item array[] >> } reset_sources; >> } >> > > If it's only about static array, > > enum amdgpu_reset_soruce { > > AMDGPU_RESET_SRC_RAS, > AMDGPU_RESET_SRC_ABC, > ..... > AMDGPU_RESET_SRC_XYZ, > AMDGPU_RESET_SRC_MAX > > }; > > struct work_struct reset_work[AMDGPU_RESET_SRC_MAX]; => An index for > each work item > > > Thanks, > Lijo It's possible though it makes harder to generalize reset_domain later for other drivers. But still one caveat, look at amdgpu_recover_work_struct and it's usage in amdgpu_device_gpu_recover and in gpu_recover_get, At least for debugfs i need to return back the result of GPU reset and so I cannot store actual work items in the array mentioned above but rather pointers to work_item because i need a way to get back the return value from gpu_recover like I do now in amdgpu_device_gpu_recover. Andrey > >> Not 100% sure if that works, but something like that should do the >> trick. >> >> My main concern is that I don't want to allocate the work items on >> the stack and dynamic allocation (e.g. kmalloc) is usually not >> possible either. >> >> Additional to that putting/removing work items from a list, array or >> other container is a very common source for race conditions. >> >> Regards, >> Christian. >> >>> >>>>> to the cancellation list which you showed above. In current way >>>>> all this done automatically within reset_domain code and it's >>>>> agnostic to specific driver and it's specific list of reset >>>>> sources. Also in case we would want to generalize reset_domain to >>>>> other GPU drivers (which was >>>>> a plan as far as i remember) this explicit mention of each reset >>>>> works for cancellation is again less suitable in my opinion. >>>> >>>> Well we could put the work item for the scheduler independent reset >>>> source into the reset domain as well. But I'm not sure those >>>> additional reset sources should be part of any common handling, >>>> that is largely amdgpu specific. >>> >>> >>> So it's for sure more then one source for the reasons described >>> above, also note that for scheduler we already cancel delayed work >>> in drm_sched_stop so calling them again in amdgpu code kind of >>> superfluous. >>> >>> Andrey >>> >>> >>>> >>>> Christian. >>>> >>>>> >>>>> Andrey >>>>> >>>>> >>>>>> >>>>>>> >>>>>>> Andrey >>>>>>> >>>>>>> >>>>>>>> >>>>>>>>> >>>>>>>>> The only difference is I chose to do the canceling right >>>>>>>>> BEFORE the HW reset and not AFTER. I did this because I see a >>>>>>>>> possible race where a new reset request is being generated >>>>>>>>> exactly after we finished the HW reset but before we canceled >>>>>>>>> out all pending resets - in such case you wold not want to >>>>>>>>> cancel this 'border line new' reset request. >>>>>>>> >>>>>>>> Why not? Any new reset request directly after a hardware reset >>>>>>>> is most likely just falsely generated by the reset itself. >>>>>>>> >>>>>>>> Ideally I would cancel all sources after the reset, but before >>>>>>>> starting any new work. >>>>>>>> >>>>>>>> Regards, >>>>>>>> Christian. >>>>>>>> >>>>>>>>> >>>>>>>>> >>>>>>>>> Andrey >>>>>>>>> >>>>>>>>> >>>>>>>>>> >>>>>>>>>> Regards, >>>>>>>>>> Christian. >>>>>>>>>> >>>>>>>>>>>> You can see that if many different reset sources share same >>>>>>>>>>>> work struct what can happen is that the first to obtain the >>>>>>>>>>>> lock you describe bellow might opt out from full HW reset >>>>>>>>>>>> because his bad job did signal for example or because his >>>>>>>>>>>> hunged IP block was able to recover through SW reset but in >>>>>>>>>>>> the meantime another reset source who needed an actual HW >>>>>>>>>>>> reset just silently returned and we end up with unhandled >>>>>>>>>>>> reset request. True that today this happens only to job >>>>>>>>>>>> timeout reset sources that are handled form within the >>>>>>>>>>>> scheduler and won't use this single work struct but no one >>>>>>>>>>>> prevents a future case for this to happen and also, if we >>>>>>>>>>>> actually want to unify scheduler time out handlers within >>>>>>>>>>>> reset domain (which seems to me the right design approach) >>>>>>>>>>>> we won't be able to use just one work struct for this >>>>>>>>>>>> reason anyway. >>>>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>> Just to add to this point - a reset domain is co-operative >>>>>>>>>>> domain. In addition to sharing a set of clients from various >>>>>>>>>>> reset sources for one device, it also will have a set of >>>>>>>>>>> devices like in XGMI hive. The job timeout on one device may >>>>>>>>>>> not eventually result in result, but a RAS error happening >>>>>>>>>>> on another device at the same time would need a reset. The >>>>>>>>>>> second device's RAS error cannot return seeing that a reset >>>>>>>>>>> work already started, or ignore the reset work given that >>>>>>>>>>> another device has filled the reset data. >>>>>>>>>>> >>>>>>>>>>> When there is a reset domain, it should take care of the >>>>>>>>>>> work scheduled and keeping it in device or any other level >>>>>>>>>>> doesn't sound good. >>>>>>>>>>> >>>>>>>>>>> Thanks, >>>>>>>>>>> Lijo >>>>>>>>>>> >>>>>>>>>>>> Andrey >>>>>>>>>>>> >>>>>>>>>>>> >>>>>>>>>>>>> >>>>>>>>>>>>> I'd put the reset work struct into the reset_domain >>>>>>>>>>>>> struct. That way you'd have exactly one worker for the >>>>>>>>>>>>> reset domain. You could implement a lock-less scheme to >>>>>>>>>>>>> decide whether you need to schedule a reset, e.g. using an >>>>>>>>>>>>> atomic counter in the shared work struct that gets >>>>>>>>>>>>> incremented when a client wants to trigger a reset >>>>>>>>>>>>> (atomic_add_return). If that counter is exactly 1 after >>>>>>>>>>>>> incrementing, you need to fill in the rest of the work >>>>>>>>>>>>> struct and schedule the work. Otherwise, it's already >>>>>>>>>>>>> scheduled (or another client is in the process of >>>>>>>>>>>>> scheduling it) and you just return. When the worker >>>>>>>>>>>>> finishes (after confirming a successful reset), it resets >>>>>>>>>>>>> the counter to 0, so the next client requesting a reset >>>>>>>>>>>>> will schedule the worker again. >>>>>>>>>>>>> >>>>>>>>>>>>> Regards, >>>>>>>>>>>>> Felix >>>>>>>>>>>>> >>>>>>>>>>>>> >>>>>>>>>>>>>> >>>>>>>>>>>>>> >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> Additional to that keep in mind that you can't allocate >>>>>>>>>>>>>>> any memory before or during the GPU reset nor wait for >>>>>>>>>>>>>>> the reset to complete (so you can't allocate anything on >>>>>>>>>>>>>>> the stack either). >>>>>>>>>>>>>> >>>>>>>>>>>>>> >>>>>>>>>>>>>> There is no dynamic allocation here, regarding stack >>>>>>>>>>>>>> allocations - we do it all the time when we call >>>>>>>>>>>>>> functions, even during GPU resets, how on stack >>>>>>>>>>>>>> allocation of work struct in amdgpu_device_gpu_recover is >>>>>>>>>>>>>> different from any other local variable we allocate in >>>>>>>>>>>>>> any function we call ? >>>>>>>>>>>>>> >>>>>>>>>>>>>> I am also not sure why it's not allowed to wait for reset >>>>>>>>>>>>>> to complete ? Also, see in amdgpu_ras_do_recovery and >>>>>>>>>>>>>> gpu_recover_get (debugfs) - the caller expects the reset >>>>>>>>>>>>>> to complete before he returns. I can probably work around >>>>>>>>>>>>>> it in RAS code by calling atomic_set(&ras->in_recovery, >>>>>>>>>>>>>> 0) from some callback within actual reset function but >>>>>>>>>>>>>> regarding sysfs it actually expects a result returned >>>>>>>>>>>>>> indicating whether the call was successful or not. >>>>>>>>>>>>>> >>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>> >>>>>>>>>>>>>> >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> I don't think that concept you try here will work. >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>>> Christian. >>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> Also in general seems to me it's cleaner approach where >>>>>>>>>>>>>>>> this logic (the work items) are held and handled in >>>>>>>>>>>>>>>> reset_domain and are not split in each adev or any >>>>>>>>>>>>>>>> other entity. We might want in the future to even move >>>>>>>>>>>>>>>> the scheduler handling into reset domain since reset >>>>>>>>>>>>>>>> domain is supposed to be a generic things and not only >>>>>>>>>>>>>>>> or AMD. >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>>>>>>> Christian. >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky >>>>>>>>>>>>>>>>>>>> <andrey.grodzovsky@amd.com> >>>>>>>>>>>>>>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com> >>>>>>>>>>>>>>>>>>>> --- >>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +--- >>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++-- >>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 + >>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 >>>>>>>>>>>>>>>>>>>> +++++++++++++++++++++- >>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +- >>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++- >>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++- >>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++- >>>>>>>>>>>>>>>>>>>> 8 files changed, 104 insertions(+), 24 deletions(-) >>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>> index 4264abc5604d..99efd8317547 100644 >>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>> @@ -109,6 +109,7 @@ >>>>>>>>>>>>>>>>>>>> #include "amdgpu_fdinfo.h" >>>>>>>>>>>>>>>>>>>> #include "amdgpu_mca.h" >>>>>>>>>>>>>>>>>>>> #include "amdgpu_ras.h" >>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h" >>>>>>>>>>>>>>>>>>>> #define MAX_GPU_INSTANCE 16 >>>>>>>>>>>>>>>>>>>> @@ -509,16 +510,6 @@ struct >>>>>>>>>>>>>>>>>>>> amdgpu_allowed_register_entry { >>>>>>>>>>>>>>>>>>>> bool grbm_indexed; >>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>> -enum amd_reset_method { >>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_NONE = -1, >>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_LEGACY = 0, >>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE0, >>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE1, >>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE2, >>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_BACO, >>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_PCI, >>>>>>>>>>>>>>>>>>>> -}; >>>>>>>>>>>>>>>>>>>> - >>>>>>>>>>>>>>>>>>>> struct amdgpu_video_codec_info { >>>>>>>>>>>>>>>>>>>> u32 codec_type; >>>>>>>>>>>>>>>>>>>> u32 max_width; >>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>> index e582f1044c0f..7fa82269c30f 100644 >>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>> @@ -5201,6 +5201,12 @@ int >>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct amdgpu_device >>>>>>>>>>>>>>>>>>>> *adev, >>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>> tmp_vram_lost_counter = >>>>>>>>>>>>>>>>>>>> atomic_read(&((adev)->vram_lost_counter)); >>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>> + /* Drop all pending resets since we will reset >>>>>>>>>>>>>>>>>>>> now anyway */ >>>>>>>>>>>>>>>>>>>> + tmp_adev = >>>>>>>>>>>>>>>>>>>> list_first_entry(device_list_handle, struct >>>>>>>>>>>>>>>>>>>> amdgpu_device, >>>>>>>>>>>>>>>>>>>> + reset_list); >>>>>>>>>>>>>>>>>>>> + amdgpu_reset_pending_list(tmp_adev->reset_domain); >>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>> /* Actual ASIC resets if needed.*/ >>>>>>>>>>>>>>>>>>>> /* Host driver will handle XGMI hive reset >>>>>>>>>>>>>>>>>>>> for SRIOV */ >>>>>>>>>>>>>>>>>>>> if (amdgpu_sriov_vf(adev)) { >>>>>>>>>>>>>>>>>>>> @@ -5296,7 +5302,7 @@ int >>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct amdgpu_device >>>>>>>>>>>>>>>>>>>> *adev, >>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>> struct amdgpu_recover_work_struct { >>>>>>>>>>>>>>>>>>>> - struct work_struct base; >>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct base; >>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev; >>>>>>>>>>>>>>>>>>>> struct amdgpu_job *job; >>>>>>>>>>>>>>>>>>>> int ret; >>>>>>>>>>>>>>>>>>>> @@ -5304,7 +5310,7 @@ struct >>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct { >>>>>>>>>>>>>>>>>>>> static void >>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work(struct >>>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>> - struct amdgpu_recover_work_struct >>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct >>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base); >>>>>>>>>>>>>>>>>>>> + struct amdgpu_recover_work_struct >>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct >>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base.base.work); >>>>>>>>>>>>>>>>>>>> recover_work->ret = >>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, >>>>>>>>>>>>>>>>>>>> recover_work->job); >>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>> @@ -5316,12 +5322,15 @@ int >>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct amdgpu_device *adev, >>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>> struct amdgpu_recover_work_struct work = >>>>>>>>>>>>>>>>>>>> {.adev = adev, .job = job}; >>>>>>>>>>>>>>>>>>>> - INIT_WORK(&work.base, >>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&work.base.base, >>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&work.base.node); >>>>>>>>>>>>>>>>>>>> if >>>>>>>>>>>>>>>>>>>> (!amdgpu_reset_domain_schedule(adev->reset_domain, >>>>>>>>>>>>>>>>>>>> &work.base)) >>>>>>>>>>>>>>>>>>>> return -EAGAIN; >>>>>>>>>>>>>>>>>>>> - flush_work(&work.base); >>>>>>>>>>>>>>>>>>>> + flush_delayed_work(&work.base.base); >>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>> &work.base); >>>>>>>>>>>>>>>>>>>> return work.ret; >>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>> index c80af0889773..ffddd419c351 100644 >>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain >>>>>>>>>>>>>>>>>>>> *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d >>>>>>>>>>>>>>>>>>>> atomic_set(&reset_domain->in_gpu_reset, 0); >>>>>>>>>>>>>>>>>>>> init_rwsem(&reset_domain->sem); >>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&reset_domain->pending_works); >>>>>>>>>>>>>>>>>>>> + mutex_init(&reset_domain->reset_lock); >>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>> return reset_domain; >>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>> index 1949dbe28a86..863ec5720fc1 100644 >>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>> @@ -24,7 +24,18 @@ >>>>>>>>>>>>>>>>>>>> #ifndef __AMDGPU_RESET_H__ >>>>>>>>>>>>>>>>>>>> #define __AMDGPU_RESET_H__ >>>>>>>>>>>>>>>>>>>> -#include "amdgpu.h" >>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>> +#include <linux/atomic.h> >>>>>>>>>>>>>>>>>>>> +#include <linux/mutex.h> >>>>>>>>>>>>>>>>>>>> +#include <linux/list.h> >>>>>>>>>>>>>>>>>>>> +#include <linux/kref.h> >>>>>>>>>>>>>>>>>>>> +#include <linux/rwsem.h> >>>>>>>>>>>>>>>>>>>> +#include <linux/workqueue.h> >>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>> +struct amdgpu_device; >>>>>>>>>>>>>>>>>>>> +struct amdgpu_job; >>>>>>>>>>>>>>>>>>>> +struct amdgpu_hive_info; >>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>> enum AMDGPU_RESET_FLAGS { >>>>>>>>>>>>>>>>>>>> @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS { >>>>>>>>>>>>>>>>>>>> AMDGPU_SKIP_HW_RESET = 1, >>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>> +enum amd_reset_method { >>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_NONE = -1, >>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_LEGACY = 0, >>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE0, >>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE1, >>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE2, >>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_BACO, >>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_PCI, >>>>>>>>>>>>>>>>>>>> +}; >>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>> struct amdgpu_reset_context { >>>>>>>>>>>>>>>>>>>> enum amd_reset_method method; >>>>>>>>>>>>>>>>>>>> struct amdgpu_device *reset_req_dev; >>>>>>>>>>>>>>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context { >>>>>>>>>>>>>>>>>>>> unsigned long flags; >>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>> +struct amdgpu_reset_control; >>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>> struct amdgpu_reset_handler { >>>>>>>>>>>>>>>>>>>> enum amd_reset_method reset_method; >>>>>>>>>>>>>>>>>>>> struct list_head handler_list; >>>>>>>>>>>>>>>>>>>> @@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type { >>>>>>>>>>>>>>>>>>>> XGMI_HIVE >>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>> +struct amdgpu_reset_work_struct { >>>>>>>>>>>>>>>>>>>> + struct delayed_work base; >>>>>>>>>>>>>>>>>>>> + struct list_head node; >>>>>>>>>>>>>>>>>>>> +}; >>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>> struct amdgpu_reset_domain { >>>>>>>>>>>>>>>>>>>> struct kref refcount; >>>>>>>>>>>>>>>>>>>> struct workqueue_struct *wq; >>>>>>>>>>>>>>>>>>>> enum amdgpu_reset_domain_type type; >>>>>>>>>>>>>>>>>>>> struct rw_semaphore sem; >>>>>>>>>>>>>>>>>>>> atomic_t in_gpu_reset; >>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>> + struct list_head pending_works; >>>>>>>>>>>>>>>>>>>> + struct mutex reset_lock; >>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>> @@ -113,9 +146,43 @@ static inline void >>>>>>>>>>>>>>>>>>>> amdgpu_reset_put_reset_domain(struct >>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *dom >>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>> static inline bool >>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_schedule(struct >>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain, >>>>>>>>>>>>>>>>>>>> - struct work_struct *work) >>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work) >>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>> - return queue_work(domain->wq, work); >>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>> + if (!queue_delayed_work(domain->wq, >>>>>>>>>>>>>>>>>>>> &work->base, 0)) { >>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>> + return false; >>>>>>>>>>>>>>>>>>>> + } >>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>> + list_add_tail(&work->node, &domain->pending_works); >>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>> + return true; >>>>>>>>>>>>>>>>>>>> +} >>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>> +static inline void >>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(struct >>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain, >>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct >>>>>>>>>>>>>>>>>>>> *work) >>>>>>>>>>>>>>>>>>>> +{ >>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>> + list_del_init(&work->node); >>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>> +} >>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>> +static inline void >>>>>>>>>>>>>>>>>>>> amdgpu_reset_pending_list(struct >>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain) >>>>>>>>>>>>>>>>>>>> +{ >>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *entry, *tmp; >>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>> + list_for_each_entry_safe(entry, tmp, >>>>>>>>>>>>>>>>>>>> &domain->pending_works, node) { >>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>> + list_del_init(&entry->node); >>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>> + /* Stop any other related pending resets */ >>>>>>>>>>>>>>>>>>>> + cancel_delayed_work(&entry->base); >>>>>>>>>>>>>>>>>>>> + } >>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>> void amdgpu_device_lock_reset_domain(struct >>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *reset_domain); >>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>> index 239f232f9c02..574e870d3064 100644 >>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>> @@ -25,6 +25,7 @@ >>>>>>>>>>>>>>>>>>>> #define AMDGPU_VIRT_H >>>>>>>>>>>>>>>>>>>> #include "amdgv_sriovmsg.h" >>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h" >>>>>>>>>>>>>>>>>>>> #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 << 0) >>>>>>>>>>>>>>>>>>>> /* vBIOS is sr-iov ready */ >>>>>>>>>>>>>>>>>>>> #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 1) /* >>>>>>>>>>>>>>>>>>>> sr-iov is enabled on this GPU */ >>>>>>>>>>>>>>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt { >>>>>>>>>>>>>>>>>>>> uint32_t reg_val_offs; >>>>>>>>>>>>>>>>>>>> struct amdgpu_irq_src ack_irq; >>>>>>>>>>>>>>>>>>>> struct amdgpu_irq_src rcv_irq; >>>>>>>>>>>>>>>>>>>> - struct work_struct flr_work; >>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct flr_work; >>>>>>>>>>>>>>>>>>>> struct amdgpu_mm_table mm_table; >>>>>>>>>>>>>>>>>>>> const struct amdgpu_virt_ops *ops; >>>>>>>>>>>>>>>>>>>> struct amdgpu_vf_error_buffer vf_errors; >>>>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>> index b81acf59870c..f3d1c2be9292 100644 >>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>> @@ -251,7 +251,7 @@ static int >>>>>>>>>>>>>>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>> *adev, >>>>>>>>>>>>>>>>>>>> static void xgpu_ai_mailbox_flr_work(struct >>>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = container_of(work, >>>>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = container_of(work, >>>>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work.base.work); >>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev = >>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt); >>>>>>>>>>>>>>>>>>>> int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>>>>>>>>>>>>>>> @@ -380,7 +380,8 @@ int >>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_get_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>> @@ -389,6 +390,8 @@ void >>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_put_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>> static int xgpu_ai_request_init_data(struct >>>>>>>>>>>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644 >>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>> @@ -275,7 +275,7 @@ static int >>>>>>>>>>>>>>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>> *adev, >>>>>>>>>>>>>>>>>>>> static void xgpu_nv_mailbox_flr_work(struct >>>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = container_of(work, >>>>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = container_of(work, >>>>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work.base.work); >>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev = >>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt); >>>>>>>>>>>>>>>>>>>> int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>>>>>>>>>>>>>>> @@ -407,7 +407,8 @@ int >>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_get_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>> @@ -416,6 +417,8 @@ void >>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_put_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>> const struct amdgpu_virt_ops xgpu_nv_virt_ops = { >>>>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644 >>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>> @@ -512,7 +512,7 @@ static int >>>>>>>>>>>>>>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>> *adev, >>>>>>>>>>>>>>>>>>>> static void xgpu_vi_mailbox_flr_work(struct >>>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = container_of(work, >>>>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = container_of(work, >>>>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work.base.work); >>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev = >>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt); >>>>>>>>>>>>>>>>>>>> /* wait until RCV_MSG become 3 */ >>>>>>>>>>>>>>>>>>>> @@ -610,7 +610,8 @@ int >>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_get_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>> @@ -619,6 +620,8 @@ void >>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_put_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>> const struct amdgpu_virt_ops xgpu_vi_virt_ops = { >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>> >>>>>>>>>> >>>>>>>> >>>>>> >>>> >> ^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive. 2022-05-11 15:35 ` Andrey Grodzovsky @ 2022-05-11 15:37 ` Lazar, Lijo 2022-05-11 15:43 ` Andrey Grodzovsky 2022-05-11 15:39 ` Christian König 1 sibling, 1 reply; 40+ messages in thread From: Lazar, Lijo @ 2022-05-11 15:37 UTC (permalink / raw) To: Andrey Grodzovsky, Christian König, Christian König, Felix Kuehling, amd-gfx Cc: Bai Zoy On 5/11/2022 9:05 PM, Andrey Grodzovsky wrote: > > On 2022-05-11 11:20, Lazar, Lijo wrote: >> >> >> On 5/11/2022 7:28 PM, Christian König wrote: >>> Am 11.05.22 um 15:43 schrieb Andrey Grodzovsky: >>>> On 2022-05-11 03:38, Christian König wrote: >>>>> Am 10.05.22 um 20:53 schrieb Andrey Grodzovsky: >>>>>> [SNIP] >>>>>>> E.g. in the reset code (either before or after the reset, that's >>>>>>> debatable) you do something like this: >>>>>>> >>>>>>> for (i = 0; i < num_ring; ++i) >>>>>>> cancel_delayed_work(ring[i]->scheduler....) >>>>>>> cancel_work(adev->ras_work); >>>>>>> cancel_work(adev->iofault_work); >>>>>>> cancel_work(adev->debugfs_work); >>>>>>> ... >>>>>>> >>>>>>> You don't really need to track which reset source has fired and >>>>>>> which hasn't, because that would be racy again. Instead just >>>>>>> bluntly reset all possible sources. >>>>>>> >>>>>>> Christian. >>>>>> >>>>>> >>>>>> I don't say we care if it fired once or twice (I need to add a fix >>>>>> to only insert reset work to pending reset list if it's not >>>>>> already there), the point of using list (or array) to which you >>>>>> add and from which you remove is that the logic of this is >>>>>> encapsulated within reset domain. In your way we need to be aware >>>>>> who exactly schedules reset work and explicitly cancel them, this >>>>>> also means that for any new source added in the future you will >>>>>> need to remember to add him >>>>> >>>>> I don't think that this is a valid argument. Additionally to the >>>>> schedulers we probably just need less than a handful of reset >>>>> sources, most likely even just one or two is enough. >>>>> >>>>> The only justification I can see of having additional separate >>>>> reset sources would be if somebody wants to know if a specific >>>>> source has been handled or not (e.g. call flush_work() or >>>>> work_pending()). Like in the case of a reset triggered through >>>>> debugfs. >>>> >>>> >>>> This is indeed one reason, another is as we said before that if you >>>> share 'reset source' (meaning a delayed work) with another client >>>> (i.e. RAS and KFD) it means you make assumption that the other >>>> client always proceeds with the >>>> reset exactly the same way as you expect. So today we have this only >>>> in scheduler vs non scheduler reset happening - non scheduler reset >>>> clients assume the reset is always fully executed in HW while >>>> scheduler based reset makes shortcuts and not always does HW reset >>>> hence they cannot share 'reset source' (delayed work). Yes, we can >>>> always add this in the future if and when such problem will arise >>>> but no one will remember this then and a new bug will be introduced >>>> and will take time to find and resolve. >>> >>> Mhm, so your main concern is that we forget to correctly handle the >>> new reset sources? >>> >>> How about we do it like this then: >>> >>> struct amdgpu_reset_domain { >>> .... >>> union { >>> struct { >>> struct work_item debugfs; >>> struct work_item ras; >>> .... >>> }; >>> struct work_item array[] >>> } reset_sources; >>> } >>> >> >> If it's only about static array, >> >> enum amdgpu_reset_soruce { >> >> AMDGPU_RESET_SRC_RAS, >> AMDGPU_RESET_SRC_ABC, >> ..... >> AMDGPU_RESET_SRC_XYZ, >> AMDGPU_RESET_SRC_MAX >> >> }; >> >> struct work_struct reset_work[AMDGPU_RESET_SRC_MAX]; => An index for >> each work item >> >> >> Thanks, >> Lijo > > > It's possible though it makes harder to generalize reset_domain later > for other drivers. The current reset domain queue design is not good for a hierarchichal reset within amdgpu itself :) Thanks, Lijo > But still one caveat, look at amdgpu_recover_work_struct and it's usage > in amdgpu_device_gpu_recover and in gpu_recover_get, > At least for debugfs i need to return back the result of GPU reset and > so I cannot store actual work items in the array mentioned above > but rather pointers to work_item because i need a way to get back the > return value from gpu_recover like I do now in amdgpu_device_gpu_recover. > > Andrey > > >> >>> Not 100% sure if that works, but something like that should do the >>> trick. >>> >>> My main concern is that I don't want to allocate the work items on >>> the stack and dynamic allocation (e.g. kmalloc) is usually not >>> possible either. >>> >>> Additional to that putting/removing work items from a list, array or >>> other container is a very common source for race conditions. >>> >>> Regards, >>> Christian. >>> >>>> >>>>>> to the cancellation list which you showed above. In current way >>>>>> all this done automatically within reset_domain code and it's >>>>>> agnostic to specific driver and it's specific list of reset >>>>>> sources. Also in case we would want to generalize reset_domain to >>>>>> other GPU drivers (which was >>>>>> a plan as far as i remember) this explicit mention of each reset >>>>>> works for cancellation is again less suitable in my opinion. >>>>> >>>>> Well we could put the work item for the scheduler independent reset >>>>> source into the reset domain as well. But I'm not sure those >>>>> additional reset sources should be part of any common handling, >>>>> that is largely amdgpu specific. >>>> >>>> >>>> So it's for sure more then one source for the reasons described >>>> above, also note that for scheduler we already cancel delayed work >>>> in drm_sched_stop so calling them again in amdgpu code kind of >>>> superfluous. >>>> >>>> Andrey >>>> >>>> >>>>> >>>>> Christian. >>>>> >>>>>> >>>>>> Andrey >>>>>> >>>>>> >>>>>>> >>>>>>>> >>>>>>>> Andrey >>>>>>>> >>>>>>>> >>>>>>>>> >>>>>>>>>> >>>>>>>>>> The only difference is I chose to do the canceling right >>>>>>>>>> BEFORE the HW reset and not AFTER. I did this because I see a >>>>>>>>>> possible race where a new reset request is being generated >>>>>>>>>> exactly after we finished the HW reset but before we canceled >>>>>>>>>> out all pending resets - in such case you wold not want to >>>>>>>>>> cancel this 'border line new' reset request. >>>>>>>>> >>>>>>>>> Why not? Any new reset request directly after a hardware reset >>>>>>>>> is most likely just falsely generated by the reset itself. >>>>>>>>> >>>>>>>>> Ideally I would cancel all sources after the reset, but before >>>>>>>>> starting any new work. >>>>>>>>> >>>>>>>>> Regards, >>>>>>>>> Christian. >>>>>>>>> >>>>>>>>>> >>>>>>>>>> >>>>>>>>>> Andrey >>>>>>>>>> >>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>> Regards, >>>>>>>>>>> Christian. >>>>>>>>>>> >>>>>>>>>>>>> You can see that if many different reset sources share same >>>>>>>>>>>>> work struct what can happen is that the first to obtain the >>>>>>>>>>>>> lock you describe bellow might opt out from full HW reset >>>>>>>>>>>>> because his bad job did signal for example or because his >>>>>>>>>>>>> hunged IP block was able to recover through SW reset but in >>>>>>>>>>>>> the meantime another reset source who needed an actual HW >>>>>>>>>>>>> reset just silently returned and we end up with unhandled >>>>>>>>>>>>> reset request. True that today this happens only to job >>>>>>>>>>>>> timeout reset sources that are handled form within the >>>>>>>>>>>>> scheduler and won't use this single work struct but no one >>>>>>>>>>>>> prevents a future case for this to happen and also, if we >>>>>>>>>>>>> actually want to unify scheduler time out handlers within >>>>>>>>>>>>> reset domain (which seems to me the right design approach) >>>>>>>>>>>>> we won't be able to use just one work struct for this >>>>>>>>>>>>> reason anyway. >>>>>>>>>>>>> >>>>>>>>>>>> >>>>>>>>>>>> Just to add to this point - a reset domain is co-operative >>>>>>>>>>>> domain. In addition to sharing a set of clients from various >>>>>>>>>>>> reset sources for one device, it also will have a set of >>>>>>>>>>>> devices like in XGMI hive. The job timeout on one device may >>>>>>>>>>>> not eventually result in result, but a RAS error happening >>>>>>>>>>>> on another device at the same time would need a reset. The >>>>>>>>>>>> second device's RAS error cannot return seeing that a reset >>>>>>>>>>>> work already started, or ignore the reset work given that >>>>>>>>>>>> another device has filled the reset data. >>>>>>>>>>>> >>>>>>>>>>>> When there is a reset domain, it should take care of the >>>>>>>>>>>> work scheduled and keeping it in device or any other level >>>>>>>>>>>> doesn't sound good. >>>>>>>>>>>> >>>>>>>>>>>> Thanks, >>>>>>>>>>>> Lijo >>>>>>>>>>>> >>>>>>>>>>>>> Andrey >>>>>>>>>>>>> >>>>>>>>>>>>> >>>>>>>>>>>>>> >>>>>>>>>>>>>> I'd put the reset work struct into the reset_domain >>>>>>>>>>>>>> struct. That way you'd have exactly one worker for the >>>>>>>>>>>>>> reset domain. You could implement a lock-less scheme to >>>>>>>>>>>>>> decide whether you need to schedule a reset, e.g. using an >>>>>>>>>>>>>> atomic counter in the shared work struct that gets >>>>>>>>>>>>>> incremented when a client wants to trigger a reset >>>>>>>>>>>>>> (atomic_add_return). If that counter is exactly 1 after >>>>>>>>>>>>>> incrementing, you need to fill in the rest of the work >>>>>>>>>>>>>> struct and schedule the work. Otherwise, it's already >>>>>>>>>>>>>> scheduled (or another client is in the process of >>>>>>>>>>>>>> scheduling it) and you just return. When the worker >>>>>>>>>>>>>> finishes (after confirming a successful reset), it resets >>>>>>>>>>>>>> the counter to 0, so the next client requesting a reset >>>>>>>>>>>>>> will schedule the worker again. >>>>>>>>>>>>>> >>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>> Felix >>>>>>>>>>>>>> >>>>>>>>>>>>>> >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> Additional to that keep in mind that you can't allocate >>>>>>>>>>>>>>>> any memory before or during the GPU reset nor wait for >>>>>>>>>>>>>>>> the reset to complete (so you can't allocate anything on >>>>>>>>>>>>>>>> the stack either). >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> There is no dynamic allocation here, regarding stack >>>>>>>>>>>>>>> allocations - we do it all the time when we call >>>>>>>>>>>>>>> functions, even during GPU resets, how on stack >>>>>>>>>>>>>>> allocation of work struct in amdgpu_device_gpu_recover is >>>>>>>>>>>>>>> different from any other local variable we allocate in >>>>>>>>>>>>>>> any function we call ? >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> I am also not sure why it's not allowed to wait for reset >>>>>>>>>>>>>>> to complete ? Also, see in amdgpu_ras_do_recovery and >>>>>>>>>>>>>>> gpu_recover_get (debugfs) - the caller expects the reset >>>>>>>>>>>>>>> to complete before he returns. I can probably work around >>>>>>>>>>>>>>> it in RAS code by calling atomic_set(&ras->in_recovery, >>>>>>>>>>>>>>> 0) from some callback within actual reset function but >>>>>>>>>>>>>>> regarding sysfs it actually expects a result returned >>>>>>>>>>>>>>> indicating whether the call was successful or not. >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> I don't think that concept you try here will work. >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>>>> Christian. >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> Also in general seems to me it's cleaner approach where >>>>>>>>>>>>>>>>> this logic (the work items) are held and handled in >>>>>>>>>>>>>>>>> reset_domain and are not split in each adev or any >>>>>>>>>>>>>>>>> other entity. We might want in the future to even move >>>>>>>>>>>>>>>>> the scheduler handling into reset domain since reset >>>>>>>>>>>>>>>>> domain is supposed to be a generic things and not only >>>>>>>>>>>>>>>>> or AMD. >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>>>>>>>> Christian. >>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky >>>>>>>>>>>>>>>>>>>>> <andrey.grodzovsky@amd.com> >>>>>>>>>>>>>>>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com> >>>>>>>>>>>>>>>>>>>>> --- >>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +--- >>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++-- >>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 + >>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 >>>>>>>>>>>>>>>>>>>>> +++++++++++++++++++++- >>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +- >>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++- >>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++- >>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++- >>>>>>>>>>>>>>>>>>>>> 8 files changed, 104 insertions(+), 24 deletions(-) >>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>>> index 4264abc5604d..99efd8317547 100644 >>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>>> @@ -109,6 +109,7 @@ >>>>>>>>>>>>>>>>>>>>> #include "amdgpu_fdinfo.h" >>>>>>>>>>>>>>>>>>>>> #include "amdgpu_mca.h" >>>>>>>>>>>>>>>>>>>>> #include "amdgpu_ras.h" >>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h" >>>>>>>>>>>>>>>>>>>>> #define MAX_GPU_INSTANCE 16 >>>>>>>>>>>>>>>>>>>>> @@ -509,16 +510,6 @@ struct >>>>>>>>>>>>>>>>>>>>> amdgpu_allowed_register_entry { >>>>>>>>>>>>>>>>>>>>> bool grbm_indexed; >>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>> -enum amd_reset_method { >>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_NONE = -1, >>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_LEGACY = 0, >>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE0, >>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE1, >>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE2, >>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_BACO, >>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_PCI, >>>>>>>>>>>>>>>>>>>>> -}; >>>>>>>>>>>>>>>>>>>>> - >>>>>>>>>>>>>>>>>>>>> struct amdgpu_video_codec_info { >>>>>>>>>>>>>>>>>>>>> u32 codec_type; >>>>>>>>>>>>>>>>>>>>> u32 max_width; >>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>>> index e582f1044c0f..7fa82269c30f 100644 >>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>>> @@ -5201,6 +5201,12 @@ int >>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>> *adev, >>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>> tmp_vram_lost_counter = >>>>>>>>>>>>>>>>>>>>> atomic_read(&((adev)->vram_lost_counter)); >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> + /* Drop all pending resets since we will reset >>>>>>>>>>>>>>>>>>>>> now anyway */ >>>>>>>>>>>>>>>>>>>>> + tmp_adev = >>>>>>>>>>>>>>>>>>>>> list_first_entry(device_list_handle, struct >>>>>>>>>>>>>>>>>>>>> amdgpu_device, >>>>>>>>>>>>>>>>>>>>> + reset_list); >>>>>>>>>>>>>>>>>>>>> + amdgpu_reset_pending_list(tmp_adev->reset_domain); >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> /* Actual ASIC resets if needed.*/ >>>>>>>>>>>>>>>>>>>>> /* Host driver will handle XGMI hive reset >>>>>>>>>>>>>>>>>>>>> for SRIOV */ >>>>>>>>>>>>>>>>>>>>> if (amdgpu_sriov_vf(adev)) { >>>>>>>>>>>>>>>>>>>>> @@ -5296,7 +5302,7 @@ int >>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>> *adev, >>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>> struct amdgpu_recover_work_struct { >>>>>>>>>>>>>>>>>>>>> - struct work_struct base; >>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct base; >>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev; >>>>>>>>>>>>>>>>>>>>> struct amdgpu_job *job; >>>>>>>>>>>>>>>>>>>>> int ret; >>>>>>>>>>>>>>>>>>>>> @@ -5304,7 +5310,7 @@ struct >>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct { >>>>>>>>>>>>>>>>>>>>> static void >>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work(struct >>>>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>> - struct amdgpu_recover_work_struct >>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct >>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base); >>>>>>>>>>>>>>>>>>>>> + struct amdgpu_recover_work_struct >>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct >>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base.base.work); >>>>>>>>>>>>>>>>>>>>> recover_work->ret = >>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, >>>>>>>>>>>>>>>>>>>>> recover_work->job); >>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>> @@ -5316,12 +5322,15 @@ int >>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct amdgpu_device *adev, >>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>> struct amdgpu_recover_work_struct work = >>>>>>>>>>>>>>>>>>>>> {.adev = adev, .job = job}; >>>>>>>>>>>>>>>>>>>>> - INIT_WORK(&work.base, >>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&work.base.base, >>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&work.base.node); >>>>>>>>>>>>>>>>>>>>> if >>>>>>>>>>>>>>>>>>>>> (!amdgpu_reset_domain_schedule(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>> &work.base)) >>>>>>>>>>>>>>>>>>>>> return -EAGAIN; >>>>>>>>>>>>>>>>>>>>> - flush_work(&work.base); >>>>>>>>>>>>>>>>>>>>> + flush_delayed_work(&work.base.base); >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>> &work.base); >>>>>>>>>>>>>>>>>>>>> return work.ret; >>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>>> index c80af0889773..ffddd419c351 100644 >>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain >>>>>>>>>>>>>>>>>>>>> *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d >>>>>>>>>>>>>>>>>>>>> atomic_set(&reset_domain->in_gpu_reset, 0); >>>>>>>>>>>>>>>>>>>>> init_rwsem(&reset_domain->sem); >>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&reset_domain->pending_works); >>>>>>>>>>>>>>>>>>>>> + mutex_init(&reset_domain->reset_lock); >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> return reset_domain; >>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>>> index 1949dbe28a86..863ec5720fc1 100644 >>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>>> @@ -24,7 +24,18 @@ >>>>>>>>>>>>>>>>>>>>> #ifndef __AMDGPU_RESET_H__ >>>>>>>>>>>>>>>>>>>>> #define __AMDGPU_RESET_H__ >>>>>>>>>>>>>>>>>>>>> -#include "amdgpu.h" >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> +#include <linux/atomic.h> >>>>>>>>>>>>>>>>>>>>> +#include <linux/mutex.h> >>>>>>>>>>>>>>>>>>>>> +#include <linux/list.h> >>>>>>>>>>>>>>>>>>>>> +#include <linux/kref.h> >>>>>>>>>>>>>>>>>>>>> +#include <linux/rwsem.h> >>>>>>>>>>>>>>>>>>>>> +#include <linux/workqueue.h> >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> +struct amdgpu_device; >>>>>>>>>>>>>>>>>>>>> +struct amdgpu_job; >>>>>>>>>>>>>>>>>>>>> +struct amdgpu_hive_info; >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> enum AMDGPU_RESET_FLAGS { >>>>>>>>>>>>>>>>>>>>> @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS { >>>>>>>>>>>>>>>>>>>>> AMDGPU_SKIP_HW_RESET = 1, >>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> +enum amd_reset_method { >>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_NONE = -1, >>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_LEGACY = 0, >>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE0, >>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE1, >>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE2, >>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_BACO, >>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_PCI, >>>>>>>>>>>>>>>>>>>>> +}; >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> struct amdgpu_reset_context { >>>>>>>>>>>>>>>>>>>>> enum amd_reset_method method; >>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *reset_req_dev; >>>>>>>>>>>>>>>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context { >>>>>>>>>>>>>>>>>>>>> unsigned long flags; >>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>> +struct amdgpu_reset_control; >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> struct amdgpu_reset_handler { >>>>>>>>>>>>>>>>>>>>> enum amd_reset_method reset_method; >>>>>>>>>>>>>>>>>>>>> struct list_head handler_list; >>>>>>>>>>>>>>>>>>>>> @@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type { >>>>>>>>>>>>>>>>>>>>> XGMI_HIVE >>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> +struct amdgpu_reset_work_struct { >>>>>>>>>>>>>>>>>>>>> + struct delayed_work base; >>>>>>>>>>>>>>>>>>>>> + struct list_head node; >>>>>>>>>>>>>>>>>>>>> +}; >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> struct amdgpu_reset_domain { >>>>>>>>>>>>>>>>>>>>> struct kref refcount; >>>>>>>>>>>>>>>>>>>>> struct workqueue_struct *wq; >>>>>>>>>>>>>>>>>>>>> enum amdgpu_reset_domain_type type; >>>>>>>>>>>>>>>>>>>>> struct rw_semaphore sem; >>>>>>>>>>>>>>>>>>>>> atomic_t in_gpu_reset; >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> + struct list_head pending_works; >>>>>>>>>>>>>>>>>>>>> + struct mutex reset_lock; >>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>> @@ -113,9 +146,43 @@ static inline void >>>>>>>>>>>>>>>>>>>>> amdgpu_reset_put_reset_domain(struct >>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *dom >>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>> static inline bool >>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_schedule(struct >>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain, >>>>>>>>>>>>>>>>>>>>> - struct work_struct *work) >>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work) >>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>> - return queue_work(domain->wq, work); >>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> + if (!queue_delayed_work(domain->wq, >>>>>>>>>>>>>>>>>>>>> &work->base, 0)) { >>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>> + return false; >>>>>>>>>>>>>>>>>>>>> + } >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> + list_add_tail(&work->node, &domain->pending_works); >>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> + return true; >>>>>>>>>>>>>>>>>>>>> +} >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> +static inline void >>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(struct >>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain, >>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct >>>>>>>>>>>>>>>>>>>>> *work) >>>>>>>>>>>>>>>>>>>>> +{ >>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>> + list_del_init(&work->node); >>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>> +} >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> +static inline void >>>>>>>>>>>>>>>>>>>>> amdgpu_reset_pending_list(struct >>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain) >>>>>>>>>>>>>>>>>>>>> +{ >>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *entry, *tmp; >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>> + list_for_each_entry_safe(entry, tmp, >>>>>>>>>>>>>>>>>>>>> &domain->pending_works, node) { >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> + list_del_init(&entry->node); >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> + /* Stop any other related pending resets */ >>>>>>>>>>>>>>>>>>>>> + cancel_delayed_work(&entry->base); >>>>>>>>>>>>>>>>>>>>> + } >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>> void amdgpu_device_lock_reset_domain(struct >>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *reset_domain); >>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>>> index 239f232f9c02..574e870d3064 100644 >>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>>> @@ -25,6 +25,7 @@ >>>>>>>>>>>>>>>>>>>>> #define AMDGPU_VIRT_H >>>>>>>>>>>>>>>>>>>>> #include "amdgv_sriovmsg.h" >>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h" >>>>>>>>>>>>>>>>>>>>> #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 << 0) >>>>>>>>>>>>>>>>>>>>> /* vBIOS is sr-iov ready */ >>>>>>>>>>>>>>>>>>>>> #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 1) /* >>>>>>>>>>>>>>>>>>>>> sr-iov is enabled on this GPU */ >>>>>>>>>>>>>>>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt { >>>>>>>>>>>>>>>>>>>>> uint32_t reg_val_offs; >>>>>>>>>>>>>>>>>>>>> struct amdgpu_irq_src ack_irq; >>>>>>>>>>>>>>>>>>>>> struct amdgpu_irq_src rcv_irq; >>>>>>>>>>>>>>>>>>>>> - struct work_struct flr_work; >>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct flr_work; >>>>>>>>>>>>>>>>>>>>> struct amdgpu_mm_table mm_table; >>>>>>>>>>>>>>>>>>>>> const struct amdgpu_virt_ops *ops; >>>>>>>>>>>>>>>>>>>>> struct amdgpu_vf_error_buffer vf_errors; >>>>>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>>> index b81acf59870c..f3d1c2be9292 100644 >>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>>> @@ -251,7 +251,7 @@ static int >>>>>>>>>>>>>>>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>> *adev, >>>>>>>>>>>>>>>>>>>>> static void xgpu_ai_mailbox_flr_work(struct >>>>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = container_of(work, >>>>>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = container_of(work, >>>>>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work.base.work); >>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev = >>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt); >>>>>>>>>>>>>>>>>>>>> int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>>>>>>>>>>>>>>>> @@ -380,7 +380,8 @@ int >>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_get_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>> @@ -389,6 +390,8 @@ void >>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_put_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>> static int xgpu_ai_request_init_data(struct >>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644 >>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>>> @@ -275,7 +275,7 @@ static int >>>>>>>>>>>>>>>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>> *adev, >>>>>>>>>>>>>>>>>>>>> static void xgpu_nv_mailbox_flr_work(struct >>>>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = container_of(work, >>>>>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = container_of(work, >>>>>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work.base.work); >>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev = >>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt); >>>>>>>>>>>>>>>>>>>>> int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>>>>>>>>>>>>>>>> @@ -407,7 +407,8 @@ int >>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_get_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>> @@ -416,6 +417,8 @@ void >>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_put_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>> const struct amdgpu_virt_ops xgpu_nv_virt_ops = { >>>>>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644 >>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>>> @@ -512,7 +512,7 @@ static int >>>>>>>>>>>>>>>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>> *adev, >>>>>>>>>>>>>>>>>>>>> static void xgpu_vi_mailbox_flr_work(struct >>>>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = container_of(work, >>>>>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = container_of(work, >>>>>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work.base.work); >>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev = >>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt); >>>>>>>>>>>>>>>>>>>>> /* wait until RCV_MSG become 3 */ >>>>>>>>>>>>>>>>>>>>> @@ -610,7 +610,8 @@ int >>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_get_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>> @@ -619,6 +620,8 @@ void >>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_put_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>> const struct amdgpu_virt_ops xgpu_vi_virt_ops = { >>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> >>>>>>>>>>> >>>>>>>>> >>>>>>> >>>>> >>> ^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive. 2022-05-11 15:37 ` Lazar, Lijo @ 2022-05-11 15:43 ` Andrey Grodzovsky 2022-05-11 15:46 ` Lazar, Lijo 0 siblings, 1 reply; 40+ messages in thread From: Andrey Grodzovsky @ 2022-05-11 15:43 UTC (permalink / raw) To: Lazar, Lijo, Christian König, Christian König, Felix Kuehling, amd-gfx Cc: Bai Zoy On 2022-05-11 11:37, Lazar, Lijo wrote: > > > On 5/11/2022 9:05 PM, Andrey Grodzovsky wrote: >> >> On 2022-05-11 11:20, Lazar, Lijo wrote: >>> >>> >>> On 5/11/2022 7:28 PM, Christian König wrote: >>>> Am 11.05.22 um 15:43 schrieb Andrey Grodzovsky: >>>>> On 2022-05-11 03:38, Christian König wrote: >>>>>> Am 10.05.22 um 20:53 schrieb Andrey Grodzovsky: >>>>>>> [SNIP] >>>>>>>> E.g. in the reset code (either before or after the reset, >>>>>>>> that's debatable) you do something like this: >>>>>>>> >>>>>>>> for (i = 0; i < num_ring; ++i) >>>>>>>> cancel_delayed_work(ring[i]->scheduler....) >>>>>>>> cancel_work(adev->ras_work); >>>>>>>> cancel_work(adev->iofault_work); >>>>>>>> cancel_work(adev->debugfs_work); >>>>>>>> ... >>>>>>>> >>>>>>>> You don't really need to track which reset source has fired and >>>>>>>> which hasn't, because that would be racy again. Instead just >>>>>>>> bluntly reset all possible sources. >>>>>>>> >>>>>>>> Christian. >>>>>>> >>>>>>> >>>>>>> I don't say we care if it fired once or twice (I need to add a >>>>>>> fix to only insert reset work to pending reset list if it's not >>>>>>> already there), the point of using list (or array) to which you >>>>>>> add and from which you remove is that the logic of this is >>>>>>> encapsulated within reset domain. In your way we need to be >>>>>>> aware who exactly schedules reset work and explicitly cancel >>>>>>> them, this also means that for any new source added in the >>>>>>> future you will need to remember to add him >>>>>> >>>>>> I don't think that this is a valid argument. Additionally to the >>>>>> schedulers we probably just need less than a handful of reset >>>>>> sources, most likely even just one or two is enough. >>>>>> >>>>>> The only justification I can see of having additional separate >>>>>> reset sources would be if somebody wants to know if a specific >>>>>> source has been handled or not (e.g. call flush_work() or >>>>>> work_pending()). Like in the case of a reset triggered through >>>>>> debugfs. >>>>> >>>>> >>>>> This is indeed one reason, another is as we said before that if >>>>> you share 'reset source' (meaning a delayed work) with another >>>>> client (i.e. RAS and KFD) it means you make assumption that the >>>>> other client always proceeds with the >>>>> reset exactly the same way as you expect. So today we have this >>>>> only in scheduler vs non scheduler reset happening - non scheduler >>>>> reset clients assume the reset is always fully executed in HW >>>>> while scheduler based reset makes shortcuts and not always does HW >>>>> reset hence they cannot share 'reset source' (delayed work). Yes, >>>>> we can always add this in the future if and when such problem will >>>>> arise but no one will remember this then and a new bug will be >>>>> introduced and will take time to find and resolve. >>>> >>>> Mhm, so your main concern is that we forget to correctly handle the >>>> new reset sources? >>>> >>>> How about we do it like this then: >>>> >>>> struct amdgpu_reset_domain { >>>> .... >>>> union { >>>> struct { >>>> struct work_item debugfs; >>>> struct work_item ras; >>>> .... >>>> }; >>>> struct work_item array[] >>>> } reset_sources; >>>> } >>>> >>> >>> If it's only about static array, >>> >>> enum amdgpu_reset_soruce { >>> >>> AMDGPU_RESET_SRC_RAS, >>> AMDGPU_RESET_SRC_ABC, >>> ..... >>> AMDGPU_RESET_SRC_XYZ, >>> AMDGPU_RESET_SRC_MAX >>> >>> }; >>> >>> struct work_struct reset_work[AMDGPU_RESET_SRC_MAX]; => An index for >>> each work item >>> >>> >>> Thanks, >>> Lijo >> >> >> It's possible though it makes harder to generalize reset_domain later >> for other drivers. > > The current reset domain queue design is not good for a hierarchichal > reset within amdgpu itself :) > > Thanks, > Lijo Not sure what do you mean ? Andrey > >> But still one caveat, look at amdgpu_recover_work_struct and it's >> usage in amdgpu_device_gpu_recover and in gpu_recover_get, >> At least for debugfs i need to return back the result of GPU reset >> and so I cannot store actual work items in the array mentioned above >> but rather pointers to work_item because i need a way to get back the >> return value from gpu_recover like I do now in >> amdgpu_device_gpu_recover. >> >> Andrey >> >> >>> >>>> Not 100% sure if that works, but something like that should do the >>>> trick. >>>> >>>> My main concern is that I don't want to allocate the work items on >>>> the stack and dynamic allocation (e.g. kmalloc) is usually not >>>> possible either. >>>> >>>> Additional to that putting/removing work items from a list, array >>>> or other container is a very common source for race conditions. >>>> >>>> Regards, >>>> Christian. >>>> >>>>> >>>>>>> to the cancellation list which you showed above. In current way >>>>>>> all this done automatically within reset_domain code and it's >>>>>>> agnostic to specific driver and it's specific list of reset >>>>>>> sources. Also in case we would want to generalize reset_domain >>>>>>> to other GPU drivers (which was >>>>>>> a plan as far as i remember) this explicit mention of each reset >>>>>>> works for cancellation is again less suitable in my opinion. >>>>>> >>>>>> Well we could put the work item for the scheduler independent >>>>>> reset source into the reset domain as well. But I'm not sure >>>>>> those additional reset sources should be part of any common >>>>>> handling, that is largely amdgpu specific. >>>>> >>>>> >>>>> So it's for sure more then one source for the reasons described >>>>> above, also note that for scheduler we already cancel delayed work >>>>> in drm_sched_stop so calling them again in amdgpu code kind of >>>>> superfluous. >>>>> >>>>> Andrey >>>>> >>>>> >>>>>> >>>>>> Christian. >>>>>> >>>>>>> >>>>>>> Andrey >>>>>>> >>>>>>> >>>>>>>> >>>>>>>>> >>>>>>>>> Andrey >>>>>>>>> >>>>>>>>> >>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>> The only difference is I chose to do the canceling right >>>>>>>>>>> BEFORE the HW reset and not AFTER. I did this because I see >>>>>>>>>>> a possible race where a new reset request is being generated >>>>>>>>>>> exactly after we finished the HW reset but before we >>>>>>>>>>> canceled out all pending resets - in such case you wold not >>>>>>>>>>> want to cancel this 'border line new' reset request. >>>>>>>>>> >>>>>>>>>> Why not? Any new reset request directly after a hardware >>>>>>>>>> reset is most likely just falsely generated by the reset itself. >>>>>>>>>> >>>>>>>>>> Ideally I would cancel all sources after the reset, but >>>>>>>>>> before starting any new work. >>>>>>>>>> >>>>>>>>>> Regards, >>>>>>>>>> Christian. >>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>> Andrey >>>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>>> >>>>>>>>>>>> Regards, >>>>>>>>>>>> Christian. >>>>>>>>>>>> >>>>>>>>>>>>>> You can see that if many different reset sources share >>>>>>>>>>>>>> same work struct what can happen is that the first to >>>>>>>>>>>>>> obtain the lock you describe bellow might opt out from >>>>>>>>>>>>>> full HW reset because his bad job did signal for example >>>>>>>>>>>>>> or because his hunged IP block was able to recover >>>>>>>>>>>>>> through SW reset but in the meantime another reset source >>>>>>>>>>>>>> who needed an actual HW reset just silently returned and >>>>>>>>>>>>>> we end up with unhandled reset request. True that today >>>>>>>>>>>>>> this happens only to job timeout reset sources that are >>>>>>>>>>>>>> handled form within the scheduler and won't use this >>>>>>>>>>>>>> single work struct but no one prevents a future case for >>>>>>>>>>>>>> this to happen and also, if we actually want to unify >>>>>>>>>>>>>> scheduler time out handlers within reset domain (which >>>>>>>>>>>>>> seems to me the right design approach) we won't be able >>>>>>>>>>>>>> to use just one work struct for this reason anyway. >>>>>>>>>>>>>> >>>>>>>>>>>>> >>>>>>>>>>>>> Just to add to this point - a reset domain is co-operative >>>>>>>>>>>>> domain. In addition to sharing a set of clients from >>>>>>>>>>>>> various reset sources for one device, it also will have a >>>>>>>>>>>>> set of devices like in XGMI hive. The job timeout on one >>>>>>>>>>>>> device may not eventually result in result, but a RAS >>>>>>>>>>>>> error happening on another device at the same time would >>>>>>>>>>>>> need a reset. The second device's RAS error cannot return >>>>>>>>>>>>> seeing that a reset work already started, or ignore the >>>>>>>>>>>>> reset work given that another device has filled the reset >>>>>>>>>>>>> data. >>>>>>>>>>>>> >>>>>>>>>>>>> When there is a reset domain, it should take care of the >>>>>>>>>>>>> work scheduled and keeping it in device or any other level >>>>>>>>>>>>> doesn't sound good. >>>>>>>>>>>>> >>>>>>>>>>>>> Thanks, >>>>>>>>>>>>> Lijo >>>>>>>>>>>>> >>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>> >>>>>>>>>>>>>> >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> I'd put the reset work struct into the reset_domain >>>>>>>>>>>>>>> struct. That way you'd have exactly one worker for the >>>>>>>>>>>>>>> reset domain. You could implement a lock-less scheme to >>>>>>>>>>>>>>> decide whether you need to schedule a reset, e.g. using >>>>>>>>>>>>>>> an atomic counter in the shared work struct that gets >>>>>>>>>>>>>>> incremented when a client wants to trigger a reset >>>>>>>>>>>>>>> (atomic_add_return). If that counter is exactly 1 after >>>>>>>>>>>>>>> incrementing, you need to fill in the rest of the work >>>>>>>>>>>>>>> struct and schedule the work. Otherwise, it's already >>>>>>>>>>>>>>> scheduled (or another client is in the process of >>>>>>>>>>>>>>> scheduling it) and you just return. When the worker >>>>>>>>>>>>>>> finishes (after confirming a successful reset), it >>>>>>>>>>>>>>> resets the counter to 0, so the next client requesting a >>>>>>>>>>>>>>> reset will schedule the worker again. >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>>> Felix >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> Additional to that keep in mind that you can't >>>>>>>>>>>>>>>>> allocate any memory before or during the GPU reset nor >>>>>>>>>>>>>>>>> wait for the reset to complete (so you can't allocate >>>>>>>>>>>>>>>>> anything on the stack either). >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> There is no dynamic allocation here, regarding stack >>>>>>>>>>>>>>>> allocations - we do it all the time when we call >>>>>>>>>>>>>>>> functions, even during GPU resets, how on stack >>>>>>>>>>>>>>>> allocation of work struct in amdgpu_device_gpu_recover >>>>>>>>>>>>>>>> is different from any other local variable we allocate >>>>>>>>>>>>>>>> in any function we call ? >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> I am also not sure why it's not allowed to wait for >>>>>>>>>>>>>>>> reset to complete ? Also, see in amdgpu_ras_do_recovery >>>>>>>>>>>>>>>> and gpu_recover_get (debugfs) - the caller expects the >>>>>>>>>>>>>>>> reset to complete before he returns. I can probably >>>>>>>>>>>>>>>> work around it in RAS code by calling >>>>>>>>>>>>>>>> atomic_set(&ras->in_recovery, 0) from some callback >>>>>>>>>>>>>>>> within actual reset function but regarding sysfs it >>>>>>>>>>>>>>>> actually expects a result returned indicating whether >>>>>>>>>>>>>>>> the call was successful or not. >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> I don't think that concept you try here will work. >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>>>>> Christian. >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> Also in general seems to me it's cleaner approach >>>>>>>>>>>>>>>>>> where this logic (the work items) are held and >>>>>>>>>>>>>>>>>> handled in reset_domain and are not split in each >>>>>>>>>>>>>>>>>> adev or any other entity. We might want in the future >>>>>>>>>>>>>>>>>> to even move the scheduler handling into reset domain >>>>>>>>>>>>>>>>>> since reset domain is supposed to be a generic things >>>>>>>>>>>>>>>>>> and not only or AMD. >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>>>>>>>>> Christian. >>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky >>>>>>>>>>>>>>>>>>>>>> <andrey.grodzovsky@amd.com> >>>>>>>>>>>>>>>>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com> >>>>>>>>>>>>>>>>>>>>>> --- >>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +--- >>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 >>>>>>>>>>>>>>>>>>>>>> +++-- >>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 + >>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 >>>>>>>>>>>>>>>>>>>>>> +++++++++++++++++++++- >>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +- >>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++- >>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++- >>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++- >>>>>>>>>>>>>>>>>>>>>> 8 files changed, 104 insertions(+), 24 >>>>>>>>>>>>>>>>>>>>>> deletions(-) >>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>>>> index 4264abc5604d..99efd8317547 100644 >>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>>>> @@ -109,6 +109,7 @@ >>>>>>>>>>>>>>>>>>>>>> #include "amdgpu_fdinfo.h" >>>>>>>>>>>>>>>>>>>>>> #include "amdgpu_mca.h" >>>>>>>>>>>>>>>>>>>>>> #include "amdgpu_ras.h" >>>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h" >>>>>>>>>>>>>>>>>>>>>> #define MAX_GPU_INSTANCE 16 >>>>>>>>>>>>>>>>>>>>>> @@ -509,16 +510,6 @@ struct >>>>>>>>>>>>>>>>>>>>>> amdgpu_allowed_register_entry { >>>>>>>>>>>>>>>>>>>>>> bool grbm_indexed; >>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>> -enum amd_reset_method { >>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_NONE = -1, >>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_LEGACY = 0, >>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE0, >>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE1, >>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE2, >>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_BACO, >>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_PCI, >>>>>>>>>>>>>>>>>>>>>> -}; >>>>>>>>>>>>>>>>>>>>>> - >>>>>>>>>>>>>>>>>>>>>> struct amdgpu_video_codec_info { >>>>>>>>>>>>>>>>>>>>>> u32 codec_type; >>>>>>>>>>>>>>>>>>>>>> u32 max_width; >>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>>>> index e582f1044c0f..7fa82269c30f 100644 >>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>>>> @@ -5201,6 +5201,12 @@ int >>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct >>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev, >>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>> tmp_vram_lost_counter = >>>>>>>>>>>>>>>>>>>>>> atomic_read(&((adev)->vram_lost_counter)); >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> + /* Drop all pending resets since we will >>>>>>>>>>>>>>>>>>>>>> reset now anyway */ >>>>>>>>>>>>>>>>>>>>>> + tmp_adev = >>>>>>>>>>>>>>>>>>>>>> list_first_entry(device_list_handle, struct >>>>>>>>>>>>>>>>>>>>>> amdgpu_device, >>>>>>>>>>>>>>>>>>>>>> + reset_list); >>>>>>>>>>>>>>>>>>>>>> + amdgpu_reset_pending_list(tmp_adev->reset_domain); >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> /* Actual ASIC resets if needed.*/ >>>>>>>>>>>>>>>>>>>>>> /* Host driver will handle XGMI hive reset >>>>>>>>>>>>>>>>>>>>>> for SRIOV */ >>>>>>>>>>>>>>>>>>>>>> if (amdgpu_sriov_vf(adev)) { >>>>>>>>>>>>>>>>>>>>>> @@ -5296,7 +5302,7 @@ int >>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct >>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev, >>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>> struct amdgpu_recover_work_struct { >>>>>>>>>>>>>>>>>>>>>> - struct work_struct base; >>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct base; >>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev; >>>>>>>>>>>>>>>>>>>>>> struct amdgpu_job *job; >>>>>>>>>>>>>>>>>>>>>> int ret; >>>>>>>>>>>>>>>>>>>>>> @@ -5304,7 +5310,7 @@ struct >>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct { >>>>>>>>>>>>>>>>>>>>>> static void >>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work(struct >>>>>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>> - struct amdgpu_recover_work_struct >>>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct >>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base); >>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_recover_work_struct >>>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct >>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base.base.work); >>>>>>>>>>>>>>>>>>>>>> recover_work->ret = >>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, >>>>>>>>>>>>>>>>>>>>>> recover_work->job); >>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>> @@ -5316,12 +5322,15 @@ int >>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>> *adev, >>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>> struct amdgpu_recover_work_struct work = >>>>>>>>>>>>>>>>>>>>>> {.adev = adev, .job = job}; >>>>>>>>>>>>>>>>>>>>>> - INIT_WORK(&work.base, >>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&work.base.base, >>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&work.base.node); >>>>>>>>>>>>>>>>>>>>>> if >>>>>>>>>>>>>>>>>>>>>> (!amdgpu_reset_domain_schedule(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>> &work.base)) >>>>>>>>>>>>>>>>>>>>>> return -EAGAIN; >>>>>>>>>>>>>>>>>>>>>> - flush_work(&work.base); >>>>>>>>>>>>>>>>>>>>>> + flush_delayed_work(&work.base.base); >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>> &work.base); >>>>>>>>>>>>>>>>>>>>>> return work.ret; >>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>>>> index c80af0889773..ffddd419c351 100644 >>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain >>>>>>>>>>>>>>>>>>>>>> *amdgpu_reset_create_reset_domain(enum >>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_d >>>>>>>>>>>>>>>>>>>>>> atomic_set(&reset_domain->in_gpu_reset, 0); >>>>>>>>>>>>>>>>>>>>>> init_rwsem(&reset_domain->sem); >>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&reset_domain->pending_works); >>>>>>>>>>>>>>>>>>>>>> + mutex_init(&reset_domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> return reset_domain; >>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>>>> index 1949dbe28a86..863ec5720fc1 100644 >>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>>>> @@ -24,7 +24,18 @@ >>>>>>>>>>>>>>>>>>>>>> #ifndef __AMDGPU_RESET_H__ >>>>>>>>>>>>>>>>>>>>>> #define __AMDGPU_RESET_H__ >>>>>>>>>>>>>>>>>>>>>> -#include "amdgpu.h" >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> +#include <linux/atomic.h> >>>>>>>>>>>>>>>>>>>>>> +#include <linux/mutex.h> >>>>>>>>>>>>>>>>>>>>>> +#include <linux/list.h> >>>>>>>>>>>>>>>>>>>>>> +#include <linux/kref.h> >>>>>>>>>>>>>>>>>>>>>> +#include <linux/rwsem.h> >>>>>>>>>>>>>>>>>>>>>> +#include <linux/workqueue.h> >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_device; >>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_job; >>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_hive_info; >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> enum AMDGPU_RESET_FLAGS { >>>>>>>>>>>>>>>>>>>>>> @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS { >>>>>>>>>>>>>>>>>>>>>> AMDGPU_SKIP_HW_RESET = 1, >>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> +enum amd_reset_method { >>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_NONE = -1, >>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_LEGACY = 0, >>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE0, >>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE1, >>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE2, >>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_BACO, >>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_PCI, >>>>>>>>>>>>>>>>>>>>>> +}; >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> struct amdgpu_reset_context { >>>>>>>>>>>>>>>>>>>>>> enum amd_reset_method method; >>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *reset_req_dev; >>>>>>>>>>>>>>>>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context { >>>>>>>>>>>>>>>>>>>>>> unsigned long flags; >>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_reset_control; >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> struct amdgpu_reset_handler { >>>>>>>>>>>>>>>>>>>>>> enum amd_reset_method reset_method; >>>>>>>>>>>>>>>>>>>>>> struct list_head handler_list; >>>>>>>>>>>>>>>>>>>>>> @@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type { >>>>>>>>>>>>>>>>>>>>>> XGMI_HIVE >>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_reset_work_struct { >>>>>>>>>>>>>>>>>>>>>> + struct delayed_work base; >>>>>>>>>>>>>>>>>>>>>> + struct list_head node; >>>>>>>>>>>>>>>>>>>>>> +}; >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> struct amdgpu_reset_domain { >>>>>>>>>>>>>>>>>>>>>> struct kref refcount; >>>>>>>>>>>>>>>>>>>>>> struct workqueue_struct *wq; >>>>>>>>>>>>>>>>>>>>>> enum amdgpu_reset_domain_type type; >>>>>>>>>>>>>>>>>>>>>> struct rw_semaphore sem; >>>>>>>>>>>>>>>>>>>>>> atomic_t in_gpu_reset; >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> + struct list_head pending_works; >>>>>>>>>>>>>>>>>>>>>> + struct mutex reset_lock; >>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>> @@ -113,9 +146,43 @@ static inline void >>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_put_reset_domain(struct >>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *dom >>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>> static inline bool >>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_schedule(struct >>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain, >>>>>>>>>>>>>>>>>>>>>> - struct work_struct *work) >>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work) >>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>> - return queue_work(domain->wq, work); >>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> + if (!queue_delayed_work(domain->wq, >>>>>>>>>>>>>>>>>>>>>> &work->base, 0)) { >>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>> + return false; >>>>>>>>>>>>>>>>>>>>>> + } >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> + list_add_tail(&work->node, >>>>>>>>>>>>>>>>>>>>>> &domain->pending_works); >>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> + return true; >>>>>>>>>>>>>>>>>>>>>> +} >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> +static inline void >>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(struct >>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain, >>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work) >>>>>>>>>>>>>>>>>>>>>> +{ >>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>> + list_del_init(&work->node); >>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>> +} >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> +static inline void >>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_pending_list(struct >>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain) >>>>>>>>>>>>>>>>>>>>>> +{ >>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *entry, *tmp; >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>> + list_for_each_entry_safe(entry, tmp, >>>>>>>>>>>>>>>>>>>>>> &domain->pending_works, node) { >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> + list_del_init(&entry->node); >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> + /* Stop any other related pending resets */ >>>>>>>>>>>>>>>>>>>>>> + cancel_delayed_work(&entry->base); >>>>>>>>>>>>>>>>>>>>>> + } >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>> void amdgpu_device_lock_reset_domain(struct >>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *reset_domain); >>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>>>> index 239f232f9c02..574e870d3064 100644 >>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>>>> @@ -25,6 +25,7 @@ >>>>>>>>>>>>>>>>>>>>>> #define AMDGPU_VIRT_H >>>>>>>>>>>>>>>>>>>>>> #include "amdgv_sriovmsg.h" >>>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h" >>>>>>>>>>>>>>>>>>>>>> #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 << >>>>>>>>>>>>>>>>>>>>>> 0) /* vBIOS is sr-iov ready */ >>>>>>>>>>>>>>>>>>>>>> #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 1) >>>>>>>>>>>>>>>>>>>>>> /* sr-iov is enabled on this GPU */ >>>>>>>>>>>>>>>>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt { >>>>>>>>>>>>>>>>>>>>>> uint32_t reg_val_offs; >>>>>>>>>>>>>>>>>>>>>> struct amdgpu_irq_src ack_irq; >>>>>>>>>>>>>>>>>>>>>> struct amdgpu_irq_src rcv_irq; >>>>>>>>>>>>>>>>>>>>>> - struct work_struct flr_work; >>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct flr_work; >>>>>>>>>>>>>>>>>>>>>> struct amdgpu_mm_table mm_table; >>>>>>>>>>>>>>>>>>>>>> const struct amdgpu_virt_ops *ops; >>>>>>>>>>>>>>>>>>>>>> struct amdgpu_vf_error_buffer vf_errors; >>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>>>> index b81acf59870c..f3d1c2be9292 100644 >>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>>>> @@ -251,7 +251,7 @@ static int >>>>>>>>>>>>>>>>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>> *adev, >>>>>>>>>>>>>>>>>>>>>> static void xgpu_ai_mailbox_flr_work(struct >>>>>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, >>>>>>>>>>>>>>>>>>>>>> flr_work.base.work); >>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev = >>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt); >>>>>>>>>>>>>>>>>>>>>> int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>>>>>>>>>>>>>>>>> @@ -380,7 +380,8 @@ int >>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_get_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>> @@ -389,6 +390,8 @@ void >>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_put_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>> static int xgpu_ai_request_init_data(struct >>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644 >>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>>>> @@ -275,7 +275,7 @@ static int >>>>>>>>>>>>>>>>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>> *adev, >>>>>>>>>>>>>>>>>>>>>> static void xgpu_nv_mailbox_flr_work(struct >>>>>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, >>>>>>>>>>>>>>>>>>>>>> flr_work.base.work); >>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev = >>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt); >>>>>>>>>>>>>>>>>>>>>> int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>>>>>>>>>>>>>>>>> @@ -407,7 +407,8 @@ int >>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_get_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>> @@ -416,6 +417,8 @@ void >>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_put_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>> const struct amdgpu_virt_ops xgpu_nv_virt_ops >>>>>>>>>>>>>>>>>>>>>> = { >>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644 >>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>>>> @@ -512,7 +512,7 @@ static int >>>>>>>>>>>>>>>>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>> *adev, >>>>>>>>>>>>>>>>>>>>>> static void xgpu_vi_mailbox_flr_work(struct >>>>>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, >>>>>>>>>>>>>>>>>>>>>> flr_work.base.work); >>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev = >>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt); >>>>>>>>>>>>>>>>>>>>>> /* wait until RCV_MSG become 3 */ >>>>>>>>>>>>>>>>>>>>>> @@ -610,7 +610,8 @@ int >>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_get_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>> @@ -619,6 +620,8 @@ void >>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_put_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>> const struct amdgpu_virt_ops xgpu_vi_virt_ops >>>>>>>>>>>>>>>>>>>>>> = { >>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> >>>>>>>>>>>> >>>>>>>>>> >>>>>>>> >>>>>> >>>> ^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive. 2022-05-11 15:43 ` Andrey Grodzovsky @ 2022-05-11 15:46 ` Lazar, Lijo 2022-05-11 15:53 ` Andrey Grodzovsky 0 siblings, 1 reply; 40+ messages in thread From: Lazar, Lijo @ 2022-05-11 15:46 UTC (permalink / raw) To: Andrey Grodzovsky, Christian König, Christian König, Felix Kuehling, amd-gfx Cc: Bai Zoy On 5/11/2022 9:13 PM, Andrey Grodzovsky wrote: > > On 2022-05-11 11:37, Lazar, Lijo wrote: >> >> >> On 5/11/2022 9:05 PM, Andrey Grodzovsky wrote: >>> >>> On 2022-05-11 11:20, Lazar, Lijo wrote: >>>> >>>> >>>> On 5/11/2022 7:28 PM, Christian König wrote: >>>>> Am 11.05.22 um 15:43 schrieb Andrey Grodzovsky: >>>>>> On 2022-05-11 03:38, Christian König wrote: >>>>>>> Am 10.05.22 um 20:53 schrieb Andrey Grodzovsky: >>>>>>>> [SNIP] >>>>>>>>> E.g. in the reset code (either before or after the reset, >>>>>>>>> that's debatable) you do something like this: >>>>>>>>> >>>>>>>>> for (i = 0; i < num_ring; ++i) >>>>>>>>> cancel_delayed_work(ring[i]->scheduler....) >>>>>>>>> cancel_work(adev->ras_work); >>>>>>>>> cancel_work(adev->iofault_work); >>>>>>>>> cancel_work(adev->debugfs_work); >>>>>>>>> ... >>>>>>>>> >>>>>>>>> You don't really need to track which reset source has fired and >>>>>>>>> which hasn't, because that would be racy again. Instead just >>>>>>>>> bluntly reset all possible sources. >>>>>>>>> >>>>>>>>> Christian. >>>>>>>> >>>>>>>> >>>>>>>> I don't say we care if it fired once or twice (I need to add a >>>>>>>> fix to only insert reset work to pending reset list if it's not >>>>>>>> already there), the point of using list (or array) to which you >>>>>>>> add and from which you remove is that the logic of this is >>>>>>>> encapsulated within reset domain. In your way we need to be >>>>>>>> aware who exactly schedules reset work and explicitly cancel >>>>>>>> them, this also means that for any new source added in the >>>>>>>> future you will need to remember to add him >>>>>>> >>>>>>> I don't think that this is a valid argument. Additionally to the >>>>>>> schedulers we probably just need less than a handful of reset >>>>>>> sources, most likely even just one or two is enough. >>>>>>> >>>>>>> The only justification I can see of having additional separate >>>>>>> reset sources would be if somebody wants to know if a specific >>>>>>> source has been handled or not (e.g. call flush_work() or >>>>>>> work_pending()). Like in the case of a reset triggered through >>>>>>> debugfs. >>>>>> >>>>>> >>>>>> This is indeed one reason, another is as we said before that if >>>>>> you share 'reset source' (meaning a delayed work) with another >>>>>> client (i.e. RAS and KFD) it means you make assumption that the >>>>>> other client always proceeds with the >>>>>> reset exactly the same way as you expect. So today we have this >>>>>> only in scheduler vs non scheduler reset happening - non scheduler >>>>>> reset clients assume the reset is always fully executed in HW >>>>>> while scheduler based reset makes shortcuts and not always does HW >>>>>> reset hence they cannot share 'reset source' (delayed work). Yes, >>>>>> we can always add this in the future if and when such problem will >>>>>> arise but no one will remember this then and a new bug will be >>>>>> introduced and will take time to find and resolve. >>>>> >>>>> Mhm, so your main concern is that we forget to correctly handle the >>>>> new reset sources? >>>>> >>>>> How about we do it like this then: >>>>> >>>>> struct amdgpu_reset_domain { >>>>> .... >>>>> union { >>>>> struct { >>>>> struct work_item debugfs; >>>>> struct work_item ras; >>>>> .... >>>>> }; >>>>> struct work_item array[] >>>>> } reset_sources; >>>>> } >>>>> >>>> >>>> If it's only about static array, >>>> >>>> enum amdgpu_reset_soruce { >>>> >>>> AMDGPU_RESET_SRC_RAS, >>>> AMDGPU_RESET_SRC_ABC, >>>> ..... >>>> AMDGPU_RESET_SRC_XYZ, >>>> AMDGPU_RESET_SRC_MAX >>>> >>>> }; >>>> >>>> struct work_struct reset_work[AMDGPU_RESET_SRC_MAX]; => An index for >>>> each work item >>>> >>>> >>>> Thanks, >>>> Lijo >>> >>> >>> It's possible though it makes harder to generalize reset_domain later >>> for other drivers. >> >> The current reset domain queue design is not good for a hierarchichal >> reset within amdgpu itself :) >> >> Thanks, >> Lijo > > > Not sure what do you mean ? > It's tied to the TDR queue in scheduler. Hierarchichal model - start from reset of lowest level nodes and on failure try with a higher level reset. This model doesn't suit that. Thanks, Lijo > Andrey > > >> >>> But still one caveat, look at amdgpu_recover_work_struct and it's >>> usage in amdgpu_device_gpu_recover and in gpu_recover_get, >>> At least for debugfs i need to return back the result of GPU reset >>> and so I cannot store actual work items in the array mentioned above >>> but rather pointers to work_item because i need a way to get back the >>> return value from gpu_recover like I do now in >>> amdgpu_device_gpu_recover. >>> >>> Andrey >>> >>> >>>> >>>>> Not 100% sure if that works, but something like that should do the >>>>> trick. >>>>> >>>>> My main concern is that I don't want to allocate the work items on >>>>> the stack and dynamic allocation (e.g. kmalloc) is usually not >>>>> possible either. >>>>> >>>>> Additional to that putting/removing work items from a list, array >>>>> or other container is a very common source for race conditions. >>>>> >>>>> Regards, >>>>> Christian. >>>>> >>>>>> >>>>>>>> to the cancellation list which you showed above. In current way >>>>>>>> all this done automatically within reset_domain code and it's >>>>>>>> agnostic to specific driver and it's specific list of reset >>>>>>>> sources. Also in case we would want to generalize reset_domain >>>>>>>> to other GPU drivers (which was >>>>>>>> a plan as far as i remember) this explicit mention of each reset >>>>>>>> works for cancellation is again less suitable in my opinion. >>>>>>> >>>>>>> Well we could put the work item for the scheduler independent >>>>>>> reset source into the reset domain as well. But I'm not sure >>>>>>> those additional reset sources should be part of any common >>>>>>> handling, that is largely amdgpu specific. >>>>>> >>>>>> >>>>>> So it's for sure more then one source for the reasons described >>>>>> above, also note that for scheduler we already cancel delayed work >>>>>> in drm_sched_stop so calling them again in amdgpu code kind of >>>>>> superfluous. >>>>>> >>>>>> Andrey >>>>>> >>>>>> >>>>>>> >>>>>>> Christian. >>>>>>> >>>>>>>> >>>>>>>> Andrey >>>>>>>> >>>>>>>> >>>>>>>>> >>>>>>>>>> >>>>>>>>>> Andrey >>>>>>>>>> >>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>>> >>>>>>>>>>>> The only difference is I chose to do the canceling right >>>>>>>>>>>> BEFORE the HW reset and not AFTER. I did this because I see >>>>>>>>>>>> a possible race where a new reset request is being generated >>>>>>>>>>>> exactly after we finished the HW reset but before we >>>>>>>>>>>> canceled out all pending resets - in such case you wold not >>>>>>>>>>>> want to cancel this 'border line new' reset request. >>>>>>>>>>> >>>>>>>>>>> Why not? Any new reset request directly after a hardware >>>>>>>>>>> reset is most likely just falsely generated by the reset itself. >>>>>>>>>>> >>>>>>>>>>> Ideally I would cancel all sources after the reset, but >>>>>>>>>>> before starting any new work. >>>>>>>>>>> >>>>>>>>>>> Regards, >>>>>>>>>>> Christian. >>>>>>>>>>> >>>>>>>>>>>> >>>>>>>>>>>> >>>>>>>>>>>> Andrey >>>>>>>>>>>> >>>>>>>>>>>> >>>>>>>>>>>>> >>>>>>>>>>>>> Regards, >>>>>>>>>>>>> Christian. >>>>>>>>>>>>> >>>>>>>>>>>>>>> You can see that if many different reset sources share >>>>>>>>>>>>>>> same work struct what can happen is that the first to >>>>>>>>>>>>>>> obtain the lock you describe bellow might opt out from >>>>>>>>>>>>>>> full HW reset because his bad job did signal for example >>>>>>>>>>>>>>> or because his hunged IP block was able to recover >>>>>>>>>>>>>>> through SW reset but in the meantime another reset source >>>>>>>>>>>>>>> who needed an actual HW reset just silently returned and >>>>>>>>>>>>>>> we end up with unhandled reset request. True that today >>>>>>>>>>>>>>> this happens only to job timeout reset sources that are >>>>>>>>>>>>>>> handled form within the scheduler and won't use this >>>>>>>>>>>>>>> single work struct but no one prevents a future case for >>>>>>>>>>>>>>> this to happen and also, if we actually want to unify >>>>>>>>>>>>>>> scheduler time out handlers within reset domain (which >>>>>>>>>>>>>>> seems to me the right design approach) we won't be able >>>>>>>>>>>>>>> to use just one work struct for this reason anyway. >>>>>>>>>>>>>>> >>>>>>>>>>>>>> >>>>>>>>>>>>>> Just to add to this point - a reset domain is co-operative >>>>>>>>>>>>>> domain. In addition to sharing a set of clients from >>>>>>>>>>>>>> various reset sources for one device, it also will have a >>>>>>>>>>>>>> set of devices like in XGMI hive. The job timeout on one >>>>>>>>>>>>>> device may not eventually result in result, but a RAS >>>>>>>>>>>>>> error happening on another device at the same time would >>>>>>>>>>>>>> need a reset. The second device's RAS error cannot return >>>>>>>>>>>>>> seeing that a reset work already started, or ignore the >>>>>>>>>>>>>> reset work given that another device has filled the reset >>>>>>>>>>>>>> data. >>>>>>>>>>>>>> >>>>>>>>>>>>>> When there is a reset domain, it should take care of the >>>>>>>>>>>>>> work scheduled and keeping it in device or any other level >>>>>>>>>>>>>> doesn't sound good. >>>>>>>>>>>>>> >>>>>>>>>>>>>> Thanks, >>>>>>>>>>>>>> Lijo >>>>>>>>>>>>>> >>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> I'd put the reset work struct into the reset_domain >>>>>>>>>>>>>>>> struct. That way you'd have exactly one worker for the >>>>>>>>>>>>>>>> reset domain. You could implement a lock-less scheme to >>>>>>>>>>>>>>>> decide whether you need to schedule a reset, e.g. using >>>>>>>>>>>>>>>> an atomic counter in the shared work struct that gets >>>>>>>>>>>>>>>> incremented when a client wants to trigger a reset >>>>>>>>>>>>>>>> (atomic_add_return). If that counter is exactly 1 after >>>>>>>>>>>>>>>> incrementing, you need to fill in the rest of the work >>>>>>>>>>>>>>>> struct and schedule the work. Otherwise, it's already >>>>>>>>>>>>>>>> scheduled (or another client is in the process of >>>>>>>>>>>>>>>> scheduling it) and you just return. When the worker >>>>>>>>>>>>>>>> finishes (after confirming a successful reset), it >>>>>>>>>>>>>>>> resets the counter to 0, so the next client requesting a >>>>>>>>>>>>>>>> reset will schedule the worker again. >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>>>> Felix >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> Additional to that keep in mind that you can't >>>>>>>>>>>>>>>>>> allocate any memory before or during the GPU reset nor >>>>>>>>>>>>>>>>>> wait for the reset to complete (so you can't allocate >>>>>>>>>>>>>>>>>> anything on the stack either). >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> There is no dynamic allocation here, regarding stack >>>>>>>>>>>>>>>>> allocations - we do it all the time when we call >>>>>>>>>>>>>>>>> functions, even during GPU resets, how on stack >>>>>>>>>>>>>>>>> allocation of work struct in amdgpu_device_gpu_recover >>>>>>>>>>>>>>>>> is different from any other local variable we allocate >>>>>>>>>>>>>>>>> in any function we call ? >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> I am also not sure why it's not allowed to wait for >>>>>>>>>>>>>>>>> reset to complete ? Also, see in amdgpu_ras_do_recovery >>>>>>>>>>>>>>>>> and gpu_recover_get (debugfs) - the caller expects the >>>>>>>>>>>>>>>>> reset to complete before he returns. I can probably >>>>>>>>>>>>>>>>> work around it in RAS code by calling >>>>>>>>>>>>>>>>> atomic_set(&ras->in_recovery, 0) from some callback >>>>>>>>>>>>>>>>> within actual reset function but regarding sysfs it >>>>>>>>>>>>>>>>> actually expects a result returned indicating whether >>>>>>>>>>>>>>>>> the call was successful or not. >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> I don't think that concept you try here will work. >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>>>>>> Christian. >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> Also in general seems to me it's cleaner approach >>>>>>>>>>>>>>>>>>> where this logic (the work items) are held and >>>>>>>>>>>>>>>>>>> handled in reset_domain and are not split in each >>>>>>>>>>>>>>>>>>> adev or any other entity. We might want in the future >>>>>>>>>>>>>>>>>>> to even move the scheduler handling into reset domain >>>>>>>>>>>>>>>>>>> since reset domain is supposed to be a generic things >>>>>>>>>>>>>>>>>>> and not only or AMD. >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>>>>>>>>>> Christian. >>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky >>>>>>>>>>>>>>>>>>>>>>> <andrey.grodzovsky@amd.com> >>>>>>>>>>>>>>>>>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com> >>>>>>>>>>>>>>>>>>>>>>> --- >>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +--- >>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 >>>>>>>>>>>>>>>>>>>>>>> +++-- >>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 + >>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 >>>>>>>>>>>>>>>>>>>>>>> +++++++++++++++++++++- >>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +- >>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++- >>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++- >>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++- >>>>>>>>>>>>>>>>>>>>>>> 8 files changed, 104 insertions(+), 24 >>>>>>>>>>>>>>>>>>>>>>> deletions(-) >>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>>>>> index 4264abc5604d..99efd8317547 100644 >>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>>>>> @@ -109,6 +109,7 @@ >>>>>>>>>>>>>>>>>>>>>>> #include "amdgpu_fdinfo.h" >>>>>>>>>>>>>>>>>>>>>>> #include "amdgpu_mca.h" >>>>>>>>>>>>>>>>>>>>>>> #include "amdgpu_ras.h" >>>>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h" >>>>>>>>>>>>>>>>>>>>>>> #define MAX_GPU_INSTANCE 16 >>>>>>>>>>>>>>>>>>>>>>> @@ -509,16 +510,6 @@ struct >>>>>>>>>>>>>>>>>>>>>>> amdgpu_allowed_register_entry { >>>>>>>>>>>>>>>>>>>>>>> bool grbm_indexed; >>>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>>> -enum amd_reset_method { >>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_NONE = -1, >>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_LEGACY = 0, >>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE0, >>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE1, >>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE2, >>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_BACO, >>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_PCI, >>>>>>>>>>>>>>>>>>>>>>> -}; >>>>>>>>>>>>>>>>>>>>>>> - >>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_video_codec_info { >>>>>>>>>>>>>>>>>>>>>>> u32 codec_type; >>>>>>>>>>>>>>>>>>>>>>> u32 max_width; >>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>>>>> index e582f1044c0f..7fa82269c30f 100644 >>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>>>>> @@ -5201,6 +5201,12 @@ int >>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct >>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev, >>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>> tmp_vram_lost_counter = >>>>>>>>>>>>>>>>>>>>>>> atomic_read(&((adev)->vram_lost_counter)); >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> + /* Drop all pending resets since we will >>>>>>>>>>>>>>>>>>>>>>> reset now anyway */ >>>>>>>>>>>>>>>>>>>>>>> + tmp_adev = >>>>>>>>>>>>>>>>>>>>>>> list_first_entry(device_list_handle, struct >>>>>>>>>>>>>>>>>>>>>>> amdgpu_device, >>>>>>>>>>>>>>>>>>>>>>> + reset_list); >>>>>>>>>>>>>>>>>>>>>>> + amdgpu_reset_pending_list(tmp_adev->reset_domain); >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> /* Actual ASIC resets if needed.*/ >>>>>>>>>>>>>>>>>>>>>>> /* Host driver will handle XGMI hive reset >>>>>>>>>>>>>>>>>>>>>>> for SRIOV */ >>>>>>>>>>>>>>>>>>>>>>> if (amdgpu_sriov_vf(adev)) { >>>>>>>>>>>>>>>>>>>>>>> @@ -5296,7 +5302,7 @@ int >>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct >>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev, >>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_recover_work_struct { >>>>>>>>>>>>>>>>>>>>>>> - struct work_struct base; >>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct base; >>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev; >>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_job *job; >>>>>>>>>>>>>>>>>>>>>>> int ret; >>>>>>>>>>>>>>>>>>>>>>> @@ -5304,7 +5310,7 @@ struct >>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct { >>>>>>>>>>>>>>>>>>>>>>> static void >>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work(struct >>>>>>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>> - struct amdgpu_recover_work_struct >>>>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct >>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base); >>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_recover_work_struct >>>>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct >>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base.base.work); >>>>>>>>>>>>>>>>>>>>>>> recover_work->ret = >>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, >>>>>>>>>>>>>>>>>>>>>>> recover_work->job); >>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>> @@ -5316,12 +5322,15 @@ int >>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>>> *adev, >>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_recover_work_struct work = >>>>>>>>>>>>>>>>>>>>>>> {.adev = adev, .job = job}; >>>>>>>>>>>>>>>>>>>>>>> - INIT_WORK(&work.base, >>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&work.base.base, >>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&work.base.node); >>>>>>>>>>>>>>>>>>>>>>> if >>>>>>>>>>>>>>>>>>>>>>> (!amdgpu_reset_domain_schedule(adev->reset_domain, &work.base)) >>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>> return -EAGAIN; >>>>>>>>>>>>>>>>>>>>>>> - flush_work(&work.base); >>>>>>>>>>>>>>>>>>>>>>> + flush_delayed_work(&work.base.base); >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>>> &work.base); >>>>>>>>>>>>>>>>>>>>>>> return work.ret; >>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>>>>> index c80af0889773..ffddd419c351 100644 >>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain >>>>>>>>>>>>>>>>>>>>>>> *amdgpu_reset_create_reset_domain(enum >>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_d >>>>>>>>>>>>>>>>>>>>>>> atomic_set(&reset_domain->in_gpu_reset, 0); >>>>>>>>>>>>>>>>>>>>>>> init_rwsem(&reset_domain->sem); >>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&reset_domain->pending_works); >>>>>>>>>>>>>>>>>>>>>>> + mutex_init(&reset_domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> return reset_domain; >>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>>>>> index 1949dbe28a86..863ec5720fc1 100644 >>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>>>>> @@ -24,7 +24,18 @@ >>>>>>>>>>>>>>>>>>>>>>> #ifndef __AMDGPU_RESET_H__ >>>>>>>>>>>>>>>>>>>>>>> #define __AMDGPU_RESET_H__ >>>>>>>>>>>>>>>>>>>>>>> -#include "amdgpu.h" >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> +#include <linux/atomic.h> >>>>>>>>>>>>>>>>>>>>>>> +#include <linux/mutex.h> >>>>>>>>>>>>>>>>>>>>>>> +#include <linux/list.h> >>>>>>>>>>>>>>>>>>>>>>> +#include <linux/kref.h> >>>>>>>>>>>>>>>>>>>>>>> +#include <linux/rwsem.h> >>>>>>>>>>>>>>>>>>>>>>> +#include <linux/workqueue.h> >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_device; >>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_job; >>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_hive_info; >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> enum AMDGPU_RESET_FLAGS { >>>>>>>>>>>>>>>>>>>>>>> @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS { >>>>>>>>>>>>>>>>>>>>>>> AMDGPU_SKIP_HW_RESET = 1, >>>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> +enum amd_reset_method { >>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_NONE = -1, >>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_LEGACY = 0, >>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE0, >>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE1, >>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE2, >>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_BACO, >>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_PCI, >>>>>>>>>>>>>>>>>>>>>>> +}; >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_reset_context { >>>>>>>>>>>>>>>>>>>>>>> enum amd_reset_method method; >>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *reset_req_dev; >>>>>>>>>>>>>>>>>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context { >>>>>>>>>>>>>>>>>>>>>>> unsigned long flags; >>>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_reset_control; >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_reset_handler { >>>>>>>>>>>>>>>>>>>>>>> enum amd_reset_method reset_method; >>>>>>>>>>>>>>>>>>>>>>> struct list_head handler_list; >>>>>>>>>>>>>>>>>>>>>>> @@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type { >>>>>>>>>>>>>>>>>>>>>>> XGMI_HIVE >>>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_reset_work_struct { >>>>>>>>>>>>>>>>>>>>>>> + struct delayed_work base; >>>>>>>>>>>>>>>>>>>>>>> + struct list_head node; >>>>>>>>>>>>>>>>>>>>>>> +}; >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_reset_domain { >>>>>>>>>>>>>>>>>>>>>>> struct kref refcount; >>>>>>>>>>>>>>>>>>>>>>> struct workqueue_struct *wq; >>>>>>>>>>>>>>>>>>>>>>> enum amdgpu_reset_domain_type type; >>>>>>>>>>>>>>>>>>>>>>> struct rw_semaphore sem; >>>>>>>>>>>>>>>>>>>>>>> atomic_t in_gpu_reset; >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> + struct list_head pending_works; >>>>>>>>>>>>>>>>>>>>>>> + struct mutex reset_lock; >>>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>>> @@ -113,9 +146,43 @@ static inline void >>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_put_reset_domain(struct >>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *dom >>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>> static inline bool >>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_schedule(struct >>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain, >>>>>>>>>>>>>>>>>>>>>>> - struct work_struct *work) >>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work) >>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>> - return queue_work(domain->wq, work); >>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> + if (!queue_delayed_work(domain->wq, >>>>>>>>>>>>>>>>>>>>>>> &work->base, 0)) { >>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>> + return false; >>>>>>>>>>>>>>>>>>>>>>> + } >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> + list_add_tail(&work->node, >>>>>>>>>>>>>>>>>>>>>>> &domain->pending_works); >>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> + return true; >>>>>>>>>>>>>>>>>>>>>>> +} >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> +static inline void >>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(struct >>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain, >>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work) >>>>>>>>>>>>>>>>>>>>>>> +{ >>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>> + list_del_init(&work->node); >>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>> +} >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> +static inline void >>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_pending_list(struct >>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain) >>>>>>>>>>>>>>>>>>>>>>> +{ >>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *entry, *tmp; >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>> + list_for_each_entry_safe(entry, tmp, >>>>>>>>>>>>>>>>>>>>>>> &domain->pending_works, node) { >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> + list_del_init(&entry->node); >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> + /* Stop any other related pending resets */ >>>>>>>>>>>>>>>>>>>>>>> + cancel_delayed_work(&entry->base); >>>>>>>>>>>>>>>>>>>>>>> + } >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>> void amdgpu_device_lock_reset_domain(struct >>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *reset_domain); >>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>>>>> index 239f232f9c02..574e870d3064 100644 >>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>>>>> @@ -25,6 +25,7 @@ >>>>>>>>>>>>>>>>>>>>>>> #define AMDGPU_VIRT_H >>>>>>>>>>>>>>>>>>>>>>> #include "amdgv_sriovmsg.h" >>>>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h" >>>>>>>>>>>>>>>>>>>>>>> #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 << >>>>>>>>>>>>>>>>>>>>>>> 0) /* vBIOS is sr-iov ready */ >>>>>>>>>>>>>>>>>>>>>>> #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 1) >>>>>>>>>>>>>>>>>>>>>>> /* sr-iov is enabled on this GPU */ >>>>>>>>>>>>>>>>>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt { >>>>>>>>>>>>>>>>>>>>>>> uint32_t reg_val_offs; >>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_irq_src ack_irq; >>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_irq_src rcv_irq; >>>>>>>>>>>>>>>>>>>>>>> - struct work_struct flr_work; >>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct flr_work; >>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_mm_table mm_table; >>>>>>>>>>>>>>>>>>>>>>> const struct amdgpu_virt_ops *ops; >>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_vf_error_buffer vf_errors; >>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>>>>> index b81acf59870c..f3d1c2be9292 100644 >>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>>>>> @@ -251,7 +251,7 @@ static int >>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>>> *adev, >>>>>>>>>>>>>>>>>>>>>>> static void xgpu_ai_mailbox_flr_work(struct >>>>>>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, >>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work); >>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev = >>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt); >>>>>>>>>>>>>>>>>>>>>>> int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>>>>>>>>>>>>>>>>>> @@ -380,7 +380,8 @@ int >>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_get_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>> @@ -389,6 +390,8 @@ void >>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_put_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>> static int xgpu_ai_request_init_data(struct >>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644 >>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>>>>> @@ -275,7 +275,7 @@ static int >>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>>> *adev, >>>>>>>>>>>>>>>>>>>>>>> static void xgpu_nv_mailbox_flr_work(struct >>>>>>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, >>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work); >>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev = >>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt); >>>>>>>>>>>>>>>>>>>>>>> int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>>>>>>>>>>>>>>>>>> @@ -407,7 +407,8 @@ int >>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_get_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>> @@ -416,6 +417,8 @@ void >>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_put_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>> const struct amdgpu_virt_ops xgpu_nv_virt_ops >>>>>>>>>>>>>>>>>>>>>>> = { >>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644 >>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>>>>> @@ -512,7 +512,7 @@ static int >>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>>> *adev, >>>>>>>>>>>>>>>>>>>>>>> static void xgpu_vi_mailbox_flr_work(struct >>>>>>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, >>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work); >>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev = >>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt); >>>>>>>>>>>>>>>>>>>>>>> /* wait until RCV_MSG become 3 */ >>>>>>>>>>>>>>>>>>>>>>> @@ -610,7 +610,8 @@ int >>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_get_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>> @@ -619,6 +620,8 @@ void >>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_put_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>> const struct amdgpu_virt_ops xgpu_vi_virt_ops >>>>>>>>>>>>>>>>>>>>>>> = { >>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>> >>>>>>>>>>> >>>>>>>>> >>>>>>> >>>>> ^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive. 2022-05-11 15:46 ` Lazar, Lijo @ 2022-05-11 15:53 ` Andrey Grodzovsky 0 siblings, 0 replies; 40+ messages in thread From: Andrey Grodzovsky @ 2022-05-11 15:53 UTC (permalink / raw) To: Lazar, Lijo, Christian König, Christian König, Felix Kuehling, amd-gfx Cc: Bai Zoy On 2022-05-11 11:46, Lazar, Lijo wrote: > > > On 5/11/2022 9:13 PM, Andrey Grodzovsky wrote: >> >> On 2022-05-11 11:37, Lazar, Lijo wrote: >>> >>> >>> On 5/11/2022 9:05 PM, Andrey Grodzovsky wrote: >>>> >>>> On 2022-05-11 11:20, Lazar, Lijo wrote: >>>>> >>>>> >>>>> On 5/11/2022 7:28 PM, Christian König wrote: >>>>>> Am 11.05.22 um 15:43 schrieb Andrey Grodzovsky: >>>>>>> On 2022-05-11 03:38, Christian König wrote: >>>>>>>> Am 10.05.22 um 20:53 schrieb Andrey Grodzovsky: >>>>>>>>> [SNIP] >>>>>>>>>> E.g. in the reset code (either before or after the reset, >>>>>>>>>> that's debatable) you do something like this: >>>>>>>>>> >>>>>>>>>> for (i = 0; i < num_ring; ++i) >>>>>>>>>> cancel_delayed_work(ring[i]->scheduler....) >>>>>>>>>> cancel_work(adev->ras_work); >>>>>>>>>> cancel_work(adev->iofault_work); >>>>>>>>>> cancel_work(adev->debugfs_work); >>>>>>>>>> ... >>>>>>>>>> >>>>>>>>>> You don't really need to track which reset source has fired >>>>>>>>>> and which hasn't, because that would be racy again. Instead >>>>>>>>>> just bluntly reset all possible sources. >>>>>>>>>> >>>>>>>>>> Christian. >>>>>>>>> >>>>>>>>> >>>>>>>>> I don't say we care if it fired once or twice (I need to add a >>>>>>>>> fix to only insert reset work to pending reset list if it's >>>>>>>>> not already there), the point of using list (or array) to >>>>>>>>> which you add and from which you remove is that the logic of >>>>>>>>> this is encapsulated within reset domain. In your way we need >>>>>>>>> to be aware who exactly schedules reset work and explicitly >>>>>>>>> cancel them, this also means that for any new source added in >>>>>>>>> the future you will need to remember to add him >>>>>>>> >>>>>>>> I don't think that this is a valid argument. Additionally to >>>>>>>> the schedulers we probably just need less than a handful of >>>>>>>> reset sources, most likely even just one or two is enough. >>>>>>>> >>>>>>>> The only justification I can see of having additional separate >>>>>>>> reset sources would be if somebody wants to know if a specific >>>>>>>> source has been handled or not (e.g. call flush_work() or >>>>>>>> work_pending()). Like in the case of a reset triggered through >>>>>>>> debugfs. >>>>>>> >>>>>>> >>>>>>> This is indeed one reason, another is as we said before that if >>>>>>> you share 'reset source' (meaning a delayed work) with another >>>>>>> client (i.e. RAS and KFD) it means you make assumption that the >>>>>>> other client always proceeds with the >>>>>>> reset exactly the same way as you expect. So today we have this >>>>>>> only in scheduler vs non scheduler reset happening - non >>>>>>> scheduler reset clients assume the reset is always fully >>>>>>> executed in HW while scheduler based reset makes shortcuts and >>>>>>> not always does HW reset hence they cannot share 'reset source' >>>>>>> (delayed work). Yes, we can always add this in the future if and >>>>>>> when such problem will arise but no one will remember this then >>>>>>> and a new bug will be introduced and will take time to find and >>>>>>> resolve. >>>>>> >>>>>> Mhm, so your main concern is that we forget to correctly handle >>>>>> the new reset sources? >>>>>> >>>>>> How about we do it like this then: >>>>>> >>>>>> struct amdgpu_reset_domain { >>>>>> .... >>>>>> union { >>>>>> struct { >>>>>> struct work_item debugfs; >>>>>> struct work_item ras; >>>>>> .... >>>>>> }; >>>>>> struct work_item array[] >>>>>> } reset_sources; >>>>>> } >>>>>> >>>>> >>>>> If it's only about static array, >>>>> >>>>> enum amdgpu_reset_soruce { >>>>> >>>>> AMDGPU_RESET_SRC_RAS, >>>>> AMDGPU_RESET_SRC_ABC, >>>>> ..... >>>>> AMDGPU_RESET_SRC_XYZ, >>>>> AMDGPU_RESET_SRC_MAX >>>>> >>>>> }; >>>>> >>>>> struct work_struct reset_work[AMDGPU_RESET_SRC_MAX]; => An index >>>>> for each work item >>>>> >>>>> >>>>> Thanks, >>>>> Lijo >>>> >>>> >>>> It's possible though it makes harder to generalize reset_domain >>>> later for other drivers. >>> >>> The current reset domain queue design is not good for a >>> hierarchichal reset within amdgpu itself :) >>> >>> Thanks, >>> Lijo >> >> >> Not sure what do you mean ? >> > > It's tied to the TDR queue in scheduler. > > Hierarchichal model - start from reset of lowest level nodes and on > failure try with a higher level reset. This model doesn't suit that. > > Thanks, > Lijo The TDR queue just provides a single threaded context to execute all resets. It has no restrictions on what exactly you reset within each work item on the queue so I still don't see a problem. I also don't understand what is lower level vs higher level nodes in our case. Is it single node vs hive ? Andrey > >> Andrey >> >> >>> >>>> But still one caveat, look at amdgpu_recover_work_struct and it's >>>> usage in amdgpu_device_gpu_recover and in gpu_recover_get, >>>> At least for debugfs i need to return back the result of GPU reset >>>> and so I cannot store actual work items in the array mentioned above >>>> but rather pointers to work_item because i need a way to get back >>>> the return value from gpu_recover like I do now in >>>> amdgpu_device_gpu_recover. >>>> >>>> Andrey >>>> >>>> >>>>> >>>>>> Not 100% sure if that works, but something like that should do >>>>>> the trick. >>>>>> >>>>>> My main concern is that I don't want to allocate the work items >>>>>> on the stack and dynamic allocation (e.g. kmalloc) is usually not >>>>>> possible either. >>>>>> >>>>>> Additional to that putting/removing work items from a list, array >>>>>> or other container is a very common source for race conditions. >>>>>> >>>>>> Regards, >>>>>> Christian. >>>>>> >>>>>>> >>>>>>>>> to the cancellation list which you showed above. In current >>>>>>>>> way all this done automatically within reset_domain code and >>>>>>>>> it's agnostic to specific driver and it's specific list of >>>>>>>>> reset sources. Also in case we would want to generalize >>>>>>>>> reset_domain to other GPU drivers (which was >>>>>>>>> a plan as far as i remember) this explicit mention of each >>>>>>>>> reset works for cancellation is again less suitable in my >>>>>>>>> opinion. >>>>>>>> >>>>>>>> Well we could put the work item for the scheduler independent >>>>>>>> reset source into the reset domain as well. But I'm not sure >>>>>>>> those additional reset sources should be part of any common >>>>>>>> handling, that is largely amdgpu specific. >>>>>>> >>>>>>> >>>>>>> So it's for sure more then one source for the reasons described >>>>>>> above, also note that for scheduler we already cancel delayed >>>>>>> work in drm_sched_stop so calling them again in amdgpu code kind >>>>>>> of superfluous. >>>>>>> >>>>>>> Andrey >>>>>>> >>>>>>> >>>>>>>> >>>>>>>> Christian. >>>>>>>> >>>>>>>>> >>>>>>>>> Andrey >>>>>>>>> >>>>>>>>> >>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>> Andrey >>>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>>> >>>>>>>>>>>>> >>>>>>>>>>>>> The only difference is I chose to do the canceling right >>>>>>>>>>>>> BEFORE the HW reset and not AFTER. I did this because I >>>>>>>>>>>>> see a possible race where a new reset request is being >>>>>>>>>>>>> generated exactly after we finished the HW reset but >>>>>>>>>>>>> before we canceled out all pending resets - in such case >>>>>>>>>>>>> you wold not want to cancel this 'border line new' reset >>>>>>>>>>>>> request. >>>>>>>>>>>> >>>>>>>>>>>> Why not? Any new reset request directly after a hardware >>>>>>>>>>>> reset is most likely just falsely generated by the reset >>>>>>>>>>>> itself. >>>>>>>>>>>> >>>>>>>>>>>> Ideally I would cancel all sources after the reset, but >>>>>>>>>>>> before starting any new work. >>>>>>>>>>>> >>>>>>>>>>>> Regards, >>>>>>>>>>>> Christian. >>>>>>>>>>>> >>>>>>>>>>>>> >>>>>>>>>>>>> >>>>>>>>>>>>> Andrey >>>>>>>>>>>>> >>>>>>>>>>>>> >>>>>>>>>>>>>> >>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>> Christian. >>>>>>>>>>>>>> >>>>>>>>>>>>>>>> You can see that if many different reset sources share >>>>>>>>>>>>>>>> same work struct what can happen is that the first to >>>>>>>>>>>>>>>> obtain the lock you describe bellow might opt out from >>>>>>>>>>>>>>>> full HW reset because his bad job did signal for >>>>>>>>>>>>>>>> example or because his hunged IP block was able to >>>>>>>>>>>>>>>> recover through SW reset but in the meantime another >>>>>>>>>>>>>>>> reset source who needed an actual HW reset just >>>>>>>>>>>>>>>> silently returned and we end up with unhandled reset >>>>>>>>>>>>>>>> request. True that today this happens only to job >>>>>>>>>>>>>>>> timeout reset sources that are handled form within the >>>>>>>>>>>>>>>> scheduler and won't use this single work struct but no >>>>>>>>>>>>>>>> one prevents a future case for this to happen and also, >>>>>>>>>>>>>>>> if we actually want to unify scheduler time out >>>>>>>>>>>>>>>> handlers within reset domain (which seems to me the >>>>>>>>>>>>>>>> right design approach) we won't be able to use just one >>>>>>>>>>>>>>>> work struct for this reason anyway. >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> Just to add to this point - a reset domain is >>>>>>>>>>>>>>> co-operative domain. In addition to sharing a set of >>>>>>>>>>>>>>> clients from various reset sources for one device, it >>>>>>>>>>>>>>> also will have a set of devices like in XGMI hive. The >>>>>>>>>>>>>>> job timeout on one device may not eventually result in >>>>>>>>>>>>>>> result, but a RAS error happening on another device at >>>>>>>>>>>>>>> the same time would need a reset. The second device's >>>>>>>>>>>>>>> RAS error cannot return seeing that a reset work already >>>>>>>>>>>>>>> started, or ignore the reset work given that another >>>>>>>>>>>>>>> device has filled the reset data. >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> When there is a reset domain, it should take care of the >>>>>>>>>>>>>>> work scheduled and keeping it in device or any other >>>>>>>>>>>>>>> level doesn't sound good. >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> Thanks, >>>>>>>>>>>>>>> Lijo >>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> I'd put the reset work struct into the reset_domain >>>>>>>>>>>>>>>>> struct. That way you'd have exactly one worker for the >>>>>>>>>>>>>>>>> reset domain. You could implement a lock-less scheme >>>>>>>>>>>>>>>>> to decide whether you need to schedule a reset, e.g. >>>>>>>>>>>>>>>>> using an atomic counter in the shared work struct that >>>>>>>>>>>>>>>>> gets incremented when a client wants to trigger a >>>>>>>>>>>>>>>>> reset (atomic_add_return). If that counter is exactly >>>>>>>>>>>>>>>>> 1 after incrementing, you need to fill in the rest of >>>>>>>>>>>>>>>>> the work struct and schedule the work. Otherwise, it's >>>>>>>>>>>>>>>>> already scheduled (or another client is in the process >>>>>>>>>>>>>>>>> of scheduling it) and you just return. When the worker >>>>>>>>>>>>>>>>> finishes (after confirming a successful reset), it >>>>>>>>>>>>>>>>> resets the counter to 0, so the next client requesting >>>>>>>>>>>>>>>>> a reset will schedule the worker again. >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>>>>> Felix >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> Additional to that keep in mind that you can't >>>>>>>>>>>>>>>>>>> allocate any memory before or during the GPU reset >>>>>>>>>>>>>>>>>>> nor wait for the reset to complete (so you can't >>>>>>>>>>>>>>>>>>> allocate anything on the stack either). >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> There is no dynamic allocation here, regarding stack >>>>>>>>>>>>>>>>>> allocations - we do it all the time when we call >>>>>>>>>>>>>>>>>> functions, even during GPU resets, how on stack >>>>>>>>>>>>>>>>>> allocation of work struct in >>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover is different from any other >>>>>>>>>>>>>>>>>> local variable we allocate in any function we call ? >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> I am also not sure why it's not allowed to wait for >>>>>>>>>>>>>>>>>> reset to complete ? Also, see in >>>>>>>>>>>>>>>>>> amdgpu_ras_do_recovery and gpu_recover_get (debugfs) >>>>>>>>>>>>>>>>>> - the caller expects the reset to complete before he >>>>>>>>>>>>>>>>>> returns. I can probably work around it in RAS code by >>>>>>>>>>>>>>>>>> calling atomic_set(&ras->in_recovery, 0) from some >>>>>>>>>>>>>>>>>> callback within actual reset function but regarding >>>>>>>>>>>>>>>>>> sysfs it actually expects a result returned >>>>>>>>>>>>>>>>>> indicating whether the call was successful or not. >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> I don't think that concept you try here will work. >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>>>>>>> Christian. >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>> Also in general seems to me it's cleaner approach >>>>>>>>>>>>>>>>>>>> where this logic (the work items) are held and >>>>>>>>>>>>>>>>>>>> handled in reset_domain and are not split in each >>>>>>>>>>>>>>>>>>>> adev or any other entity. We might want in the >>>>>>>>>>>>>>>>>>>> future to even move the scheduler handling into >>>>>>>>>>>>>>>>>>>> reset domain since reset domain is supposed to be a >>>>>>>>>>>>>>>>>>>> generic things and not only or AMD. >>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>>>>>>>>>>> Christian. >>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky >>>>>>>>>>>>>>>>>>>>>>>> <andrey.grodzovsky@amd.com> >>>>>>>>>>>>>>>>>>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com> >>>>>>>>>>>>>>>>>>>>>>>> --- >>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +--- >>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 >>>>>>>>>>>>>>>>>>>>>>>> +++-- >>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 + >>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 >>>>>>>>>>>>>>>>>>>>>>>> +++++++++++++++++++++- >>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +- >>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++- >>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++- >>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++- >>>>>>>>>>>>>>>>>>>>>>>> 8 files changed, 104 insertions(+), 24 >>>>>>>>>>>>>>>>>>>>>>>> deletions(-) >>>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>>>>>> index 4264abc5604d..99efd8317547 100644 >>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>>>>>> @@ -109,6 +109,7 @@ >>>>>>>>>>>>>>>>>>>>>>>> #include "amdgpu_fdinfo.h" >>>>>>>>>>>>>>>>>>>>>>>> #include "amdgpu_mca.h" >>>>>>>>>>>>>>>>>>>>>>>> #include "amdgpu_ras.h" >>>>>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h" >>>>>>>>>>>>>>>>>>>>>>>> #define MAX_GPU_INSTANCE 16 >>>>>>>>>>>>>>>>>>>>>>>> @@ -509,16 +510,6 @@ struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_allowed_register_entry { >>>>>>>>>>>>>>>>>>>>>>>> bool grbm_indexed; >>>>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>>>> -enum amd_reset_method { >>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_NONE = -1, >>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_LEGACY = 0, >>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE0, >>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE1, >>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE2, >>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_BACO, >>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_PCI, >>>>>>>>>>>>>>>>>>>>>>>> -}; >>>>>>>>>>>>>>>>>>>>>>>> - >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_video_codec_info { >>>>>>>>>>>>>>>>>>>>>>>> u32 codec_type; >>>>>>>>>>>>>>>>>>>>>>>> u32 max_width; >>>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>>>>>> index e582f1044c0f..7fa82269c30f 100644 >>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>>>>>> @@ -5201,6 +5201,12 @@ int >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev, >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> tmp_vram_lost_counter = >>>>>>>>>>>>>>>>>>>>>>>> atomic_read(&((adev)->vram_lost_counter)); >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> + /* Drop all pending resets since we will >>>>>>>>>>>>>>>>>>>>>>>> reset now anyway */ >>>>>>>>>>>>>>>>>>>>>>>> + tmp_adev = >>>>>>>>>>>>>>>>>>>>>>>> list_first_entry(device_list_handle, struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device, >>>>>>>>>>>>>>>>>>>>>>>> + reset_list); >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_pending_list(tmp_adev->reset_domain); >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> /* Actual ASIC resets if needed.*/ >>>>>>>>>>>>>>>>>>>>>>>> /* Host driver will handle XGMI hive >>>>>>>>>>>>>>>>>>>>>>>> reset for SRIOV */ >>>>>>>>>>>>>>>>>>>>>>>> if (amdgpu_sriov_vf(adev)) { >>>>>>>>>>>>>>>>>>>>>>>> @@ -5296,7 +5302,7 @@ int >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev, >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_recover_work_struct { >>>>>>>>>>>>>>>>>>>>>>>> - struct work_struct base; >>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct base; >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev; >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_job *job; >>>>>>>>>>>>>>>>>>>>>>>> int ret; >>>>>>>>>>>>>>>>>>>>>>>> @@ -5304,7 +5310,7 @@ struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct { >>>>>>>>>>>>>>>>>>>>>>>> static void >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work(struct >>>>>>>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>>> - struct amdgpu_recover_work_struct >>>>>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base); >>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_recover_work_struct >>>>>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base.base.work); >>>>>>>>>>>>>>>>>>>>>>>> recover_work->ret = >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, >>>>>>>>>>>>>>>>>>>>>>>> recover_work->job); >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> @@ -5316,12 +5322,15 @@ int >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>>>> *adev, >>>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_recover_work_struct work = >>>>>>>>>>>>>>>>>>>>>>>> {.adev = adev, .job = job}; >>>>>>>>>>>>>>>>>>>>>>>> - INIT_WORK(&work.base, >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&work.base.base, >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&work.base.node); >>>>>>>>>>>>>>>>>>>>>>>> if >>>>>>>>>>>>>>>>>>>>>>>> (!amdgpu_reset_domain_schedule(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>>>> &work.base)) >>>>>>>>>>>>>>>>>>>>>>>> return -EAGAIN; >>>>>>>>>>>>>>>>>>>>>>>> - flush_work(&work.base); >>>>>>>>>>>>>>>>>>>>>>>> + flush_delayed_work(&work.base.base); >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>>>> &work.base); >>>>>>>>>>>>>>>>>>>>>>>> return work.ret; >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>>>>>> index c80af0889773..ffddd419c351 100644 >>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain >>>>>>>>>>>>>>>>>>>>>>>> *amdgpu_reset_create_reset_domain(enum >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_d >>>>>>>>>>>>>>>>>>>>>>>> atomic_set(&reset_domain->in_gpu_reset, 0); >>>>>>>>>>>>>>>>>>>>>>>> init_rwsem(&reset_domain->sem); >>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&reset_domain->pending_works); >>>>>>>>>>>>>>>>>>>>>>>> + mutex_init(&reset_domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> return reset_domain; >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>>>>>> index 1949dbe28a86..863ec5720fc1 100644 >>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>>>>>> @@ -24,7 +24,18 @@ >>>>>>>>>>>>>>>>>>>>>>>> #ifndef __AMDGPU_RESET_H__ >>>>>>>>>>>>>>>>>>>>>>>> #define __AMDGPU_RESET_H__ >>>>>>>>>>>>>>>>>>>>>>>> -#include "amdgpu.h" >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/atomic.h> >>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/mutex.h> >>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/list.h> >>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/kref.h> >>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/rwsem.h> >>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/workqueue.h> >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_device; >>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_job; >>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_hive_info; >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> enum AMDGPU_RESET_FLAGS { >>>>>>>>>>>>>>>>>>>>>>>> @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS { >>>>>>>>>>>>>>>>>>>>>>>> AMDGPU_SKIP_HW_RESET = 1, >>>>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> +enum amd_reset_method { >>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_NONE = -1, >>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_LEGACY = 0, >>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE0, >>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE1, >>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE2, >>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_BACO, >>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_PCI, >>>>>>>>>>>>>>>>>>>>>>>> +}; >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_reset_context { >>>>>>>>>>>>>>>>>>>>>>>> enum amd_reset_method method; >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *reset_req_dev; >>>>>>>>>>>>>>>>>>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context { >>>>>>>>>>>>>>>>>>>>>>>> unsigned long flags; >>>>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_reset_control; >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_reset_handler { >>>>>>>>>>>>>>>>>>>>>>>> enum amd_reset_method reset_method; >>>>>>>>>>>>>>>>>>>>>>>> struct list_head handler_list; >>>>>>>>>>>>>>>>>>>>>>>> @@ -76,12 +100,21 @@ enum >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_type { >>>>>>>>>>>>>>>>>>>>>>>> XGMI_HIVE >>>>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_reset_work_struct { >>>>>>>>>>>>>>>>>>>>>>>> + struct delayed_work base; >>>>>>>>>>>>>>>>>>>>>>>> + struct list_head node; >>>>>>>>>>>>>>>>>>>>>>>> +}; >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_reset_domain { >>>>>>>>>>>>>>>>>>>>>>>> struct kref refcount; >>>>>>>>>>>>>>>>>>>>>>>> struct workqueue_struct *wq; >>>>>>>>>>>>>>>>>>>>>>>> enum amdgpu_reset_domain_type type; >>>>>>>>>>>>>>>>>>>>>>>> struct rw_semaphore sem; >>>>>>>>>>>>>>>>>>>>>>>> atomic_t in_gpu_reset; >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> + struct list_head pending_works; >>>>>>>>>>>>>>>>>>>>>>>> + struct mutex reset_lock; >>>>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>>>> @@ -113,9 +146,43 @@ static inline void >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_put_reset_domain(struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *dom >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> static inline bool >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_schedule(struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain, >>>>>>>>>>>>>>>>>>>>>>>> - struct work_struct *work) >>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work) >>>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>>> - return queue_work(domain->wq, work); >>>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> + if (!queue_delayed_work(domain->wq, >>>>>>>>>>>>>>>>>>>>>>>> &work->base, 0)) { >>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>>> + return false; >>>>>>>>>>>>>>>>>>>>>>>> + } >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> + list_add_tail(&work->node, >>>>>>>>>>>>>>>>>>>>>>>> &domain->pending_works); >>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> + return true; >>>>>>>>>>>>>>>>>>>>>>>> +} >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> +static inline void >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain, >>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work) >>>>>>>>>>>>>>>>>>>>>>>> +{ >>>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>>> + list_del_init(&work->node); >>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>>> +} >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> +static inline void >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_pending_list(struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain) >>>>>>>>>>>>>>>>>>>>>>>> +{ >>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *entry, *tmp; >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>>> + list_for_each_entry_safe(entry, tmp, >>>>>>>>>>>>>>>>>>>>>>>> &domain->pending_works, node) { >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> + list_del_init(&entry->node); >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> + /* Stop any other related pending >>>>>>>>>>>>>>>>>>>>>>>> resets */ >>>>>>>>>>>>>>>>>>>>>>>> + cancel_delayed_work(&entry->base); >>>>>>>>>>>>>>>>>>>>>>>> + } >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> void amdgpu_device_lock_reset_domain(struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *reset_domain); >>>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>>>>>> index 239f232f9c02..574e870d3064 100644 >>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>>>>>> @@ -25,6 +25,7 @@ >>>>>>>>>>>>>>>>>>>>>>>> #define AMDGPU_VIRT_H >>>>>>>>>>>>>>>>>>>>>>>> #include "amdgv_sriovmsg.h" >>>>>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h" >>>>>>>>>>>>>>>>>>>>>>>> #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 << >>>>>>>>>>>>>>>>>>>>>>>> 0) /* vBIOS is sr-iov ready */ >>>>>>>>>>>>>>>>>>>>>>>> #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 1) >>>>>>>>>>>>>>>>>>>>>>>> /* sr-iov is enabled on this GPU */ >>>>>>>>>>>>>>>>>>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt { >>>>>>>>>>>>>>>>>>>>>>>> uint32_t reg_val_offs; >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_irq_src ack_irq; >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_irq_src rcv_irq; >>>>>>>>>>>>>>>>>>>>>>>> - struct work_struct flr_work; >>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct flr_work; >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_mm_table mm_table; >>>>>>>>>>>>>>>>>>>>>>>> const struct amdgpu_virt_ops *ops; >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_vf_error_buffer vf_errors; >>>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>>>>>> index b81acf59870c..f3d1c2be9292 100644 >>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>>>>>> @@ -251,7 +251,7 @@ static int >>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev, >>>>>>>>>>>>>>>>>>>>>>>> static void xgpu_ai_mailbox_flr_work(struct >>>>>>>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, >>>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work); >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev = >>>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt); >>>>>>>>>>>>>>>>>>>>>>>> int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>>>>>>>>>>>>>>>>>>> @@ -380,7 +380,8 @@ int >>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_get_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>>>> *adev) >>>>>>>>>>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> @@ -389,6 +390,8 @@ void >>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_put_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>>>> *adev) >>>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> static int xgpu_ai_request_init_data(struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644 >>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>>>>>> @@ -275,7 +275,7 @@ static int >>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev, >>>>>>>>>>>>>>>>>>>>>>>> static void xgpu_nv_mailbox_flr_work(struct >>>>>>>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, >>>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work); >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev = >>>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt); >>>>>>>>>>>>>>>>>>>>>>>> int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>>>>>>>>>>>>>>>>>>> @@ -407,7 +407,8 @@ int >>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_get_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>>>> *adev) >>>>>>>>>>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> @@ -416,6 +417,8 @@ void >>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_put_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>>>> *adev) >>>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> const struct amdgpu_virt_ops >>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_virt_ops = { >>>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644 >>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>>>>>> @@ -512,7 +512,7 @@ static int >>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev, >>>>>>>>>>>>>>>>>>>>>>>> static void xgpu_vi_mailbox_flr_work(struct >>>>>>>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, >>>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work); >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev = >>>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt); >>>>>>>>>>>>>>>>>>>>>>>> /* wait until RCV_MSG become 3 */ >>>>>>>>>>>>>>>>>>>>>>>> @@ -610,7 +610,8 @@ int >>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_get_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>>>> *adev) >>>>>>>>>>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> @@ -619,6 +620,8 @@ void >>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_put_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>>>> *adev) >>>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> const struct amdgpu_virt_ops >>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_virt_ops = { >>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>> >>>>>>>>>>>> >>>>>>>>>> >>>>>>>> >>>>>> ^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive. 2022-05-11 15:35 ` Andrey Grodzovsky 2022-05-11 15:37 ` Lazar, Lijo @ 2022-05-11 15:39 ` Christian König 2022-05-11 15:57 ` Andrey Grodzovsky 2022-05-11 20:27 ` Andrey Grodzovsky 1 sibling, 2 replies; 40+ messages in thread From: Christian König @ 2022-05-11 15:39 UTC (permalink / raw) To: Andrey Grodzovsky, Lazar, Lijo, Christian König, Felix Kuehling, amd-gfx Cc: Bai Zoy Am 11.05.22 um 17:35 schrieb Andrey Grodzovsky: > On 2022-05-11 11:20, Lazar, Lijo wrote: >> >> >> On 5/11/2022 7:28 PM, Christian König wrote: >>> Am 11.05.22 um 15:43 schrieb Andrey Grodzovsky: >>>> On 2022-05-11 03:38, Christian König wrote: >>>>> Am 10.05.22 um 20:53 schrieb Andrey Grodzovsky: >>>>>> [SNIP] >>>>>>> E.g. in the reset code (either before or after the reset, that's >>>>>>> debatable) you do something like this: >>>>>>> >>>>>>> for (i = 0; i < num_ring; ++i) >>>>>>> cancel_delayed_work(ring[i]->scheduler....) >>>>>>> cancel_work(adev->ras_work); >>>>>>> cancel_work(adev->iofault_work); >>>>>>> cancel_work(adev->debugfs_work); >>>>>>> ... >>>>>>> >>>>>>> You don't really need to track which reset source has fired and >>>>>>> which hasn't, because that would be racy again. Instead just >>>>>>> bluntly reset all possible sources. >>>>>>> >>>>>>> Christian. >>>>>> >>>>>> >>>>>> I don't say we care if it fired once or twice (I need to add a >>>>>> fix to only insert reset work to pending reset list if it's not >>>>>> already there), the point of using list (or array) to which you >>>>>> add and from which you remove is that the logic of this is >>>>>> encapsulated within reset domain. In your way we need to be aware >>>>>> who exactly schedules reset work and explicitly cancel them, this >>>>>> also means that for any new source added in the future you will >>>>>> need to remember to add him >>>>> >>>>> I don't think that this is a valid argument. Additionally to the >>>>> schedulers we probably just need less than a handful of reset >>>>> sources, most likely even just one or two is enough. >>>>> >>>>> The only justification I can see of having additional separate >>>>> reset sources would be if somebody wants to know if a specific >>>>> source has been handled or not (e.g. call flush_work() or >>>>> work_pending()). Like in the case of a reset triggered through >>>>> debugfs. >>>> >>>> >>>> This is indeed one reason, another is as we said before that if you >>>> share 'reset source' (meaning a delayed work) with another client >>>> (i.e. RAS and KFD) it means you make assumption that the other >>>> client always proceeds with the >>>> reset exactly the same way as you expect. So today we have this >>>> only in scheduler vs non scheduler reset happening - non scheduler >>>> reset clients assume the reset is always fully executed in HW while >>>> scheduler based reset makes shortcuts and not always does HW reset >>>> hence they cannot share 'reset source' (delayed work). Yes, we can >>>> always add this in the future if and when such problem will arise >>>> but no one will remember this then and a new bug will be introduced >>>> and will take time to find and resolve. >>> >>> Mhm, so your main concern is that we forget to correctly handle the >>> new reset sources? >>> >>> How about we do it like this then: >>> >>> struct amdgpu_reset_domain { >>> .... >>> union { >>> struct { >>> struct work_item debugfs; >>> struct work_item ras; >>> .... >>> }; >>> struct work_item array[] >>> } reset_sources; >>> } >>> >> >> If it's only about static array, >> >> enum amdgpu_reset_soruce { >> >> AMDGPU_RESET_SRC_RAS, >> AMDGPU_RESET_SRC_ABC, >> ..... >> AMDGPU_RESET_SRC_XYZ, >> AMDGPU_RESET_SRC_MAX >> >> }; >> >> struct work_struct reset_work[AMDGPU_RESET_SRC_MAX]; => An index for >> each work item >> >> >> Thanks, >> Lijo > > > It's possible though it makes harder to generalize reset_domain later > for other drivers. > But still one caveat, look at amdgpu_recover_work_struct and it's > usage in amdgpu_device_gpu_recover and in gpu_recover_get, > At least for debugfs i need to return back the result of GPU reset and > so I cannot store actual work items in the array mentioned above > but rather pointers to work_item because i need a way to get back the > return value from gpu_recover like I do now in amdgpu_device_gpu_recover. You should try to avoid that as well. See when the debugfs reset is canceled because of a scheduler reset you won't get a useful return code either. What we should do instead is to cache the status of the last reset in the reset domain. Regards, Christian. > > Andrey > > >> >>> Not 100% sure if that works, but something like that should do the >>> trick. >>> >>> My main concern is that I don't want to allocate the work items on >>> the stack and dynamic allocation (e.g. kmalloc) is usually not >>> possible either. >>> >>> Additional to that putting/removing work items from a list, array or >>> other container is a very common source for race conditions. >>> >>> Regards, >>> Christian. >>> >>>> >>>>>> to the cancellation list which you showed above. In current way >>>>>> all this done automatically within reset_domain code and it's >>>>>> agnostic to specific driver and it's specific list of reset >>>>>> sources. Also in case we would want to generalize reset_domain to >>>>>> other GPU drivers (which was >>>>>> a plan as far as i remember) this explicit mention of each reset >>>>>> works for cancellation is again less suitable in my opinion. >>>>> >>>>> Well we could put the work item for the scheduler independent >>>>> reset source into the reset domain as well. But I'm not sure those >>>>> additional reset sources should be part of any common handling, >>>>> that is largely amdgpu specific. >>>> >>>> >>>> So it's for sure more then one source for the reasons described >>>> above, also note that for scheduler we already cancel delayed work >>>> in drm_sched_stop so calling them again in amdgpu code kind of >>>> superfluous. >>>> >>>> Andrey >>>> >>>> >>>>> >>>>> Christian. >>>>> >>>>>> >>>>>> Andrey >>>>>> >>>>>> >>>>>>> >>>>>>>> >>>>>>>> Andrey >>>>>>>> >>>>>>>> >>>>>>>>> >>>>>>>>>> >>>>>>>>>> The only difference is I chose to do the canceling right >>>>>>>>>> BEFORE the HW reset and not AFTER. I did this because I see a >>>>>>>>>> possible race where a new reset request is being generated >>>>>>>>>> exactly after we finished the HW reset but before we canceled >>>>>>>>>> out all pending resets - in such case you wold not want to >>>>>>>>>> cancel this 'border line new' reset request. >>>>>>>>> >>>>>>>>> Why not? Any new reset request directly after a hardware reset >>>>>>>>> is most likely just falsely generated by the reset itself. >>>>>>>>> >>>>>>>>> Ideally I would cancel all sources after the reset, but before >>>>>>>>> starting any new work. >>>>>>>>> >>>>>>>>> Regards, >>>>>>>>> Christian. >>>>>>>>> >>>>>>>>>> >>>>>>>>>> >>>>>>>>>> Andrey >>>>>>>>>> >>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>> Regards, >>>>>>>>>>> Christian. >>>>>>>>>>> >>>>>>>>>>>>> You can see that if many different reset sources share >>>>>>>>>>>>> same work struct what can happen is that the first to >>>>>>>>>>>>> obtain the lock you describe bellow might opt out from >>>>>>>>>>>>> full HW reset because his bad job did signal for example >>>>>>>>>>>>> or because his hunged IP block was able to recover through >>>>>>>>>>>>> SW reset but in the meantime another reset source who >>>>>>>>>>>>> needed an actual HW reset just silently returned and we >>>>>>>>>>>>> end up with unhandled reset request. True that today this >>>>>>>>>>>>> happens only to job timeout reset sources that are handled >>>>>>>>>>>>> form within the scheduler and won't use this single work >>>>>>>>>>>>> struct but no one prevents a future case for this to >>>>>>>>>>>>> happen and also, if we actually want to unify scheduler >>>>>>>>>>>>> time out handlers within reset domain (which seems to me >>>>>>>>>>>>> the right design approach) we won't be able to use just >>>>>>>>>>>>> one work struct for this reason anyway. >>>>>>>>>>>>> >>>>>>>>>>>> >>>>>>>>>>>> Just to add to this point - a reset domain is co-operative >>>>>>>>>>>> domain. In addition to sharing a set of clients from >>>>>>>>>>>> various reset sources for one device, it also will have a >>>>>>>>>>>> set of devices like in XGMI hive. The job timeout on one >>>>>>>>>>>> device may not eventually result in result, but a RAS error >>>>>>>>>>>> happening on another device at the same time would need a >>>>>>>>>>>> reset. The second device's RAS error cannot return seeing >>>>>>>>>>>> that a reset work already started, or ignore the reset work >>>>>>>>>>>> given that another device has filled the reset data. >>>>>>>>>>>> >>>>>>>>>>>> When there is a reset domain, it should take care of the >>>>>>>>>>>> work scheduled and keeping it in device or any other level >>>>>>>>>>>> doesn't sound good. >>>>>>>>>>>> >>>>>>>>>>>> Thanks, >>>>>>>>>>>> Lijo >>>>>>>>>>>> >>>>>>>>>>>>> Andrey >>>>>>>>>>>>> >>>>>>>>>>>>> >>>>>>>>>>>>>> >>>>>>>>>>>>>> I'd put the reset work struct into the reset_domain >>>>>>>>>>>>>> struct. That way you'd have exactly one worker for the >>>>>>>>>>>>>> reset domain. You could implement a lock-less scheme to >>>>>>>>>>>>>> decide whether you need to schedule a reset, e.g. using >>>>>>>>>>>>>> an atomic counter in the shared work struct that gets >>>>>>>>>>>>>> incremented when a client wants to trigger a reset >>>>>>>>>>>>>> (atomic_add_return). If that counter is exactly 1 after >>>>>>>>>>>>>> incrementing, you need to fill in the rest of the work >>>>>>>>>>>>>> struct and schedule the work. Otherwise, it's already >>>>>>>>>>>>>> scheduled (or another client is in the process of >>>>>>>>>>>>>> scheduling it) and you just return. When the worker >>>>>>>>>>>>>> finishes (after confirming a successful reset), it resets >>>>>>>>>>>>>> the counter to 0, so the next client requesting a reset >>>>>>>>>>>>>> will schedule the worker again. >>>>>>>>>>>>>> >>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>> Felix >>>>>>>>>>>>>> >>>>>>>>>>>>>> >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> Additional to that keep in mind that you can't allocate >>>>>>>>>>>>>>>> any memory before or during the GPU reset nor wait for >>>>>>>>>>>>>>>> the reset to complete (so you can't allocate anything >>>>>>>>>>>>>>>> on the stack either). >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> There is no dynamic allocation here, regarding stack >>>>>>>>>>>>>>> allocations - we do it all the time when we call >>>>>>>>>>>>>>> functions, even during GPU resets, how on stack >>>>>>>>>>>>>>> allocation of work struct in amdgpu_device_gpu_recover >>>>>>>>>>>>>>> is different from any other local variable we allocate >>>>>>>>>>>>>>> in any function we call ? >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> I am also not sure why it's not allowed to wait for >>>>>>>>>>>>>>> reset to complete ? Also, see in amdgpu_ras_do_recovery >>>>>>>>>>>>>>> and gpu_recover_get (debugfs) - the caller expects the >>>>>>>>>>>>>>> reset to complete before he returns. I can probably work >>>>>>>>>>>>>>> around it in RAS code by calling >>>>>>>>>>>>>>> atomic_set(&ras->in_recovery, 0) from some callback >>>>>>>>>>>>>>> within actual reset function but regarding sysfs it >>>>>>>>>>>>>>> actually expects a result returned indicating whether >>>>>>>>>>>>>>> the call was successful or not. >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> I don't think that concept you try here will work. >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>>>> Christian. >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> Also in general seems to me it's cleaner approach >>>>>>>>>>>>>>>>> where this logic (the work items) are held and handled >>>>>>>>>>>>>>>>> in reset_domain and are not split in each adev or any >>>>>>>>>>>>>>>>> other entity. We might want in the future to even move >>>>>>>>>>>>>>>>> the scheduler handling into reset domain since reset >>>>>>>>>>>>>>>>> domain is supposed to be a generic things and not only >>>>>>>>>>>>>>>>> or AMD. >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>>>>>>>> Christian. >>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky >>>>>>>>>>>>>>>>>>>>> <andrey.grodzovsky@amd.com> >>>>>>>>>>>>>>>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com> >>>>>>>>>>>>>>>>>>>>> --- >>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +--- >>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++-- >>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 + >>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 >>>>>>>>>>>>>>>>>>>>> +++++++++++++++++++++- >>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +- >>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++- >>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++- >>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++- >>>>>>>>>>>>>>>>>>>>> 8 files changed, 104 insertions(+), 24 deletions(-) >>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>>> index 4264abc5604d..99efd8317547 100644 >>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>>> @@ -109,6 +109,7 @@ >>>>>>>>>>>>>>>>>>>>> #include "amdgpu_fdinfo.h" >>>>>>>>>>>>>>>>>>>>> #include "amdgpu_mca.h" >>>>>>>>>>>>>>>>>>>>> #include "amdgpu_ras.h" >>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h" >>>>>>>>>>>>>>>>>>>>> #define MAX_GPU_INSTANCE 16 >>>>>>>>>>>>>>>>>>>>> @@ -509,16 +510,6 @@ struct >>>>>>>>>>>>>>>>>>>>> amdgpu_allowed_register_entry { >>>>>>>>>>>>>>>>>>>>> bool grbm_indexed; >>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>> -enum amd_reset_method { >>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_NONE = -1, >>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_LEGACY = 0, >>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE0, >>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE1, >>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE2, >>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_BACO, >>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_PCI, >>>>>>>>>>>>>>>>>>>>> -}; >>>>>>>>>>>>>>>>>>>>> - >>>>>>>>>>>>>>>>>>>>> struct amdgpu_video_codec_info { >>>>>>>>>>>>>>>>>>>>> u32 codec_type; >>>>>>>>>>>>>>>>>>>>> u32 max_width; >>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>>> index e582f1044c0f..7fa82269c30f 100644 >>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>>> @@ -5201,6 +5201,12 @@ int >>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>> *adev, >>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>> tmp_vram_lost_counter = >>>>>>>>>>>>>>>>>>>>> atomic_read(&((adev)->vram_lost_counter)); >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> + /* Drop all pending resets since we will >>>>>>>>>>>>>>>>>>>>> reset now anyway */ >>>>>>>>>>>>>>>>>>>>> + tmp_adev = >>>>>>>>>>>>>>>>>>>>> list_first_entry(device_list_handle, struct >>>>>>>>>>>>>>>>>>>>> amdgpu_device, >>>>>>>>>>>>>>>>>>>>> + reset_list); >>>>>>>>>>>>>>>>>>>>> + amdgpu_reset_pending_list(tmp_adev->reset_domain); >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> /* Actual ASIC resets if needed.*/ >>>>>>>>>>>>>>>>>>>>> /* Host driver will handle XGMI hive reset >>>>>>>>>>>>>>>>>>>>> for SRIOV */ >>>>>>>>>>>>>>>>>>>>> if (amdgpu_sriov_vf(adev)) { >>>>>>>>>>>>>>>>>>>>> @@ -5296,7 +5302,7 @@ int >>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>> *adev, >>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>> struct amdgpu_recover_work_struct { >>>>>>>>>>>>>>>>>>>>> - struct work_struct base; >>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct base; >>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev; >>>>>>>>>>>>>>>>>>>>> struct amdgpu_job *job; >>>>>>>>>>>>>>>>>>>>> int ret; >>>>>>>>>>>>>>>>>>>>> @@ -5304,7 +5310,7 @@ struct >>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct { >>>>>>>>>>>>>>>>>>>>> static void >>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work(struct >>>>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>> - struct amdgpu_recover_work_struct >>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct >>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base); >>>>>>>>>>>>>>>>>>>>> + struct amdgpu_recover_work_struct >>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct >>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base.base.work); >>>>>>>>>>>>>>>>>>>>> recover_work->ret = >>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, >>>>>>>>>>>>>>>>>>>>> recover_work->job); >>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>> @@ -5316,12 +5322,15 @@ int >>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct amdgpu_device *adev, >>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>> struct amdgpu_recover_work_struct work = >>>>>>>>>>>>>>>>>>>>> {.adev = adev, .job = job}; >>>>>>>>>>>>>>>>>>>>> - INIT_WORK(&work.base, >>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&work.base.base, >>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&work.base.node); >>>>>>>>>>>>>>>>>>>>> if >>>>>>>>>>>>>>>>>>>>> (!amdgpu_reset_domain_schedule(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>> &work.base)) >>>>>>>>>>>>>>>>>>>>> return -EAGAIN; >>>>>>>>>>>>>>>>>>>>> - flush_work(&work.base); >>>>>>>>>>>>>>>>>>>>> + flush_delayed_work(&work.base.base); >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>> &work.base); >>>>>>>>>>>>>>>>>>>>> return work.ret; >>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>>> index c80af0889773..ffddd419c351 100644 >>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain >>>>>>>>>>>>>>>>>>>>> *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d >>>>>>>>>>>>>>>>>>>>> atomic_set(&reset_domain->in_gpu_reset, 0); >>>>>>>>>>>>>>>>>>>>> init_rwsem(&reset_domain->sem); >>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&reset_domain->pending_works); >>>>>>>>>>>>>>>>>>>>> + mutex_init(&reset_domain->reset_lock); >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> return reset_domain; >>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>>> index 1949dbe28a86..863ec5720fc1 100644 >>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>>> @@ -24,7 +24,18 @@ >>>>>>>>>>>>>>>>>>>>> #ifndef __AMDGPU_RESET_H__ >>>>>>>>>>>>>>>>>>>>> #define __AMDGPU_RESET_H__ >>>>>>>>>>>>>>>>>>>>> -#include "amdgpu.h" >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> +#include <linux/atomic.h> >>>>>>>>>>>>>>>>>>>>> +#include <linux/mutex.h> >>>>>>>>>>>>>>>>>>>>> +#include <linux/list.h> >>>>>>>>>>>>>>>>>>>>> +#include <linux/kref.h> >>>>>>>>>>>>>>>>>>>>> +#include <linux/rwsem.h> >>>>>>>>>>>>>>>>>>>>> +#include <linux/workqueue.h> >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> +struct amdgpu_device; >>>>>>>>>>>>>>>>>>>>> +struct amdgpu_job; >>>>>>>>>>>>>>>>>>>>> +struct amdgpu_hive_info; >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> enum AMDGPU_RESET_FLAGS { >>>>>>>>>>>>>>>>>>>>> @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS { >>>>>>>>>>>>>>>>>>>>> AMDGPU_SKIP_HW_RESET = 1, >>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> +enum amd_reset_method { >>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_NONE = -1, >>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_LEGACY = 0, >>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE0, >>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE1, >>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE2, >>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_BACO, >>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_PCI, >>>>>>>>>>>>>>>>>>>>> +}; >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> struct amdgpu_reset_context { >>>>>>>>>>>>>>>>>>>>> enum amd_reset_method method; >>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *reset_req_dev; >>>>>>>>>>>>>>>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context { >>>>>>>>>>>>>>>>>>>>> unsigned long flags; >>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>> +struct amdgpu_reset_control; >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> struct amdgpu_reset_handler { >>>>>>>>>>>>>>>>>>>>> enum amd_reset_method reset_method; >>>>>>>>>>>>>>>>>>>>> struct list_head handler_list; >>>>>>>>>>>>>>>>>>>>> @@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type { >>>>>>>>>>>>>>>>>>>>> XGMI_HIVE >>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> +struct amdgpu_reset_work_struct { >>>>>>>>>>>>>>>>>>>>> + struct delayed_work base; >>>>>>>>>>>>>>>>>>>>> + struct list_head node; >>>>>>>>>>>>>>>>>>>>> +}; >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> struct amdgpu_reset_domain { >>>>>>>>>>>>>>>>>>>>> struct kref refcount; >>>>>>>>>>>>>>>>>>>>> struct workqueue_struct *wq; >>>>>>>>>>>>>>>>>>>>> enum amdgpu_reset_domain_type type; >>>>>>>>>>>>>>>>>>>>> struct rw_semaphore sem; >>>>>>>>>>>>>>>>>>>>> atomic_t in_gpu_reset; >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> + struct list_head pending_works; >>>>>>>>>>>>>>>>>>>>> + struct mutex reset_lock; >>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>> @@ -113,9 +146,43 @@ static inline void >>>>>>>>>>>>>>>>>>>>> amdgpu_reset_put_reset_domain(struct >>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *dom >>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>> static inline bool >>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_schedule(struct >>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain, >>>>>>>>>>>>>>>>>>>>> - struct work_struct *work) >>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work) >>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>> - return queue_work(domain->wq, work); >>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> + if (!queue_delayed_work(domain->wq, >>>>>>>>>>>>>>>>>>>>> &work->base, 0)) { >>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>> + return false; >>>>>>>>>>>>>>>>>>>>> + } >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> + list_add_tail(&work->node, &domain->pending_works); >>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> + return true; >>>>>>>>>>>>>>>>>>>>> +} >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> +static inline void >>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(struct >>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain, >>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct >>>>>>>>>>>>>>>>>>>>> *work) >>>>>>>>>>>>>>>>>>>>> +{ >>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>> + list_del_init(&work->node); >>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>> +} >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> +static inline void >>>>>>>>>>>>>>>>>>>>> amdgpu_reset_pending_list(struct >>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain) >>>>>>>>>>>>>>>>>>>>> +{ >>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *entry, *tmp; >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>> + list_for_each_entry_safe(entry, tmp, >>>>>>>>>>>>>>>>>>>>> &domain->pending_works, node) { >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> + list_del_init(&entry->node); >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> + /* Stop any other related pending resets */ >>>>>>>>>>>>>>>>>>>>> + cancel_delayed_work(&entry->base); >>>>>>>>>>>>>>>>>>>>> + } >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>> void amdgpu_device_lock_reset_domain(struct >>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *reset_domain); >>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>>> index 239f232f9c02..574e870d3064 100644 >>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>>> @@ -25,6 +25,7 @@ >>>>>>>>>>>>>>>>>>>>> #define AMDGPU_VIRT_H >>>>>>>>>>>>>>>>>>>>> #include "amdgv_sriovmsg.h" >>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h" >>>>>>>>>>>>>>>>>>>>> #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 << 0) >>>>>>>>>>>>>>>>>>>>> /* vBIOS is sr-iov ready */ >>>>>>>>>>>>>>>>>>>>> #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 1) /* >>>>>>>>>>>>>>>>>>>>> sr-iov is enabled on this GPU */ >>>>>>>>>>>>>>>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt { >>>>>>>>>>>>>>>>>>>>> uint32_t reg_val_offs; >>>>>>>>>>>>>>>>>>>>> struct amdgpu_irq_src ack_irq; >>>>>>>>>>>>>>>>>>>>> struct amdgpu_irq_src rcv_irq; >>>>>>>>>>>>>>>>>>>>> - struct work_struct flr_work; >>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct flr_work; >>>>>>>>>>>>>>>>>>>>> struct amdgpu_mm_table mm_table; >>>>>>>>>>>>>>>>>>>>> const struct amdgpu_virt_ops *ops; >>>>>>>>>>>>>>>>>>>>> struct amdgpu_vf_error_buffer vf_errors; >>>>>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>>> index b81acf59870c..f3d1c2be9292 100644 >>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>>> @@ -251,7 +251,7 @@ static int >>>>>>>>>>>>>>>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>> *adev, >>>>>>>>>>>>>>>>>>>>> static void xgpu_ai_mailbox_flr_work(struct >>>>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = container_of(work, >>>>>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = container_of(work, >>>>>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work.base.work); >>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev = >>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt); >>>>>>>>>>>>>>>>>>>>> int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>>>>>>>>>>>>>>>> @@ -380,7 +380,8 @@ int >>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_get_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>> @@ -389,6 +390,8 @@ void >>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_put_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>> static int xgpu_ai_request_init_data(struct >>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644 >>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>>> @@ -275,7 +275,7 @@ static int >>>>>>>>>>>>>>>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>> *adev, >>>>>>>>>>>>>>>>>>>>> static void xgpu_nv_mailbox_flr_work(struct >>>>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = container_of(work, >>>>>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = container_of(work, >>>>>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work.base.work); >>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev = >>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt); >>>>>>>>>>>>>>>>>>>>> int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>>>>>>>>>>>>>>>> @@ -407,7 +407,8 @@ int >>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_get_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>> @@ -416,6 +417,8 @@ void >>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_put_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>> const struct amdgpu_virt_ops xgpu_nv_virt_ops = { >>>>>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644 >>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>>> @@ -512,7 +512,7 @@ static int >>>>>>>>>>>>>>>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>> *adev, >>>>>>>>>>>>>>>>>>>>> static void xgpu_vi_mailbox_flr_work(struct >>>>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = container_of(work, >>>>>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = container_of(work, >>>>>>>>>>>>>>>>>>>>> struct amdgpu_virt, flr_work.base.work); >>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev = >>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt); >>>>>>>>>>>>>>>>>>>>> /* wait until RCV_MSG become 3 */ >>>>>>>>>>>>>>>>>>>>> @@ -610,7 +610,8 @@ int >>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_get_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>> @@ -619,6 +620,8 @@ void >>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_put_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>> const struct amdgpu_virt_ops xgpu_vi_virt_ops = { >>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> >>>>>>>>>>> >>>>>>>>> >>>>>>> >>>>> >>> ^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive. 2022-05-11 15:39 ` Christian König @ 2022-05-11 15:57 ` Andrey Grodzovsky 2022-05-12 6:03 ` Christian König 2022-05-11 20:27 ` Andrey Grodzovsky 1 sibling, 1 reply; 40+ messages in thread From: Andrey Grodzovsky @ 2022-05-11 15:57 UTC (permalink / raw) To: Christian König, Lazar, Lijo, Christian König, Felix Kuehling, amd-gfx Cc: Bai Zoy On 2022-05-11 11:39, Christian König wrote: > Am 11.05.22 um 17:35 schrieb Andrey Grodzovsky: >> On 2022-05-11 11:20, Lazar, Lijo wrote: >>> >>> >>> On 5/11/2022 7:28 PM, Christian König wrote: >>>> Am 11.05.22 um 15:43 schrieb Andrey Grodzovsky: >>>>> On 2022-05-11 03:38, Christian König wrote: >>>>>> Am 10.05.22 um 20:53 schrieb Andrey Grodzovsky: >>>>>>> [SNIP] >>>>>>>> E.g. in the reset code (either before or after the reset, >>>>>>>> that's debatable) you do something like this: >>>>>>>> >>>>>>>> for (i = 0; i < num_ring; ++i) >>>>>>>> cancel_delayed_work(ring[i]->scheduler....) >>>>>>>> cancel_work(adev->ras_work); >>>>>>>> cancel_work(adev->iofault_work); >>>>>>>> cancel_work(adev->debugfs_work); >>>>>>>> ... >>>>>>>> >>>>>>>> You don't really need to track which reset source has fired and >>>>>>>> which hasn't, because that would be racy again. Instead just >>>>>>>> bluntly reset all possible sources. >>>>>>>> >>>>>>>> Christian. >>>>>>> >>>>>>> >>>>>>> I don't say we care if it fired once or twice (I need to add a >>>>>>> fix to only insert reset work to pending reset list if it's not >>>>>>> already there), the point of using list (or array) to which you >>>>>>> add and from which you remove is that the logic of this is >>>>>>> encapsulated within reset domain. In your way we need to be >>>>>>> aware who exactly schedules reset work and explicitly cancel >>>>>>> them, this also means that for any new source added in the >>>>>>> future you will need to remember to add him >>>>>> >>>>>> I don't think that this is a valid argument. Additionally to the >>>>>> schedulers we probably just need less than a handful of reset >>>>>> sources, most likely even just one or two is enough. >>>>>> >>>>>> The only justification I can see of having additional separate >>>>>> reset sources would be if somebody wants to know if a specific >>>>>> source has been handled or not (e.g. call flush_work() or >>>>>> work_pending()). Like in the case of a reset triggered through >>>>>> debugfs. >>>>> >>>>> >>>>> This is indeed one reason, another is as we said before that if >>>>> you share 'reset source' (meaning a delayed work) with another >>>>> client (i.e. RAS and KFD) it means you make assumption that the >>>>> other client always proceeds with the >>>>> reset exactly the same way as you expect. So today we have this >>>>> only in scheduler vs non scheduler reset happening - non scheduler >>>>> reset clients assume the reset is always fully executed in HW >>>>> while scheduler based reset makes shortcuts and not always does HW >>>>> reset hence they cannot share 'reset source' (delayed work). Yes, >>>>> we can always add this in the future if and when such problem will >>>>> arise but no one will remember this then and a new bug will be >>>>> introduced and will take time to find and resolve. >>>> >>>> Mhm, so your main concern is that we forget to correctly handle the >>>> new reset sources? >>>> >>>> How about we do it like this then: >>>> >>>> struct amdgpu_reset_domain { >>>> .... >>>> union { >>>> struct { >>>> struct work_item debugfs; >>>> struct work_item ras; >>>> .... >>>> }; >>>> struct work_item array[] >>>> } reset_sources; >>>> } >>>> >>> >>> If it's only about static array, >>> >>> enum amdgpu_reset_soruce { >>> >>> AMDGPU_RESET_SRC_RAS, >>> AMDGPU_RESET_SRC_ABC, >>> ..... >>> AMDGPU_RESET_SRC_XYZ, >>> AMDGPU_RESET_SRC_MAX >>> >>> }; >>> >>> struct work_struct reset_work[AMDGPU_RESET_SRC_MAX]; => An index for >>> each work item >>> >>> >>> Thanks, >>> Lijo >> >> >> It's possible though it makes harder to generalize reset_domain later >> for other drivers. >> But still one caveat, look at amdgpu_recover_work_struct and it's >> usage in amdgpu_device_gpu_recover and in gpu_recover_get, >> At least for debugfs i need to return back the result of GPU reset >> and so I cannot store actual work items in the array mentioned above >> but rather pointers to work_item because i need a way to get back the >> return value from gpu_recover like I do now in >> amdgpu_device_gpu_recover. > > You should try to avoid that as well. Why ? > > See when the debugfs reset is canceled because of a scheduler reset > you won't get a useful return code either. That true. > > What we should do instead is to cache the status of the last reset in > the reset domain. Probably an atomic ret in reset_domain then. Another 2 points i forgot to ask - 1) What race condition you had in mind about insertion and extraction from the list if all is done under lock ? 2) This I asked already - why you opposed so much to allocation on the stack ? I understand the problem with dynamic memory allocations but why stack ? We do multiple allocations on the stack for any function we call during GPU reset, what so special in this case where we allocate work struct on the stack? It's not like the work struct is especially big compared to other stuff we allocate on the stack. Andrey > > Regards, > Christian. > >> >> Andrey >> >> >>> >>>> Not 100% sure if that works, but something like that should do the >>>> trick. >>>> >>>> My main concern is that I don't want to allocate the work items on >>>> the stack and dynamic allocation (e.g. kmalloc) is usually not >>>> possible either. >>>> >>>> Additional to that putting/removing work items from a list, array >>>> or other container is a very common source for race conditions. >>>> >>>> Regards, >>>> Christian. >>>> >>>>> >>>>>>> to the cancellation list which you showed above. In current way >>>>>>> all this done automatically within reset_domain code and it's >>>>>>> agnostic to specific driver and it's specific list of reset >>>>>>> sources. Also in case we would want to generalize reset_domain >>>>>>> to other GPU drivers (which was >>>>>>> a plan as far as i remember) this explicit mention of each reset >>>>>>> works for cancellation is again less suitable in my opinion. >>>>>> >>>>>> Well we could put the work item for the scheduler independent >>>>>> reset source into the reset domain as well. But I'm not sure >>>>>> those additional reset sources should be part of any common >>>>>> handling, that is largely amdgpu specific. >>>>> >>>>> >>>>> So it's for sure more then one source for the reasons described >>>>> above, also note that for scheduler we already cancel delayed work >>>>> in drm_sched_stop so calling them again in amdgpu code kind of >>>>> superfluous. >>>>> >>>>> Andrey >>>>> >>>>> >>>>>> >>>>>> Christian. >>>>>> >>>>>>> >>>>>>> Andrey >>>>>>> >>>>>>> >>>>>>>> >>>>>>>>> >>>>>>>>> Andrey >>>>>>>>> >>>>>>>>> >>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>> The only difference is I chose to do the canceling right >>>>>>>>>>> BEFORE the HW reset and not AFTER. I did this because I see >>>>>>>>>>> a possible race where a new reset request is being generated >>>>>>>>>>> exactly after we finished the HW reset but before we >>>>>>>>>>> canceled out all pending resets - in such case you wold not >>>>>>>>>>> want to cancel this 'border line new' reset request. >>>>>>>>>> >>>>>>>>>> Why not? Any new reset request directly after a hardware >>>>>>>>>> reset is most likely just falsely generated by the reset itself. >>>>>>>>>> >>>>>>>>>> Ideally I would cancel all sources after the reset, but >>>>>>>>>> before starting any new work. >>>>>>>>>> >>>>>>>>>> Regards, >>>>>>>>>> Christian. >>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>> Andrey >>>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>>> >>>>>>>>>>>> Regards, >>>>>>>>>>>> Christian. >>>>>>>>>>>> >>>>>>>>>>>>>> You can see that if many different reset sources share >>>>>>>>>>>>>> same work struct what can happen is that the first to >>>>>>>>>>>>>> obtain the lock you describe bellow might opt out from >>>>>>>>>>>>>> full HW reset because his bad job did signal for example >>>>>>>>>>>>>> or because his hunged IP block was able to recover >>>>>>>>>>>>>> through SW reset but in the meantime another reset source >>>>>>>>>>>>>> who needed an actual HW reset just silently returned and >>>>>>>>>>>>>> we end up with unhandled reset request. True that today >>>>>>>>>>>>>> this happens only to job timeout reset sources that are >>>>>>>>>>>>>> handled form within the scheduler and won't use this >>>>>>>>>>>>>> single work struct but no one prevents a future case for >>>>>>>>>>>>>> this to happen and also, if we actually want to unify >>>>>>>>>>>>>> scheduler time out handlers within reset domain (which >>>>>>>>>>>>>> seems to me the right design approach) we won't be able >>>>>>>>>>>>>> to use just one work struct for this reason anyway. >>>>>>>>>>>>>> >>>>>>>>>>>>> >>>>>>>>>>>>> Just to add to this point - a reset domain is co-operative >>>>>>>>>>>>> domain. In addition to sharing a set of clients from >>>>>>>>>>>>> various reset sources for one device, it also will have a >>>>>>>>>>>>> set of devices like in XGMI hive. The job timeout on one >>>>>>>>>>>>> device may not eventually result in result, but a RAS >>>>>>>>>>>>> error happening on another device at the same time would >>>>>>>>>>>>> need a reset. The second device's RAS error cannot return >>>>>>>>>>>>> seeing that a reset work already started, or ignore the >>>>>>>>>>>>> reset work given that another device has filled the reset >>>>>>>>>>>>> data. >>>>>>>>>>>>> >>>>>>>>>>>>> When there is a reset domain, it should take care of the >>>>>>>>>>>>> work scheduled and keeping it in device or any other level >>>>>>>>>>>>> doesn't sound good. >>>>>>>>>>>>> >>>>>>>>>>>>> Thanks, >>>>>>>>>>>>> Lijo >>>>>>>>>>>>> >>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>> >>>>>>>>>>>>>> >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> I'd put the reset work struct into the reset_domain >>>>>>>>>>>>>>> struct. That way you'd have exactly one worker for the >>>>>>>>>>>>>>> reset domain. You could implement a lock-less scheme to >>>>>>>>>>>>>>> decide whether you need to schedule a reset, e.g. using >>>>>>>>>>>>>>> an atomic counter in the shared work struct that gets >>>>>>>>>>>>>>> incremented when a client wants to trigger a reset >>>>>>>>>>>>>>> (atomic_add_return). If that counter is exactly 1 after >>>>>>>>>>>>>>> incrementing, you need to fill in the rest of the work >>>>>>>>>>>>>>> struct and schedule the work. Otherwise, it's already >>>>>>>>>>>>>>> scheduled (or another client is in the process of >>>>>>>>>>>>>>> scheduling it) and you just return. When the worker >>>>>>>>>>>>>>> finishes (after confirming a successful reset), it >>>>>>>>>>>>>>> resets the counter to 0, so the next client requesting a >>>>>>>>>>>>>>> reset will schedule the worker again. >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>>> Felix >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> Additional to that keep in mind that you can't >>>>>>>>>>>>>>>>> allocate any memory before or during the GPU reset nor >>>>>>>>>>>>>>>>> wait for the reset to complete (so you can't allocate >>>>>>>>>>>>>>>>> anything on the stack either). >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> There is no dynamic allocation here, regarding stack >>>>>>>>>>>>>>>> allocations - we do it all the time when we call >>>>>>>>>>>>>>>> functions, even during GPU resets, how on stack >>>>>>>>>>>>>>>> allocation of work struct in amdgpu_device_gpu_recover >>>>>>>>>>>>>>>> is different from any other local variable we allocate >>>>>>>>>>>>>>>> in any function we call ? >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> I am also not sure why it's not allowed to wait for >>>>>>>>>>>>>>>> reset to complete ? Also, see in amdgpu_ras_do_recovery >>>>>>>>>>>>>>>> and gpu_recover_get (debugfs) - the caller expects the >>>>>>>>>>>>>>>> reset to complete before he returns. I can probably >>>>>>>>>>>>>>>> work around it in RAS code by calling >>>>>>>>>>>>>>>> atomic_set(&ras->in_recovery, 0) from some callback >>>>>>>>>>>>>>>> within actual reset function but regarding sysfs it >>>>>>>>>>>>>>>> actually expects a result returned indicating whether >>>>>>>>>>>>>>>> the call was successful or not. >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> I don't think that concept you try here will work. >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>>>>> Christian. >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> Also in general seems to me it's cleaner approach >>>>>>>>>>>>>>>>>> where this logic (the work items) are held and >>>>>>>>>>>>>>>>>> handled in reset_domain and are not split in each >>>>>>>>>>>>>>>>>> adev or any other entity. We might want in the future >>>>>>>>>>>>>>>>>> to even move the scheduler handling into reset domain >>>>>>>>>>>>>>>>>> since reset domain is supposed to be a generic things >>>>>>>>>>>>>>>>>> and not only or AMD. >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>>>>>>>>> Christian. >>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky >>>>>>>>>>>>>>>>>>>>>> <andrey.grodzovsky@amd.com> >>>>>>>>>>>>>>>>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com> >>>>>>>>>>>>>>>>>>>>>> --- >>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +--- >>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 >>>>>>>>>>>>>>>>>>>>>> +++-- >>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 + >>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 >>>>>>>>>>>>>>>>>>>>>> +++++++++++++++++++++- >>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +- >>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++- >>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++- >>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++- >>>>>>>>>>>>>>>>>>>>>> 8 files changed, 104 insertions(+), 24 >>>>>>>>>>>>>>>>>>>>>> deletions(-) >>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>>>> index 4264abc5604d..99efd8317547 100644 >>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>>>> @@ -109,6 +109,7 @@ >>>>>>>>>>>>>>>>>>>>>> #include "amdgpu_fdinfo.h" >>>>>>>>>>>>>>>>>>>>>> #include "amdgpu_mca.h" >>>>>>>>>>>>>>>>>>>>>> #include "amdgpu_ras.h" >>>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h" >>>>>>>>>>>>>>>>>>>>>> #define MAX_GPU_INSTANCE 16 >>>>>>>>>>>>>>>>>>>>>> @@ -509,16 +510,6 @@ struct >>>>>>>>>>>>>>>>>>>>>> amdgpu_allowed_register_entry { >>>>>>>>>>>>>>>>>>>>>> bool grbm_indexed; >>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>> -enum amd_reset_method { >>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_NONE = -1, >>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_LEGACY = 0, >>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE0, >>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE1, >>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE2, >>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_BACO, >>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_PCI, >>>>>>>>>>>>>>>>>>>>>> -}; >>>>>>>>>>>>>>>>>>>>>> - >>>>>>>>>>>>>>>>>>>>>> struct amdgpu_video_codec_info { >>>>>>>>>>>>>>>>>>>>>> u32 codec_type; >>>>>>>>>>>>>>>>>>>>>> u32 max_width; >>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>>>> index e582f1044c0f..7fa82269c30f 100644 >>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>>>> @@ -5201,6 +5201,12 @@ int >>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct >>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev, >>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>> tmp_vram_lost_counter = >>>>>>>>>>>>>>>>>>>>>> atomic_read(&((adev)->vram_lost_counter)); >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> + /* Drop all pending resets since we will >>>>>>>>>>>>>>>>>>>>>> reset now anyway */ >>>>>>>>>>>>>>>>>>>>>> + tmp_adev = >>>>>>>>>>>>>>>>>>>>>> list_first_entry(device_list_handle, struct >>>>>>>>>>>>>>>>>>>>>> amdgpu_device, >>>>>>>>>>>>>>>>>>>>>> + reset_list); >>>>>>>>>>>>>>>>>>>>>> + amdgpu_reset_pending_list(tmp_adev->reset_domain); >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> /* Actual ASIC resets if needed.*/ >>>>>>>>>>>>>>>>>>>>>> /* Host driver will handle XGMI hive reset >>>>>>>>>>>>>>>>>>>>>> for SRIOV */ >>>>>>>>>>>>>>>>>>>>>> if (amdgpu_sriov_vf(adev)) { >>>>>>>>>>>>>>>>>>>>>> @@ -5296,7 +5302,7 @@ int >>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct >>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev, >>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>> struct amdgpu_recover_work_struct { >>>>>>>>>>>>>>>>>>>>>> - struct work_struct base; >>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct base; >>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev; >>>>>>>>>>>>>>>>>>>>>> struct amdgpu_job *job; >>>>>>>>>>>>>>>>>>>>>> int ret; >>>>>>>>>>>>>>>>>>>>>> @@ -5304,7 +5310,7 @@ struct >>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct { >>>>>>>>>>>>>>>>>>>>>> static void >>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work(struct >>>>>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>> - struct amdgpu_recover_work_struct >>>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct >>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base); >>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_recover_work_struct >>>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct >>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base.base.work); >>>>>>>>>>>>>>>>>>>>>> recover_work->ret = >>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, >>>>>>>>>>>>>>>>>>>>>> recover_work->job); >>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>> @@ -5316,12 +5322,15 @@ int >>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>> *adev, >>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>> struct amdgpu_recover_work_struct work = >>>>>>>>>>>>>>>>>>>>>> {.adev = adev, .job = job}; >>>>>>>>>>>>>>>>>>>>>> - INIT_WORK(&work.base, >>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&work.base.base, >>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&work.base.node); >>>>>>>>>>>>>>>>>>>>>> if >>>>>>>>>>>>>>>>>>>>>> (!amdgpu_reset_domain_schedule(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>> &work.base)) >>>>>>>>>>>>>>>>>>>>>> return -EAGAIN; >>>>>>>>>>>>>>>>>>>>>> - flush_work(&work.base); >>>>>>>>>>>>>>>>>>>>>> + flush_delayed_work(&work.base.base); >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>> &work.base); >>>>>>>>>>>>>>>>>>>>>> return work.ret; >>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>>>> index c80af0889773..ffddd419c351 100644 >>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain >>>>>>>>>>>>>>>>>>>>>> *amdgpu_reset_create_reset_domain(enum >>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_d >>>>>>>>>>>>>>>>>>>>>> atomic_set(&reset_domain->in_gpu_reset, 0); >>>>>>>>>>>>>>>>>>>>>> init_rwsem(&reset_domain->sem); >>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&reset_domain->pending_works); >>>>>>>>>>>>>>>>>>>>>> + mutex_init(&reset_domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> return reset_domain; >>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>>>> index 1949dbe28a86..863ec5720fc1 100644 >>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>>>> @@ -24,7 +24,18 @@ >>>>>>>>>>>>>>>>>>>>>> #ifndef __AMDGPU_RESET_H__ >>>>>>>>>>>>>>>>>>>>>> #define __AMDGPU_RESET_H__ >>>>>>>>>>>>>>>>>>>>>> -#include "amdgpu.h" >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> +#include <linux/atomic.h> >>>>>>>>>>>>>>>>>>>>>> +#include <linux/mutex.h> >>>>>>>>>>>>>>>>>>>>>> +#include <linux/list.h> >>>>>>>>>>>>>>>>>>>>>> +#include <linux/kref.h> >>>>>>>>>>>>>>>>>>>>>> +#include <linux/rwsem.h> >>>>>>>>>>>>>>>>>>>>>> +#include <linux/workqueue.h> >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_device; >>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_job; >>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_hive_info; >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> enum AMDGPU_RESET_FLAGS { >>>>>>>>>>>>>>>>>>>>>> @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS { >>>>>>>>>>>>>>>>>>>>>> AMDGPU_SKIP_HW_RESET = 1, >>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> +enum amd_reset_method { >>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_NONE = -1, >>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_LEGACY = 0, >>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE0, >>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE1, >>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE2, >>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_BACO, >>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_PCI, >>>>>>>>>>>>>>>>>>>>>> +}; >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> struct amdgpu_reset_context { >>>>>>>>>>>>>>>>>>>>>> enum amd_reset_method method; >>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *reset_req_dev; >>>>>>>>>>>>>>>>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context { >>>>>>>>>>>>>>>>>>>>>> unsigned long flags; >>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_reset_control; >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> struct amdgpu_reset_handler { >>>>>>>>>>>>>>>>>>>>>> enum amd_reset_method reset_method; >>>>>>>>>>>>>>>>>>>>>> struct list_head handler_list; >>>>>>>>>>>>>>>>>>>>>> @@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type { >>>>>>>>>>>>>>>>>>>>>> XGMI_HIVE >>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_reset_work_struct { >>>>>>>>>>>>>>>>>>>>>> + struct delayed_work base; >>>>>>>>>>>>>>>>>>>>>> + struct list_head node; >>>>>>>>>>>>>>>>>>>>>> +}; >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> struct amdgpu_reset_domain { >>>>>>>>>>>>>>>>>>>>>> struct kref refcount; >>>>>>>>>>>>>>>>>>>>>> struct workqueue_struct *wq; >>>>>>>>>>>>>>>>>>>>>> enum amdgpu_reset_domain_type type; >>>>>>>>>>>>>>>>>>>>>> struct rw_semaphore sem; >>>>>>>>>>>>>>>>>>>>>> atomic_t in_gpu_reset; >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> + struct list_head pending_works; >>>>>>>>>>>>>>>>>>>>>> + struct mutex reset_lock; >>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>> @@ -113,9 +146,43 @@ static inline void >>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_put_reset_domain(struct >>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *dom >>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>> static inline bool >>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_schedule(struct >>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain, >>>>>>>>>>>>>>>>>>>>>> - struct work_struct *work) >>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work) >>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>> - return queue_work(domain->wq, work); >>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> + if (!queue_delayed_work(domain->wq, >>>>>>>>>>>>>>>>>>>>>> &work->base, 0)) { >>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>> + return false; >>>>>>>>>>>>>>>>>>>>>> + } >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> + list_add_tail(&work->node, >>>>>>>>>>>>>>>>>>>>>> &domain->pending_works); >>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> + return true; >>>>>>>>>>>>>>>>>>>>>> +} >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> +static inline void >>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(struct >>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain, >>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work) >>>>>>>>>>>>>>>>>>>>>> +{ >>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>> + list_del_init(&work->node); >>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>> +} >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> +static inline void >>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_pending_list(struct >>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain) >>>>>>>>>>>>>>>>>>>>>> +{ >>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *entry, *tmp; >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>> + list_for_each_entry_safe(entry, tmp, >>>>>>>>>>>>>>>>>>>>>> &domain->pending_works, node) { >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> + list_del_init(&entry->node); >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> + /* Stop any other related pending resets */ >>>>>>>>>>>>>>>>>>>>>> + cancel_delayed_work(&entry->base); >>>>>>>>>>>>>>>>>>>>>> + } >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>> void amdgpu_device_lock_reset_domain(struct >>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *reset_domain); >>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>>>> index 239f232f9c02..574e870d3064 100644 >>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>>>> @@ -25,6 +25,7 @@ >>>>>>>>>>>>>>>>>>>>>> #define AMDGPU_VIRT_H >>>>>>>>>>>>>>>>>>>>>> #include "amdgv_sriovmsg.h" >>>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h" >>>>>>>>>>>>>>>>>>>>>> #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 << >>>>>>>>>>>>>>>>>>>>>> 0) /* vBIOS is sr-iov ready */ >>>>>>>>>>>>>>>>>>>>>> #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 1) >>>>>>>>>>>>>>>>>>>>>> /* sr-iov is enabled on this GPU */ >>>>>>>>>>>>>>>>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt { >>>>>>>>>>>>>>>>>>>>>> uint32_t reg_val_offs; >>>>>>>>>>>>>>>>>>>>>> struct amdgpu_irq_src ack_irq; >>>>>>>>>>>>>>>>>>>>>> struct amdgpu_irq_src rcv_irq; >>>>>>>>>>>>>>>>>>>>>> - struct work_struct flr_work; >>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct flr_work; >>>>>>>>>>>>>>>>>>>>>> struct amdgpu_mm_table mm_table; >>>>>>>>>>>>>>>>>>>>>> const struct amdgpu_virt_ops *ops; >>>>>>>>>>>>>>>>>>>>>> struct amdgpu_vf_error_buffer vf_errors; >>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>>>> index b81acf59870c..f3d1c2be9292 100644 >>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>>>> @@ -251,7 +251,7 @@ static int >>>>>>>>>>>>>>>>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>> *adev, >>>>>>>>>>>>>>>>>>>>>> static void xgpu_ai_mailbox_flr_work(struct >>>>>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, >>>>>>>>>>>>>>>>>>>>>> flr_work.base.work); >>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev = >>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt); >>>>>>>>>>>>>>>>>>>>>> int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>>>>>>>>>>>>>>>>> @@ -380,7 +380,8 @@ int >>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_get_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>> @@ -389,6 +390,8 @@ void >>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_put_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>> static int xgpu_ai_request_init_data(struct >>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644 >>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>>>> @@ -275,7 +275,7 @@ static int >>>>>>>>>>>>>>>>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>> *adev, >>>>>>>>>>>>>>>>>>>>>> static void xgpu_nv_mailbox_flr_work(struct >>>>>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, >>>>>>>>>>>>>>>>>>>>>> flr_work.base.work); >>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev = >>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt); >>>>>>>>>>>>>>>>>>>>>> int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>>>>>>>>>>>>>>>>> @@ -407,7 +407,8 @@ int >>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_get_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>> @@ -416,6 +417,8 @@ void >>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_put_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>> const struct amdgpu_virt_ops xgpu_nv_virt_ops >>>>>>>>>>>>>>>>>>>>>> = { >>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644 >>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>>>> @@ -512,7 +512,7 @@ static int >>>>>>>>>>>>>>>>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>> *adev, >>>>>>>>>>>>>>>>>>>>>> static void xgpu_vi_mailbox_flr_work(struct >>>>>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, >>>>>>>>>>>>>>>>>>>>>> flr_work.base.work); >>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev = >>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt); >>>>>>>>>>>>>>>>>>>>>> /* wait until RCV_MSG become 3 */ >>>>>>>>>>>>>>>>>>>>>> @@ -610,7 +610,8 @@ int >>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_get_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>> @@ -619,6 +620,8 @@ void >>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_put_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>> const struct amdgpu_virt_ops xgpu_vi_virt_ops >>>>>>>>>>>>>>>>>>>>>> = { >>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> >>>>>>>>>>>> >>>>>>>>>> >>>>>>>> >>>>>> >>>> > ^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive. 2022-05-11 15:57 ` Andrey Grodzovsky @ 2022-05-12 6:03 ` Christian König 2022-05-12 12:57 ` Andrey Grodzovsky 0 siblings, 1 reply; 40+ messages in thread From: Christian König @ 2022-05-12 6:03 UTC (permalink / raw) To: Andrey Grodzovsky, Christian König, Lazar, Lijo, Felix Kuehling, amd-gfx Cc: Bai Zoy Am 11.05.22 um 17:57 schrieb Andrey Grodzovsky: > [SNIP] >>>>> How about we do it like this then: >>>>> >>>>> struct amdgpu_reset_domain { >>>>> .... >>>>> union { >>>>> struct { >>>>> struct work_item debugfs; >>>>> struct work_item ras; >>>>> .... >>>>> }; >>>>> struct work_item array[] >>>>> } reset_sources; >>>>> } >>>>> >>>> >>>> If it's only about static array, >>>> >>>> enum amdgpu_reset_soruce { >>>> >>>> AMDGPU_RESET_SRC_RAS, >>>> AMDGPU_RESET_SRC_ABC, >>>> ..... >>>> AMDGPU_RESET_SRC_XYZ, >>>> AMDGPU_RESET_SRC_MAX >>>> >>>> }; >>>> >>>> struct work_struct reset_work[AMDGPU_RESET_SRC_MAX]; => An index >>>> for each work item >>>> >>>> >>>> Thanks, >>>> Lijo >>> >>> >>> It's possible though it makes harder to generalize reset_domain >>> later for other drivers. >>> But still one caveat, look at amdgpu_recover_work_struct and it's >>> usage in amdgpu_device_gpu_recover and in gpu_recover_get, >>> At least for debugfs i need to return back the result of GPU reset >>> and so I cannot store actual work items in the array mentioned above >>> but rather pointers to work_item because i need a way to get back >>> the return value from gpu_recover like I do now in >>> amdgpu_device_gpu_recover. >> >> You should try to avoid that as well. > > > Why ? Because pointers need a lifetime. When the work items are allocated as part of the structure you can be sure that they are not freed not reused for something else. > >> >> What we should do instead is to cache the status of the last reset in >> the reset domain. > > > Probably an atomic ret in reset_domain then. > > Another 2 points i forgot to ask - > > 1) What race condition you had in mind about insertion and extraction > from the list if all is done under lock ? A work item is essentially a linked list and a function to call when the worker thread has time to process the item. This means you now have two linked lists representing essentially the same. To make it even worse those two lists are protected by two different locks. The work item list is protected by the worker lock and the reset item by our own. Keeping all that synced up is rather racy. > > 2) This I asked already - why you opposed so much to allocation on the > stack ? I understand the problem with dynamic memory allocations but > why stack ? We do multiple allocations on the stack for any function > we call during GPU reset, what so special in this case where we > allocate work struct on the stack? It's not like the work struct is > especially big compared to other stuff we allocate on the stack. The problem is once more the lifetime. When the reset work items are allocated on the stack we absolutely need to make sure that their pointer is not flying around anywhere when the function returns. Keep in mind that a stack is just another form of memory management. It's certainly possible to get that right, but it's just the more error prone approach. Regards, Christian. > > Andrey > > >> >> Regards, >> Christian. >> >>> >>> Andrey >>> >>> >>>> >>>>> Not 100% sure if that works, but something like that should do the >>>>> trick. >>>>> >>>>> My main concern is that I don't want to allocate the work items on >>>>> the stack and dynamic allocation (e.g. kmalloc) is usually not >>>>> possible either. >>>>> >>>>> Additional to that putting/removing work items from a list, array >>>>> or other container is a very common source for race conditions. >>>>> >>>>> Regards, >>>>> Christian. >>>>> >>>>>> >>>>>>>> to the cancellation list which you showed above. In current way >>>>>>>> all this done automatically within reset_domain code and it's >>>>>>>> agnostic to specific driver and it's specific list of reset >>>>>>>> sources. Also in case we would want to generalize reset_domain >>>>>>>> to other GPU drivers (which was >>>>>>>> a plan as far as i remember) this explicit mention of each >>>>>>>> reset works for cancellation is again less suitable in my opinion. >>>>>>> >>>>>>> Well we could put the work item for the scheduler independent >>>>>>> reset source into the reset domain as well. But I'm not sure >>>>>>> those additional reset sources should be part of any common >>>>>>> handling, that is largely amdgpu specific. >>>>>> >>>>>> >>>>>> So it's for sure more then one source for the reasons described >>>>>> above, also note that for scheduler we already cancel delayed >>>>>> work in drm_sched_stop so calling them again in amdgpu code kind >>>>>> of superfluous. >>>>>> >>>>>> Andrey >>>>>> >>>>>> >>>>>>> >>>>>>> Christian. >>>>>>> >>>>>>>> >>>>>>>> Andrey >>>>>>>> >>>>>>>> >>>>>>>>> >>>>>>>>>> >>>>>>>>>> Andrey >>>>>>>>>> >>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>>> >>>>>>>>>>>> The only difference is I chose to do the canceling right >>>>>>>>>>>> BEFORE the HW reset and not AFTER. I did this because I see >>>>>>>>>>>> a possible race where a new reset request is being >>>>>>>>>>>> generated exactly after we finished the HW reset but before >>>>>>>>>>>> we canceled out all pending resets - in such case you wold >>>>>>>>>>>> not want to cancel this 'border line new' reset request. >>>>>>>>>>> >>>>>>>>>>> Why not? Any new reset request directly after a hardware >>>>>>>>>>> reset is most likely just falsely generated by the reset >>>>>>>>>>> itself. >>>>>>>>>>> >>>>>>>>>>> Ideally I would cancel all sources after the reset, but >>>>>>>>>>> before starting any new work. >>>>>>>>>>> >>>>>>>>>>> Regards, >>>>>>>>>>> Christian. >>>>>>>>>>> >>>>>>>>>>>> >>>>>>>>>>>> >>>>>>>>>>>> Andrey >>>>>>>>>>>> >>>>>>>>>>>> >>>>>>>>>>>>> >>>>>>>>>>>>> Regards, >>>>>>>>>>>>> Christian. >>>>>>>>>>>>> >>>>>>>>>>>>>>> You can see that if many different reset sources share >>>>>>>>>>>>>>> same work struct what can happen is that the first to >>>>>>>>>>>>>>> obtain the lock you describe bellow might opt out from >>>>>>>>>>>>>>> full HW reset because his bad job did signal for example >>>>>>>>>>>>>>> or because his hunged IP block was able to recover >>>>>>>>>>>>>>> through SW reset but in the meantime another reset >>>>>>>>>>>>>>> source who needed an actual HW reset just silently >>>>>>>>>>>>>>> returned and we end up with unhandled reset request. >>>>>>>>>>>>>>> True that today this happens only to job timeout reset >>>>>>>>>>>>>>> sources that are handled form within the scheduler and >>>>>>>>>>>>>>> won't use this single work struct but no one prevents a >>>>>>>>>>>>>>> future case for this to happen and also, if we actually >>>>>>>>>>>>>>> want to unify scheduler time out handlers within reset >>>>>>>>>>>>>>> domain (which seems to me the right design approach) we >>>>>>>>>>>>>>> won't be able to use just one work struct for this >>>>>>>>>>>>>>> reason anyway. >>>>>>>>>>>>>>> >>>>>>>>>>>>>> >>>>>>>>>>>>>> Just to add to this point - a reset domain is >>>>>>>>>>>>>> co-operative domain. In addition to sharing a set of >>>>>>>>>>>>>> clients from various reset sources for one device, it >>>>>>>>>>>>>> also will have a set of devices like in XGMI hive. The >>>>>>>>>>>>>> job timeout on one device may not eventually result in >>>>>>>>>>>>>> result, but a RAS error happening on another device at >>>>>>>>>>>>>> the same time would need a reset. The second device's RAS >>>>>>>>>>>>>> error cannot return seeing that a reset work already >>>>>>>>>>>>>> started, or ignore the reset work given that another >>>>>>>>>>>>>> device has filled the reset data. >>>>>>>>>>>>>> >>>>>>>>>>>>>> When there is a reset domain, it should take care of the >>>>>>>>>>>>>> work scheduled and keeping it in device or any other >>>>>>>>>>>>>> level doesn't sound good. >>>>>>>>>>>>>> >>>>>>>>>>>>>> Thanks, >>>>>>>>>>>>>> Lijo >>>>>>>>>>>>>> >>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> I'd put the reset work struct into the reset_domain >>>>>>>>>>>>>>>> struct. That way you'd have exactly one worker for the >>>>>>>>>>>>>>>> reset domain. You could implement a lock-less scheme to >>>>>>>>>>>>>>>> decide whether you need to schedule a reset, e.g. using >>>>>>>>>>>>>>>> an atomic counter in the shared work struct that gets >>>>>>>>>>>>>>>> incremented when a client wants to trigger a reset >>>>>>>>>>>>>>>> (atomic_add_return). If that counter is exactly 1 after >>>>>>>>>>>>>>>> incrementing, you need to fill in the rest of the work >>>>>>>>>>>>>>>> struct and schedule the work. Otherwise, it's already >>>>>>>>>>>>>>>> scheduled (or another client is in the process of >>>>>>>>>>>>>>>> scheduling it) and you just return. When the worker >>>>>>>>>>>>>>>> finishes (after confirming a successful reset), it >>>>>>>>>>>>>>>> resets the counter to 0, so the next client requesting >>>>>>>>>>>>>>>> a reset will schedule the worker again. >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>>>> Felix >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> Additional to that keep in mind that you can't >>>>>>>>>>>>>>>>>> allocate any memory before or during the GPU reset >>>>>>>>>>>>>>>>>> nor wait for the reset to complete (so you can't >>>>>>>>>>>>>>>>>> allocate anything on the stack either). >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> There is no dynamic allocation here, regarding stack >>>>>>>>>>>>>>>>> allocations - we do it all the time when we call >>>>>>>>>>>>>>>>> functions, even during GPU resets, how on stack >>>>>>>>>>>>>>>>> allocation of work struct in amdgpu_device_gpu_recover >>>>>>>>>>>>>>>>> is different from any other local variable we allocate >>>>>>>>>>>>>>>>> in any function we call ? >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> I am also not sure why it's not allowed to wait for >>>>>>>>>>>>>>>>> reset to complete ? Also, see in >>>>>>>>>>>>>>>>> amdgpu_ras_do_recovery and gpu_recover_get (debugfs) - >>>>>>>>>>>>>>>>> the caller expects the reset to complete before he >>>>>>>>>>>>>>>>> returns. I can probably work around it in RAS code by >>>>>>>>>>>>>>>>> calling atomic_set(&ras->in_recovery, 0) from some >>>>>>>>>>>>>>>>> callback within actual reset function but regarding >>>>>>>>>>>>>>>>> sysfs it actually expects a result returned indicating >>>>>>>>>>>>>>>>> whether the call was successful or not. >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> I don't think that concept you try here will work. >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>>>>>> Christian. >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> Also in general seems to me it's cleaner approach >>>>>>>>>>>>>>>>>>> where this logic (the work items) are held and >>>>>>>>>>>>>>>>>>> handled in reset_domain and are not split in each >>>>>>>>>>>>>>>>>>> adev or any other entity. We might want in the >>>>>>>>>>>>>>>>>>> future to even move the scheduler handling into >>>>>>>>>>>>>>>>>>> reset domain since reset domain is supposed to be a >>>>>>>>>>>>>>>>>>> generic things and not only or AMD. >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>>>>>>>>>> Christian. >>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky >>>>>>>>>>>>>>>>>>>>>>> <andrey.grodzovsky@amd.com> >>>>>>>>>>>>>>>>>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com> >>>>>>>>>>>>>>>>>>>>>>> --- >>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +--- >>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 >>>>>>>>>>>>>>>>>>>>>>> +++-- >>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 + >>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 >>>>>>>>>>>>>>>>>>>>>>> +++++++++++++++++++++- >>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +- >>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++- >>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++- >>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++- >>>>>>>>>>>>>>>>>>>>>>> 8 files changed, 104 insertions(+), 24 >>>>>>>>>>>>>>>>>>>>>>> deletions(-) >>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>>>>> index 4264abc5604d..99efd8317547 100644 >>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>>>>> @@ -109,6 +109,7 @@ >>>>>>>>>>>>>>>>>>>>>>> #include "amdgpu_fdinfo.h" >>>>>>>>>>>>>>>>>>>>>>> #include "amdgpu_mca.h" >>>>>>>>>>>>>>>>>>>>>>> #include "amdgpu_ras.h" >>>>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h" >>>>>>>>>>>>>>>>>>>>>>> #define MAX_GPU_INSTANCE 16 >>>>>>>>>>>>>>>>>>>>>>> @@ -509,16 +510,6 @@ struct >>>>>>>>>>>>>>>>>>>>>>> amdgpu_allowed_register_entry { >>>>>>>>>>>>>>>>>>>>>>> bool grbm_indexed; >>>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>>> -enum amd_reset_method { >>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_NONE = -1, >>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_LEGACY = 0, >>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE0, >>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE1, >>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE2, >>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_BACO, >>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_PCI, >>>>>>>>>>>>>>>>>>>>>>> -}; >>>>>>>>>>>>>>>>>>>>>>> - >>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_video_codec_info { >>>>>>>>>>>>>>>>>>>>>>> u32 codec_type; >>>>>>>>>>>>>>>>>>>>>>> u32 max_width; >>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>>>>> index e582f1044c0f..7fa82269c30f 100644 >>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>>>>> @@ -5201,6 +5201,12 @@ int >>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct >>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev, >>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>> tmp_vram_lost_counter = >>>>>>>>>>>>>>>>>>>>>>> atomic_read(&((adev)->vram_lost_counter)); >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> + /* Drop all pending resets since we will >>>>>>>>>>>>>>>>>>>>>>> reset now anyway */ >>>>>>>>>>>>>>>>>>>>>>> + tmp_adev = >>>>>>>>>>>>>>>>>>>>>>> list_first_entry(device_list_handle, struct >>>>>>>>>>>>>>>>>>>>>>> amdgpu_device, >>>>>>>>>>>>>>>>>>>>>>> + reset_list); >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_pending_list(tmp_adev->reset_domain); >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> /* Actual ASIC resets if needed.*/ >>>>>>>>>>>>>>>>>>>>>>> /* Host driver will handle XGMI hive reset >>>>>>>>>>>>>>>>>>>>>>> for SRIOV */ >>>>>>>>>>>>>>>>>>>>>>> if (amdgpu_sriov_vf(adev)) { >>>>>>>>>>>>>>>>>>>>>>> @@ -5296,7 +5302,7 @@ int >>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct >>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev, >>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_recover_work_struct { >>>>>>>>>>>>>>>>>>>>>>> - struct work_struct base; >>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct base; >>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev; >>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_job *job; >>>>>>>>>>>>>>>>>>>>>>> int ret; >>>>>>>>>>>>>>>>>>>>>>> @@ -5304,7 +5310,7 @@ struct >>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct { >>>>>>>>>>>>>>>>>>>>>>> static void >>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work(struct >>>>>>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>> - struct amdgpu_recover_work_struct >>>>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct >>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base); >>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_recover_work_struct >>>>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct >>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base.base.work); >>>>>>>>>>>>>>>>>>>>>>> recover_work->ret = >>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, >>>>>>>>>>>>>>>>>>>>>>> recover_work->job); >>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>> @@ -5316,12 +5322,15 @@ int >>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>>> *adev, >>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_recover_work_struct work = >>>>>>>>>>>>>>>>>>>>>>> {.adev = adev, .job = job}; >>>>>>>>>>>>>>>>>>>>>>> - INIT_WORK(&work.base, >>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&work.base.base, >>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&work.base.node); >>>>>>>>>>>>>>>>>>>>>>> if >>>>>>>>>>>>>>>>>>>>>>> (!amdgpu_reset_domain_schedule(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>>> &work.base)) >>>>>>>>>>>>>>>>>>>>>>> return -EAGAIN; >>>>>>>>>>>>>>>>>>>>>>> - flush_work(&work.base); >>>>>>>>>>>>>>>>>>>>>>> + flush_delayed_work(&work.base.base); >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>>> &work.base); >>>>>>>>>>>>>>>>>>>>>>> return work.ret; >>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>>>>> index c80af0889773..ffddd419c351 100644 >>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain >>>>>>>>>>>>>>>>>>>>>>> *amdgpu_reset_create_reset_domain(enum >>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_d >>>>>>>>>>>>>>>>>>>>>>> atomic_set(&reset_domain->in_gpu_reset, 0); >>>>>>>>>>>>>>>>>>>>>>> init_rwsem(&reset_domain->sem); >>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&reset_domain->pending_works); >>>>>>>>>>>>>>>>>>>>>>> + mutex_init(&reset_domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> return reset_domain; >>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>>>>> index 1949dbe28a86..863ec5720fc1 100644 >>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>>>>> @@ -24,7 +24,18 @@ >>>>>>>>>>>>>>>>>>>>>>> #ifndef __AMDGPU_RESET_H__ >>>>>>>>>>>>>>>>>>>>>>> #define __AMDGPU_RESET_H__ >>>>>>>>>>>>>>>>>>>>>>> -#include "amdgpu.h" >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> +#include <linux/atomic.h> >>>>>>>>>>>>>>>>>>>>>>> +#include <linux/mutex.h> >>>>>>>>>>>>>>>>>>>>>>> +#include <linux/list.h> >>>>>>>>>>>>>>>>>>>>>>> +#include <linux/kref.h> >>>>>>>>>>>>>>>>>>>>>>> +#include <linux/rwsem.h> >>>>>>>>>>>>>>>>>>>>>>> +#include <linux/workqueue.h> >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_device; >>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_job; >>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_hive_info; >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> enum AMDGPU_RESET_FLAGS { >>>>>>>>>>>>>>>>>>>>>>> @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS { >>>>>>>>>>>>>>>>>>>>>>> AMDGPU_SKIP_HW_RESET = 1, >>>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> +enum amd_reset_method { >>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_NONE = -1, >>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_LEGACY = 0, >>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE0, >>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE1, >>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE2, >>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_BACO, >>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_PCI, >>>>>>>>>>>>>>>>>>>>>>> +}; >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_reset_context { >>>>>>>>>>>>>>>>>>>>>>> enum amd_reset_method method; >>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *reset_req_dev; >>>>>>>>>>>>>>>>>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context { >>>>>>>>>>>>>>>>>>>>>>> unsigned long flags; >>>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_reset_control; >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_reset_handler { >>>>>>>>>>>>>>>>>>>>>>> enum amd_reset_method reset_method; >>>>>>>>>>>>>>>>>>>>>>> struct list_head handler_list; >>>>>>>>>>>>>>>>>>>>>>> @@ -76,12 +100,21 @@ enum >>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_type { >>>>>>>>>>>>>>>>>>>>>>> XGMI_HIVE >>>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_reset_work_struct { >>>>>>>>>>>>>>>>>>>>>>> + struct delayed_work base; >>>>>>>>>>>>>>>>>>>>>>> + struct list_head node; >>>>>>>>>>>>>>>>>>>>>>> +}; >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_reset_domain { >>>>>>>>>>>>>>>>>>>>>>> struct kref refcount; >>>>>>>>>>>>>>>>>>>>>>> struct workqueue_struct *wq; >>>>>>>>>>>>>>>>>>>>>>> enum amdgpu_reset_domain_type type; >>>>>>>>>>>>>>>>>>>>>>> struct rw_semaphore sem; >>>>>>>>>>>>>>>>>>>>>>> atomic_t in_gpu_reset; >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> + struct list_head pending_works; >>>>>>>>>>>>>>>>>>>>>>> + struct mutex reset_lock; >>>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>>> @@ -113,9 +146,43 @@ static inline void >>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_put_reset_domain(struct >>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *dom >>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>> static inline bool >>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_schedule(struct >>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain, >>>>>>>>>>>>>>>>>>>>>>> - struct work_struct *work) >>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work) >>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>> - return queue_work(domain->wq, work); >>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> + if (!queue_delayed_work(domain->wq, >>>>>>>>>>>>>>>>>>>>>>> &work->base, 0)) { >>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>> + return false; >>>>>>>>>>>>>>>>>>>>>>> + } >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> + list_add_tail(&work->node, >>>>>>>>>>>>>>>>>>>>>>> &domain->pending_works); >>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> + return true; >>>>>>>>>>>>>>>>>>>>>>> +} >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> +static inline void >>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(struct >>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain, >>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work) >>>>>>>>>>>>>>>>>>>>>>> +{ >>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>> + list_del_init(&work->node); >>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>> +} >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> +static inline void >>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_pending_list(struct >>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain) >>>>>>>>>>>>>>>>>>>>>>> +{ >>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *entry, *tmp; >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>> + list_for_each_entry_safe(entry, tmp, >>>>>>>>>>>>>>>>>>>>>>> &domain->pending_works, node) { >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> + list_del_init(&entry->node); >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> + /* Stop any other related pending >>>>>>>>>>>>>>>>>>>>>>> resets */ >>>>>>>>>>>>>>>>>>>>>>> + cancel_delayed_work(&entry->base); >>>>>>>>>>>>>>>>>>>>>>> + } >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>> void amdgpu_device_lock_reset_domain(struct >>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *reset_domain); >>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>>>>> index 239f232f9c02..574e870d3064 100644 >>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>>>>> @@ -25,6 +25,7 @@ >>>>>>>>>>>>>>>>>>>>>>> #define AMDGPU_VIRT_H >>>>>>>>>>>>>>>>>>>>>>> #include "amdgv_sriovmsg.h" >>>>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h" >>>>>>>>>>>>>>>>>>>>>>> #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 << >>>>>>>>>>>>>>>>>>>>>>> 0) /* vBIOS is sr-iov ready */ >>>>>>>>>>>>>>>>>>>>>>> #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 1) >>>>>>>>>>>>>>>>>>>>>>> /* sr-iov is enabled on this GPU */ >>>>>>>>>>>>>>>>>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt { >>>>>>>>>>>>>>>>>>>>>>> uint32_t reg_val_offs; >>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_irq_src ack_irq; >>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_irq_src rcv_irq; >>>>>>>>>>>>>>>>>>>>>>> - struct work_struct flr_work; >>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct flr_work; >>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_mm_table mm_table; >>>>>>>>>>>>>>>>>>>>>>> const struct amdgpu_virt_ops *ops; >>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_vf_error_buffer vf_errors; >>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>>>>> index b81acf59870c..f3d1c2be9292 100644 >>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>>>>> @@ -251,7 +251,7 @@ static int >>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>>> *adev, >>>>>>>>>>>>>>>>>>>>>>> static void xgpu_ai_mailbox_flr_work(struct >>>>>>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, >>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work); >>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev = >>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt); >>>>>>>>>>>>>>>>>>>>>>> int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>>>>>>>>>>>>>>>>>> @@ -380,7 +380,8 @@ int >>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_get_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>> @@ -389,6 +390,8 @@ void >>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_put_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>> static int xgpu_ai_request_init_data(struct >>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644 >>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>>>>> @@ -275,7 +275,7 @@ static int >>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>>> *adev, >>>>>>>>>>>>>>>>>>>>>>> static void xgpu_nv_mailbox_flr_work(struct >>>>>>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, >>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work); >>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev = >>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt); >>>>>>>>>>>>>>>>>>>>>>> int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>>>>>>>>>>>>>>>>>> @@ -407,7 +407,8 @@ int >>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_get_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>> @@ -416,6 +417,8 @@ void >>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_put_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>> const struct amdgpu_virt_ops >>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_virt_ops = { >>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644 >>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>>>>> @@ -512,7 +512,7 @@ static int >>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>>> *adev, >>>>>>>>>>>>>>>>>>>>>>> static void xgpu_vi_mailbox_flr_work(struct >>>>>>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, >>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work); >>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev = >>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt); >>>>>>>>>>>>>>>>>>>>>>> /* wait until RCV_MSG become 3 */ >>>>>>>>>>>>>>>>>>>>>>> @@ -610,7 +610,8 @@ int >>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_get_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>> @@ -619,6 +620,8 @@ void >>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_put_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>> const struct amdgpu_virt_ops >>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_virt_ops = { >>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>> >>>>>>>>>>> >>>>>>>>> >>>>>>> >>>>> >> ^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive. 2022-05-12 6:03 ` Christian König @ 2022-05-12 12:57 ` Andrey Grodzovsky 0 siblings, 0 replies; 40+ messages in thread From: Andrey Grodzovsky @ 2022-05-12 12:57 UTC (permalink / raw) To: Christian König, Christian König, Lazar, Lijo, Felix Kuehling, amd-gfx Cc: Bai Zoy On 2022-05-12 02:03, Christian König wrote: > Am 11.05.22 um 17:57 schrieb Andrey Grodzovsky: >> [SNIP] >>>>>> How about we do it like this then: >>>>>> >>>>>> struct amdgpu_reset_domain { >>>>>> .... >>>>>> union { >>>>>> struct { >>>>>> struct work_item debugfs; >>>>>> struct work_item ras; >>>>>> .... >>>>>> }; >>>>>> struct work_item array[] >>>>>> } reset_sources; >>>>>> } >>>>>> >>>>> >>>>> If it's only about static array, >>>>> >>>>> enum amdgpu_reset_soruce { >>>>> >>>>> AMDGPU_RESET_SRC_RAS, >>>>> AMDGPU_RESET_SRC_ABC, >>>>> ..... >>>>> AMDGPU_RESET_SRC_XYZ, >>>>> AMDGPU_RESET_SRC_MAX >>>>> >>>>> }; >>>>> >>>>> struct work_struct reset_work[AMDGPU_RESET_SRC_MAX]; => An index >>>>> for each work item >>>>> >>>>> >>>>> Thanks, >>>>> Lijo >>>> >>>> >>>> It's possible though it makes harder to generalize reset_domain >>>> later for other drivers. >>>> But still one caveat, look at amdgpu_recover_work_struct and it's >>>> usage in amdgpu_device_gpu_recover and in gpu_recover_get, >>>> At least for debugfs i need to return back the result of GPU reset >>>> and so I cannot store actual work items in the array mentioned above >>>> but rather pointers to work_item because i need a way to get back >>>> the return value from gpu_recover like I do now in >>>> amdgpu_device_gpu_recover. >>> >>> You should try to avoid that as well. >> >> >> Why ? > > Because pointers need a lifetime. When the work items are allocated as > part of the structure you can be sure that they are not freed not > reused for something else. > >> >>> >>> What we should do instead is to cache the status of the last reset >>> in the reset domain. >> >> >> Probably an atomic ret in reset_domain then. >> >> Another 2 points i forgot to ask - >> >> 1) What race condition you had in mind about insertion and extraction >> from the list if all is done under lock ? > > A work item is essentially a linked list and a function to call when > the worker thread has time to process the item. This means you now > have two linked lists representing essentially the same. > > To make it even worse those two lists are protected by two different > locks. The work item list is protected by the worker lock and the > reset item by our own. > > Keeping all that synced up is rather racy. I really don't see a problem as the locking order is always preserved so that the reset list lock scope is around the internal worker thread list lock. > >> >> 2) This I asked already - why you opposed so much to allocation on >> the stack ? I understand the problem with dynamic memory allocations >> but why stack ? We do multiple allocations on the stack for any function >> we call during GPU reset, what so special in this case where we >> allocate work struct on the stack? It's not like the work struct is >> especially big compared to other stuff we allocate on the stack. > > The problem is once more the lifetime. When the reset work items are > allocated on the stack we absolutely need to make sure that their > pointer is not flying around anywhere when the function returns. Keep > in mind that a stack is just another form of memory management. > > It's certainly possible to get that right, but it's just the more > error prone approach. I believe i got that right as at any place where the reset_work item life cycle ends i was always calling pending works list delete to not leave dangling pointers. I guess it's all pros and cons, Yes, it' maybe more risky this way since it creates a risk of dangling pointers if forgetting to remove but on the other hand this approach in my opinion gives cleaner and more generic design then the other alternative as you can see from the other problem I encountered with not being able to retrieve amdgpu_device pointer which we discuss in another thread and having to explicitly name each reset source in reset domain. Andrey > > Regards, > Christian. > >> >> Andrey >> >> >>> >>> Regards, >>> Christian. >>> >>>> >>>> Andrey >>>> >>>> >>>>> >>>>>> Not 100% sure if that works, but something like that should do >>>>>> the trick. >>>>>> >>>>>> My main concern is that I don't want to allocate the work items >>>>>> on the stack and dynamic allocation (e.g. kmalloc) is usually not >>>>>> possible either. >>>>>> >>>>>> Additional to that putting/removing work items from a list, array >>>>>> or other container is a very common source for race conditions. >>>>>> >>>>>> Regards, >>>>>> Christian. >>>>>> >>>>>>> >>>>>>>>> to the cancellation list which you showed above. In current >>>>>>>>> way all this done automatically within reset_domain code and >>>>>>>>> it's agnostic to specific driver and it's specific list of >>>>>>>>> reset sources. Also in case we would want to generalize >>>>>>>>> reset_domain to other GPU drivers (which was >>>>>>>>> a plan as far as i remember) this explicit mention of each >>>>>>>>> reset works for cancellation is again less suitable in my >>>>>>>>> opinion. >>>>>>>> >>>>>>>> Well we could put the work item for the scheduler independent >>>>>>>> reset source into the reset domain as well. But I'm not sure >>>>>>>> those additional reset sources should be part of any common >>>>>>>> handling, that is largely amdgpu specific. >>>>>>> >>>>>>> >>>>>>> So it's for sure more then one source for the reasons described >>>>>>> above, also note that for scheduler we already cancel delayed >>>>>>> work in drm_sched_stop so calling them again in amdgpu code kind >>>>>>> of superfluous. >>>>>>> >>>>>>> Andrey >>>>>>> >>>>>>> >>>>>>>> >>>>>>>> Christian. >>>>>>>> >>>>>>>>> >>>>>>>>> Andrey >>>>>>>>> >>>>>>>>> >>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>> Andrey >>>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>>> >>>>>>>>>>>>> >>>>>>>>>>>>> The only difference is I chose to do the canceling right >>>>>>>>>>>>> BEFORE the HW reset and not AFTER. I did this because I >>>>>>>>>>>>> see a possible race where a new reset request is being >>>>>>>>>>>>> generated exactly after we finished the HW reset but >>>>>>>>>>>>> before we canceled out all pending resets - in such case >>>>>>>>>>>>> you wold not want to cancel this 'border line new' reset >>>>>>>>>>>>> request. >>>>>>>>>>>> >>>>>>>>>>>> Why not? Any new reset request directly after a hardware >>>>>>>>>>>> reset is most likely just falsely generated by the reset >>>>>>>>>>>> itself. >>>>>>>>>>>> >>>>>>>>>>>> Ideally I would cancel all sources after the reset, but >>>>>>>>>>>> before starting any new work. >>>>>>>>>>>> >>>>>>>>>>>> Regards, >>>>>>>>>>>> Christian. >>>>>>>>>>>> >>>>>>>>>>>>> >>>>>>>>>>>>> >>>>>>>>>>>>> Andrey >>>>>>>>>>>>> >>>>>>>>>>>>> >>>>>>>>>>>>>> >>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>> Christian. >>>>>>>>>>>>>> >>>>>>>>>>>>>>>> You can see that if many different reset sources share >>>>>>>>>>>>>>>> same work struct what can happen is that the first to >>>>>>>>>>>>>>>> obtain the lock you describe bellow might opt out from >>>>>>>>>>>>>>>> full HW reset because his bad job did signal for >>>>>>>>>>>>>>>> example or because his hunged IP block was able to >>>>>>>>>>>>>>>> recover through SW reset but in the meantime another >>>>>>>>>>>>>>>> reset source who needed an actual HW reset just >>>>>>>>>>>>>>>> silently returned and we end up with unhandled reset >>>>>>>>>>>>>>>> request. True that today this happens only to job >>>>>>>>>>>>>>>> timeout reset sources that are handled form within the >>>>>>>>>>>>>>>> scheduler and won't use this single work struct but no >>>>>>>>>>>>>>>> one prevents a future case for this to happen and also, >>>>>>>>>>>>>>>> if we actually want to unify scheduler time out >>>>>>>>>>>>>>>> handlers within reset domain (which seems to me the >>>>>>>>>>>>>>>> right design approach) we won't be able to use just one >>>>>>>>>>>>>>>> work struct for this reason anyway. >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> Just to add to this point - a reset domain is >>>>>>>>>>>>>>> co-operative domain. In addition to sharing a set of >>>>>>>>>>>>>>> clients from various reset sources for one device, it >>>>>>>>>>>>>>> also will have a set of devices like in XGMI hive. The >>>>>>>>>>>>>>> job timeout on one device may not eventually result in >>>>>>>>>>>>>>> result, but a RAS error happening on another device at >>>>>>>>>>>>>>> the same time would need a reset. The second device's >>>>>>>>>>>>>>> RAS error cannot return seeing that a reset work already >>>>>>>>>>>>>>> started, or ignore the reset work given that another >>>>>>>>>>>>>>> device has filled the reset data. >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> When there is a reset domain, it should take care of the >>>>>>>>>>>>>>> work scheduled and keeping it in device or any other >>>>>>>>>>>>>>> level doesn't sound good. >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> Thanks, >>>>>>>>>>>>>>> Lijo >>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> I'd put the reset work struct into the reset_domain >>>>>>>>>>>>>>>>> struct. That way you'd have exactly one worker for the >>>>>>>>>>>>>>>>> reset domain. You could implement a lock-less scheme >>>>>>>>>>>>>>>>> to decide whether you need to schedule a reset, e.g. >>>>>>>>>>>>>>>>> using an atomic counter in the shared work struct that >>>>>>>>>>>>>>>>> gets incremented when a client wants to trigger a >>>>>>>>>>>>>>>>> reset (atomic_add_return). If that counter is exactly >>>>>>>>>>>>>>>>> 1 after incrementing, you need to fill in the rest of >>>>>>>>>>>>>>>>> the work struct and schedule the work. Otherwise, it's >>>>>>>>>>>>>>>>> already scheduled (or another client is in the process >>>>>>>>>>>>>>>>> of scheduling it) and you just return. When the worker >>>>>>>>>>>>>>>>> finishes (after confirming a successful reset), it >>>>>>>>>>>>>>>>> resets the counter to 0, so the next client requesting >>>>>>>>>>>>>>>>> a reset will schedule the worker again. >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>>>>> Felix >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> Additional to that keep in mind that you can't >>>>>>>>>>>>>>>>>>> allocate any memory before or during the GPU reset >>>>>>>>>>>>>>>>>>> nor wait for the reset to complete (so you can't >>>>>>>>>>>>>>>>>>> allocate anything on the stack either). >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> There is no dynamic allocation here, regarding stack >>>>>>>>>>>>>>>>>> allocations - we do it all the time when we call >>>>>>>>>>>>>>>>>> functions, even during GPU resets, how on stack >>>>>>>>>>>>>>>>>> allocation of work struct in >>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover is different from any other >>>>>>>>>>>>>>>>>> local variable we allocate in any function we call ? >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> I am also not sure why it's not allowed to wait for >>>>>>>>>>>>>>>>>> reset to complete ? Also, see in >>>>>>>>>>>>>>>>>> amdgpu_ras_do_recovery and gpu_recover_get (debugfs) >>>>>>>>>>>>>>>>>> - the caller expects the reset to complete before he >>>>>>>>>>>>>>>>>> returns. I can probably work around it in RAS code by >>>>>>>>>>>>>>>>>> calling atomic_set(&ras->in_recovery, 0) from some >>>>>>>>>>>>>>>>>> callback within actual reset function but regarding >>>>>>>>>>>>>>>>>> sysfs it actually expects a result returned >>>>>>>>>>>>>>>>>> indicating whether the call was successful or not. >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> I don't think that concept you try here will work. >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>>>>>>> Christian. >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>> Also in general seems to me it's cleaner approach >>>>>>>>>>>>>>>>>>>> where this logic (the work items) are held and >>>>>>>>>>>>>>>>>>>> handled in reset_domain and are not split in each >>>>>>>>>>>>>>>>>>>> adev or any other entity. We might want in the >>>>>>>>>>>>>>>>>>>> future to even move the scheduler handling into >>>>>>>>>>>>>>>>>>>> reset domain since reset domain is supposed to be a >>>>>>>>>>>>>>>>>>>> generic things and not only or AMD. >>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>>>>>>>>>>> Christian. >>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky >>>>>>>>>>>>>>>>>>>>>>>> <andrey.grodzovsky@amd.com> >>>>>>>>>>>>>>>>>>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com> >>>>>>>>>>>>>>>>>>>>>>>> --- >>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +--- >>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 >>>>>>>>>>>>>>>>>>>>>>>> +++-- >>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 + >>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 >>>>>>>>>>>>>>>>>>>>>>>> +++++++++++++++++++++- >>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +- >>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++- >>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++- >>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++- >>>>>>>>>>>>>>>>>>>>>>>> 8 files changed, 104 insertions(+), 24 >>>>>>>>>>>>>>>>>>>>>>>> deletions(-) >>>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>>>>>> index 4264abc5604d..99efd8317547 100644 >>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>>>>>> @@ -109,6 +109,7 @@ >>>>>>>>>>>>>>>>>>>>>>>> #include "amdgpu_fdinfo.h" >>>>>>>>>>>>>>>>>>>>>>>> #include "amdgpu_mca.h" >>>>>>>>>>>>>>>>>>>>>>>> #include "amdgpu_ras.h" >>>>>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h" >>>>>>>>>>>>>>>>>>>>>>>> #define MAX_GPU_INSTANCE 16 >>>>>>>>>>>>>>>>>>>>>>>> @@ -509,16 +510,6 @@ struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_allowed_register_entry { >>>>>>>>>>>>>>>>>>>>>>>> bool grbm_indexed; >>>>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>>>> -enum amd_reset_method { >>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_NONE = -1, >>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_LEGACY = 0, >>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE0, >>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE1, >>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE2, >>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_BACO, >>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_PCI, >>>>>>>>>>>>>>>>>>>>>>>> -}; >>>>>>>>>>>>>>>>>>>>>>>> - >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_video_codec_info { >>>>>>>>>>>>>>>>>>>>>>>> u32 codec_type; >>>>>>>>>>>>>>>>>>>>>>>> u32 max_width; >>>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>>>>>> index e582f1044c0f..7fa82269c30f 100644 >>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>>>>>> @@ -5201,6 +5201,12 @@ int >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev, >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> tmp_vram_lost_counter = >>>>>>>>>>>>>>>>>>>>>>>> atomic_read(&((adev)->vram_lost_counter)); >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> + /* Drop all pending resets since we will >>>>>>>>>>>>>>>>>>>>>>>> reset now anyway */ >>>>>>>>>>>>>>>>>>>>>>>> + tmp_adev = >>>>>>>>>>>>>>>>>>>>>>>> list_first_entry(device_list_handle, struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device, >>>>>>>>>>>>>>>>>>>>>>>> + reset_list); >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_pending_list(tmp_adev->reset_domain); >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> /* Actual ASIC resets if needed.*/ >>>>>>>>>>>>>>>>>>>>>>>> /* Host driver will handle XGMI hive >>>>>>>>>>>>>>>>>>>>>>>> reset for SRIOV */ >>>>>>>>>>>>>>>>>>>>>>>> if (amdgpu_sriov_vf(adev)) { >>>>>>>>>>>>>>>>>>>>>>>> @@ -5296,7 +5302,7 @@ int >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev, >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_recover_work_struct { >>>>>>>>>>>>>>>>>>>>>>>> - struct work_struct base; >>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct base; >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev; >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_job *job; >>>>>>>>>>>>>>>>>>>>>>>> int ret; >>>>>>>>>>>>>>>>>>>>>>>> @@ -5304,7 +5310,7 @@ struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct { >>>>>>>>>>>>>>>>>>>>>>>> static void >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work(struct >>>>>>>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>>> - struct amdgpu_recover_work_struct >>>>>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base); >>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_recover_work_struct >>>>>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base.base.work); >>>>>>>>>>>>>>>>>>>>>>>> recover_work->ret = >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, >>>>>>>>>>>>>>>>>>>>>>>> recover_work->job); >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> @@ -5316,12 +5322,15 @@ int >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>>>> *adev, >>>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_recover_work_struct work = >>>>>>>>>>>>>>>>>>>>>>>> {.adev = adev, .job = job}; >>>>>>>>>>>>>>>>>>>>>>>> - INIT_WORK(&work.base, >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&work.base.base, >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&work.base.node); >>>>>>>>>>>>>>>>>>>>>>>> if >>>>>>>>>>>>>>>>>>>>>>>> (!amdgpu_reset_domain_schedule(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>>>> &work.base)) >>>>>>>>>>>>>>>>>>>>>>>> return -EAGAIN; >>>>>>>>>>>>>>>>>>>>>>>> - flush_work(&work.base); >>>>>>>>>>>>>>>>>>>>>>>> + flush_delayed_work(&work.base.base); >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>>>> &work.base); >>>>>>>>>>>>>>>>>>>>>>>> return work.ret; >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>>>>>> index c80af0889773..ffddd419c351 100644 >>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain >>>>>>>>>>>>>>>>>>>>>>>> *amdgpu_reset_create_reset_domain(enum >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_d >>>>>>>>>>>>>>>>>>>>>>>> atomic_set(&reset_domain->in_gpu_reset, 0); >>>>>>>>>>>>>>>>>>>>>>>> init_rwsem(&reset_domain->sem); >>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&reset_domain->pending_works); >>>>>>>>>>>>>>>>>>>>>>>> + mutex_init(&reset_domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> return reset_domain; >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>>>>>> index 1949dbe28a86..863ec5720fc1 100644 >>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>>>>>> @@ -24,7 +24,18 @@ >>>>>>>>>>>>>>>>>>>>>>>> #ifndef __AMDGPU_RESET_H__ >>>>>>>>>>>>>>>>>>>>>>>> #define __AMDGPU_RESET_H__ >>>>>>>>>>>>>>>>>>>>>>>> -#include "amdgpu.h" >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/atomic.h> >>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/mutex.h> >>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/list.h> >>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/kref.h> >>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/rwsem.h> >>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/workqueue.h> >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_device; >>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_job; >>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_hive_info; >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> enum AMDGPU_RESET_FLAGS { >>>>>>>>>>>>>>>>>>>>>>>> @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS { >>>>>>>>>>>>>>>>>>>>>>>> AMDGPU_SKIP_HW_RESET = 1, >>>>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> +enum amd_reset_method { >>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_NONE = -1, >>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_LEGACY = 0, >>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE0, >>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE1, >>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE2, >>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_BACO, >>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_PCI, >>>>>>>>>>>>>>>>>>>>>>>> +}; >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_reset_context { >>>>>>>>>>>>>>>>>>>>>>>> enum amd_reset_method method; >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *reset_req_dev; >>>>>>>>>>>>>>>>>>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context { >>>>>>>>>>>>>>>>>>>>>>>> unsigned long flags; >>>>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_reset_control; >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_reset_handler { >>>>>>>>>>>>>>>>>>>>>>>> enum amd_reset_method reset_method; >>>>>>>>>>>>>>>>>>>>>>>> struct list_head handler_list; >>>>>>>>>>>>>>>>>>>>>>>> @@ -76,12 +100,21 @@ enum >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_type { >>>>>>>>>>>>>>>>>>>>>>>> XGMI_HIVE >>>>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_reset_work_struct { >>>>>>>>>>>>>>>>>>>>>>>> + struct delayed_work base; >>>>>>>>>>>>>>>>>>>>>>>> + struct list_head node; >>>>>>>>>>>>>>>>>>>>>>>> +}; >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_reset_domain { >>>>>>>>>>>>>>>>>>>>>>>> struct kref refcount; >>>>>>>>>>>>>>>>>>>>>>>> struct workqueue_struct *wq; >>>>>>>>>>>>>>>>>>>>>>>> enum amdgpu_reset_domain_type type; >>>>>>>>>>>>>>>>>>>>>>>> struct rw_semaphore sem; >>>>>>>>>>>>>>>>>>>>>>>> atomic_t in_gpu_reset; >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> + struct list_head pending_works; >>>>>>>>>>>>>>>>>>>>>>>> + struct mutex reset_lock; >>>>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>>>> @@ -113,9 +146,43 @@ static inline void >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_put_reset_domain(struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *dom >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> static inline bool >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_schedule(struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain, >>>>>>>>>>>>>>>>>>>>>>>> - struct work_struct *work) >>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work) >>>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>>> - return queue_work(domain->wq, work); >>>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> + if (!queue_delayed_work(domain->wq, >>>>>>>>>>>>>>>>>>>>>>>> &work->base, 0)) { >>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>>> + return false; >>>>>>>>>>>>>>>>>>>>>>>> + } >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> + list_add_tail(&work->node, >>>>>>>>>>>>>>>>>>>>>>>> &domain->pending_works); >>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> + return true; >>>>>>>>>>>>>>>>>>>>>>>> +} >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> +static inline void >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain, >>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work) >>>>>>>>>>>>>>>>>>>>>>>> +{ >>>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>>> + list_del_init(&work->node); >>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>>> +} >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> +static inline void >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_pending_list(struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain) >>>>>>>>>>>>>>>>>>>>>>>> +{ >>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *entry, *tmp; >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>>> + list_for_each_entry_safe(entry, tmp, >>>>>>>>>>>>>>>>>>>>>>>> &domain->pending_works, node) { >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> + list_del_init(&entry->node); >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> + /* Stop any other related pending >>>>>>>>>>>>>>>>>>>>>>>> resets */ >>>>>>>>>>>>>>>>>>>>>>>> + cancel_delayed_work(&entry->base); >>>>>>>>>>>>>>>>>>>>>>>> + } >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> void amdgpu_device_lock_reset_domain(struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *reset_domain); >>>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>>>>>> index 239f232f9c02..574e870d3064 100644 >>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>>>>>> @@ -25,6 +25,7 @@ >>>>>>>>>>>>>>>>>>>>>>>> #define AMDGPU_VIRT_H >>>>>>>>>>>>>>>>>>>>>>>> #include "amdgv_sriovmsg.h" >>>>>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h" >>>>>>>>>>>>>>>>>>>>>>>> #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 << >>>>>>>>>>>>>>>>>>>>>>>> 0) /* vBIOS is sr-iov ready */ >>>>>>>>>>>>>>>>>>>>>>>> #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 1) >>>>>>>>>>>>>>>>>>>>>>>> /* sr-iov is enabled on this GPU */ >>>>>>>>>>>>>>>>>>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt { >>>>>>>>>>>>>>>>>>>>>>>> uint32_t reg_val_offs; >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_irq_src ack_irq; >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_irq_src rcv_irq; >>>>>>>>>>>>>>>>>>>>>>>> - struct work_struct flr_work; >>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct flr_work; >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_mm_table mm_table; >>>>>>>>>>>>>>>>>>>>>>>> const struct amdgpu_virt_ops *ops; >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_vf_error_buffer vf_errors; >>>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>>>>>> index b81acf59870c..f3d1c2be9292 100644 >>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>>>>>> @@ -251,7 +251,7 @@ static int >>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev, >>>>>>>>>>>>>>>>>>>>>>>> static void xgpu_ai_mailbox_flr_work(struct >>>>>>>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, >>>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work); >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev = >>>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt); >>>>>>>>>>>>>>>>>>>>>>>> int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>>>>>>>>>>>>>>>>>>> @@ -380,7 +380,8 @@ int >>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_get_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>>>> *adev) >>>>>>>>>>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> @@ -389,6 +390,8 @@ void >>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_put_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>>>> *adev) >>>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> static int xgpu_ai_request_init_data(struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644 >>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>>>>>> @@ -275,7 +275,7 @@ static int >>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev, >>>>>>>>>>>>>>>>>>>>>>>> static void xgpu_nv_mailbox_flr_work(struct >>>>>>>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, >>>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work); >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev = >>>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt); >>>>>>>>>>>>>>>>>>>>>>>> int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>>>>>>>>>>>>>>>>>>> @@ -407,7 +407,8 @@ int >>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_get_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>>>> *adev) >>>>>>>>>>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> @@ -416,6 +417,8 @@ void >>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_put_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>>>> *adev) >>>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> const struct amdgpu_virt_ops >>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_virt_ops = { >>>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644 >>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>>>>>> @@ -512,7 +512,7 @@ static int >>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev, >>>>>>>>>>>>>>>>>>>>>>>> static void xgpu_vi_mailbox_flr_work(struct >>>>>>>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, >>>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work); >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev = >>>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt); >>>>>>>>>>>>>>>>>>>>>>>> /* wait until RCV_MSG become 3 */ >>>>>>>>>>>>>>>>>>>>>>>> @@ -610,7 +610,8 @@ int >>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_get_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>>>> *adev) >>>>>>>>>>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> @@ -619,6 +620,8 @@ void >>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_put_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>>>> *adev) >>>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> const struct amdgpu_virt_ops >>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_virt_ops = { >>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>> >>>>>>>>>>>> >>>>>>>>>> >>>>>>>> >>>>>> >>> > ^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive. 2022-05-11 15:39 ` Christian König 2022-05-11 15:57 ` Andrey Grodzovsky @ 2022-05-11 20:27 ` Andrey Grodzovsky 2022-05-12 6:06 ` Christian König 1 sibling, 1 reply; 40+ messages in thread From: Andrey Grodzovsky @ 2022-05-11 20:27 UTC (permalink / raw) To: Christian König, Lazar, Lijo, Christian König, Felix Kuehling, amd-gfx Cc: Bai Zoy On 2022-05-11 11:39, Christian König wrote: > Am 11.05.22 um 17:35 schrieb Andrey Grodzovsky: >> On 2022-05-11 11:20, Lazar, Lijo wrote: >>> >>> >>> On 5/11/2022 7:28 PM, Christian König wrote: >>>> Am 11.05.22 um 15:43 schrieb Andrey Grodzovsky: >>>>> On 2022-05-11 03:38, Christian König wrote: >>>>>> Am 10.05.22 um 20:53 schrieb Andrey Grodzovsky: >>>>>>> [SNIP] >>>>>>>> E.g. in the reset code (either before or after the reset, >>>>>>>> that's debatable) you do something like this: >>>>>>>> >>>>>>>> for (i = 0; i < num_ring; ++i) >>>>>>>> cancel_delayed_work(ring[i]->scheduler....) >>>>>>>> cancel_work(adev->ras_work); >>>>>>>> cancel_work(adev->iofault_work); >>>>>>>> cancel_work(adev->debugfs_work); >>>>>>>> ... >>>>>>>> >>>>>>>> You don't really need to track which reset source has fired and >>>>>>>> which hasn't, because that would be racy again. Instead just >>>>>>>> bluntly reset all possible sources. >>>>>>>> >>>>>>>> Christian. >>>>>>> >>>>>>> >>>>>>> I don't say we care if it fired once or twice (I need to add a >>>>>>> fix to only insert reset work to pending reset list if it's not >>>>>>> already there), the point of using list (or array) to which you >>>>>>> add and from which you remove is that the logic of this is >>>>>>> encapsulated within reset domain. In your way we need to be >>>>>>> aware who exactly schedules reset work and explicitly cancel >>>>>>> them, this also means that for any new source added in the >>>>>>> future you will need to remember to add him >>>>>> >>>>>> I don't think that this is a valid argument. Additionally to the >>>>>> schedulers we probably just need less than a handful of reset >>>>>> sources, most likely even just one or two is enough. >>>>>> >>>>>> The only justification I can see of having additional separate >>>>>> reset sources would be if somebody wants to know if a specific >>>>>> source has been handled or not (e.g. call flush_work() or >>>>>> work_pending()). Like in the case of a reset triggered through >>>>>> debugfs. >>>>> >>>>> >>>>> This is indeed one reason, another is as we said before that if >>>>> you share 'reset source' (meaning a delayed work) with another >>>>> client (i.e. RAS and KFD) it means you make assumption that the >>>>> other client always proceeds with the >>>>> reset exactly the same way as you expect. So today we have this >>>>> only in scheduler vs non scheduler reset happening - non scheduler >>>>> reset clients assume the reset is always fully executed in HW >>>>> while scheduler based reset makes shortcuts and not always does HW >>>>> reset hence they cannot share 'reset source' (delayed work). Yes, >>>>> we can always add this in the future if and when such problem will >>>>> arise but no one will remember this then and a new bug will be >>>>> introduced and will take time to find and resolve. >>>> >>>> Mhm, so your main concern is that we forget to correctly handle the >>>> new reset sources? >>>> >>>> How about we do it like this then: >>>> >>>> struct amdgpu_reset_domain { >>>> .... >>>> union { >>>> struct { >>>> struct work_item debugfs; >>>> struct work_item ras; >>>> .... >>>> }; >>>> struct work_item array[] >>>> } reset_sources; >>>> } >>>> >>> >>> If it's only about static array, >>> >>> enum amdgpu_reset_soruce { >>> >>> AMDGPU_RESET_SRC_RAS, >>> AMDGPU_RESET_SRC_ABC, >>> ..... >>> AMDGPU_RESET_SRC_XYZ, >>> AMDGPU_RESET_SRC_MAX >>> >>> }; >>> >>> struct work_struct reset_work[AMDGPU_RESET_SRC_MAX]; => An index for >>> each work item >>> >>> >>> Thanks, >>> Lijo >> >> >> It's possible though it makes harder to generalize reset_domain later >> for other drivers. >> But still one caveat, look at amdgpu_recover_work_struct and it's >> usage in amdgpu_device_gpu_recover and in gpu_recover_get, >> At least for debugfs i need to return back the result of GPU reset >> and so I cannot store actual work items in the array mentioned above >> but rather pointers to work_item because i need a way to get back the >> return value from gpu_recover like I do now in >> amdgpu_device_gpu_recover. > > You should try to avoid that as well. > > See when the debugfs reset is canceled because of a scheduler reset > you won't get a useful return code either. > > What we should do instead is to cache the status of the last reset in > the reset domain. > > Regards, > Christian. Another problem with this approach - to execute the actaul GPU reset I need accesses to concrete amdgpu_device pointer from work struct (see xgpu_ai_mailbox_flr_work) as example. If i store all work items in array in amdgpu_reset_domain the most i can only retrieve is the reset_domain struct itself which won't help since it's dynamically allocated, not embedded in hive or adev and can can be one per device or per entire hive in case of XGMI and so there is no way for me to reach back to amdgpu_device. Back pointer to adev* from amdgpu_reset_domain will only work for single device but not for XGMI hive where there are multiple devices in a hive. Andrey > >> >> Andrey >> >> >>> >>>> Not 100% sure if that works, but something like that should do the >>>> trick. >>>> >>>> My main concern is that I don't want to allocate the work items on >>>> the stack and dynamic allocation (e.g. kmalloc) is usually not >>>> possible either. >>>> >>>> Additional to that putting/removing work items from a list, array >>>> or other container is a very common source for race conditions. >>>> >>>> Regards, >>>> Christian. >>>> >>>>> >>>>>>> to the cancellation list which you showed above. In current way >>>>>>> all this done automatically within reset_domain code and it's >>>>>>> agnostic to specific driver and it's specific list of reset >>>>>>> sources. Also in case we would want to generalize reset_domain >>>>>>> to other GPU drivers (which was >>>>>>> a plan as far as i remember) this explicit mention of each reset >>>>>>> works for cancellation is again less suitable in my opinion. >>>>>> >>>>>> Well we could put the work item for the scheduler independent >>>>>> reset source into the reset domain as well. But I'm not sure >>>>>> those additional reset sources should be part of any common >>>>>> handling, that is largely amdgpu specific. >>>>> >>>>> >>>>> So it's for sure more then one source for the reasons described >>>>> above, also note that for scheduler we already cancel delayed work >>>>> in drm_sched_stop so calling them again in amdgpu code kind of >>>>> superfluous. >>>>> >>>>> Andrey >>>>> >>>>> >>>>>> >>>>>> Christian. >>>>>> >>>>>>> >>>>>>> Andrey >>>>>>> >>>>>>> >>>>>>>> >>>>>>>>> >>>>>>>>> Andrey >>>>>>>>> >>>>>>>>> >>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>> The only difference is I chose to do the canceling right >>>>>>>>>>> BEFORE the HW reset and not AFTER. I did this because I see >>>>>>>>>>> a possible race where a new reset request is being generated >>>>>>>>>>> exactly after we finished the HW reset but before we >>>>>>>>>>> canceled out all pending resets - in such case you wold not >>>>>>>>>>> want to cancel this 'border line new' reset request. >>>>>>>>>> >>>>>>>>>> Why not? Any new reset request directly after a hardware >>>>>>>>>> reset is most likely just falsely generated by the reset itself. >>>>>>>>>> >>>>>>>>>> Ideally I would cancel all sources after the reset, but >>>>>>>>>> before starting any new work. >>>>>>>>>> >>>>>>>>>> Regards, >>>>>>>>>> Christian. >>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>> Andrey >>>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>>> >>>>>>>>>>>> Regards, >>>>>>>>>>>> Christian. >>>>>>>>>>>> >>>>>>>>>>>>>> You can see that if many different reset sources share >>>>>>>>>>>>>> same work struct what can happen is that the first to >>>>>>>>>>>>>> obtain the lock you describe bellow might opt out from >>>>>>>>>>>>>> full HW reset because his bad job did signal for example >>>>>>>>>>>>>> or because his hunged IP block was able to recover >>>>>>>>>>>>>> through SW reset but in the meantime another reset source >>>>>>>>>>>>>> who needed an actual HW reset just silently returned and >>>>>>>>>>>>>> we end up with unhandled reset request. True that today >>>>>>>>>>>>>> this happens only to job timeout reset sources that are >>>>>>>>>>>>>> handled form within the scheduler and won't use this >>>>>>>>>>>>>> single work struct but no one prevents a future case for >>>>>>>>>>>>>> this to happen and also, if we actually want to unify >>>>>>>>>>>>>> scheduler time out handlers within reset domain (which >>>>>>>>>>>>>> seems to me the right design approach) we won't be able >>>>>>>>>>>>>> to use just one work struct for this reason anyway. >>>>>>>>>>>>>> >>>>>>>>>>>>> >>>>>>>>>>>>> Just to add to this point - a reset domain is co-operative >>>>>>>>>>>>> domain. In addition to sharing a set of clients from >>>>>>>>>>>>> various reset sources for one device, it also will have a >>>>>>>>>>>>> set of devices like in XGMI hive. The job timeout on one >>>>>>>>>>>>> device may not eventually result in result, but a RAS >>>>>>>>>>>>> error happening on another device at the same time would >>>>>>>>>>>>> need a reset. The second device's RAS error cannot return >>>>>>>>>>>>> seeing that a reset work already started, or ignore the >>>>>>>>>>>>> reset work given that another device has filled the reset >>>>>>>>>>>>> data. >>>>>>>>>>>>> >>>>>>>>>>>>> When there is a reset domain, it should take care of the >>>>>>>>>>>>> work scheduled and keeping it in device or any other level >>>>>>>>>>>>> doesn't sound good. >>>>>>>>>>>>> >>>>>>>>>>>>> Thanks, >>>>>>>>>>>>> Lijo >>>>>>>>>>>>> >>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>> >>>>>>>>>>>>>> >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> I'd put the reset work struct into the reset_domain >>>>>>>>>>>>>>> struct. That way you'd have exactly one worker for the >>>>>>>>>>>>>>> reset domain. You could implement a lock-less scheme to >>>>>>>>>>>>>>> decide whether you need to schedule a reset, e.g. using >>>>>>>>>>>>>>> an atomic counter in the shared work struct that gets >>>>>>>>>>>>>>> incremented when a client wants to trigger a reset >>>>>>>>>>>>>>> (atomic_add_return). If that counter is exactly 1 after >>>>>>>>>>>>>>> incrementing, you need to fill in the rest of the work >>>>>>>>>>>>>>> struct and schedule the work. Otherwise, it's already >>>>>>>>>>>>>>> scheduled (or another client is in the process of >>>>>>>>>>>>>>> scheduling it) and you just return. When the worker >>>>>>>>>>>>>>> finishes (after confirming a successful reset), it >>>>>>>>>>>>>>> resets the counter to 0, so the next client requesting a >>>>>>>>>>>>>>> reset will schedule the worker again. >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>>> Felix >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> Additional to that keep in mind that you can't >>>>>>>>>>>>>>>>> allocate any memory before or during the GPU reset nor >>>>>>>>>>>>>>>>> wait for the reset to complete (so you can't allocate >>>>>>>>>>>>>>>>> anything on the stack either). >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> There is no dynamic allocation here, regarding stack >>>>>>>>>>>>>>>> allocations - we do it all the time when we call >>>>>>>>>>>>>>>> functions, even during GPU resets, how on stack >>>>>>>>>>>>>>>> allocation of work struct in amdgpu_device_gpu_recover >>>>>>>>>>>>>>>> is different from any other local variable we allocate >>>>>>>>>>>>>>>> in any function we call ? >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> I am also not sure why it's not allowed to wait for >>>>>>>>>>>>>>>> reset to complete ? Also, see in amdgpu_ras_do_recovery >>>>>>>>>>>>>>>> and gpu_recover_get (debugfs) - the caller expects the >>>>>>>>>>>>>>>> reset to complete before he returns. I can probably >>>>>>>>>>>>>>>> work around it in RAS code by calling >>>>>>>>>>>>>>>> atomic_set(&ras->in_recovery, 0) from some callback >>>>>>>>>>>>>>>> within actual reset function but regarding sysfs it >>>>>>>>>>>>>>>> actually expects a result returned indicating whether >>>>>>>>>>>>>>>> the call was successful or not. >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> I don't think that concept you try here will work. >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>>>>> Christian. >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> Also in general seems to me it's cleaner approach >>>>>>>>>>>>>>>>>> where this logic (the work items) are held and >>>>>>>>>>>>>>>>>> handled in reset_domain and are not split in each >>>>>>>>>>>>>>>>>> adev or any other entity. We might want in the future >>>>>>>>>>>>>>>>>> to even move the scheduler handling into reset domain >>>>>>>>>>>>>>>>>> since reset domain is supposed to be a generic things >>>>>>>>>>>>>>>>>> and not only or AMD. >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>>>>>>>>> Christian. >>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky >>>>>>>>>>>>>>>>>>>>>> <andrey.grodzovsky@amd.com> >>>>>>>>>>>>>>>>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com> >>>>>>>>>>>>>>>>>>>>>> --- >>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +--- >>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 >>>>>>>>>>>>>>>>>>>>>> +++-- >>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 + >>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 >>>>>>>>>>>>>>>>>>>>>> +++++++++++++++++++++- >>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +- >>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++- >>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++- >>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++- >>>>>>>>>>>>>>>>>>>>>> 8 files changed, 104 insertions(+), 24 >>>>>>>>>>>>>>>>>>>>>> deletions(-) >>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>>>> index 4264abc5604d..99efd8317547 100644 >>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>>>> @@ -109,6 +109,7 @@ >>>>>>>>>>>>>>>>>>>>>> #include "amdgpu_fdinfo.h" >>>>>>>>>>>>>>>>>>>>>> #include "amdgpu_mca.h" >>>>>>>>>>>>>>>>>>>>>> #include "amdgpu_ras.h" >>>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h" >>>>>>>>>>>>>>>>>>>>>> #define MAX_GPU_INSTANCE 16 >>>>>>>>>>>>>>>>>>>>>> @@ -509,16 +510,6 @@ struct >>>>>>>>>>>>>>>>>>>>>> amdgpu_allowed_register_entry { >>>>>>>>>>>>>>>>>>>>>> bool grbm_indexed; >>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>> -enum amd_reset_method { >>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_NONE = -1, >>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_LEGACY = 0, >>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE0, >>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE1, >>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE2, >>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_BACO, >>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_PCI, >>>>>>>>>>>>>>>>>>>>>> -}; >>>>>>>>>>>>>>>>>>>>>> - >>>>>>>>>>>>>>>>>>>>>> struct amdgpu_video_codec_info { >>>>>>>>>>>>>>>>>>>>>> u32 codec_type; >>>>>>>>>>>>>>>>>>>>>> u32 max_width; >>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>>>> index e582f1044c0f..7fa82269c30f 100644 >>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>>>> @@ -5201,6 +5201,12 @@ int >>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct >>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev, >>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>> tmp_vram_lost_counter = >>>>>>>>>>>>>>>>>>>>>> atomic_read(&((adev)->vram_lost_counter)); >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> + /* Drop all pending resets since we will >>>>>>>>>>>>>>>>>>>>>> reset now anyway */ >>>>>>>>>>>>>>>>>>>>>> + tmp_adev = >>>>>>>>>>>>>>>>>>>>>> list_first_entry(device_list_handle, struct >>>>>>>>>>>>>>>>>>>>>> amdgpu_device, >>>>>>>>>>>>>>>>>>>>>> + reset_list); >>>>>>>>>>>>>>>>>>>>>> + amdgpu_reset_pending_list(tmp_adev->reset_domain); >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> /* Actual ASIC resets if needed.*/ >>>>>>>>>>>>>>>>>>>>>> /* Host driver will handle XGMI hive reset >>>>>>>>>>>>>>>>>>>>>> for SRIOV */ >>>>>>>>>>>>>>>>>>>>>> if (amdgpu_sriov_vf(adev)) { >>>>>>>>>>>>>>>>>>>>>> @@ -5296,7 +5302,7 @@ int >>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct >>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev, >>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>> struct amdgpu_recover_work_struct { >>>>>>>>>>>>>>>>>>>>>> - struct work_struct base; >>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct base; >>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev; >>>>>>>>>>>>>>>>>>>>>> struct amdgpu_job *job; >>>>>>>>>>>>>>>>>>>>>> int ret; >>>>>>>>>>>>>>>>>>>>>> @@ -5304,7 +5310,7 @@ struct >>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct { >>>>>>>>>>>>>>>>>>>>>> static void >>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work(struct >>>>>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>> - struct amdgpu_recover_work_struct >>>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct >>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base); >>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_recover_work_struct >>>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct >>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base.base.work); >>>>>>>>>>>>>>>>>>>>>> recover_work->ret = >>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, >>>>>>>>>>>>>>>>>>>>>> recover_work->job); >>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>> @@ -5316,12 +5322,15 @@ int >>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>> *adev, >>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>> struct amdgpu_recover_work_struct work = >>>>>>>>>>>>>>>>>>>>>> {.adev = adev, .job = job}; >>>>>>>>>>>>>>>>>>>>>> - INIT_WORK(&work.base, >>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&work.base.base, >>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&work.base.node); >>>>>>>>>>>>>>>>>>>>>> if >>>>>>>>>>>>>>>>>>>>>> (!amdgpu_reset_domain_schedule(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>> &work.base)) >>>>>>>>>>>>>>>>>>>>>> return -EAGAIN; >>>>>>>>>>>>>>>>>>>>>> - flush_work(&work.base); >>>>>>>>>>>>>>>>>>>>>> + flush_delayed_work(&work.base.base); >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>> &work.base); >>>>>>>>>>>>>>>>>>>>>> return work.ret; >>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>>>> index c80af0889773..ffddd419c351 100644 >>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain >>>>>>>>>>>>>>>>>>>>>> *amdgpu_reset_create_reset_domain(enum >>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_d >>>>>>>>>>>>>>>>>>>>>> atomic_set(&reset_domain->in_gpu_reset, 0); >>>>>>>>>>>>>>>>>>>>>> init_rwsem(&reset_domain->sem); >>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&reset_domain->pending_works); >>>>>>>>>>>>>>>>>>>>>> + mutex_init(&reset_domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> return reset_domain; >>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>>>> index 1949dbe28a86..863ec5720fc1 100644 >>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>>>> @@ -24,7 +24,18 @@ >>>>>>>>>>>>>>>>>>>>>> #ifndef __AMDGPU_RESET_H__ >>>>>>>>>>>>>>>>>>>>>> #define __AMDGPU_RESET_H__ >>>>>>>>>>>>>>>>>>>>>> -#include "amdgpu.h" >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> +#include <linux/atomic.h> >>>>>>>>>>>>>>>>>>>>>> +#include <linux/mutex.h> >>>>>>>>>>>>>>>>>>>>>> +#include <linux/list.h> >>>>>>>>>>>>>>>>>>>>>> +#include <linux/kref.h> >>>>>>>>>>>>>>>>>>>>>> +#include <linux/rwsem.h> >>>>>>>>>>>>>>>>>>>>>> +#include <linux/workqueue.h> >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_device; >>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_job; >>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_hive_info; >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> enum AMDGPU_RESET_FLAGS { >>>>>>>>>>>>>>>>>>>>>> @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS { >>>>>>>>>>>>>>>>>>>>>> AMDGPU_SKIP_HW_RESET = 1, >>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> +enum amd_reset_method { >>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_NONE = -1, >>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_LEGACY = 0, >>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE0, >>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE1, >>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE2, >>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_BACO, >>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_PCI, >>>>>>>>>>>>>>>>>>>>>> +}; >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> struct amdgpu_reset_context { >>>>>>>>>>>>>>>>>>>>>> enum amd_reset_method method; >>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *reset_req_dev; >>>>>>>>>>>>>>>>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context { >>>>>>>>>>>>>>>>>>>>>> unsigned long flags; >>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_reset_control; >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> struct amdgpu_reset_handler { >>>>>>>>>>>>>>>>>>>>>> enum amd_reset_method reset_method; >>>>>>>>>>>>>>>>>>>>>> struct list_head handler_list; >>>>>>>>>>>>>>>>>>>>>> @@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type { >>>>>>>>>>>>>>>>>>>>>> XGMI_HIVE >>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_reset_work_struct { >>>>>>>>>>>>>>>>>>>>>> + struct delayed_work base; >>>>>>>>>>>>>>>>>>>>>> + struct list_head node; >>>>>>>>>>>>>>>>>>>>>> +}; >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> struct amdgpu_reset_domain { >>>>>>>>>>>>>>>>>>>>>> struct kref refcount; >>>>>>>>>>>>>>>>>>>>>> struct workqueue_struct *wq; >>>>>>>>>>>>>>>>>>>>>> enum amdgpu_reset_domain_type type; >>>>>>>>>>>>>>>>>>>>>> struct rw_semaphore sem; >>>>>>>>>>>>>>>>>>>>>> atomic_t in_gpu_reset; >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> + struct list_head pending_works; >>>>>>>>>>>>>>>>>>>>>> + struct mutex reset_lock; >>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>> @@ -113,9 +146,43 @@ static inline void >>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_put_reset_domain(struct >>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *dom >>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>> static inline bool >>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_schedule(struct >>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain, >>>>>>>>>>>>>>>>>>>>>> - struct work_struct *work) >>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work) >>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>> - return queue_work(domain->wq, work); >>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> + if (!queue_delayed_work(domain->wq, >>>>>>>>>>>>>>>>>>>>>> &work->base, 0)) { >>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>> + return false; >>>>>>>>>>>>>>>>>>>>>> + } >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> + list_add_tail(&work->node, >>>>>>>>>>>>>>>>>>>>>> &domain->pending_works); >>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> + return true; >>>>>>>>>>>>>>>>>>>>>> +} >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> +static inline void >>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(struct >>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain, >>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work) >>>>>>>>>>>>>>>>>>>>>> +{ >>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>> + list_del_init(&work->node); >>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>> +} >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> +static inline void >>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_pending_list(struct >>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain) >>>>>>>>>>>>>>>>>>>>>> +{ >>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *entry, *tmp; >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>> + list_for_each_entry_safe(entry, tmp, >>>>>>>>>>>>>>>>>>>>>> &domain->pending_works, node) { >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> + list_del_init(&entry->node); >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> + /* Stop any other related pending resets */ >>>>>>>>>>>>>>>>>>>>>> + cancel_delayed_work(&entry->base); >>>>>>>>>>>>>>>>>>>>>> + } >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>> void amdgpu_device_lock_reset_domain(struct >>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *reset_domain); >>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>>>> index 239f232f9c02..574e870d3064 100644 >>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>>>> @@ -25,6 +25,7 @@ >>>>>>>>>>>>>>>>>>>>>> #define AMDGPU_VIRT_H >>>>>>>>>>>>>>>>>>>>>> #include "amdgv_sriovmsg.h" >>>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h" >>>>>>>>>>>>>>>>>>>>>> #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 << >>>>>>>>>>>>>>>>>>>>>> 0) /* vBIOS is sr-iov ready */ >>>>>>>>>>>>>>>>>>>>>> #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 1) >>>>>>>>>>>>>>>>>>>>>> /* sr-iov is enabled on this GPU */ >>>>>>>>>>>>>>>>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt { >>>>>>>>>>>>>>>>>>>>>> uint32_t reg_val_offs; >>>>>>>>>>>>>>>>>>>>>> struct amdgpu_irq_src ack_irq; >>>>>>>>>>>>>>>>>>>>>> struct amdgpu_irq_src rcv_irq; >>>>>>>>>>>>>>>>>>>>>> - struct work_struct flr_work; >>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct flr_work; >>>>>>>>>>>>>>>>>>>>>> struct amdgpu_mm_table mm_table; >>>>>>>>>>>>>>>>>>>>>> const struct amdgpu_virt_ops *ops; >>>>>>>>>>>>>>>>>>>>>> struct amdgpu_vf_error_buffer vf_errors; >>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>>>> index b81acf59870c..f3d1c2be9292 100644 >>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>>>> @@ -251,7 +251,7 @@ static int >>>>>>>>>>>>>>>>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>> *adev, >>>>>>>>>>>>>>>>>>>>>> static void xgpu_ai_mailbox_flr_work(struct >>>>>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, >>>>>>>>>>>>>>>>>>>>>> flr_work.base.work); >>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev = >>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt); >>>>>>>>>>>>>>>>>>>>>> int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>>>>>>>>>>>>>>>>> @@ -380,7 +380,8 @@ int >>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_get_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>> @@ -389,6 +390,8 @@ void >>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_put_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>> static int xgpu_ai_request_init_data(struct >>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644 >>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>>>> @@ -275,7 +275,7 @@ static int >>>>>>>>>>>>>>>>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>> *adev, >>>>>>>>>>>>>>>>>>>>>> static void xgpu_nv_mailbox_flr_work(struct >>>>>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, >>>>>>>>>>>>>>>>>>>>>> flr_work.base.work); >>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev = >>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt); >>>>>>>>>>>>>>>>>>>>>> int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>>>>>>>>>>>>>>>>> @@ -407,7 +407,8 @@ int >>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_get_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>> @@ -416,6 +417,8 @@ void >>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_put_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>> const struct amdgpu_virt_ops xgpu_nv_virt_ops >>>>>>>>>>>>>>>>>>>>>> = { >>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644 >>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>>>> @@ -512,7 +512,7 @@ static int >>>>>>>>>>>>>>>>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>> *adev, >>>>>>>>>>>>>>>>>>>>>> static void xgpu_vi_mailbox_flr_work(struct >>>>>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, >>>>>>>>>>>>>>>>>>>>>> flr_work.base.work); >>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev = >>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt); >>>>>>>>>>>>>>>>>>>>>> /* wait until RCV_MSG become 3 */ >>>>>>>>>>>>>>>>>>>>>> @@ -610,7 +610,8 @@ int >>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_get_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>> @@ -619,6 +620,8 @@ void >>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_put_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>> const struct amdgpu_virt_ops xgpu_vi_virt_ops >>>>>>>>>>>>>>>>>>>>>> = { >>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> >>>>>>>>>>>> >>>>>>>>>> >>>>>>>> >>>>>> >>>> > ^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive. 2022-05-11 20:27 ` Andrey Grodzovsky @ 2022-05-12 6:06 ` Christian König 2022-05-12 9:21 ` Lazar, Lijo 2022-05-12 13:07 ` Andrey Grodzovsky 0 siblings, 2 replies; 40+ messages in thread From: Christian König @ 2022-05-12 6:06 UTC (permalink / raw) To: Andrey Grodzovsky, Lazar, Lijo, Christian König, Felix Kuehling, amd-gfx Cc: Bai Zoy Am 11.05.22 um 22:27 schrieb Andrey Grodzovsky: > > On 2022-05-11 11:39, Christian König wrote: >> Am 11.05.22 um 17:35 schrieb Andrey Grodzovsky: >>> On 2022-05-11 11:20, Lazar, Lijo wrote: >>>> >>>> >>>> On 5/11/2022 7:28 PM, Christian König wrote: >>>>> Am 11.05.22 um 15:43 schrieb Andrey Grodzovsky: >>>>>> On 2022-05-11 03:38, Christian König wrote: >>>>>>> Am 10.05.22 um 20:53 schrieb Andrey Grodzovsky: >>>>>>>> [SNIP] >>>>>>>>> E.g. in the reset code (either before or after the reset, >>>>>>>>> that's debatable) you do something like this: >>>>>>>>> >>>>>>>>> for (i = 0; i < num_ring; ++i) >>>>>>>>> cancel_delayed_work(ring[i]->scheduler....) >>>>>>>>> cancel_work(adev->ras_work); >>>>>>>>> cancel_work(adev->iofault_work); >>>>>>>>> cancel_work(adev->debugfs_work); >>>>>>>>> ... >>>>>>>>> >>>>>>>>> You don't really need to track which reset source has fired >>>>>>>>> and which hasn't, because that would be racy again. Instead >>>>>>>>> just bluntly reset all possible sources. >>>>>>>>> >>>>>>>>> Christian. >>>>>>>> >>>>>>>> >>>>>>>> I don't say we care if it fired once or twice (I need to add a >>>>>>>> fix to only insert reset work to pending reset list if it's not >>>>>>>> already there), the point of using list (or array) to which you >>>>>>>> add and from which you remove is that the logic of this is >>>>>>>> encapsulated within reset domain. In your way we need to be >>>>>>>> aware who exactly schedules reset work and explicitly cancel >>>>>>>> them, this also means that for any new source added in the >>>>>>>> future you will need to remember to add him >>>>>>> >>>>>>> I don't think that this is a valid argument. Additionally to the >>>>>>> schedulers we probably just need less than a handful of reset >>>>>>> sources, most likely even just one or two is enough. >>>>>>> >>>>>>> The only justification I can see of having additional separate >>>>>>> reset sources would be if somebody wants to know if a specific >>>>>>> source has been handled or not (e.g. call flush_work() or >>>>>>> work_pending()). Like in the case of a reset triggered through >>>>>>> debugfs. >>>>>> >>>>>> >>>>>> This is indeed one reason, another is as we said before that if >>>>>> you share 'reset source' (meaning a delayed work) with another >>>>>> client (i.e. RAS and KFD) it means you make assumption that the >>>>>> other client always proceeds with the >>>>>> reset exactly the same way as you expect. So today we have this >>>>>> only in scheduler vs non scheduler reset happening - non >>>>>> scheduler reset clients assume the reset is always fully executed >>>>>> in HW while scheduler based reset makes shortcuts and not always >>>>>> does HW reset hence they cannot share 'reset source' (delayed >>>>>> work). Yes, we can always add this in the future if and when such >>>>>> problem will arise but no one will remember this then and a new >>>>>> bug will be introduced and will take time to find and resolve. >>>>> >>>>> Mhm, so your main concern is that we forget to correctly handle >>>>> the new reset sources? >>>>> >>>>> How about we do it like this then: >>>>> >>>>> struct amdgpu_reset_domain { >>>>> .... >>>>> union { >>>>> struct { >>>>> struct work_item debugfs; >>>>> struct work_item ras; >>>>> .... >>>>> }; >>>>> struct work_item array[] >>>>> } reset_sources; >>>>> } >>>>> >>>> >>>> If it's only about static array, >>>> >>>> enum amdgpu_reset_soruce { >>>> >>>> AMDGPU_RESET_SRC_RAS, >>>> AMDGPU_RESET_SRC_ABC, >>>> ..... >>>> AMDGPU_RESET_SRC_XYZ, >>>> AMDGPU_RESET_SRC_MAX >>>> >>>> }; >>>> >>>> struct work_struct reset_work[AMDGPU_RESET_SRC_MAX]; => An index >>>> for each work item >>>> >>>> >>>> Thanks, >>>> Lijo >>> >>> >>> It's possible though it makes harder to generalize reset_domain >>> later for other drivers. >>> But still one caveat, look at amdgpu_recover_work_struct and it's >>> usage in amdgpu_device_gpu_recover and in gpu_recover_get, >>> At least for debugfs i need to return back the result of GPU reset >>> and so I cannot store actual work items in the array mentioned above >>> but rather pointers to work_item because i need a way to get back >>> the return value from gpu_recover like I do now in >>> amdgpu_device_gpu_recover. >> >> You should try to avoid that as well. >> >> See when the debugfs reset is canceled because of a scheduler reset >> you won't get a useful return code either. >> >> What we should do instead is to cache the status of the last reset in >> the reset domain. >> >> Regards, >> Christian. > > > Another problem with this approach - to execute the actaul GPU reset > I need accesses to concrete amdgpu_device pointer from work struct > (see xgpu_ai_mailbox_flr_work) as example. If i store all work items in > array in amdgpu_reset_domain the most i can only retrieve is the > reset_domain struct itself which won't help since it's dynamically > allocated, not embedded in hive or adev and can can be one per device > or per entire hive in case of XGMI and so there is no way for me to > reach back to amdgpu_device. Back pointer to adev* from > amdgpu_reset_domain will only work for single device but not for XGMI > hive where there are multiple devices in a hive. Which is exactly the reason why I think we should always allocate the hive structure, even if we only have one device. And a GPU reset should then always work with the hive data structure and not adev. Adding a pointer from your reset work item back to the hive is then trivial. Regards, Christian. > > Andrey > > >> >>> >>> Andrey >>> >>> >>>> >>>>> Not 100% sure if that works, but something like that should do the >>>>> trick. >>>>> >>>>> My main concern is that I don't want to allocate the work items on >>>>> the stack and dynamic allocation (e.g. kmalloc) is usually not >>>>> possible either. >>>>> >>>>> Additional to that putting/removing work items from a list, array >>>>> or other container is a very common source for race conditions. >>>>> >>>>> Regards, >>>>> Christian. >>>>> >>>>>> >>>>>>>> to the cancellation list which you showed above. In current way >>>>>>>> all this done automatically within reset_domain code and it's >>>>>>>> agnostic to specific driver and it's specific list of reset >>>>>>>> sources. Also in case we would want to generalize reset_domain >>>>>>>> to other GPU drivers (which was >>>>>>>> a plan as far as i remember) this explicit mention of each >>>>>>>> reset works for cancellation is again less suitable in my opinion. >>>>>>> >>>>>>> Well we could put the work item for the scheduler independent >>>>>>> reset source into the reset domain as well. But I'm not sure >>>>>>> those additional reset sources should be part of any common >>>>>>> handling, that is largely amdgpu specific. >>>>>> >>>>>> >>>>>> So it's for sure more then one source for the reasons described >>>>>> above, also note that for scheduler we already cancel delayed >>>>>> work in drm_sched_stop so calling them again in amdgpu code kind >>>>>> of superfluous. >>>>>> >>>>>> Andrey >>>>>> >>>>>> >>>>>>> >>>>>>> Christian. >>>>>>> >>>>>>>> >>>>>>>> Andrey >>>>>>>> >>>>>>>> >>>>>>>>> >>>>>>>>>> >>>>>>>>>> Andrey >>>>>>>>>> >>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>>> >>>>>>>>>>>> The only difference is I chose to do the canceling right >>>>>>>>>>>> BEFORE the HW reset and not AFTER. I did this because I see >>>>>>>>>>>> a possible race where a new reset request is being >>>>>>>>>>>> generated exactly after we finished the HW reset but before >>>>>>>>>>>> we canceled out all pending resets - in such case you wold >>>>>>>>>>>> not want to cancel this 'border line new' reset request. >>>>>>>>>>> >>>>>>>>>>> Why not? Any new reset request directly after a hardware >>>>>>>>>>> reset is most likely just falsely generated by the reset >>>>>>>>>>> itself. >>>>>>>>>>> >>>>>>>>>>> Ideally I would cancel all sources after the reset, but >>>>>>>>>>> before starting any new work. >>>>>>>>>>> >>>>>>>>>>> Regards, >>>>>>>>>>> Christian. >>>>>>>>>>> >>>>>>>>>>>> >>>>>>>>>>>> >>>>>>>>>>>> Andrey >>>>>>>>>>>> >>>>>>>>>>>> >>>>>>>>>>>>> >>>>>>>>>>>>> Regards, >>>>>>>>>>>>> Christian. >>>>>>>>>>>>> >>>>>>>>>>>>>>> You can see that if many different reset sources share >>>>>>>>>>>>>>> same work struct what can happen is that the first to >>>>>>>>>>>>>>> obtain the lock you describe bellow might opt out from >>>>>>>>>>>>>>> full HW reset because his bad job did signal for example >>>>>>>>>>>>>>> or because his hunged IP block was able to recover >>>>>>>>>>>>>>> through SW reset but in the meantime another reset >>>>>>>>>>>>>>> source who needed an actual HW reset just silently >>>>>>>>>>>>>>> returned and we end up with unhandled reset request. >>>>>>>>>>>>>>> True that today this happens only to job timeout reset >>>>>>>>>>>>>>> sources that are handled form within the scheduler and >>>>>>>>>>>>>>> won't use this single work struct but no one prevents a >>>>>>>>>>>>>>> future case for this to happen and also, if we actually >>>>>>>>>>>>>>> want to unify scheduler time out handlers within reset >>>>>>>>>>>>>>> domain (which seems to me the right design approach) we >>>>>>>>>>>>>>> won't be able to use just one work struct for this >>>>>>>>>>>>>>> reason anyway. >>>>>>>>>>>>>>> >>>>>>>>>>>>>> >>>>>>>>>>>>>> Just to add to this point - a reset domain is >>>>>>>>>>>>>> co-operative domain. In addition to sharing a set of >>>>>>>>>>>>>> clients from various reset sources for one device, it >>>>>>>>>>>>>> also will have a set of devices like in XGMI hive. The >>>>>>>>>>>>>> job timeout on one device may not eventually result in >>>>>>>>>>>>>> result, but a RAS error happening on another device at >>>>>>>>>>>>>> the same time would need a reset. The second device's RAS >>>>>>>>>>>>>> error cannot return seeing that a reset work already >>>>>>>>>>>>>> started, or ignore the reset work given that another >>>>>>>>>>>>>> device has filled the reset data. >>>>>>>>>>>>>> >>>>>>>>>>>>>> When there is a reset domain, it should take care of the >>>>>>>>>>>>>> work scheduled and keeping it in device or any other >>>>>>>>>>>>>> level doesn't sound good. >>>>>>>>>>>>>> >>>>>>>>>>>>>> Thanks, >>>>>>>>>>>>>> Lijo >>>>>>>>>>>>>> >>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> I'd put the reset work struct into the reset_domain >>>>>>>>>>>>>>>> struct. That way you'd have exactly one worker for the >>>>>>>>>>>>>>>> reset domain. You could implement a lock-less scheme to >>>>>>>>>>>>>>>> decide whether you need to schedule a reset, e.g. using >>>>>>>>>>>>>>>> an atomic counter in the shared work struct that gets >>>>>>>>>>>>>>>> incremented when a client wants to trigger a reset >>>>>>>>>>>>>>>> (atomic_add_return). If that counter is exactly 1 after >>>>>>>>>>>>>>>> incrementing, you need to fill in the rest of the work >>>>>>>>>>>>>>>> struct and schedule the work. Otherwise, it's already >>>>>>>>>>>>>>>> scheduled (or another client is in the process of >>>>>>>>>>>>>>>> scheduling it) and you just return. When the worker >>>>>>>>>>>>>>>> finishes (after confirming a successful reset), it >>>>>>>>>>>>>>>> resets the counter to 0, so the next client requesting >>>>>>>>>>>>>>>> a reset will schedule the worker again. >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>>>> Felix >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> Additional to that keep in mind that you can't >>>>>>>>>>>>>>>>>> allocate any memory before or during the GPU reset >>>>>>>>>>>>>>>>>> nor wait for the reset to complete (so you can't >>>>>>>>>>>>>>>>>> allocate anything on the stack either). >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> There is no dynamic allocation here, regarding stack >>>>>>>>>>>>>>>>> allocations - we do it all the time when we call >>>>>>>>>>>>>>>>> functions, even during GPU resets, how on stack >>>>>>>>>>>>>>>>> allocation of work struct in amdgpu_device_gpu_recover >>>>>>>>>>>>>>>>> is different from any other local variable we allocate >>>>>>>>>>>>>>>>> in any function we call ? >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> I am also not sure why it's not allowed to wait for >>>>>>>>>>>>>>>>> reset to complete ? Also, see in >>>>>>>>>>>>>>>>> amdgpu_ras_do_recovery and gpu_recover_get (debugfs) - >>>>>>>>>>>>>>>>> the caller expects the reset to complete before he >>>>>>>>>>>>>>>>> returns. I can probably work around it in RAS code by >>>>>>>>>>>>>>>>> calling atomic_set(&ras->in_recovery, 0) from some >>>>>>>>>>>>>>>>> callback within actual reset function but regarding >>>>>>>>>>>>>>>>> sysfs it actually expects a result returned indicating >>>>>>>>>>>>>>>>> whether the call was successful or not. >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> I don't think that concept you try here will work. >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>>>>>> Christian. >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> Also in general seems to me it's cleaner approach >>>>>>>>>>>>>>>>>>> where this logic (the work items) are held and >>>>>>>>>>>>>>>>>>> handled in reset_domain and are not split in each >>>>>>>>>>>>>>>>>>> adev or any other entity. We might want in the >>>>>>>>>>>>>>>>>>> future to even move the scheduler handling into >>>>>>>>>>>>>>>>>>> reset domain since reset domain is supposed to be a >>>>>>>>>>>>>>>>>>> generic things and not only or AMD. >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>>>>>>>>>> Christian. >>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky >>>>>>>>>>>>>>>>>>>>>>> <andrey.grodzovsky@amd.com> >>>>>>>>>>>>>>>>>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com> >>>>>>>>>>>>>>>>>>>>>>> --- >>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +--- >>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 >>>>>>>>>>>>>>>>>>>>>>> +++-- >>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 + >>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 >>>>>>>>>>>>>>>>>>>>>>> +++++++++++++++++++++- >>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +- >>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++- >>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++- >>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++- >>>>>>>>>>>>>>>>>>>>>>> 8 files changed, 104 insertions(+), 24 >>>>>>>>>>>>>>>>>>>>>>> deletions(-) >>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>>>>> index 4264abc5604d..99efd8317547 100644 >>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>>>>> @@ -109,6 +109,7 @@ >>>>>>>>>>>>>>>>>>>>>>> #include "amdgpu_fdinfo.h" >>>>>>>>>>>>>>>>>>>>>>> #include "amdgpu_mca.h" >>>>>>>>>>>>>>>>>>>>>>> #include "amdgpu_ras.h" >>>>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h" >>>>>>>>>>>>>>>>>>>>>>> #define MAX_GPU_INSTANCE 16 >>>>>>>>>>>>>>>>>>>>>>> @@ -509,16 +510,6 @@ struct >>>>>>>>>>>>>>>>>>>>>>> amdgpu_allowed_register_entry { >>>>>>>>>>>>>>>>>>>>>>> bool grbm_indexed; >>>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>>> -enum amd_reset_method { >>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_NONE = -1, >>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_LEGACY = 0, >>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE0, >>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE1, >>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE2, >>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_BACO, >>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_PCI, >>>>>>>>>>>>>>>>>>>>>>> -}; >>>>>>>>>>>>>>>>>>>>>>> - >>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_video_codec_info { >>>>>>>>>>>>>>>>>>>>>>> u32 codec_type; >>>>>>>>>>>>>>>>>>>>>>> u32 max_width; >>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>>>>> index e582f1044c0f..7fa82269c30f 100644 >>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>>>>> @@ -5201,6 +5201,12 @@ int >>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct >>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev, >>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>> tmp_vram_lost_counter = >>>>>>>>>>>>>>>>>>>>>>> atomic_read(&((adev)->vram_lost_counter)); >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> + /* Drop all pending resets since we will >>>>>>>>>>>>>>>>>>>>>>> reset now anyway */ >>>>>>>>>>>>>>>>>>>>>>> + tmp_adev = >>>>>>>>>>>>>>>>>>>>>>> list_first_entry(device_list_handle, struct >>>>>>>>>>>>>>>>>>>>>>> amdgpu_device, >>>>>>>>>>>>>>>>>>>>>>> + reset_list); >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_pending_list(tmp_adev->reset_domain); >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> /* Actual ASIC resets if needed.*/ >>>>>>>>>>>>>>>>>>>>>>> /* Host driver will handle XGMI hive reset >>>>>>>>>>>>>>>>>>>>>>> for SRIOV */ >>>>>>>>>>>>>>>>>>>>>>> if (amdgpu_sriov_vf(adev)) { >>>>>>>>>>>>>>>>>>>>>>> @@ -5296,7 +5302,7 @@ int >>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct >>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev, >>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_recover_work_struct { >>>>>>>>>>>>>>>>>>>>>>> - struct work_struct base; >>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct base; >>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev; >>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_job *job; >>>>>>>>>>>>>>>>>>>>>>> int ret; >>>>>>>>>>>>>>>>>>>>>>> @@ -5304,7 +5310,7 @@ struct >>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct { >>>>>>>>>>>>>>>>>>>>>>> static void >>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work(struct >>>>>>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>> - struct amdgpu_recover_work_struct >>>>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct >>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base); >>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_recover_work_struct >>>>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct >>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base.base.work); >>>>>>>>>>>>>>>>>>>>>>> recover_work->ret = >>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, >>>>>>>>>>>>>>>>>>>>>>> recover_work->job); >>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>> @@ -5316,12 +5322,15 @@ int >>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>>> *adev, >>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_recover_work_struct work = >>>>>>>>>>>>>>>>>>>>>>> {.adev = adev, .job = job}; >>>>>>>>>>>>>>>>>>>>>>> - INIT_WORK(&work.base, >>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&work.base.base, >>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&work.base.node); >>>>>>>>>>>>>>>>>>>>>>> if >>>>>>>>>>>>>>>>>>>>>>> (!amdgpu_reset_domain_schedule(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>>> &work.base)) >>>>>>>>>>>>>>>>>>>>>>> return -EAGAIN; >>>>>>>>>>>>>>>>>>>>>>> - flush_work(&work.base); >>>>>>>>>>>>>>>>>>>>>>> + flush_delayed_work(&work.base.base); >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>>> &work.base); >>>>>>>>>>>>>>>>>>>>>>> return work.ret; >>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>>>>> index c80af0889773..ffddd419c351 100644 >>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain >>>>>>>>>>>>>>>>>>>>>>> *amdgpu_reset_create_reset_domain(enum >>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_d >>>>>>>>>>>>>>>>>>>>>>> atomic_set(&reset_domain->in_gpu_reset, 0); >>>>>>>>>>>>>>>>>>>>>>> init_rwsem(&reset_domain->sem); >>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&reset_domain->pending_works); >>>>>>>>>>>>>>>>>>>>>>> + mutex_init(&reset_domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> return reset_domain; >>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>>>>> index 1949dbe28a86..863ec5720fc1 100644 >>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>>>>> @@ -24,7 +24,18 @@ >>>>>>>>>>>>>>>>>>>>>>> #ifndef __AMDGPU_RESET_H__ >>>>>>>>>>>>>>>>>>>>>>> #define __AMDGPU_RESET_H__ >>>>>>>>>>>>>>>>>>>>>>> -#include "amdgpu.h" >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> +#include <linux/atomic.h> >>>>>>>>>>>>>>>>>>>>>>> +#include <linux/mutex.h> >>>>>>>>>>>>>>>>>>>>>>> +#include <linux/list.h> >>>>>>>>>>>>>>>>>>>>>>> +#include <linux/kref.h> >>>>>>>>>>>>>>>>>>>>>>> +#include <linux/rwsem.h> >>>>>>>>>>>>>>>>>>>>>>> +#include <linux/workqueue.h> >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_device; >>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_job; >>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_hive_info; >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> enum AMDGPU_RESET_FLAGS { >>>>>>>>>>>>>>>>>>>>>>> @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS { >>>>>>>>>>>>>>>>>>>>>>> AMDGPU_SKIP_HW_RESET = 1, >>>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> +enum amd_reset_method { >>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_NONE = -1, >>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_LEGACY = 0, >>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE0, >>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE1, >>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE2, >>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_BACO, >>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_PCI, >>>>>>>>>>>>>>>>>>>>>>> +}; >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_reset_context { >>>>>>>>>>>>>>>>>>>>>>> enum amd_reset_method method; >>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *reset_req_dev; >>>>>>>>>>>>>>>>>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context { >>>>>>>>>>>>>>>>>>>>>>> unsigned long flags; >>>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_reset_control; >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_reset_handler { >>>>>>>>>>>>>>>>>>>>>>> enum amd_reset_method reset_method; >>>>>>>>>>>>>>>>>>>>>>> struct list_head handler_list; >>>>>>>>>>>>>>>>>>>>>>> @@ -76,12 +100,21 @@ enum >>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_type { >>>>>>>>>>>>>>>>>>>>>>> XGMI_HIVE >>>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_reset_work_struct { >>>>>>>>>>>>>>>>>>>>>>> + struct delayed_work base; >>>>>>>>>>>>>>>>>>>>>>> + struct list_head node; >>>>>>>>>>>>>>>>>>>>>>> +}; >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_reset_domain { >>>>>>>>>>>>>>>>>>>>>>> struct kref refcount; >>>>>>>>>>>>>>>>>>>>>>> struct workqueue_struct *wq; >>>>>>>>>>>>>>>>>>>>>>> enum amdgpu_reset_domain_type type; >>>>>>>>>>>>>>>>>>>>>>> struct rw_semaphore sem; >>>>>>>>>>>>>>>>>>>>>>> atomic_t in_gpu_reset; >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> + struct list_head pending_works; >>>>>>>>>>>>>>>>>>>>>>> + struct mutex reset_lock; >>>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>>> @@ -113,9 +146,43 @@ static inline void >>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_put_reset_domain(struct >>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *dom >>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>> static inline bool >>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_schedule(struct >>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain, >>>>>>>>>>>>>>>>>>>>>>> - struct work_struct *work) >>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work) >>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>> - return queue_work(domain->wq, work); >>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> + if (!queue_delayed_work(domain->wq, >>>>>>>>>>>>>>>>>>>>>>> &work->base, 0)) { >>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>> + return false; >>>>>>>>>>>>>>>>>>>>>>> + } >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> + list_add_tail(&work->node, >>>>>>>>>>>>>>>>>>>>>>> &domain->pending_works); >>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> + return true; >>>>>>>>>>>>>>>>>>>>>>> +} >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> +static inline void >>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(struct >>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain, >>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work) >>>>>>>>>>>>>>>>>>>>>>> +{ >>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>> + list_del_init(&work->node); >>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>> +} >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> +static inline void >>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_pending_list(struct >>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain) >>>>>>>>>>>>>>>>>>>>>>> +{ >>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *entry, *tmp; >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>> + list_for_each_entry_safe(entry, tmp, >>>>>>>>>>>>>>>>>>>>>>> &domain->pending_works, node) { >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> + list_del_init(&entry->node); >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> + /* Stop any other related pending >>>>>>>>>>>>>>>>>>>>>>> resets */ >>>>>>>>>>>>>>>>>>>>>>> + cancel_delayed_work(&entry->base); >>>>>>>>>>>>>>>>>>>>>>> + } >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>> void amdgpu_device_lock_reset_domain(struct >>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *reset_domain); >>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>>>>> index 239f232f9c02..574e870d3064 100644 >>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>>>>> @@ -25,6 +25,7 @@ >>>>>>>>>>>>>>>>>>>>>>> #define AMDGPU_VIRT_H >>>>>>>>>>>>>>>>>>>>>>> #include "amdgv_sriovmsg.h" >>>>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h" >>>>>>>>>>>>>>>>>>>>>>> #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 << >>>>>>>>>>>>>>>>>>>>>>> 0) /* vBIOS is sr-iov ready */ >>>>>>>>>>>>>>>>>>>>>>> #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 1) >>>>>>>>>>>>>>>>>>>>>>> /* sr-iov is enabled on this GPU */ >>>>>>>>>>>>>>>>>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt { >>>>>>>>>>>>>>>>>>>>>>> uint32_t reg_val_offs; >>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_irq_src ack_irq; >>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_irq_src rcv_irq; >>>>>>>>>>>>>>>>>>>>>>> - struct work_struct flr_work; >>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct flr_work; >>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_mm_table mm_table; >>>>>>>>>>>>>>>>>>>>>>> const struct amdgpu_virt_ops *ops; >>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_vf_error_buffer vf_errors; >>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>>>>> index b81acf59870c..f3d1c2be9292 100644 >>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>>>>> @@ -251,7 +251,7 @@ static int >>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>>> *adev, >>>>>>>>>>>>>>>>>>>>>>> static void xgpu_ai_mailbox_flr_work(struct >>>>>>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, >>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work); >>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev = >>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt); >>>>>>>>>>>>>>>>>>>>>>> int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>>>>>>>>>>>>>>>>>> @@ -380,7 +380,8 @@ int >>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_get_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>> @@ -389,6 +390,8 @@ void >>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_put_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>> static int xgpu_ai_request_init_data(struct >>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644 >>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>>>>> @@ -275,7 +275,7 @@ static int >>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>>> *adev, >>>>>>>>>>>>>>>>>>>>>>> static void xgpu_nv_mailbox_flr_work(struct >>>>>>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, >>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work); >>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev = >>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt); >>>>>>>>>>>>>>>>>>>>>>> int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>>>>>>>>>>>>>>>>>> @@ -407,7 +407,8 @@ int >>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_get_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>> @@ -416,6 +417,8 @@ void >>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_put_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>> const struct amdgpu_virt_ops >>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_virt_ops = { >>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644 >>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>>>>> @@ -512,7 +512,7 @@ static int >>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>>> *adev, >>>>>>>>>>>>>>>>>>>>>>> static void xgpu_vi_mailbox_flr_work(struct >>>>>>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, >>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work); >>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev = >>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt); >>>>>>>>>>>>>>>>>>>>>>> /* wait until RCV_MSG become 3 */ >>>>>>>>>>>>>>>>>>>>>>> @@ -610,7 +610,8 @@ int >>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_get_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>> @@ -619,6 +620,8 @@ void >>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_put_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>> const struct amdgpu_virt_ops >>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_virt_ops = { >>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>> >>>>>>>>>>> >>>>>>>>> >>>>>>> >>>>> >> ^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive. 2022-05-12 6:06 ` Christian König @ 2022-05-12 9:21 ` Lazar, Lijo 2022-05-12 13:07 ` Andrey Grodzovsky 1 sibling, 0 replies; 40+ messages in thread From: Lazar, Lijo @ 2022-05-12 9:21 UTC (permalink / raw) To: Christian König, Andrey Grodzovsky, Christian König, Felix Kuehling, amd-gfx Cc: Bai Zoy On 5/12/2022 11:36 AM, Christian König wrote: > Am 11.05.22 um 22:27 schrieb Andrey Grodzovsky: >> >> On 2022-05-11 11:39, Christian König wrote: >>> Am 11.05.22 um 17:35 schrieb Andrey Grodzovsky: >>>> On 2022-05-11 11:20, Lazar, Lijo wrote: >>>>> >>>>> >>>>> On 5/11/2022 7:28 PM, Christian König wrote: >>>>>> Am 11.05.22 um 15:43 schrieb Andrey Grodzovsky: >>>>>>> On 2022-05-11 03:38, Christian König wrote: >>>>>>>> Am 10.05.22 um 20:53 schrieb Andrey Grodzovsky: >>>>>>>>> [SNIP] >>>>>>>>>> E.g. in the reset code (either before or after the reset, >>>>>>>>>> that's debatable) you do something like this: >>>>>>>>>> >>>>>>>>>> for (i = 0; i < num_ring; ++i) >>>>>>>>>> cancel_delayed_work(ring[i]->scheduler....) >>>>>>>>>> cancel_work(adev->ras_work); >>>>>>>>>> cancel_work(adev->iofault_work); >>>>>>>>>> cancel_work(adev->debugfs_work); >>>>>>>>>> ... >>>>>>>>>> >>>>>>>>>> You don't really need to track which reset source has fired >>>>>>>>>> and which hasn't, because that would be racy again. Instead >>>>>>>>>> just bluntly reset all possible sources. >>>>>>>>>> >>>>>>>>>> Christian. >>>>>>>>> >>>>>>>>> >>>>>>>>> I don't say we care if it fired once or twice (I need to add a >>>>>>>>> fix to only insert reset work to pending reset list if it's not >>>>>>>>> already there), the point of using list (or array) to which you >>>>>>>>> add and from which you remove is that the logic of this is >>>>>>>>> encapsulated within reset domain. In your way we need to be >>>>>>>>> aware who exactly schedules reset work and explicitly cancel >>>>>>>>> them, this also means that for any new source added in the >>>>>>>>> future you will need to remember to add him >>>>>>>> >>>>>>>> I don't think that this is a valid argument. Additionally to the >>>>>>>> schedulers we probably just need less than a handful of reset >>>>>>>> sources, most likely even just one or two is enough. >>>>>>>> >>>>>>>> The only justification I can see of having additional separate >>>>>>>> reset sources would be if somebody wants to know if a specific >>>>>>>> source has been handled or not (e.g. call flush_work() or >>>>>>>> work_pending()). Like in the case of a reset triggered through >>>>>>>> debugfs. >>>>>>> >>>>>>> >>>>>>> This is indeed one reason, another is as we said before that if >>>>>>> you share 'reset source' (meaning a delayed work) with another >>>>>>> client (i.e. RAS and KFD) it means you make assumption that the >>>>>>> other client always proceeds with the >>>>>>> reset exactly the same way as you expect. So today we have this >>>>>>> only in scheduler vs non scheduler reset happening - non >>>>>>> scheduler reset clients assume the reset is always fully executed >>>>>>> in HW while scheduler based reset makes shortcuts and not always >>>>>>> does HW reset hence they cannot share 'reset source' (delayed >>>>>>> work). Yes, we can always add this in the future if and when such >>>>>>> problem will arise but no one will remember this then and a new >>>>>>> bug will be introduced and will take time to find and resolve. >>>>>> >>>>>> Mhm, so your main concern is that we forget to correctly handle >>>>>> the new reset sources? >>>>>> >>>>>> How about we do it like this then: >>>>>> >>>>>> struct amdgpu_reset_domain { >>>>>> .... >>>>>> union { >>>>>> struct { >>>>>> struct work_item debugfs; >>>>>> struct work_item ras; >>>>>> .... >>>>>> }; >>>>>> struct work_item array[] >>>>>> } reset_sources; >>>>>> } >>>>>> >>>>> >>>>> If it's only about static array, >>>>> >>>>> enum amdgpu_reset_soruce { >>>>> >>>>> AMDGPU_RESET_SRC_RAS, >>>>> AMDGPU_RESET_SRC_ABC, >>>>> ..... >>>>> AMDGPU_RESET_SRC_XYZ, >>>>> AMDGPU_RESET_SRC_MAX >>>>> >>>>> }; >>>>> >>>>> struct work_struct reset_work[AMDGPU_RESET_SRC_MAX]; => An index >>>>> for each work item >>>>> >>>>> >>>>> Thanks, >>>>> Lijo >>>> >>>> >>>> It's possible though it makes harder to generalize reset_domain >>>> later for other drivers. >>>> But still one caveat, look at amdgpu_recover_work_struct and it's >>>> usage in amdgpu_device_gpu_recover and in gpu_recover_get, >>>> At least for debugfs i need to return back the result of GPU reset >>>> and so I cannot store actual work items in the array mentioned above >>>> but rather pointers to work_item because i need a way to get back >>>> the return value from gpu_recover like I do now in >>>> amdgpu_device_gpu_recover. >>> >>> You should try to avoid that as well. >>> >>> See when the debugfs reset is canceled because of a scheduler reset >>> you won't get a useful return code either. >>> >>> What we should do instead is to cache the status of the last reset in >>> the reset domain. >>> >>> Regards, >>> Christian. >> >> >> Another problem with this approach - to execute the actaul GPU reset >> I need accesses to concrete amdgpu_device pointer from work struct >> (see xgpu_ai_mailbox_flr_work) as example. If i store all work items in >> array in amdgpu_reset_domain the most i can only retrieve is the >> reset_domain struct itself which won't help since it's dynamically >> allocated, not embedded in hive or adev and can can be one per device >> or per entire hive in case of XGMI and so there is no way for me to >> reach back to amdgpu_device. Back pointer to adev* from >> amdgpu_reset_domain will only work for single device but not for XGMI >> hive where there are multiple devices in a hive. You could embed the work within another struct which provides the context of the work and have that as the array. struct amdgpu_reset_job { void *reset_context; // A struct containing the reset context, you may check amdgpu_reset_context. struct work_struct reset_work; unsigned long flags; }; And keep an array of amdgpu_reset_job[AMDGPU_RESET_SRC_MAX] inside reset domain. When the request is coming you may fill the requested device + reset_context and schedule the corresponding work. I guess the idea now is if a work item for a particular source is already active (indicated by flags), another device belonging to the reset domain won't be able to request the same workitem again. Thanks, Lijo > > Which is exactly the reason why I think we should always allocate the > hive structure, even if we only have one device. And a GPU reset should > then always work with the hive data structure and not adev. > > Adding a pointer from your reset work item back to the hive is then > trivial. > > Regards, > Christian. > >> >> Andrey >> >> >>> >>>> >>>> Andrey >>>> >>>> >>>>> >>>>>> Not 100% sure if that works, but something like that should do the >>>>>> trick. >>>>>> >>>>>> My main concern is that I don't want to allocate the work items on >>>>>> the stack and dynamic allocation (e.g. kmalloc) is usually not >>>>>> possible either. >>>>>> >>>>>> Additional to that putting/removing work items from a list, array >>>>>> or other container is a very common source for race conditions. >>>>>> >>>>>> Regards, >>>>>> Christian. >>>>>> >>>>>>> >>>>>>>>> to the cancellation list which you showed above. In current way >>>>>>>>> all this done automatically within reset_domain code and it's >>>>>>>>> agnostic to specific driver and it's specific list of reset >>>>>>>>> sources. Also in case we would want to generalize reset_domain >>>>>>>>> to other GPU drivers (which was >>>>>>>>> a plan as far as i remember) this explicit mention of each >>>>>>>>> reset works for cancellation is again less suitable in my opinion. >>>>>>>> >>>>>>>> Well we could put the work item for the scheduler independent >>>>>>>> reset source into the reset domain as well. But I'm not sure >>>>>>>> those additional reset sources should be part of any common >>>>>>>> handling, that is largely amdgpu specific. >>>>>>> >>>>>>> >>>>>>> So it's for sure more then one source for the reasons described >>>>>>> above, also note that for scheduler we already cancel delayed >>>>>>> work in drm_sched_stop so calling them again in amdgpu code kind >>>>>>> of superfluous. >>>>>>> >>>>>>> Andrey >>>>>>> >>>>>>> >>>>>>>> >>>>>>>> Christian. >>>>>>>> >>>>>>>>> >>>>>>>>> Andrey >>>>>>>>> >>>>>>>>> >>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>> Andrey >>>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>>> >>>>>>>>>>>>> >>>>>>>>>>>>> The only difference is I chose to do the canceling right >>>>>>>>>>>>> BEFORE the HW reset and not AFTER. I did this because I see >>>>>>>>>>>>> a possible race where a new reset request is being >>>>>>>>>>>>> generated exactly after we finished the HW reset but before >>>>>>>>>>>>> we canceled out all pending resets - in such case you wold >>>>>>>>>>>>> not want to cancel this 'border line new' reset request. >>>>>>>>>>>> >>>>>>>>>>>> Why not? Any new reset request directly after a hardware >>>>>>>>>>>> reset is most likely just falsely generated by the reset >>>>>>>>>>>> itself. >>>>>>>>>>>> >>>>>>>>>>>> Ideally I would cancel all sources after the reset, but >>>>>>>>>>>> before starting any new work. >>>>>>>>>>>> >>>>>>>>>>>> Regards, >>>>>>>>>>>> Christian. >>>>>>>>>>>> >>>>>>>>>>>>> >>>>>>>>>>>>> >>>>>>>>>>>>> Andrey >>>>>>>>>>>>> >>>>>>>>>>>>> >>>>>>>>>>>>>> >>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>> Christian. >>>>>>>>>>>>>> >>>>>>>>>>>>>>>> You can see that if many different reset sources share >>>>>>>>>>>>>>>> same work struct what can happen is that the first to >>>>>>>>>>>>>>>> obtain the lock you describe bellow might opt out from >>>>>>>>>>>>>>>> full HW reset because his bad job did signal for example >>>>>>>>>>>>>>>> or because his hunged IP block was able to recover >>>>>>>>>>>>>>>> through SW reset but in the meantime another reset >>>>>>>>>>>>>>>> source who needed an actual HW reset just silently >>>>>>>>>>>>>>>> returned and we end up with unhandled reset request. >>>>>>>>>>>>>>>> True that today this happens only to job timeout reset >>>>>>>>>>>>>>>> sources that are handled form within the scheduler and >>>>>>>>>>>>>>>> won't use this single work struct but no one prevents a >>>>>>>>>>>>>>>> future case for this to happen and also, if we actually >>>>>>>>>>>>>>>> want to unify scheduler time out handlers within reset >>>>>>>>>>>>>>>> domain (which seems to me the right design approach) we >>>>>>>>>>>>>>>> won't be able to use just one work struct for this >>>>>>>>>>>>>>>> reason anyway. >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> Just to add to this point - a reset domain is >>>>>>>>>>>>>>> co-operative domain. In addition to sharing a set of >>>>>>>>>>>>>>> clients from various reset sources for one device, it >>>>>>>>>>>>>>> also will have a set of devices like in XGMI hive. The >>>>>>>>>>>>>>> job timeout on one device may not eventually result in >>>>>>>>>>>>>>> result, but a RAS error happening on another device at >>>>>>>>>>>>>>> the same time would need a reset. The second device's RAS >>>>>>>>>>>>>>> error cannot return seeing that a reset work already >>>>>>>>>>>>>>> started, or ignore the reset work given that another >>>>>>>>>>>>>>> device has filled the reset data. >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> When there is a reset domain, it should take care of the >>>>>>>>>>>>>>> work scheduled and keeping it in device or any other >>>>>>>>>>>>>>> level doesn't sound good. >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> Thanks, >>>>>>>>>>>>>>> Lijo >>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> I'd put the reset work struct into the reset_domain >>>>>>>>>>>>>>>>> struct. That way you'd have exactly one worker for the >>>>>>>>>>>>>>>>> reset domain. You could implement a lock-less scheme to >>>>>>>>>>>>>>>>> decide whether you need to schedule a reset, e.g. using >>>>>>>>>>>>>>>>> an atomic counter in the shared work struct that gets >>>>>>>>>>>>>>>>> incremented when a client wants to trigger a reset >>>>>>>>>>>>>>>>> (atomic_add_return). If that counter is exactly 1 after >>>>>>>>>>>>>>>>> incrementing, you need to fill in the rest of the work >>>>>>>>>>>>>>>>> struct and schedule the work. Otherwise, it's already >>>>>>>>>>>>>>>>> scheduled (or another client is in the process of >>>>>>>>>>>>>>>>> scheduling it) and you just return. When the worker >>>>>>>>>>>>>>>>> finishes (after confirming a successful reset), it >>>>>>>>>>>>>>>>> resets the counter to 0, so the next client requesting >>>>>>>>>>>>>>>>> a reset will schedule the worker again. >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>>>>> Felix >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> Additional to that keep in mind that you can't >>>>>>>>>>>>>>>>>>> allocate any memory before or during the GPU reset >>>>>>>>>>>>>>>>>>> nor wait for the reset to complete (so you can't >>>>>>>>>>>>>>>>>>> allocate anything on the stack either). >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> There is no dynamic allocation here, regarding stack >>>>>>>>>>>>>>>>>> allocations - we do it all the time when we call >>>>>>>>>>>>>>>>>> functions, even during GPU resets, how on stack >>>>>>>>>>>>>>>>>> allocation of work struct in amdgpu_device_gpu_recover >>>>>>>>>>>>>>>>>> is different from any other local variable we allocate >>>>>>>>>>>>>>>>>> in any function we call ? >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> I am also not sure why it's not allowed to wait for >>>>>>>>>>>>>>>>>> reset to complete ? Also, see in >>>>>>>>>>>>>>>>>> amdgpu_ras_do_recovery and gpu_recover_get (debugfs) - >>>>>>>>>>>>>>>>>> the caller expects the reset to complete before he >>>>>>>>>>>>>>>>>> returns. I can probably work around it in RAS code by >>>>>>>>>>>>>>>>>> calling atomic_set(&ras->in_recovery, 0) from some >>>>>>>>>>>>>>>>>> callback within actual reset function but regarding >>>>>>>>>>>>>>>>>> sysfs it actually expects a result returned indicating >>>>>>>>>>>>>>>>>> whether the call was successful or not. >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> I don't think that concept you try here will work. >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>>>>>>> Christian. >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>> Also in general seems to me it's cleaner approach >>>>>>>>>>>>>>>>>>>> where this logic (the work items) are held and >>>>>>>>>>>>>>>>>>>> handled in reset_domain and are not split in each >>>>>>>>>>>>>>>>>>>> adev or any other entity. We might want in the >>>>>>>>>>>>>>>>>>>> future to even move the scheduler handling into >>>>>>>>>>>>>>>>>>>> reset domain since reset domain is supposed to be a >>>>>>>>>>>>>>>>>>>> generic things and not only or AMD. >>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>>>>>>>>>>> Christian. >>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky >>>>>>>>>>>>>>>>>>>>>>>> <andrey.grodzovsky@amd.com> >>>>>>>>>>>>>>>>>>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com> >>>>>>>>>>>>>>>>>>>>>>>> --- >>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +--- >>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 >>>>>>>>>>>>>>>>>>>>>>>> +++-- >>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 + >>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 >>>>>>>>>>>>>>>>>>>>>>>> +++++++++++++++++++++- >>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +- >>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++- >>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++- >>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++- >>>>>>>>>>>>>>>>>>>>>>>> 8 files changed, 104 insertions(+), 24 >>>>>>>>>>>>>>>>>>>>>>>> deletions(-) >>>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>>>>>> index 4264abc5604d..99efd8317547 100644 >>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>>>>>> @@ -109,6 +109,7 @@ >>>>>>>>>>>>>>>>>>>>>>>> #include "amdgpu_fdinfo.h" >>>>>>>>>>>>>>>>>>>>>>>> #include "amdgpu_mca.h" >>>>>>>>>>>>>>>>>>>>>>>> #include "amdgpu_ras.h" >>>>>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h" >>>>>>>>>>>>>>>>>>>>>>>> #define MAX_GPU_INSTANCE 16 >>>>>>>>>>>>>>>>>>>>>>>> @@ -509,16 +510,6 @@ struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_allowed_register_entry { >>>>>>>>>>>>>>>>>>>>>>>> bool grbm_indexed; >>>>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>>>> -enum amd_reset_method { >>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_NONE = -1, >>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_LEGACY = 0, >>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE0, >>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE1, >>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE2, >>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_BACO, >>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_PCI, >>>>>>>>>>>>>>>>>>>>>>>> -}; >>>>>>>>>>>>>>>>>>>>>>>> - >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_video_codec_info { >>>>>>>>>>>>>>>>>>>>>>>> u32 codec_type; >>>>>>>>>>>>>>>>>>>>>>>> u32 max_width; >>>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>>>>>> index e582f1044c0f..7fa82269c30f 100644 >>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>>>>>> @@ -5201,6 +5201,12 @@ int >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev, >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> tmp_vram_lost_counter = >>>>>>>>>>>>>>>>>>>>>>>> atomic_read(&((adev)->vram_lost_counter)); >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> + /* Drop all pending resets since we will >>>>>>>>>>>>>>>>>>>>>>>> reset now anyway */ >>>>>>>>>>>>>>>>>>>>>>>> + tmp_adev = >>>>>>>>>>>>>>>>>>>>>>>> list_first_entry(device_list_handle, struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device, >>>>>>>>>>>>>>>>>>>>>>>> + reset_list); >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_pending_list(tmp_adev->reset_domain); >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> /* Actual ASIC resets if needed.*/ >>>>>>>>>>>>>>>>>>>>>>>> /* Host driver will handle XGMI hive reset >>>>>>>>>>>>>>>>>>>>>>>> for SRIOV */ >>>>>>>>>>>>>>>>>>>>>>>> if (amdgpu_sriov_vf(adev)) { >>>>>>>>>>>>>>>>>>>>>>>> @@ -5296,7 +5302,7 @@ int >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev, >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_recover_work_struct { >>>>>>>>>>>>>>>>>>>>>>>> - struct work_struct base; >>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct base; >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev; >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_job *job; >>>>>>>>>>>>>>>>>>>>>>>> int ret; >>>>>>>>>>>>>>>>>>>>>>>> @@ -5304,7 +5310,7 @@ struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct { >>>>>>>>>>>>>>>>>>>>>>>> static void >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work(struct >>>>>>>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>>> - struct amdgpu_recover_work_struct >>>>>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base); >>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_recover_work_struct >>>>>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base.base.work); >>>>>>>>>>>>>>>>>>>>>>>> recover_work->ret = >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, recover_work->job); >>>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> @@ -5316,12 +5322,15 @@ int >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>>>> *adev, >>>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_recover_work_struct work = >>>>>>>>>>>>>>>>>>>>>>>> {.adev = adev, .job = job}; >>>>>>>>>>>>>>>>>>>>>>>> - INIT_WORK(&work.base, >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&work.base.base, >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&work.base.node); >>>>>>>>>>>>>>>>>>>>>>>> if >>>>>>>>>>>>>>>>>>>>>>>> (!amdgpu_reset_domain_schedule(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>>>> &work.base)) >>>>>>>>>>>>>>>>>>>>>>>> return -EAGAIN; >>>>>>>>>>>>>>>>>>>>>>>> - flush_work(&work.base); >>>>>>>>>>>>>>>>>>>>>>>> + flush_delayed_work(&work.base.base); >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>>>> &work.base); >>>>>>>>>>>>>>>>>>>>>>>> return work.ret; >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>>>>>> index c80af0889773..ffddd419c351 100644 >>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain >>>>>>>>>>>>>>>>>>>>>>>> *amdgpu_reset_create_reset_domain(enum >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_d >>>>>>>>>>>>>>>>>>>>>>>> atomic_set(&reset_domain->in_gpu_reset, 0); >>>>>>>>>>>>>>>>>>>>>>>> init_rwsem(&reset_domain->sem); >>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&reset_domain->pending_works); >>>>>>>>>>>>>>>>>>>>>>>> + mutex_init(&reset_domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> return reset_domain; >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>>>>>> index 1949dbe28a86..863ec5720fc1 100644 >>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>>>>>> @@ -24,7 +24,18 @@ >>>>>>>>>>>>>>>>>>>>>>>> #ifndef __AMDGPU_RESET_H__ >>>>>>>>>>>>>>>>>>>>>>>> #define __AMDGPU_RESET_H__ >>>>>>>>>>>>>>>>>>>>>>>> -#include "amdgpu.h" >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/atomic.h> >>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/mutex.h> >>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/list.h> >>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/kref.h> >>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/rwsem.h> >>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/workqueue.h> >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_device; >>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_job; >>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_hive_info; >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> enum AMDGPU_RESET_FLAGS { >>>>>>>>>>>>>>>>>>>>>>>> @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS { >>>>>>>>>>>>>>>>>>>>>>>> AMDGPU_SKIP_HW_RESET = 1, >>>>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> +enum amd_reset_method { >>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_NONE = -1, >>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_LEGACY = 0, >>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE0, >>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE1, >>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE2, >>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_BACO, >>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_PCI, >>>>>>>>>>>>>>>>>>>>>>>> +}; >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_reset_context { >>>>>>>>>>>>>>>>>>>>>>>> enum amd_reset_method method; >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *reset_req_dev; >>>>>>>>>>>>>>>>>>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context { >>>>>>>>>>>>>>>>>>>>>>>> unsigned long flags; >>>>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_reset_control; >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_reset_handler { >>>>>>>>>>>>>>>>>>>>>>>> enum amd_reset_method reset_method; >>>>>>>>>>>>>>>>>>>>>>>> struct list_head handler_list; >>>>>>>>>>>>>>>>>>>>>>>> @@ -76,12 +100,21 @@ enum >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_type { >>>>>>>>>>>>>>>>>>>>>>>> XGMI_HIVE >>>>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_reset_work_struct { >>>>>>>>>>>>>>>>>>>>>>>> + struct delayed_work base; >>>>>>>>>>>>>>>>>>>>>>>> + struct list_head node; >>>>>>>>>>>>>>>>>>>>>>>> +}; >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_reset_domain { >>>>>>>>>>>>>>>>>>>>>>>> struct kref refcount; >>>>>>>>>>>>>>>>>>>>>>>> struct workqueue_struct *wq; >>>>>>>>>>>>>>>>>>>>>>>> enum amdgpu_reset_domain_type type; >>>>>>>>>>>>>>>>>>>>>>>> struct rw_semaphore sem; >>>>>>>>>>>>>>>>>>>>>>>> atomic_t in_gpu_reset; >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> + struct list_head pending_works; >>>>>>>>>>>>>>>>>>>>>>>> + struct mutex reset_lock; >>>>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>>>> @@ -113,9 +146,43 @@ static inline void >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_put_reset_domain(struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *dom >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> static inline bool >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_schedule(struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain, >>>>>>>>>>>>>>>>>>>>>>>> - struct work_struct *work) >>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work) >>>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>>> - return queue_work(domain->wq, work); >>>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> + if (!queue_delayed_work(domain->wq, >>>>>>>>>>>>>>>>>>>>>>>> &work->base, 0)) { >>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>>> + return false; >>>>>>>>>>>>>>>>>>>>>>>> + } >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> + list_add_tail(&work->node, >>>>>>>>>>>>>>>>>>>>>>>> &domain->pending_works); >>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> + return true; >>>>>>>>>>>>>>>>>>>>>>>> +} >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> +static inline void >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain, >>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work) >>>>>>>>>>>>>>>>>>>>>>>> +{ >>>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>>> + list_del_init(&work->node); >>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>>> +} >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> +static inline void >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_pending_list(struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain) >>>>>>>>>>>>>>>>>>>>>>>> +{ >>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *entry, *tmp; >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>>> + list_for_each_entry_safe(entry, tmp, >>>>>>>>>>>>>>>>>>>>>>>> &domain->pending_works, node) { >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> + list_del_init(&entry->node); >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> + /* Stop any other related pending >>>>>>>>>>>>>>>>>>>>>>>> resets */ >>>>>>>>>>>>>>>>>>>>>>>> + cancel_delayed_work(&entry->base); >>>>>>>>>>>>>>>>>>>>>>>> + } >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> void amdgpu_device_lock_reset_domain(struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *reset_domain); >>>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>>>>>> index 239f232f9c02..574e870d3064 100644 >>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>>>>>> @@ -25,6 +25,7 @@ >>>>>>>>>>>>>>>>>>>>>>>> #define AMDGPU_VIRT_H >>>>>>>>>>>>>>>>>>>>>>>> #include "amdgv_sriovmsg.h" >>>>>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h" >>>>>>>>>>>>>>>>>>>>>>>> #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 << >>>>>>>>>>>>>>>>>>>>>>>> 0) /* vBIOS is sr-iov ready */ >>>>>>>>>>>>>>>>>>>>>>>> #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 1) >>>>>>>>>>>>>>>>>>>>>>>> /* sr-iov is enabled on this GPU */ >>>>>>>>>>>>>>>>>>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt { >>>>>>>>>>>>>>>>>>>>>>>> uint32_t reg_val_offs; >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_irq_src ack_irq; >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_irq_src rcv_irq; >>>>>>>>>>>>>>>>>>>>>>>> - struct work_struct flr_work; >>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct flr_work; >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_mm_table mm_table; >>>>>>>>>>>>>>>>>>>>>>>> const struct amdgpu_virt_ops *ops; >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_vf_error_buffer vf_errors; >>>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>>>>>> index b81acf59870c..f3d1c2be9292 100644 >>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>>>>>> @@ -251,7 +251,7 @@ static int >>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>>>> *adev, >>>>>>>>>>>>>>>>>>>>>>>> static void xgpu_ai_mailbox_flr_work(struct >>>>>>>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, >>>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work); >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev = >>>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt); >>>>>>>>>>>>>>>>>>>>>>>> int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>>>>>>>>>>>>>>>>>>> @@ -380,7 +380,8 @@ int >>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_get_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> @@ -389,6 +390,8 @@ void >>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_put_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> static int xgpu_ai_request_init_data(struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644 >>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>>>>>> @@ -275,7 +275,7 @@ static int >>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>>>> *adev, >>>>>>>>>>>>>>>>>>>>>>>> static void xgpu_nv_mailbox_flr_work(struct >>>>>>>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, >>>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work); >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev = >>>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt); >>>>>>>>>>>>>>>>>>>>>>>> int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>>>>>>>>>>>>>>>>>>> @@ -407,7 +407,8 @@ int >>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_get_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> @@ -416,6 +417,8 @@ void >>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_put_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> const struct amdgpu_virt_ops >>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_virt_ops = { >>>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644 >>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>>>>>> @@ -512,7 +512,7 @@ static int >>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>>>> *adev, >>>>>>>>>>>>>>>>>>>>>>>> static void xgpu_vi_mailbox_flr_work(struct >>>>>>>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, >>>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work); >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev = >>>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt); >>>>>>>>>>>>>>>>>>>>>>>> /* wait until RCV_MSG become 3 */ >>>>>>>>>>>>>>>>>>>>>>>> @@ -610,7 +610,8 @@ int >>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_get_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> @@ -619,6 +620,8 @@ void >>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_put_irq(struct amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> const struct amdgpu_virt_ops >>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_virt_ops = { >>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>> >>>>>>>>>>>> >>>>>>>>>> >>>>>>>> >>>>>> >>> > ^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive. 2022-05-12 6:06 ` Christian König 2022-05-12 9:21 ` Lazar, Lijo @ 2022-05-12 13:07 ` Andrey Grodzovsky 2022-05-12 13:15 ` Christian König 1 sibling, 1 reply; 40+ messages in thread From: Andrey Grodzovsky @ 2022-05-12 13:07 UTC (permalink / raw) To: Christian König, Lazar, Lijo, Christian König, Felix Kuehling, amd-gfx Cc: Bai Zoy On 2022-05-12 02:06, Christian König wrote: > Am 11.05.22 um 22:27 schrieb Andrey Grodzovsky: >> >> On 2022-05-11 11:39, Christian König wrote: >>> Am 11.05.22 um 17:35 schrieb Andrey Grodzovsky: >>>> On 2022-05-11 11:20, Lazar, Lijo wrote: >>>>> >>>>> >>>>> On 5/11/2022 7:28 PM, Christian König wrote: >>>>>> Am 11.05.22 um 15:43 schrieb Andrey Grodzovsky: >>>>>>> On 2022-05-11 03:38, Christian König wrote: >>>>>>>> Am 10.05.22 um 20:53 schrieb Andrey Grodzovsky: >>>>>>>>> [SNIP] >>>>>>>>>> E.g. in the reset code (either before or after the reset, >>>>>>>>>> that's debatable) you do something like this: >>>>>>>>>> >>>>>>>>>> for (i = 0; i < num_ring; ++i) >>>>>>>>>> cancel_delayed_work(ring[i]->scheduler....) >>>>>>>>>> cancel_work(adev->ras_work); >>>>>>>>>> cancel_work(adev->iofault_work); >>>>>>>>>> cancel_work(adev->debugfs_work); >>>>>>>>>> ... >>>>>>>>>> >>>>>>>>>> You don't really need to track which reset source has fired >>>>>>>>>> and which hasn't, because that would be racy again. Instead >>>>>>>>>> just bluntly reset all possible sources. >>>>>>>>>> >>>>>>>>>> Christian. >>>>>>>>> >>>>>>>>> >>>>>>>>> I don't say we care if it fired once or twice (I need to add a >>>>>>>>> fix to only insert reset work to pending reset list if it's >>>>>>>>> not already there), the point of using list (or array) to >>>>>>>>> which you add and from which you remove is that the logic of >>>>>>>>> this is encapsulated within reset domain. In your way we need >>>>>>>>> to be aware who exactly schedules reset work and explicitly >>>>>>>>> cancel them, this also means that for any new source added in >>>>>>>>> the future you will need to remember to add him >>>>>>>> >>>>>>>> I don't think that this is a valid argument. Additionally to >>>>>>>> the schedulers we probably just need less than a handful of >>>>>>>> reset sources, most likely even just one or two is enough. >>>>>>>> >>>>>>>> The only justification I can see of having additional separate >>>>>>>> reset sources would be if somebody wants to know if a specific >>>>>>>> source has been handled or not (e.g. call flush_work() or >>>>>>>> work_pending()). Like in the case of a reset triggered through >>>>>>>> debugfs. >>>>>>> >>>>>>> >>>>>>> This is indeed one reason, another is as we said before that if >>>>>>> you share 'reset source' (meaning a delayed work) with another >>>>>>> client (i.e. RAS and KFD) it means you make assumption that the >>>>>>> other client always proceeds with the >>>>>>> reset exactly the same way as you expect. So today we have this >>>>>>> only in scheduler vs non scheduler reset happening - non >>>>>>> scheduler reset clients assume the reset is always fully >>>>>>> executed in HW while scheduler based reset makes shortcuts and >>>>>>> not always does HW reset hence they cannot share 'reset source' >>>>>>> (delayed work). Yes, we can always add this in the future if and >>>>>>> when such problem will arise but no one will remember this then >>>>>>> and a new bug will be introduced and will take time to find and >>>>>>> resolve. >>>>>> >>>>>> Mhm, so your main concern is that we forget to correctly handle >>>>>> the new reset sources? >>>>>> >>>>>> How about we do it like this then: >>>>>> >>>>>> struct amdgpu_reset_domain { >>>>>> .... >>>>>> union { >>>>>> struct { >>>>>> struct work_item debugfs; >>>>>> struct work_item ras; >>>>>> .... >>>>>> }; >>>>>> struct work_item array[] >>>>>> } reset_sources; >>>>>> } >>>>>> >>>>> >>>>> If it's only about static array, >>>>> >>>>> enum amdgpu_reset_soruce { >>>>> >>>>> AMDGPU_RESET_SRC_RAS, >>>>> AMDGPU_RESET_SRC_ABC, >>>>> ..... >>>>> AMDGPU_RESET_SRC_XYZ, >>>>> AMDGPU_RESET_SRC_MAX >>>>> >>>>> }; >>>>> >>>>> struct work_struct reset_work[AMDGPU_RESET_SRC_MAX]; => An index >>>>> for each work item >>>>> >>>>> >>>>> Thanks, >>>>> Lijo >>>> >>>> >>>> It's possible though it makes harder to generalize reset_domain >>>> later for other drivers. >>>> But still one caveat, look at amdgpu_recover_work_struct and it's >>>> usage in amdgpu_device_gpu_recover and in gpu_recover_get, >>>> At least for debugfs i need to return back the result of GPU reset >>>> and so I cannot store actual work items in the array mentioned above >>>> but rather pointers to work_item because i need a way to get back >>>> the return value from gpu_recover like I do now in >>>> amdgpu_device_gpu_recover. >>> >>> You should try to avoid that as well. >>> >>> See when the debugfs reset is canceled because of a scheduler reset >>> you won't get a useful return code either. >>> >>> What we should do instead is to cache the status of the last reset >>> in the reset domain. >>> >>> Regards, >>> Christian. >> >> >> Another problem with this approach - to execute the actaul GPU >> reset I need accesses to concrete amdgpu_device pointer from work >> struct (see xgpu_ai_mailbox_flr_work) as example. If i store all work >> items in >> array in amdgpu_reset_domain the most i can only retrieve is the >> reset_domain struct itself which won't help since it's dynamically >> allocated, not embedded in hive or adev and can can be one per device >> or per entire hive in case of XGMI and so there is no way for me to >> reach back to amdgpu_device. Back pointer to adev* from >> amdgpu_reset_domain will only work for single device but not for XGMI >> hive where there are multiple devices in a hive. > > Which is exactly the reason why I think we should always allocate the > hive structure, even if we only have one device. And a GPU reset > should then always work with the hive data structure and not adev. I am not sure why HIVE is the object we should work with, hive is one use case, single device is another, then Lijo described something called partition which is what ? Particular pipe within GPU ?. What they all share in common IMHO is that all of them use reset domain when they want a recovery operation, so maybe GPU reset should be oriented to work with reset domain ? Andrey > > Adding a pointer from your reset work item back to the hive is then > trivial. > > Regards, > Christian. > >> >> Andrey >> >> >>> >>>> >>>> Andrey >>>> >>>> >>>>> >>>>>> Not 100% sure if that works, but something like that should do >>>>>> the trick. >>>>>> >>>>>> My main concern is that I don't want to allocate the work items >>>>>> on the stack and dynamic allocation (e.g. kmalloc) is usually not >>>>>> possible either. >>>>>> >>>>>> Additional to that putting/removing work items from a list, array >>>>>> or other container is a very common source for race conditions. >>>>>> >>>>>> Regards, >>>>>> Christian. >>>>>> >>>>>>> >>>>>>>>> to the cancellation list which you showed above. In current >>>>>>>>> way all this done automatically within reset_domain code and >>>>>>>>> it's agnostic to specific driver and it's specific list of >>>>>>>>> reset sources. Also in case we would want to generalize >>>>>>>>> reset_domain to other GPU drivers (which was >>>>>>>>> a plan as far as i remember) this explicit mention of each >>>>>>>>> reset works for cancellation is again less suitable in my >>>>>>>>> opinion. >>>>>>>> >>>>>>>> Well we could put the work item for the scheduler independent >>>>>>>> reset source into the reset domain as well. But I'm not sure >>>>>>>> those additional reset sources should be part of any common >>>>>>>> handling, that is largely amdgpu specific. >>>>>>> >>>>>>> >>>>>>> So it's for sure more then one source for the reasons described >>>>>>> above, also note that for scheduler we already cancel delayed >>>>>>> work in drm_sched_stop so calling them again in amdgpu code kind >>>>>>> of superfluous. >>>>>>> >>>>>>> Andrey >>>>>>> >>>>>>> >>>>>>>> >>>>>>>> Christian. >>>>>>>> >>>>>>>>> >>>>>>>>> Andrey >>>>>>>>> >>>>>>>>> >>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>> Andrey >>>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>>> >>>>>>>>>>>>> >>>>>>>>>>>>> The only difference is I chose to do the canceling right >>>>>>>>>>>>> BEFORE the HW reset and not AFTER. I did this because I >>>>>>>>>>>>> see a possible race where a new reset request is being >>>>>>>>>>>>> generated exactly after we finished the HW reset but >>>>>>>>>>>>> before we canceled out all pending resets - in such case >>>>>>>>>>>>> you wold not want to cancel this 'border line new' reset >>>>>>>>>>>>> request. >>>>>>>>>>>> >>>>>>>>>>>> Why not? Any new reset request directly after a hardware >>>>>>>>>>>> reset is most likely just falsely generated by the reset >>>>>>>>>>>> itself. >>>>>>>>>>>> >>>>>>>>>>>> Ideally I would cancel all sources after the reset, but >>>>>>>>>>>> before starting any new work. >>>>>>>>>>>> >>>>>>>>>>>> Regards, >>>>>>>>>>>> Christian. >>>>>>>>>>>> >>>>>>>>>>>>> >>>>>>>>>>>>> >>>>>>>>>>>>> Andrey >>>>>>>>>>>>> >>>>>>>>>>>>> >>>>>>>>>>>>>> >>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>> Christian. >>>>>>>>>>>>>> >>>>>>>>>>>>>>>> You can see that if many different reset sources share >>>>>>>>>>>>>>>> same work struct what can happen is that the first to >>>>>>>>>>>>>>>> obtain the lock you describe bellow might opt out from >>>>>>>>>>>>>>>> full HW reset because his bad job did signal for >>>>>>>>>>>>>>>> example or because his hunged IP block was able to >>>>>>>>>>>>>>>> recover through SW reset but in the meantime another >>>>>>>>>>>>>>>> reset source who needed an actual HW reset just >>>>>>>>>>>>>>>> silently returned and we end up with unhandled reset >>>>>>>>>>>>>>>> request. True that today this happens only to job >>>>>>>>>>>>>>>> timeout reset sources that are handled form within the >>>>>>>>>>>>>>>> scheduler and won't use this single work struct but no >>>>>>>>>>>>>>>> one prevents a future case for this to happen and also, >>>>>>>>>>>>>>>> if we actually want to unify scheduler time out >>>>>>>>>>>>>>>> handlers within reset domain (which seems to me the >>>>>>>>>>>>>>>> right design approach) we won't be able to use just one >>>>>>>>>>>>>>>> work struct for this reason anyway. >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> Just to add to this point - a reset domain is >>>>>>>>>>>>>>> co-operative domain. In addition to sharing a set of >>>>>>>>>>>>>>> clients from various reset sources for one device, it >>>>>>>>>>>>>>> also will have a set of devices like in XGMI hive. The >>>>>>>>>>>>>>> job timeout on one device may not eventually result in >>>>>>>>>>>>>>> result, but a RAS error happening on another device at >>>>>>>>>>>>>>> the same time would need a reset. The second device's >>>>>>>>>>>>>>> RAS error cannot return seeing that a reset work already >>>>>>>>>>>>>>> started, or ignore the reset work given that another >>>>>>>>>>>>>>> device has filled the reset data. >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> When there is a reset domain, it should take care of the >>>>>>>>>>>>>>> work scheduled and keeping it in device or any other >>>>>>>>>>>>>>> level doesn't sound good. >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> Thanks, >>>>>>>>>>>>>>> Lijo >>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> I'd put the reset work struct into the reset_domain >>>>>>>>>>>>>>>>> struct. That way you'd have exactly one worker for the >>>>>>>>>>>>>>>>> reset domain. You could implement a lock-less scheme >>>>>>>>>>>>>>>>> to decide whether you need to schedule a reset, e.g. >>>>>>>>>>>>>>>>> using an atomic counter in the shared work struct that >>>>>>>>>>>>>>>>> gets incremented when a client wants to trigger a >>>>>>>>>>>>>>>>> reset (atomic_add_return). If that counter is exactly >>>>>>>>>>>>>>>>> 1 after incrementing, you need to fill in the rest of >>>>>>>>>>>>>>>>> the work struct and schedule the work. Otherwise, it's >>>>>>>>>>>>>>>>> already scheduled (or another client is in the process >>>>>>>>>>>>>>>>> of scheduling it) and you just return. When the worker >>>>>>>>>>>>>>>>> finishes (after confirming a successful reset), it >>>>>>>>>>>>>>>>> resets the counter to 0, so the next client requesting >>>>>>>>>>>>>>>>> a reset will schedule the worker again. >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>>>>> Felix >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> Additional to that keep in mind that you can't >>>>>>>>>>>>>>>>>>> allocate any memory before or during the GPU reset >>>>>>>>>>>>>>>>>>> nor wait for the reset to complete (so you can't >>>>>>>>>>>>>>>>>>> allocate anything on the stack either). >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> There is no dynamic allocation here, regarding stack >>>>>>>>>>>>>>>>>> allocations - we do it all the time when we call >>>>>>>>>>>>>>>>>> functions, even during GPU resets, how on stack >>>>>>>>>>>>>>>>>> allocation of work struct in >>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover is different from any other >>>>>>>>>>>>>>>>>> local variable we allocate in any function we call ? >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> I am also not sure why it's not allowed to wait for >>>>>>>>>>>>>>>>>> reset to complete ? Also, see in >>>>>>>>>>>>>>>>>> amdgpu_ras_do_recovery and gpu_recover_get (debugfs) >>>>>>>>>>>>>>>>>> - the caller expects the reset to complete before he >>>>>>>>>>>>>>>>>> returns. I can probably work around it in RAS code by >>>>>>>>>>>>>>>>>> calling atomic_set(&ras->in_recovery, 0) from some >>>>>>>>>>>>>>>>>> callback within actual reset function but regarding >>>>>>>>>>>>>>>>>> sysfs it actually expects a result returned >>>>>>>>>>>>>>>>>> indicating whether the call was successful or not. >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> I don't think that concept you try here will work. >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>>>>>>> Christian. >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>> Also in general seems to me it's cleaner approach >>>>>>>>>>>>>>>>>>>> where this logic (the work items) are held and >>>>>>>>>>>>>>>>>>>> handled in reset_domain and are not split in each >>>>>>>>>>>>>>>>>>>> adev or any other entity. We might want in the >>>>>>>>>>>>>>>>>>>> future to even move the scheduler handling into >>>>>>>>>>>>>>>>>>>> reset domain since reset domain is supposed to be a >>>>>>>>>>>>>>>>>>>> generic things and not only or AMD. >>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>>>>>>>>>>> Christian. >>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky >>>>>>>>>>>>>>>>>>>>>>>> <andrey.grodzovsky@amd.com> >>>>>>>>>>>>>>>>>>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com> >>>>>>>>>>>>>>>>>>>>>>>> --- >>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +--- >>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 >>>>>>>>>>>>>>>>>>>>>>>> +++-- >>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 + >>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 >>>>>>>>>>>>>>>>>>>>>>>> +++++++++++++++++++++- >>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +- >>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++- >>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++- >>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++- >>>>>>>>>>>>>>>>>>>>>>>> 8 files changed, 104 insertions(+), 24 >>>>>>>>>>>>>>>>>>>>>>>> deletions(-) >>>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>>>>>> index 4264abc5604d..99efd8317547 100644 >>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>>>>>> @@ -109,6 +109,7 @@ >>>>>>>>>>>>>>>>>>>>>>>> #include "amdgpu_fdinfo.h" >>>>>>>>>>>>>>>>>>>>>>>> #include "amdgpu_mca.h" >>>>>>>>>>>>>>>>>>>>>>>> #include "amdgpu_ras.h" >>>>>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h" >>>>>>>>>>>>>>>>>>>>>>>> #define MAX_GPU_INSTANCE 16 >>>>>>>>>>>>>>>>>>>>>>>> @@ -509,16 +510,6 @@ struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_allowed_register_entry { >>>>>>>>>>>>>>>>>>>>>>>> bool grbm_indexed; >>>>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>>>> -enum amd_reset_method { >>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_NONE = -1, >>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_LEGACY = 0, >>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE0, >>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE1, >>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE2, >>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_BACO, >>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_PCI, >>>>>>>>>>>>>>>>>>>>>>>> -}; >>>>>>>>>>>>>>>>>>>>>>>> - >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_video_codec_info { >>>>>>>>>>>>>>>>>>>>>>>> u32 codec_type; >>>>>>>>>>>>>>>>>>>>>>>> u32 max_width; >>>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>>>>>> index e582f1044c0f..7fa82269c30f 100644 >>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>>>>>> @@ -5201,6 +5201,12 @@ int >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev, >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> tmp_vram_lost_counter = >>>>>>>>>>>>>>>>>>>>>>>> atomic_read(&((adev)->vram_lost_counter)); >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> + /* Drop all pending resets since we will >>>>>>>>>>>>>>>>>>>>>>>> reset now anyway */ >>>>>>>>>>>>>>>>>>>>>>>> + tmp_adev = >>>>>>>>>>>>>>>>>>>>>>>> list_first_entry(device_list_handle, struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device, >>>>>>>>>>>>>>>>>>>>>>>> + reset_list); >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_pending_list(tmp_adev->reset_domain); >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> /* Actual ASIC resets if needed.*/ >>>>>>>>>>>>>>>>>>>>>>>> /* Host driver will handle XGMI hive >>>>>>>>>>>>>>>>>>>>>>>> reset for SRIOV */ >>>>>>>>>>>>>>>>>>>>>>>> if (amdgpu_sriov_vf(adev)) { >>>>>>>>>>>>>>>>>>>>>>>> @@ -5296,7 +5302,7 @@ int >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev, >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_recover_work_struct { >>>>>>>>>>>>>>>>>>>>>>>> - struct work_struct base; >>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct base; >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev; >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_job *job; >>>>>>>>>>>>>>>>>>>>>>>> int ret; >>>>>>>>>>>>>>>>>>>>>>>> @@ -5304,7 +5310,7 @@ struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct { >>>>>>>>>>>>>>>>>>>>>>>> static void >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work(struct >>>>>>>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>>> - struct amdgpu_recover_work_struct >>>>>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base); >>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_recover_work_struct >>>>>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base.base.work); >>>>>>>>>>>>>>>>>>>>>>>> recover_work->ret = >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, >>>>>>>>>>>>>>>>>>>>>>>> recover_work->job); >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> @@ -5316,12 +5322,15 @@ int >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>>>> *adev, >>>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_recover_work_struct work = >>>>>>>>>>>>>>>>>>>>>>>> {.adev = adev, .job = job}; >>>>>>>>>>>>>>>>>>>>>>>> - INIT_WORK(&work.base, >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&work.base.base, >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&work.base.node); >>>>>>>>>>>>>>>>>>>>>>>> if >>>>>>>>>>>>>>>>>>>>>>>> (!amdgpu_reset_domain_schedule(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>>>> &work.base)) >>>>>>>>>>>>>>>>>>>>>>>> return -EAGAIN; >>>>>>>>>>>>>>>>>>>>>>>> - flush_work(&work.base); >>>>>>>>>>>>>>>>>>>>>>>> + flush_delayed_work(&work.base.base); >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>>>> &work.base); >>>>>>>>>>>>>>>>>>>>>>>> return work.ret; >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>>>>>> index c80af0889773..ffddd419c351 100644 >>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain >>>>>>>>>>>>>>>>>>>>>>>> *amdgpu_reset_create_reset_domain(enum >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_d >>>>>>>>>>>>>>>>>>>>>>>> atomic_set(&reset_domain->in_gpu_reset, 0); >>>>>>>>>>>>>>>>>>>>>>>> init_rwsem(&reset_domain->sem); >>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&reset_domain->pending_works); >>>>>>>>>>>>>>>>>>>>>>>> + mutex_init(&reset_domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> return reset_domain; >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>>>>>> index 1949dbe28a86..863ec5720fc1 100644 >>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>>>>>> @@ -24,7 +24,18 @@ >>>>>>>>>>>>>>>>>>>>>>>> #ifndef __AMDGPU_RESET_H__ >>>>>>>>>>>>>>>>>>>>>>>> #define __AMDGPU_RESET_H__ >>>>>>>>>>>>>>>>>>>>>>>> -#include "amdgpu.h" >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/atomic.h> >>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/mutex.h> >>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/list.h> >>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/kref.h> >>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/rwsem.h> >>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/workqueue.h> >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_device; >>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_job; >>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_hive_info; >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> enum AMDGPU_RESET_FLAGS { >>>>>>>>>>>>>>>>>>>>>>>> @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS { >>>>>>>>>>>>>>>>>>>>>>>> AMDGPU_SKIP_HW_RESET = 1, >>>>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> +enum amd_reset_method { >>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_NONE = -1, >>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_LEGACY = 0, >>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE0, >>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE1, >>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE2, >>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_BACO, >>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_PCI, >>>>>>>>>>>>>>>>>>>>>>>> +}; >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_reset_context { >>>>>>>>>>>>>>>>>>>>>>>> enum amd_reset_method method; >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *reset_req_dev; >>>>>>>>>>>>>>>>>>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context { >>>>>>>>>>>>>>>>>>>>>>>> unsigned long flags; >>>>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_reset_control; >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_reset_handler { >>>>>>>>>>>>>>>>>>>>>>>> enum amd_reset_method reset_method; >>>>>>>>>>>>>>>>>>>>>>>> struct list_head handler_list; >>>>>>>>>>>>>>>>>>>>>>>> @@ -76,12 +100,21 @@ enum >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_type { >>>>>>>>>>>>>>>>>>>>>>>> XGMI_HIVE >>>>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_reset_work_struct { >>>>>>>>>>>>>>>>>>>>>>>> + struct delayed_work base; >>>>>>>>>>>>>>>>>>>>>>>> + struct list_head node; >>>>>>>>>>>>>>>>>>>>>>>> +}; >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_reset_domain { >>>>>>>>>>>>>>>>>>>>>>>> struct kref refcount; >>>>>>>>>>>>>>>>>>>>>>>> struct workqueue_struct *wq; >>>>>>>>>>>>>>>>>>>>>>>> enum amdgpu_reset_domain_type type; >>>>>>>>>>>>>>>>>>>>>>>> struct rw_semaphore sem; >>>>>>>>>>>>>>>>>>>>>>>> atomic_t in_gpu_reset; >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> + struct list_head pending_works; >>>>>>>>>>>>>>>>>>>>>>>> + struct mutex reset_lock; >>>>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>>>> @@ -113,9 +146,43 @@ static inline void >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_put_reset_domain(struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *dom >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> static inline bool >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_schedule(struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain, >>>>>>>>>>>>>>>>>>>>>>>> - struct work_struct *work) >>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work) >>>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>>> - return queue_work(domain->wq, work); >>>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> + if (!queue_delayed_work(domain->wq, >>>>>>>>>>>>>>>>>>>>>>>> &work->base, 0)) { >>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>>> + return false; >>>>>>>>>>>>>>>>>>>>>>>> + } >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> + list_add_tail(&work->node, >>>>>>>>>>>>>>>>>>>>>>>> &domain->pending_works); >>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> + return true; >>>>>>>>>>>>>>>>>>>>>>>> +} >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> +static inline void >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain, >>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work) >>>>>>>>>>>>>>>>>>>>>>>> +{ >>>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>>> + list_del_init(&work->node); >>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>>> +} >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> +static inline void >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_pending_list(struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain) >>>>>>>>>>>>>>>>>>>>>>>> +{ >>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *entry, *tmp; >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>>> + list_for_each_entry_safe(entry, tmp, >>>>>>>>>>>>>>>>>>>>>>>> &domain->pending_works, node) { >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> + list_del_init(&entry->node); >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> + /* Stop any other related pending >>>>>>>>>>>>>>>>>>>>>>>> resets */ >>>>>>>>>>>>>>>>>>>>>>>> + cancel_delayed_work(&entry->base); >>>>>>>>>>>>>>>>>>>>>>>> + } >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> void amdgpu_device_lock_reset_domain(struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *reset_domain); >>>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>>>>>> index 239f232f9c02..574e870d3064 100644 >>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>>>>>> @@ -25,6 +25,7 @@ >>>>>>>>>>>>>>>>>>>>>>>> #define AMDGPU_VIRT_H >>>>>>>>>>>>>>>>>>>>>>>> #include "amdgv_sriovmsg.h" >>>>>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h" >>>>>>>>>>>>>>>>>>>>>>>> #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 << >>>>>>>>>>>>>>>>>>>>>>>> 0) /* vBIOS is sr-iov ready */ >>>>>>>>>>>>>>>>>>>>>>>> #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 1) >>>>>>>>>>>>>>>>>>>>>>>> /* sr-iov is enabled on this GPU */ >>>>>>>>>>>>>>>>>>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt { >>>>>>>>>>>>>>>>>>>>>>>> uint32_t reg_val_offs; >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_irq_src ack_irq; >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_irq_src rcv_irq; >>>>>>>>>>>>>>>>>>>>>>>> - struct work_struct flr_work; >>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct flr_work; >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_mm_table mm_table; >>>>>>>>>>>>>>>>>>>>>>>> const struct amdgpu_virt_ops *ops; >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_vf_error_buffer vf_errors; >>>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>>>>>> index b81acf59870c..f3d1c2be9292 100644 >>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>>>>>> @@ -251,7 +251,7 @@ static int >>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev, >>>>>>>>>>>>>>>>>>>>>>>> static void xgpu_ai_mailbox_flr_work(struct >>>>>>>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, >>>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work); >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev = >>>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt); >>>>>>>>>>>>>>>>>>>>>>>> int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>>>>>>>>>>>>>>>>>>> @@ -380,7 +380,8 @@ int >>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_get_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>>>> *adev) >>>>>>>>>>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> @@ -389,6 +390,8 @@ void >>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_put_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>>>> *adev) >>>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> static int xgpu_ai_request_init_data(struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644 >>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>>>>>> @@ -275,7 +275,7 @@ static int >>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev, >>>>>>>>>>>>>>>>>>>>>>>> static void xgpu_nv_mailbox_flr_work(struct >>>>>>>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, >>>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work); >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev = >>>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt); >>>>>>>>>>>>>>>>>>>>>>>> int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>>>>>>>>>>>>>>>>>>> @@ -407,7 +407,8 @@ int >>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_get_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>>>> *adev) >>>>>>>>>>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> @@ -416,6 +417,8 @@ void >>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_put_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>>>> *adev) >>>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> const struct amdgpu_virt_ops >>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_virt_ops = { >>>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644 >>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>>>>>> @@ -512,7 +512,7 @@ static int >>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev, >>>>>>>>>>>>>>>>>>>>>>>> static void xgpu_vi_mailbox_flr_work(struct >>>>>>>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, >>>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work); >>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev = >>>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt); >>>>>>>>>>>>>>>>>>>>>>>> /* wait until RCV_MSG become 3 */ >>>>>>>>>>>>>>>>>>>>>>>> @@ -610,7 +610,8 @@ int >>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_get_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>>>> *adev) >>>>>>>>>>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> @@ -619,6 +620,8 @@ void >>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_put_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>>>> *adev) >>>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>> const struct amdgpu_virt_ops >>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_virt_ops = { >>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>> >>>>>>>>>>>> >>>>>>>>>> >>>>>>>> >>>>>> >>> > ^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive. 2022-05-12 13:07 ` Andrey Grodzovsky @ 2022-05-12 13:15 ` Christian König 2022-05-12 13:44 ` Andrey Grodzovsky 2022-05-13 15:41 ` Andrey Grodzovsky 0 siblings, 2 replies; 40+ messages in thread From: Christian König @ 2022-05-12 13:15 UTC (permalink / raw) To: Andrey Grodzovsky, Lazar, Lijo, Christian König, Felix Kuehling, amd-gfx Cc: Bai Zoy Am 12.05.22 um 15:07 schrieb Andrey Grodzovsky: > > On 2022-05-12 02:06, Christian König wrote: >> Am 11.05.22 um 22:27 schrieb Andrey Grodzovsky: >>> >>> On 2022-05-11 11:39, Christian König wrote: >>>> Am 11.05.22 um 17:35 schrieb Andrey Grodzovsky: >>>>> On 2022-05-11 11:20, Lazar, Lijo wrote: >>>>>> >>>>>> >>>>>> On 5/11/2022 7:28 PM, Christian König wrote: >>>>>>> Am 11.05.22 um 15:43 schrieb Andrey Grodzovsky: >>>>>>>> On 2022-05-11 03:38, Christian König wrote: >>>>>>>>> Am 10.05.22 um 20:53 schrieb Andrey Grodzovsky: >>>>>>>>>> [SNIP] >>>>>>>>>>> E.g. in the reset code (either before or after the reset, >>>>>>>>>>> that's debatable) you do something like this: >>>>>>>>>>> >>>>>>>>>>> for (i = 0; i < num_ring; ++i) >>>>>>>>>>> cancel_delayed_work(ring[i]->scheduler....) >>>>>>>>>>> cancel_work(adev->ras_work); >>>>>>>>>>> cancel_work(adev->iofault_work); >>>>>>>>>>> cancel_work(adev->debugfs_work); >>>>>>>>>>> ... >>>>>>>>>>> >>>>>>>>>>> You don't really need to track which reset source has fired >>>>>>>>>>> and which hasn't, because that would be racy again. Instead >>>>>>>>>>> just bluntly reset all possible sources. >>>>>>>>>>> >>>>>>>>>>> Christian. >>>>>>>>>> >>>>>>>>>> >>>>>>>>>> I don't say we care if it fired once or twice (I need to add >>>>>>>>>> a fix to only insert reset work to pending reset list if it's >>>>>>>>>> not already there), the point of using list (or array) to >>>>>>>>>> which you add and from which you remove is that the logic of >>>>>>>>>> this is encapsulated within reset domain. In your way we need >>>>>>>>>> to be aware who exactly schedules reset work and explicitly >>>>>>>>>> cancel them, this also means that for any new source added in >>>>>>>>>> the future you will need to remember to add him >>>>>>>>> >>>>>>>>> I don't think that this is a valid argument. Additionally to >>>>>>>>> the schedulers we probably just need less than a handful of >>>>>>>>> reset sources, most likely even just one or two is enough. >>>>>>>>> >>>>>>>>> The only justification I can see of having additional separate >>>>>>>>> reset sources would be if somebody wants to know if a specific >>>>>>>>> source has been handled or not (e.g. call flush_work() or >>>>>>>>> work_pending()). Like in the case of a reset triggered through >>>>>>>>> debugfs. >>>>>>>> >>>>>>>> >>>>>>>> This is indeed one reason, another is as we said before that if >>>>>>>> you share 'reset source' (meaning a delayed work) with another >>>>>>>> client (i.e. RAS and KFD) it means you make assumption that the >>>>>>>> other client always proceeds with the >>>>>>>> reset exactly the same way as you expect. So today we have this >>>>>>>> only in scheduler vs non scheduler reset happening - non >>>>>>>> scheduler reset clients assume the reset is always fully >>>>>>>> executed in HW while scheduler based reset makes shortcuts and >>>>>>>> not always does HW reset hence they cannot share 'reset source' >>>>>>>> (delayed work). Yes, we can always add this in the future if >>>>>>>> and when such problem will arise but no one will remember this >>>>>>>> then and a new bug will be introduced and will take time to >>>>>>>> find and resolve. >>>>>>> >>>>>>> Mhm, so your main concern is that we forget to correctly handle >>>>>>> the new reset sources? >>>>>>> >>>>>>> How about we do it like this then: >>>>>>> >>>>>>> struct amdgpu_reset_domain { >>>>>>> .... >>>>>>> union { >>>>>>> struct { >>>>>>> struct work_item debugfs; >>>>>>> struct work_item ras; >>>>>>> .... >>>>>>> }; >>>>>>> struct work_item array[] >>>>>>> } reset_sources; >>>>>>> } >>>>>>> >>>>>> >>>>>> If it's only about static array, >>>>>> >>>>>> enum amdgpu_reset_soruce { >>>>>> >>>>>> AMDGPU_RESET_SRC_RAS, >>>>>> AMDGPU_RESET_SRC_ABC, >>>>>> ..... >>>>>> AMDGPU_RESET_SRC_XYZ, >>>>>> AMDGPU_RESET_SRC_MAX >>>>>> >>>>>> }; >>>>>> >>>>>> struct work_struct reset_work[AMDGPU_RESET_SRC_MAX]; => An index >>>>>> for each work item >>>>>> >>>>>> >>>>>> Thanks, >>>>>> Lijo >>>>> >>>>> >>>>> It's possible though it makes harder to generalize reset_domain >>>>> later for other drivers. >>>>> But still one caveat, look at amdgpu_recover_work_struct and it's >>>>> usage in amdgpu_device_gpu_recover and in gpu_recover_get, >>>>> At least for debugfs i need to return back the result of GPU reset >>>>> and so I cannot store actual work items in the array mentioned above >>>>> but rather pointers to work_item because i need a way to get back >>>>> the return value from gpu_recover like I do now in >>>>> amdgpu_device_gpu_recover. >>>> >>>> You should try to avoid that as well. >>>> >>>> See when the debugfs reset is canceled because of a scheduler reset >>>> you won't get a useful return code either. >>>> >>>> What we should do instead is to cache the status of the last reset >>>> in the reset domain. >>>> >>>> Regards, >>>> Christian. >>> >>> >>> Another problem with this approach - to execute the actaul GPU >>> reset I need accesses to concrete amdgpu_device pointer from work >>> struct (see xgpu_ai_mailbox_flr_work) as example. If i store all >>> work items in >>> array in amdgpu_reset_domain the most i can only retrieve is the >>> reset_domain struct itself which won't help since it's dynamically >>> allocated, not embedded in hive or adev and can can be one per >>> device or per entire hive in case of XGMI and so there is no way for >>> me to reach back to amdgpu_device. Back pointer to adev* from >>> amdgpu_reset_domain will only work for single device but not for >>> XGMI hive where there are multiple devices in a hive. >> >> Which is exactly the reason why I think we should always allocate the >> hive structure, even if we only have one device. And a GPU reset >> should then always work with the hive data structure and not adev. > > > I am not sure why HIVE is the object we should work with, hive is one > use case, single device is another, then Lijo described something > called partition which is what ? Particular pipe within GPU ?. What > they all share in common > IMHO is that all of them use reset domain when they want a recovery > operation, so maybe GPU reset should be oriented to work with reset > domain ? Yes, exactly that's the idea. Basically the reset domain knowns which amdgpu devices it needs to reset together. If you then represent that so that you always have a hive even when you only have one device in it, or if you put an array of devices which needs to be reset together into the reset domain doesn't matter. Maybe go for the later approach, that is probably a bit cleaner and less code to change. Christian. > > Andrey > > >> >> Adding a pointer from your reset work item back to the hive is then >> trivial. >> >> Regards, >> Christian. >> >>> >>> Andrey >>> >>> >>>> >>>>> >>>>> Andrey >>>>> >>>>> >>>>>> >>>>>>> Not 100% sure if that works, but something like that should do >>>>>>> the trick. >>>>>>> >>>>>>> My main concern is that I don't want to allocate the work items >>>>>>> on the stack and dynamic allocation (e.g. kmalloc) is usually >>>>>>> not possible either. >>>>>>> >>>>>>> Additional to that putting/removing work items from a list, >>>>>>> array or other container is a very common source for race >>>>>>> conditions. >>>>>>> >>>>>>> Regards, >>>>>>> Christian. >>>>>>> >>>>>>>> >>>>>>>>>> to the cancellation list which you showed above. In current >>>>>>>>>> way all this done automatically within reset_domain code and >>>>>>>>>> it's agnostic to specific driver and it's specific list of >>>>>>>>>> reset sources. Also in case we would want to generalize >>>>>>>>>> reset_domain to other GPU drivers (which was >>>>>>>>>> a plan as far as i remember) this explicit mention of each >>>>>>>>>> reset works for cancellation is again less suitable in my >>>>>>>>>> opinion. >>>>>>>>> >>>>>>>>> Well we could put the work item for the scheduler independent >>>>>>>>> reset source into the reset domain as well. But I'm not sure >>>>>>>>> those additional reset sources should be part of any common >>>>>>>>> handling, that is largely amdgpu specific. >>>>>>>> >>>>>>>> >>>>>>>> So it's for sure more then one source for the reasons described >>>>>>>> above, also note that for scheduler we already cancel delayed >>>>>>>> work in drm_sched_stop so calling them again in amdgpu code >>>>>>>> kind of superfluous. >>>>>>>> >>>>>>>> Andrey >>>>>>>> >>>>>>>> >>>>>>>>> >>>>>>>>> Christian. >>>>>>>>> >>>>>>>>>> >>>>>>>>>> Andrey >>>>>>>>>> >>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>>> >>>>>>>>>>>> Andrey >>>>>>>>>>>> >>>>>>>>>>>> >>>>>>>>>>>>> >>>>>>>>>>>>>> >>>>>>>>>>>>>> The only difference is I chose to do the canceling right >>>>>>>>>>>>>> BEFORE the HW reset and not AFTER. I did this because I >>>>>>>>>>>>>> see a possible race where a new reset request is being >>>>>>>>>>>>>> generated exactly after we finished the HW reset but >>>>>>>>>>>>>> before we canceled out all pending resets - in such case >>>>>>>>>>>>>> you wold not want to cancel this 'border line new' reset >>>>>>>>>>>>>> request. >>>>>>>>>>>>> >>>>>>>>>>>>> Why not? Any new reset request directly after a hardware >>>>>>>>>>>>> reset is most likely just falsely generated by the reset >>>>>>>>>>>>> itself. >>>>>>>>>>>>> >>>>>>>>>>>>> Ideally I would cancel all sources after the reset, but >>>>>>>>>>>>> before starting any new work. >>>>>>>>>>>>> >>>>>>>>>>>>> Regards, >>>>>>>>>>>>> Christian. >>>>>>>>>>>>> >>>>>>>>>>>>>> >>>>>>>>>>>>>> >>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>> >>>>>>>>>>>>>> >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>>> Christian. >>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> You can see that if many different reset sources share >>>>>>>>>>>>>>>>> same work struct what can happen is that the first to >>>>>>>>>>>>>>>>> obtain the lock you describe bellow might opt out from >>>>>>>>>>>>>>>>> full HW reset because his bad job did signal for >>>>>>>>>>>>>>>>> example or because his hunged IP block was able to >>>>>>>>>>>>>>>>> recover through SW reset but in the meantime another >>>>>>>>>>>>>>>>> reset source who needed an actual HW reset just >>>>>>>>>>>>>>>>> silently returned and we end up with unhandled reset >>>>>>>>>>>>>>>>> request. True that today this happens only to job >>>>>>>>>>>>>>>>> timeout reset sources that are handled form within the >>>>>>>>>>>>>>>>> scheduler and won't use this single work struct but no >>>>>>>>>>>>>>>>> one prevents a future case for this to happen and >>>>>>>>>>>>>>>>> also, if we actually want to unify scheduler time out >>>>>>>>>>>>>>>>> handlers within reset domain (which seems to me the >>>>>>>>>>>>>>>>> right design approach) we won't be able to use just >>>>>>>>>>>>>>>>> one work struct for this reason anyway. >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> Just to add to this point - a reset domain is >>>>>>>>>>>>>>>> co-operative domain. In addition to sharing a set of >>>>>>>>>>>>>>>> clients from various reset sources for one device, it >>>>>>>>>>>>>>>> also will have a set of devices like in XGMI hive. The >>>>>>>>>>>>>>>> job timeout on one device may not eventually result in >>>>>>>>>>>>>>>> result, but a RAS error happening on another device at >>>>>>>>>>>>>>>> the same time would need a reset. The second device's >>>>>>>>>>>>>>>> RAS error cannot return seeing that a reset work >>>>>>>>>>>>>>>> already started, or ignore the reset work given that >>>>>>>>>>>>>>>> another device has filled the reset data. >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> When there is a reset domain, it should take care of >>>>>>>>>>>>>>>> the work scheduled and keeping it in device or any >>>>>>>>>>>>>>>> other level doesn't sound good. >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> Thanks, >>>>>>>>>>>>>>>> Lijo >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> I'd put the reset work struct into the reset_domain >>>>>>>>>>>>>>>>>> struct. That way you'd have exactly one worker for >>>>>>>>>>>>>>>>>> the reset domain. You could implement a lock-less >>>>>>>>>>>>>>>>>> scheme to decide whether you need to schedule a >>>>>>>>>>>>>>>>>> reset, e.g. using an atomic counter in the shared >>>>>>>>>>>>>>>>>> work struct that gets incremented when a client wants >>>>>>>>>>>>>>>>>> to trigger a reset (atomic_add_return). If that >>>>>>>>>>>>>>>>>> counter is exactly 1 after incrementing, you need to >>>>>>>>>>>>>>>>>> fill in the rest of the work struct and schedule the >>>>>>>>>>>>>>>>>> work. Otherwise, it's already scheduled (or another >>>>>>>>>>>>>>>>>> client is in the process of scheduling it) and you >>>>>>>>>>>>>>>>>> just return. When the worker finishes (after >>>>>>>>>>>>>>>>>> confirming a successful reset), it resets the counter >>>>>>>>>>>>>>>>>> to 0, so the next client requesting a reset will >>>>>>>>>>>>>>>>>> schedule the worker again. >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>>>>>> Felix >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>> Additional to that keep in mind that you can't >>>>>>>>>>>>>>>>>>>> allocate any memory before or during the GPU reset >>>>>>>>>>>>>>>>>>>> nor wait for the reset to complete (so you can't >>>>>>>>>>>>>>>>>>>> allocate anything on the stack either). >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> There is no dynamic allocation here, regarding stack >>>>>>>>>>>>>>>>>>> allocations - we do it all the time when we call >>>>>>>>>>>>>>>>>>> functions, even during GPU resets, how on stack >>>>>>>>>>>>>>>>>>> allocation of work struct in >>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover is different from any >>>>>>>>>>>>>>>>>>> other local variable we allocate in any function we >>>>>>>>>>>>>>>>>>> call ? >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> I am also not sure why it's not allowed to wait for >>>>>>>>>>>>>>>>>>> reset to complete ? Also, see in >>>>>>>>>>>>>>>>>>> amdgpu_ras_do_recovery and gpu_recover_get (debugfs) >>>>>>>>>>>>>>>>>>> - the caller expects the reset to complete before he >>>>>>>>>>>>>>>>>>> returns. I can probably work around it in RAS code >>>>>>>>>>>>>>>>>>> by calling atomic_set(&ras->in_recovery, 0) from >>>>>>>>>>>>>>>>>>> some callback within actual reset function but >>>>>>>>>>>>>>>>>>> regarding sysfs it actually expects a result >>>>>>>>>>>>>>>>>>> returned indicating whether the call was successful >>>>>>>>>>>>>>>>>>> or not. >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>> I don't think that concept you try here will work. >>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>>>>>>>> Christian. >>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>> Also in general seems to me it's cleaner approach >>>>>>>>>>>>>>>>>>>>> where this logic (the work items) are held and >>>>>>>>>>>>>>>>>>>>> handled in reset_domain and are not split in each >>>>>>>>>>>>>>>>>>>>> adev or any other entity. We might want in the >>>>>>>>>>>>>>>>>>>>> future to even move the scheduler handling into >>>>>>>>>>>>>>>>>>>>> reset domain since reset domain is supposed to be >>>>>>>>>>>>>>>>>>>>> a generic things and not only or AMD. >>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>>>>>>>>>>>> Christian. >>>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky >>>>>>>>>>>>>>>>>>>>>>>>> <andrey.grodzovsky@amd.com> >>>>>>>>>>>>>>>>>>>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com> >>>>>>>>>>>>>>>>>>>>>>>>> --- >>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +--- >>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | >>>>>>>>>>>>>>>>>>>>>>>>> 17 +++-- >>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 + >>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 >>>>>>>>>>>>>>>>>>>>>>>>> +++++++++++++++++++++- >>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +- >>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++- >>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++- >>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++- >>>>>>>>>>>>>>>>>>>>>>>>> 8 files changed, 104 insertions(+), 24 >>>>>>>>>>>>>>>>>>>>>>>>> deletions(-) >>>>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>>>>>>> index 4264abc5604d..99efd8317547 100644 >>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>>>>>>> @@ -109,6 +109,7 @@ >>>>>>>>>>>>>>>>>>>>>>>>> #include "amdgpu_fdinfo.h" >>>>>>>>>>>>>>>>>>>>>>>>> #include "amdgpu_mca.h" >>>>>>>>>>>>>>>>>>>>>>>>> #include "amdgpu_ras.h" >>>>>>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h" >>>>>>>>>>>>>>>>>>>>>>>>> #define MAX_GPU_INSTANCE 16 >>>>>>>>>>>>>>>>>>>>>>>>> @@ -509,16 +510,6 @@ struct >>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_allowed_register_entry { >>>>>>>>>>>>>>>>>>>>>>>>> bool grbm_indexed; >>>>>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>>>>> -enum amd_reset_method { >>>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_NONE = -1, >>>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_LEGACY = 0, >>>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE0, >>>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE1, >>>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE2, >>>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_BACO, >>>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_PCI, >>>>>>>>>>>>>>>>>>>>>>>>> -}; >>>>>>>>>>>>>>>>>>>>>>>>> - >>>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_video_codec_info { >>>>>>>>>>>>>>>>>>>>>>>>> u32 codec_type; >>>>>>>>>>>>>>>>>>>>>>>>> u32 max_width; >>>>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>>>>>>> index e582f1044c0f..7fa82269c30f 100644 >>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>>>>>>> @@ -5201,6 +5201,12 @@ int >>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct >>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev, >>>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>>> tmp_vram_lost_counter = >>>>>>>>>>>>>>>>>>>>>>>>> atomic_read(&((adev)->vram_lost_counter)); >>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>> + /* Drop all pending resets since we will >>>>>>>>>>>>>>>>>>>>>>>>> reset now anyway */ >>>>>>>>>>>>>>>>>>>>>>>>> + tmp_adev = >>>>>>>>>>>>>>>>>>>>>>>>> list_first_entry(device_list_handle, struct >>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device, >>>>>>>>>>>>>>>>>>>>>>>>> + reset_list); >>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_pending_list(tmp_adev->reset_domain); >>>>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>> /* Actual ASIC resets if needed.*/ >>>>>>>>>>>>>>>>>>>>>>>>> /* Host driver will handle XGMI hive >>>>>>>>>>>>>>>>>>>>>>>>> reset for SRIOV */ >>>>>>>>>>>>>>>>>>>>>>>>> if (amdgpu_sriov_vf(adev)) { >>>>>>>>>>>>>>>>>>>>>>>>> @@ -5296,7 +5302,7 @@ int >>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct >>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev, >>>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_recover_work_struct { >>>>>>>>>>>>>>>>>>>>>>>>> - struct work_struct base; >>>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct base; >>>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev; >>>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_job *job; >>>>>>>>>>>>>>>>>>>>>>>>> int ret; >>>>>>>>>>>>>>>>>>>>>>>>> @@ -5304,7 +5310,7 @@ struct >>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct { >>>>>>>>>>>>>>>>>>>>>>>>> static void >>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work(struct >>>>>>>>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>>>> - struct amdgpu_recover_work_struct >>>>>>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct >>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base); >>>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_recover_work_struct >>>>>>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct >>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base.base.work); >>>>>>>>>>>>>>>>>>>>>>>>> recover_work->ret = >>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, >>>>>>>>>>>>>>>>>>>>>>>>> recover_work->job); >>>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>>> @@ -5316,12 +5322,15 @@ int >>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>>>>> *adev, >>>>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_recover_work_struct work = >>>>>>>>>>>>>>>>>>>>>>>>> {.adev = adev, .job = job}; >>>>>>>>>>>>>>>>>>>>>>>>> - INIT_WORK(&work.base, >>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&work.base.base, >>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&work.base.node); >>>>>>>>>>>>>>>>>>>>>>>>> if >>>>>>>>>>>>>>>>>>>>>>>>> (!amdgpu_reset_domain_schedule(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>>>>> &work.base)) >>>>>>>>>>>>>>>>>>>>>>>>> return -EAGAIN; >>>>>>>>>>>>>>>>>>>>>>>>> - flush_work(&work.base); >>>>>>>>>>>>>>>>>>>>>>>>> + flush_delayed_work(&work.base.base); >>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>>>>> &work.base); >>>>>>>>>>>>>>>>>>>>>>>>> return work.ret; >>>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>>>>>>> index c80af0889773..ffddd419c351 100644 >>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>>>>>>> @@ -134,6 +134,9 @@ struct amdgpu_reset_domain >>>>>>>>>>>>>>>>>>>>>>>>> *amdgpu_reset_create_reset_domain(enum >>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_d >>>>>>>>>>>>>>>>>>>>>>>>> atomic_set(&reset_domain->in_gpu_reset, 0); >>>>>>>>>>>>>>>>>>>>>>>>> init_rwsem(&reset_domain->sem); >>>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&reset_domain->pending_works); >>>>>>>>>>>>>>>>>>>>>>>>> + mutex_init(&reset_domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>> return reset_domain; >>>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>>>>>>> index 1949dbe28a86..863ec5720fc1 100644 >>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>>>>>>> @@ -24,7 +24,18 @@ >>>>>>>>>>>>>>>>>>>>>>>>> #ifndef __AMDGPU_RESET_H__ >>>>>>>>>>>>>>>>>>>>>>>>> #define __AMDGPU_RESET_H__ >>>>>>>>>>>>>>>>>>>>>>>>> -#include "amdgpu.h" >>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/atomic.h> >>>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/mutex.h> >>>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/list.h> >>>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/kref.h> >>>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/rwsem.h> >>>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/workqueue.h> >>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_device; >>>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_job; >>>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_hive_info; >>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>> enum AMDGPU_RESET_FLAGS { >>>>>>>>>>>>>>>>>>>>>>>>> @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS { >>>>>>>>>>>>>>>>>>>>>>>>> AMDGPU_SKIP_HW_RESET = 1, >>>>>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>> +enum amd_reset_method { >>>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_NONE = -1, >>>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_LEGACY = 0, >>>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE0, >>>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE1, >>>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE2, >>>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_BACO, >>>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_PCI, >>>>>>>>>>>>>>>>>>>>>>>>> +}; >>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_reset_context { >>>>>>>>>>>>>>>>>>>>>>>>> enum amd_reset_method method; >>>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *reset_req_dev; >>>>>>>>>>>>>>>>>>>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context { >>>>>>>>>>>>>>>>>>>>>>>>> unsigned long flags; >>>>>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_reset_control; >>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_reset_handler { >>>>>>>>>>>>>>>>>>>>>>>>> enum amd_reset_method reset_method; >>>>>>>>>>>>>>>>>>>>>>>>> struct list_head handler_list; >>>>>>>>>>>>>>>>>>>>>>>>> @@ -76,12 +100,21 @@ enum >>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_type { >>>>>>>>>>>>>>>>>>>>>>>>> XGMI_HIVE >>>>>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_reset_work_struct { >>>>>>>>>>>>>>>>>>>>>>>>> + struct delayed_work base; >>>>>>>>>>>>>>>>>>>>>>>>> + struct list_head node; >>>>>>>>>>>>>>>>>>>>>>>>> +}; >>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_reset_domain { >>>>>>>>>>>>>>>>>>>>>>>>> struct kref refcount; >>>>>>>>>>>>>>>>>>>>>>>>> struct workqueue_struct *wq; >>>>>>>>>>>>>>>>>>>>>>>>> enum amdgpu_reset_domain_type type; >>>>>>>>>>>>>>>>>>>>>>>>> struct rw_semaphore sem; >>>>>>>>>>>>>>>>>>>>>>>>> atomic_t in_gpu_reset; >>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>> + struct list_head pending_works; >>>>>>>>>>>>>>>>>>>>>>>>> + struct mutex reset_lock; >>>>>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>>>>> @@ -113,9 +146,43 @@ static inline void >>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_put_reset_domain(struct >>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *dom >>>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>>> static inline bool >>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_schedule(struct >>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain, >>>>>>>>>>>>>>>>>>>>>>>>> - struct work_struct *work) >>>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work) >>>>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>>>> - return queue_work(domain->wq, work); >>>>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>> + if (!queue_delayed_work(domain->wq, >>>>>>>>>>>>>>>>>>>>>>>>> &work->base, 0)) { >>>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>>>> + return false; >>>>>>>>>>>>>>>>>>>>>>>>> + } >>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>> + list_add_tail(&work->node, >>>>>>>>>>>>>>>>>>>>>>>>> &domain->pending_works); >>>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>> + return true; >>>>>>>>>>>>>>>>>>>>>>>>> +} >>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>> +static inline void >>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(struct >>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain, >>>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work) >>>>>>>>>>>>>>>>>>>>>>>>> +{ >>>>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>>>> + list_del_init(&work->node); >>>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>>>> +} >>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>> +static inline void >>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_pending_list(struct >>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain) >>>>>>>>>>>>>>>>>>>>>>>>> +{ >>>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *entry, >>>>>>>>>>>>>>>>>>>>>>>>> *tmp; >>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>>>> + list_for_each_entry_safe(entry, tmp, >>>>>>>>>>>>>>>>>>>>>>>>> &domain->pending_works, node) { >>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>> + list_del_init(&entry->node); >>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>> + /* Stop any other related pending >>>>>>>>>>>>>>>>>>>>>>>>> resets */ >>>>>>>>>>>>>>>>>>>>>>>>> + cancel_delayed_work(&entry->base); >>>>>>>>>>>>>>>>>>>>>>>>> + } >>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>>> void >>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_lock_reset_domain(struct >>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *reset_domain); >>>>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>>>>>>> index 239f232f9c02..574e870d3064 100644 >>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>>>>>>> @@ -25,6 +25,7 @@ >>>>>>>>>>>>>>>>>>>>>>>>> #define AMDGPU_VIRT_H >>>>>>>>>>>>>>>>>>>>>>>>> #include "amdgv_sriovmsg.h" >>>>>>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h" >>>>>>>>>>>>>>>>>>>>>>>>> #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 >>>>>>>>>>>>>>>>>>>>>>>>> << 0) /* vBIOS is sr-iov ready */ >>>>>>>>>>>>>>>>>>>>>>>>> #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << >>>>>>>>>>>>>>>>>>>>>>>>> 1) /* sr-iov is enabled on this GPU */ >>>>>>>>>>>>>>>>>>>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt { >>>>>>>>>>>>>>>>>>>>>>>>> uint32_t reg_val_offs; >>>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_irq_src ack_irq; >>>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_irq_src rcv_irq; >>>>>>>>>>>>>>>>>>>>>>>>> - struct work_struct flr_work; >>>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct flr_work; >>>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_mm_table mm_table; >>>>>>>>>>>>>>>>>>>>>>>>> const struct amdgpu_virt_ops *ops; >>>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_vf_error_buffer vf_errors; >>>>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>>>>>>> index b81acf59870c..f3d1c2be9292 100644 >>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>>>>>>> @@ -251,7 +251,7 @@ static int >>>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct >>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev, >>>>>>>>>>>>>>>>>>>>>>>>> static void >>>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work(struct work_struct >>>>>>>>>>>>>>>>>>>>>>>>> *work) >>>>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, >>>>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work); >>>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev = >>>>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt); >>>>>>>>>>>>>>>>>>>>>>>>> int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>>>>>>>>>>>>>>>>>>>> @@ -380,7 +380,8 @@ int >>>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_get_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>>>>> *adev) >>>>>>>>>>>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>>> @@ -389,6 +390,8 @@ void >>>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_put_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>>>>> *adev) >>>>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>>> static int >>>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_request_init_data(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>>>>> *adev) >>>>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644 >>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>>>>>>> @@ -275,7 +275,7 @@ static int >>>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct >>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev, >>>>>>>>>>>>>>>>>>>>>>>>> static void >>>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work(struct work_struct >>>>>>>>>>>>>>>>>>>>>>>>> *work) >>>>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, >>>>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work); >>>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev = >>>>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt); >>>>>>>>>>>>>>>>>>>>>>>>> int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>>>>>>>>>>>>>>>>>>>> @@ -407,7 +407,8 @@ int >>>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_get_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>>>>> *adev) >>>>>>>>>>>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>>> @@ -416,6 +417,8 @@ void >>>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_put_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>>>>> *adev) >>>>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>>> const struct amdgpu_virt_ops >>>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_virt_ops = { >>>>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644 >>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>>>>>>> @@ -512,7 +512,7 @@ static int >>>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct >>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev, >>>>>>>>>>>>>>>>>>>>>>>>> static void >>>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work(struct work_struct >>>>>>>>>>>>>>>>>>>>>>>>> *work) >>>>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, flr_work); >>>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, >>>>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work); >>>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev = >>>>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt); >>>>>>>>>>>>>>>>>>>>>>>>> /* wait until RCV_MSG become 3 */ >>>>>>>>>>>>>>>>>>>>>>>>> @@ -610,7 +610,8 @@ int >>>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_get_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>>>>> *adev) >>>>>>>>>>>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>>> @@ -619,6 +620,8 @@ void >>>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_put_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>>>>> *adev) >>>>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>>> const struct amdgpu_virt_ops >>>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_virt_ops = { >>>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>> >>>>>>>>>>>>> >>>>>>>>>>> >>>>>>>>> >>>>>>> >>>> >> ^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive. 2022-05-12 13:15 ` Christian König @ 2022-05-12 13:44 ` Andrey Grodzovsky 2022-05-13 15:41 ` Andrey Grodzovsky 1 sibling, 0 replies; 40+ messages in thread From: Andrey Grodzovsky @ 2022-05-12 13:44 UTC (permalink / raw) To: Christian König, Lazar, Lijo, Christian König, Felix Kuehling, amd-gfx Cc: Bai Zoy [-- Attachment #1: Type: text/plain, Size: 1218 bytes --] Sure, I will investigate that. What about the ticket which LIjo raised which was basically doing 8 resets instead of one ? Lijo - can this ticket wait until I come up with this new design for amdgpu reset function or u need a quick solution now in which case we can use the already existing patch temporary. Andrey On 2022-05-12 09:15, Christian König wrote: >> I am not sure why HIVE is the object we should work with, hive is one >> use case, single device is another, then Lijo described something >> called partition which is what ? Particular pipe within GPU ?. What >> they all share in common >> IMHO is that all of them use reset domain when they want a recovery >> operation, so maybe GPU reset should be oriented to work with reset >> domain ? > > Yes, exactly that's the idea. > > Basically the reset domain knowns which amdgpu devices it needs to > reset together. > > If you then represent that so that you always have a hive even when > you only have one device in it, or if you put an array of devices > which needs to be reset together into the reset domain doesn't matter. > > Maybe go for the later approach, that is probably a bit cleaner and > less code to change. > > Christian. [-- Attachment #2: Type: text/html, Size: 1819 bytes --] ^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive. 2022-05-12 13:15 ` Christian König 2022-05-12 13:44 ` Andrey Grodzovsky @ 2022-05-13 15:41 ` Andrey Grodzovsky 2022-05-16 14:12 ` Andrey Grodzovsky 1 sibling, 1 reply; 40+ messages in thread From: Andrey Grodzovsky @ 2022-05-13 15:41 UTC (permalink / raw) To: Christian König, Lazar, Lijo, Christian König, Felix Kuehling, amd-gfx Cc: Bai Zoy On 2022-05-12 09:15, Christian König wrote: > Am 12.05.22 um 15:07 schrieb Andrey Grodzovsky: >> >> On 2022-05-12 02:06, Christian König wrote: >>> Am 11.05.22 um 22:27 schrieb Andrey Grodzovsky: >>>> >>>> On 2022-05-11 11:39, Christian König wrote: >>>>> Am 11.05.22 um 17:35 schrieb Andrey Grodzovsky: >>>>>> On 2022-05-11 11:20, Lazar, Lijo wrote: >>>>>>> >>>>>>> >>>>>>> On 5/11/2022 7:28 PM, Christian König wrote: >>>>>>>> Am 11.05.22 um 15:43 schrieb Andrey Grodzovsky: >>>>>>>>> On 2022-05-11 03:38, Christian König wrote: >>>>>>>>>> Am 10.05.22 um 20:53 schrieb Andrey Grodzovsky: >>>>>>>>>>> [SNIP] >>>>>>>>>>>> E.g. in the reset code (either before or after the reset, >>>>>>>>>>>> that's debatable) you do something like this: >>>>>>>>>>>> >>>>>>>>>>>> for (i = 0; i < num_ring; ++i) >>>>>>>>>>>> cancel_delayed_work(ring[i]->scheduler....) >>>>>>>>>>>> cancel_work(adev->ras_work); >>>>>>>>>>>> cancel_work(adev->iofault_work); >>>>>>>>>>>> cancel_work(adev->debugfs_work); >>>>>>>>>>>> ... >>>>>>>>>>>> >>>>>>>>>>>> You don't really need to track which reset source has fired >>>>>>>>>>>> and which hasn't, because that would be racy again. Instead >>>>>>>>>>>> just bluntly reset all possible sources. >>>>>>>>>>>> >>>>>>>>>>>> Christian. >>>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>> I don't say we care if it fired once or twice (I need to add >>>>>>>>>>> a fix to only insert reset work to pending reset list if >>>>>>>>>>> it's not already there), the point of using list (or array) >>>>>>>>>>> to which you add and from which you remove is that the logic >>>>>>>>>>> of this is encapsulated within reset domain. In your way we >>>>>>>>>>> need to be aware who exactly schedules reset work and >>>>>>>>>>> explicitly cancel them, this also means that for any new >>>>>>>>>>> source added in the future you will need to remember to add him >>>>>>>>>> >>>>>>>>>> I don't think that this is a valid argument. Additionally to >>>>>>>>>> the schedulers we probably just need less than a handful of >>>>>>>>>> reset sources, most likely even just one or two is enough. >>>>>>>>>> >>>>>>>>>> The only justification I can see of having additional >>>>>>>>>> separate reset sources would be if somebody wants to know if >>>>>>>>>> a specific source has been handled or not (e.g. call >>>>>>>>>> flush_work() or work_pending()). Like in the case of a reset >>>>>>>>>> triggered through debugfs. >>>>>>>>> >>>>>>>>> >>>>>>>>> This is indeed one reason, another is as we said before that >>>>>>>>> if you share 'reset source' (meaning a delayed work) with >>>>>>>>> another client (i.e. RAS and KFD) it means you make assumption >>>>>>>>> that the other client always proceeds with the >>>>>>>>> reset exactly the same way as you expect. So today we have >>>>>>>>> this only in scheduler vs non scheduler reset happening - non >>>>>>>>> scheduler reset clients assume the reset is always fully >>>>>>>>> executed in HW while scheduler based reset makes shortcuts and >>>>>>>>> not always does HW reset hence they cannot share 'reset >>>>>>>>> source' (delayed work). Yes, we can always add this in the >>>>>>>>> future if and when such problem will arise but no one will >>>>>>>>> remember this then and a new bug will be introduced and will >>>>>>>>> take time to find and resolve. >>>>>>>> >>>>>>>> Mhm, so your main concern is that we forget to correctly handle >>>>>>>> the new reset sources? >>>>>>>> >>>>>>>> How about we do it like this then: >>>>>>>> >>>>>>>> struct amdgpu_reset_domain { >>>>>>>> .... >>>>>>>> union { >>>>>>>> struct { >>>>>>>> struct work_item debugfs; >>>>>>>> struct work_item ras; >>>>>>>> .... >>>>>>>> }; >>>>>>>> struct work_item array[] >>>>>>>> } reset_sources; >>>>>>>> } >>>>>>>> >>>>>>> >>>>>>> If it's only about static array, >>>>>>> >>>>>>> enum amdgpu_reset_soruce { >>>>>>> >>>>>>> AMDGPU_RESET_SRC_RAS, >>>>>>> AMDGPU_RESET_SRC_ABC, >>>>>>> ..... >>>>>>> AMDGPU_RESET_SRC_XYZ, >>>>>>> AMDGPU_RESET_SRC_MAX >>>>>>> >>>>>>> }; >>>>>>> >>>>>>> struct work_struct reset_work[AMDGPU_RESET_SRC_MAX]; => An index >>>>>>> for each work item >>>>>>> >>>>>>> >>>>>>> Thanks, >>>>>>> Lijo >>>>>> >>>>>> >>>>>> It's possible though it makes harder to generalize reset_domain >>>>>> later for other drivers. >>>>>> But still one caveat, look at amdgpu_recover_work_struct and it's >>>>>> usage in amdgpu_device_gpu_recover and in gpu_recover_get, >>>>>> At least for debugfs i need to return back the result of GPU >>>>>> reset and so I cannot store actual work items in the array >>>>>> mentioned above >>>>>> but rather pointers to work_item because i need a way to get back >>>>>> the return value from gpu_recover like I do now in >>>>>> amdgpu_device_gpu_recover. >>>>> >>>>> You should try to avoid that as well. >>>>> >>>>> See when the debugfs reset is canceled because of a scheduler >>>>> reset you won't get a useful return code either. >>>>> >>>>> What we should do instead is to cache the status of the last reset >>>>> in the reset domain. >>>>> >>>>> Regards, >>>>> Christian. >>>> >>>> >>>> Another problem with this approach - to execute the actaul GPU >>>> reset I need accesses to concrete amdgpu_device pointer from work >>>> struct (see xgpu_ai_mailbox_flr_work) as example. If i store all >>>> work items in >>>> array in amdgpu_reset_domain the most i can only retrieve is the >>>> reset_domain struct itself which won't help since it's dynamically >>>> allocated, not embedded in hive or adev and can can be one per >>>> device or per entire hive in case of XGMI and so there is no way >>>> for me to reach back to amdgpu_device. Back pointer to adev* from >>>> amdgpu_reset_domain will only work for single device but not for >>>> XGMI hive where there are multiple devices in a hive. >>> >>> Which is exactly the reason why I think we should always allocate >>> the hive structure, even if we only have one device. And a GPU reset >>> should then always work with the hive data structure and not adev. >> >> >> I am not sure why HIVE is the object we should work with, hive is one >> use case, single device is another, then Lijo described something >> called partition which is what ? Particular pipe within GPU ?. What >> they all share in common >> IMHO is that all of them use reset domain when they want a recovery >> operation, so maybe GPU reset should be oriented to work with reset >> domain ? > > Yes, exactly that's the idea. > > Basically the reset domain knowns which amdgpu devices it needs to > reset together. > > If you then represent that so that you always have a hive even when > you only have one device in it, or if you put an array of devices > which needs to be reset together into the reset domain doesn't matter. > > Maybe go for the later approach, that is probably a bit cleaner and > less code to change. > > Christian. Unfortunately this approach raises also a few difficulties - First - if holding array of devices in reset_domain then when you come to GPU reset function you don't really know which adev is the one triggered the reset and this is actually essential to some procedures like emergency restart. Second - in XGMI case we must take into account that one of the hive members might go away in runtime (i could do echo 1 > /sysfs/pci_id/remove on it for example at any moment) - so now we need to maintain this array and mark such entry with NULL probably on XGMI node removal , and then there might be hot insertion and all this adds more complications. I now tend to prefer your initial solution for it's simplicity and the result will be what we need - "E.g. in the reset code (either before or after the reset, that's debatable) you do something like this: for (i = 0; i < num_ring; ++i) cancel_delayed_work(ring[i]->scheduler....) cancel_work(adev->ras_work); cancel_work(adev->iofault_work); cancel_work(adev->debugfs_work); " And while here for each new reset source you need to remember to add another line of code here same can said about adev* array in reset_context as you will need to remember to add a new source there anyway. Let me know what you think. Andrey > >> >> Andrey >> >> >>> >>> Adding a pointer from your reset work item back to the hive is then >>> trivial. >>> >>> Regards, >>> Christian. >>> >>>> >>>> Andrey >>>> >>>> >>>>> >>>>>> >>>>>> Andrey >>>>>> >>>>>> >>>>>>> >>>>>>>> Not 100% sure if that works, but something like that should do >>>>>>>> the trick. >>>>>>>> >>>>>>>> My main concern is that I don't want to allocate the work items >>>>>>>> on the stack and dynamic allocation (e.g. kmalloc) is usually >>>>>>>> not possible either. >>>>>>>> >>>>>>>> Additional to that putting/removing work items from a list, >>>>>>>> array or other container is a very common source for race >>>>>>>> conditions. >>>>>>>> >>>>>>>> Regards, >>>>>>>> Christian. >>>>>>>> >>>>>>>>> >>>>>>>>>>> to the cancellation list which you showed above. In current >>>>>>>>>>> way all this done automatically within reset_domain code and >>>>>>>>>>> it's agnostic to specific driver and it's specific list of >>>>>>>>>>> reset sources. Also in case we would want to generalize >>>>>>>>>>> reset_domain to other GPU drivers (which was >>>>>>>>>>> a plan as far as i remember) this explicit mention of each >>>>>>>>>>> reset works for cancellation is again less suitable in my >>>>>>>>>>> opinion. >>>>>>>>>> >>>>>>>>>> Well we could put the work item for the scheduler independent >>>>>>>>>> reset source into the reset domain as well. But I'm not sure >>>>>>>>>> those additional reset sources should be part of any common >>>>>>>>>> handling, that is largely amdgpu specific. >>>>>>>>> >>>>>>>>> >>>>>>>>> So it's for sure more then one source for the reasons >>>>>>>>> described above, also note that for scheduler we already >>>>>>>>> cancel delayed work in drm_sched_stop so calling them again in >>>>>>>>> amdgpu code kind of superfluous. >>>>>>>>> >>>>>>>>> Andrey >>>>>>>>> >>>>>>>>> >>>>>>>>>> >>>>>>>>>> Christian. >>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>> Andrey >>>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>>> >>>>>>>>>>>>> >>>>>>>>>>>>> Andrey >>>>>>>>>>>>> >>>>>>>>>>>>> >>>>>>>>>>>>>> >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> The only difference is I chose to do the canceling right >>>>>>>>>>>>>>> BEFORE the HW reset and not AFTER. I did this because I >>>>>>>>>>>>>>> see a possible race where a new reset request is being >>>>>>>>>>>>>>> generated exactly after we finished the HW reset but >>>>>>>>>>>>>>> before we canceled out all pending resets - in such case >>>>>>>>>>>>>>> you wold not want to cancel this 'border line new' reset >>>>>>>>>>>>>>> request. >>>>>>>>>>>>>> >>>>>>>>>>>>>> Why not? Any new reset request directly after a hardware >>>>>>>>>>>>>> reset is most likely just falsely generated by the reset >>>>>>>>>>>>>> itself. >>>>>>>>>>>>>> >>>>>>>>>>>>>> Ideally I would cancel all sources after the reset, but >>>>>>>>>>>>>> before starting any new work. >>>>>>>>>>>>>> >>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>> Christian. >>>>>>>>>>>>>> >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>>>> Christian. >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> You can see that if many different reset sources >>>>>>>>>>>>>>>>>> share same work struct what can happen is that the >>>>>>>>>>>>>>>>>> first to obtain the lock you describe bellow might >>>>>>>>>>>>>>>>>> opt out from full HW reset because his bad job did >>>>>>>>>>>>>>>>>> signal for example or because his hunged IP block was >>>>>>>>>>>>>>>>>> able to recover through SW reset but in the meantime >>>>>>>>>>>>>>>>>> another reset source who needed an actual HW reset >>>>>>>>>>>>>>>>>> just silently returned and we end up with unhandled >>>>>>>>>>>>>>>>>> reset request. True that today this happens only to >>>>>>>>>>>>>>>>>> job timeout reset sources that are handled form >>>>>>>>>>>>>>>>>> within the scheduler and won't use this single work >>>>>>>>>>>>>>>>>> struct but no one prevents a future case for this to >>>>>>>>>>>>>>>>>> happen and also, if we actually want to unify >>>>>>>>>>>>>>>>>> scheduler time out handlers within reset domain >>>>>>>>>>>>>>>>>> (which seems to me the right design approach) we >>>>>>>>>>>>>>>>>> won't be able to use just one work struct for this >>>>>>>>>>>>>>>>>> reason anyway. >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> Just to add to this point - a reset domain is >>>>>>>>>>>>>>>>> co-operative domain. In addition to sharing a set of >>>>>>>>>>>>>>>>> clients from various reset sources for one device, it >>>>>>>>>>>>>>>>> also will have a set of devices like in XGMI hive. The >>>>>>>>>>>>>>>>> job timeout on one device may not eventually result in >>>>>>>>>>>>>>>>> result, but a RAS error happening on another device at >>>>>>>>>>>>>>>>> the same time would need a reset. The second device's >>>>>>>>>>>>>>>>> RAS error cannot return seeing that a reset work >>>>>>>>>>>>>>>>> already started, or ignore the reset work given that >>>>>>>>>>>>>>>>> another device has filled the reset data. >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> When there is a reset domain, it should take care of >>>>>>>>>>>>>>>>> the work scheduled and keeping it in device or any >>>>>>>>>>>>>>>>> other level doesn't sound good. >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> Thanks, >>>>>>>>>>>>>>>>> Lijo >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> I'd put the reset work struct into the reset_domain >>>>>>>>>>>>>>>>>>> struct. That way you'd have exactly one worker for >>>>>>>>>>>>>>>>>>> the reset domain. You could implement a lock-less >>>>>>>>>>>>>>>>>>> scheme to decide whether you need to schedule a >>>>>>>>>>>>>>>>>>> reset, e.g. using an atomic counter in the shared >>>>>>>>>>>>>>>>>>> work struct that gets incremented when a client >>>>>>>>>>>>>>>>>>> wants to trigger a reset (atomic_add_return). If >>>>>>>>>>>>>>>>>>> that counter is exactly 1 after incrementing, you >>>>>>>>>>>>>>>>>>> need to fill in the rest of the work struct and >>>>>>>>>>>>>>>>>>> schedule the work. Otherwise, it's already scheduled >>>>>>>>>>>>>>>>>>> (or another client is in the process of scheduling >>>>>>>>>>>>>>>>>>> it) and you just return. When the worker finishes >>>>>>>>>>>>>>>>>>> (after confirming a successful reset), it resets the >>>>>>>>>>>>>>>>>>> counter to 0, so the next client requesting a reset >>>>>>>>>>>>>>>>>>> will schedule the worker again. >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>>>>>>> Felix >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>> Additional to that keep in mind that you can't >>>>>>>>>>>>>>>>>>>>> allocate any memory before or during the GPU reset >>>>>>>>>>>>>>>>>>>>> nor wait for the reset to complete (so you can't >>>>>>>>>>>>>>>>>>>>> allocate anything on the stack either). >>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>> There is no dynamic allocation here, regarding >>>>>>>>>>>>>>>>>>>> stack allocations - we do it all the time when we >>>>>>>>>>>>>>>>>>>> call functions, even during GPU resets, how on >>>>>>>>>>>>>>>>>>>> stack allocation of work struct in >>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover is different from any >>>>>>>>>>>>>>>>>>>> other local variable we allocate in any function we >>>>>>>>>>>>>>>>>>>> call ? >>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>> I am also not sure why it's not allowed to wait for >>>>>>>>>>>>>>>>>>>> reset to complete ? Also, see in >>>>>>>>>>>>>>>>>>>> amdgpu_ras_do_recovery and gpu_recover_get >>>>>>>>>>>>>>>>>>>> (debugfs) - the caller expects the reset to >>>>>>>>>>>>>>>>>>>> complete before he returns. I can probably work >>>>>>>>>>>>>>>>>>>> around it in RAS code by calling >>>>>>>>>>>>>>>>>>>> atomic_set(&ras->in_recovery, 0) from some callback >>>>>>>>>>>>>>>>>>>> within actual reset function but regarding sysfs it >>>>>>>>>>>>>>>>>>>> actually expects a result returned indicating >>>>>>>>>>>>>>>>>>>> whether the call was successful or not. >>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>> I don't think that concept you try here will work. >>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>>>>>>>>> Christian. >>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>> Also in general seems to me it's cleaner approach >>>>>>>>>>>>>>>>>>>>>> where this logic (the work items) are held and >>>>>>>>>>>>>>>>>>>>>> handled in reset_domain and are not split in each >>>>>>>>>>>>>>>>>>>>>> adev or any other entity. We might want in the >>>>>>>>>>>>>>>>>>>>>> future to even move the scheduler handling into >>>>>>>>>>>>>>>>>>>>>> reset domain since reset domain is supposed to be >>>>>>>>>>>>>>>>>>>>>> a generic things and not only or AMD. >>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>>> Andrey >>>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>>>> Regards, >>>>>>>>>>>>>>>>>>>>>>>>> Christian. >>>>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky >>>>>>>>>>>>>>>>>>>>>>>>>> <andrey.grodzovsky@amd.com> >>>>>>>>>>>>>>>>>>>>>>>>>> Tested-by: Bai Zoy <Zoy.Bai@amd.com> >>>>>>>>>>>>>>>>>>>>>>>>>> --- >>>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +--- >>>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | >>>>>>>>>>>>>>>>>>>>>>>>>> 17 +++-- >>>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 + >>>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | >>>>>>>>>>>>>>>>>>>>>>>>>> 73 +++++++++++++++++++++- >>>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +- >>>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++- >>>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++- >>>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++- >>>>>>>>>>>>>>>>>>>>>>>>>> 8 files changed, 104 insertions(+), 24 >>>>>>>>>>>>>>>>>>>>>>>>>> deletions(-) >>>>>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>>>>>>>> index 4264abc5604d..99efd8317547 100644 >>>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>>>>>>>>>>>>>>>>>>>>>>>> @@ -109,6 +109,7 @@ >>>>>>>>>>>>>>>>>>>>>>>>>> #include "amdgpu_fdinfo.h" >>>>>>>>>>>>>>>>>>>>>>>>>> #include "amdgpu_mca.h" >>>>>>>>>>>>>>>>>>>>>>>>>> #include "amdgpu_ras.h" >>>>>>>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h" >>>>>>>>>>>>>>>>>>>>>>>>>> #define MAX_GPU_INSTANCE 16 >>>>>>>>>>>>>>>>>>>>>>>>>> @@ -509,16 +510,6 @@ struct >>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_allowed_register_entry { >>>>>>>>>>>>>>>>>>>>>>>>>> bool grbm_indexed; >>>>>>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>>>>>> -enum amd_reset_method { >>>>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_NONE = -1, >>>>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_LEGACY = 0, >>>>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE0, >>>>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE1, >>>>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_MODE2, >>>>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_BACO, >>>>>>>>>>>>>>>>>>>>>>>>>> - AMD_RESET_METHOD_PCI, >>>>>>>>>>>>>>>>>>>>>>>>>> -}; >>>>>>>>>>>>>>>>>>>>>>>>>> - >>>>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_video_codec_info { >>>>>>>>>>>>>>>>>>>>>>>>>> u32 codec_type; >>>>>>>>>>>>>>>>>>>>>>>>>> u32 max_width; >>>>>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>>>>>>>> index e582f1044c0f..7fa82269c30f 100644 >>>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>>>>>>>>>>>>>>>>>>>>> @@ -5201,6 +5201,12 @@ int >>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct >>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev, >>>>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>>>> tmp_vram_lost_counter = >>>>>>>>>>>>>>>>>>>>>>>>>> atomic_read(&((adev)->vram_lost_counter)); >>>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>>> + /* Drop all pending resets since we will >>>>>>>>>>>>>>>>>>>>>>>>>> reset now anyway */ >>>>>>>>>>>>>>>>>>>>>>>>>> + tmp_adev = >>>>>>>>>>>>>>>>>>>>>>>>>> list_first_entry(device_list_handle, struct >>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device, >>>>>>>>>>>>>>>>>>>>>>>>>> + reset_list); >>>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_pending_list(tmp_adev->reset_domain); >>>>>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>>> /* Actual ASIC resets if needed.*/ >>>>>>>>>>>>>>>>>>>>>>>>>> /* Host driver will handle XGMI hive >>>>>>>>>>>>>>>>>>>>>>>>>> reset for SRIOV */ >>>>>>>>>>>>>>>>>>>>>>>>>> if (amdgpu_sriov_vf(adev)) { >>>>>>>>>>>>>>>>>>>>>>>>>> @@ -5296,7 +5302,7 @@ int >>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(struct >>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev, >>>>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_recover_work_struct { >>>>>>>>>>>>>>>>>>>>>>>>>> - struct work_struct base; >>>>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct base; >>>>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev; >>>>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_job *job; >>>>>>>>>>>>>>>>>>>>>>>>>> int ret; >>>>>>>>>>>>>>>>>>>>>>>>>> @@ -5304,7 +5310,7 @@ struct >>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct { >>>>>>>>>>>>>>>>>>>>>>>>>> static void >>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work(struct >>>>>>>>>>>>>>>>>>>>>>>>>> work_struct *work) >>>>>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>>>>> - struct amdgpu_recover_work_struct >>>>>>>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct >>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base); >>>>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_recover_work_struct >>>>>>>>>>>>>>>>>>>>>>>>>> *recover_work = container_of(work, struct >>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_recover_work_struct, base.base.work); >>>>>>>>>>>>>>>>>>>>>>>>>> recover_work->ret = >>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover_imp(recover_work->adev, >>>>>>>>>>>>>>>>>>>>>>>>>> recover_work->job); >>>>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>>>> @@ -5316,12 +5322,15 @@ int >>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct >>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev, >>>>>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_recover_work_struct work >>>>>>>>>>>>>>>>>>>>>>>>>> = {.adev = adev, .job = job}; >>>>>>>>>>>>>>>>>>>>>>>>>> - INIT_WORK(&work.base, >>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>>>>>>>>>>>>>>>>>>>>> + INIT_DELAYED_WORK(&work.base.base, >>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_queue_gpu_recover_work); >>>>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&work.base.node); >>>>>>>>>>>>>>>>>>>>>>>>>> if >>>>>>>>>>>>>>>>>>>>>>>>>> (!amdgpu_reset_domain_schedule(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>>>>>> &work.base)) >>>>>>>>>>>>>>>>>>>>>>>>>> return -EAGAIN; >>>>>>>>>>>>>>>>>>>>>>>>>> - flush_work(&work.base); >>>>>>>>>>>>>>>>>>>>>>>>>> + flush_delayed_work(&work.base.base); >>>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>>>>>> &work.base); >>>>>>>>>>>>>>>>>>>>>>>>>> return work.ret; >>>>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>>>>>>>> index c80af0889773..ffddd419c351 100644 >>>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c >>>>>>>>>>>>>>>>>>>>>>>>>> @@ -134,6 +134,9 @@ struct >>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain >>>>>>>>>>>>>>>>>>>>>>>>>> *amdgpu_reset_create_reset_domain(enum >>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_d >>>>>>>>>>>>>>>>>>>>>>>>>> atomic_set(&reset_domain->in_gpu_reset, 0); >>>>>>>>>>>>>>>>>>>>>>>>>> init_rwsem(&reset_domain->sem); >>>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>>> INIT_LIST_HEAD(&reset_domain->pending_works); >>>>>>>>>>>>>>>>>>>>>>>>>> + mutex_init(&reset_domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>>> return reset_domain; >>>>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>>>>>>>> index 1949dbe28a86..863ec5720fc1 100644 >>>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >>>>>>>>>>>>>>>>>>>>>>>>>> @@ -24,7 +24,18 @@ >>>>>>>>>>>>>>>>>>>>>>>>>> #ifndef __AMDGPU_RESET_H__ >>>>>>>>>>>>>>>>>>>>>>>>>> #define __AMDGPU_RESET_H__ >>>>>>>>>>>>>>>>>>>>>>>>>> -#include "amdgpu.h" >>>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/atomic.h> >>>>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/mutex.h> >>>>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/list.h> >>>>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/kref.h> >>>>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/rwsem.h> >>>>>>>>>>>>>>>>>>>>>>>>>> +#include <linux/workqueue.h> >>>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_device; >>>>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_job; >>>>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_hive_info; >>>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>>> enum AMDGPU_RESET_FLAGS { >>>>>>>>>>>>>>>>>>>>>>>>>> @@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS { >>>>>>>>>>>>>>>>>>>>>>>>>> AMDGPU_SKIP_HW_RESET = 1, >>>>>>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>>> +enum amd_reset_method { >>>>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_NONE = -1, >>>>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_LEGACY = 0, >>>>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE0, >>>>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE1, >>>>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_MODE2, >>>>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_BACO, >>>>>>>>>>>>>>>>>>>>>>>>>> + AMD_RESET_METHOD_PCI, >>>>>>>>>>>>>>>>>>>>>>>>>> +}; >>>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_reset_context { >>>>>>>>>>>>>>>>>>>>>>>>>> enum amd_reset_method method; >>>>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *reset_req_dev; >>>>>>>>>>>>>>>>>>>>>>>>>> @@ -40,6 +62,8 @@ struct amdgpu_reset_context { >>>>>>>>>>>>>>>>>>>>>>>>>> unsigned long flags; >>>>>>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_reset_control; >>>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_reset_handler { >>>>>>>>>>>>>>>>>>>>>>>>>> enum amd_reset_method reset_method; >>>>>>>>>>>>>>>>>>>>>>>>>> struct list_head handler_list; >>>>>>>>>>>>>>>>>>>>>>>>>> @@ -76,12 +100,21 @@ enum >>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_type { >>>>>>>>>>>>>>>>>>>>>>>>>> XGMI_HIVE >>>>>>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>>> +struct amdgpu_reset_work_struct { >>>>>>>>>>>>>>>>>>>>>>>>>> + struct delayed_work base; >>>>>>>>>>>>>>>>>>>>>>>>>> + struct list_head node; >>>>>>>>>>>>>>>>>>>>>>>>>> +}; >>>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_reset_domain { >>>>>>>>>>>>>>>>>>>>>>>>>> struct kref refcount; >>>>>>>>>>>>>>>>>>>>>>>>>> struct workqueue_struct *wq; >>>>>>>>>>>>>>>>>>>>>>>>>> enum amdgpu_reset_domain_type type; >>>>>>>>>>>>>>>>>>>>>>>>>> struct rw_semaphore sem; >>>>>>>>>>>>>>>>>>>>>>>>>> atomic_t in_gpu_reset; >>>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>>> + struct list_head pending_works; >>>>>>>>>>>>>>>>>>>>>>>>>> + struct mutex reset_lock; >>>>>>>>>>>>>>>>>>>>>>>>>> }; >>>>>>>>>>>>>>>>>>>>>>>>>> @@ -113,9 +146,43 @@ static inline void >>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_put_reset_domain(struct >>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *dom >>>>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>>>> static inline bool >>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_schedule(struct >>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain, >>>>>>>>>>>>>>>>>>>>>>>>>> - struct work_struct *work) >>>>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work) >>>>>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>>>>> - return queue_work(domain->wq, work); >>>>>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>>> + if (!queue_delayed_work(domain->wq, >>>>>>>>>>>>>>>>>>>>>>>>>> &work->base, 0)) { >>>>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>>>>> + return false; >>>>>>>>>>>>>>>>>>>>>>>>>> + } >>>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>>> + list_add_tail(&work->node, >>>>>>>>>>>>>>>>>>>>>>>>>> &domain->pending_works); >>>>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>>> + return true; >>>>>>>>>>>>>>>>>>>>>>>>>> +} >>>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>>> +static inline void >>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(struct >>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain, >>>>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *work) >>>>>>>>>>>>>>>>>>>>>>>>>> +{ >>>>>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>>>>> + list_del_init(&work->node); >>>>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>>>>> +} >>>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>>> +static inline void >>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_pending_list(struct >>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *domain) >>>>>>>>>>>>>>>>>>>>>>>>>> +{ >>>>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct *entry, >>>>>>>>>>>>>>>>>>>>>>>>>> *tmp; >>>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>>> + mutex_lock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>>>>> + list_for_each_entry_safe(entry, tmp, >>>>>>>>>>>>>>>>>>>>>>>>>> &domain->pending_works, node) { >>>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>>> + list_del_init(&entry->node); >>>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>>> + /* Stop any other related pending >>>>>>>>>>>>>>>>>>>>>>>>>> resets */ >>>>>>>>>>>>>>>>>>>>>>>>>> + cancel_delayed_work(&entry->base); >>>>>>>>>>>>>>>>>>>>>>>>>> + } >>>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>>> + mutex_unlock(&domain->reset_lock); >>>>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>>>> void >>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_lock_reset_domain(struct >>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain *reset_domain); >>>>>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>>>>>>>> index 239f232f9c02..574e870d3064 100644 >>>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h >>>>>>>>>>>>>>>>>>>>>>>>>> @@ -25,6 +25,7 @@ >>>>>>>>>>>>>>>>>>>>>>>>>> #define AMDGPU_VIRT_H >>>>>>>>>>>>>>>>>>>>>>>>>> #include "amdgv_sriovmsg.h" >>>>>>>>>>>>>>>>>>>>>>>>>> +#include "amdgpu_reset.h" >>>>>>>>>>>>>>>>>>>>>>>>>> #define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 >>>>>>>>>>>>>>>>>>>>>>>>>> << 0) /* vBIOS is sr-iov ready */ >>>>>>>>>>>>>>>>>>>>>>>>>> #define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << >>>>>>>>>>>>>>>>>>>>>>>>>> 1) /* sr-iov is enabled on this GPU */ >>>>>>>>>>>>>>>>>>>>>>>>>> @@ -230,7 +231,7 @@ struct amdgpu_virt { >>>>>>>>>>>>>>>>>>>>>>>>>> uint32_t reg_val_offs; >>>>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_irq_src ack_irq; >>>>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_irq_src rcv_irq; >>>>>>>>>>>>>>>>>>>>>>>>>> - struct work_struct flr_work; >>>>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_reset_work_struct flr_work; >>>>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_mm_table mm_table; >>>>>>>>>>>>>>>>>>>>>>>>>> const struct amdgpu_virt_ops *ops; >>>>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_vf_error_buffer vf_errors; >>>>>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>>>>>>>> index b81acf59870c..f3d1c2be9292 100644 >>>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >>>>>>>>>>>>>>>>>>>>>>>>>> @@ -251,7 +251,7 @@ static int >>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_set_mailbox_ack_irq(struct >>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev, >>>>>>>>>>>>>>>>>>>>>>>>>> static void >>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work(struct work_struct >>>>>>>>>>>>>>>>>>>>>>>>>> *work) >>>>>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, >>>>>>>>>>>>>>>>>>>>>>>>>> flr_work); >>>>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, >>>>>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work); >>>>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev = >>>>>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt); >>>>>>>>>>>>>>>>>>>>>>>>>> int timeout = >>>>>>>>>>>>>>>>>>>>>>>>>> AI_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>>>>>>>>>>>>>>>>>>>>> @@ -380,7 +380,8 @@ int >>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_get_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>>>>>> *adev) >>>>>>>>>>>>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>>> INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>>>> @@ -389,6 +390,8 @@ void >>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_mailbox_put_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>>>>>> *adev) >>>>>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>>>> static int >>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_ai_request_init_data(struct >>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev) >>>>>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>>>>>>>> index 22c10b97ea81..927b3d5bb1d0 100644 >>>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >>>>>>>>>>>>>>>>>>>>>>>>>> @@ -275,7 +275,7 @@ static int >>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_set_mailbox_ack_irq(struct >>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev, >>>>>>>>>>>>>>>>>>>>>>>>>> static void >>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work(struct work_struct >>>>>>>>>>>>>>>>>>>>>>>>>> *work) >>>>>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, >>>>>>>>>>>>>>>>>>>>>>>>>> flr_work); >>>>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, >>>>>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work); >>>>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev = >>>>>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt); >>>>>>>>>>>>>>>>>>>>>>>>>> int timeout = >>>>>>>>>>>>>>>>>>>>>>>>>> NV_MAILBOX_POLL_FLR_TIMEDOUT; >>>>>>>>>>>>>>>>>>>>>>>>>> @@ -407,7 +407,8 @@ int >>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_get_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>>>>>> *adev) >>>>>>>>>>>>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>>> INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>>>> @@ -416,6 +417,8 @@ void >>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_mailbox_put_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>>>>>> *adev) >>>>>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>>>> const struct amdgpu_virt_ops >>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_nv_virt_ops = { >>>>>>>>>>>>>>>>>>>>>>>>>> diff --git >>>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>>>>>>>> index 7b63d30b9b79..1d4ef5c70730 100644 >>>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >>>>>>>>>>>>>>>>>>>>>>>>>> @@ -512,7 +512,7 @@ static int >>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_set_mailbox_ack_irq(struct >>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev, >>>>>>>>>>>>>>>>>>>>>>>>>> static void >>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work(struct work_struct >>>>>>>>>>>>>>>>>>>>>>>>>> *work) >>>>>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>>>>> - struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, >>>>>>>>>>>>>>>>>>>>>>>>>> flr_work); >>>>>>>>>>>>>>>>>>>>>>>>>> + struct amdgpu_virt *virt = >>>>>>>>>>>>>>>>>>>>>>>>>> container_of(work, struct amdgpu_virt, >>>>>>>>>>>>>>>>>>>>>>>>>> flr_work.base.work); >>>>>>>>>>>>>>>>>>>>>>>>>> struct amdgpu_device *adev = >>>>>>>>>>>>>>>>>>>>>>>>>> container_of(virt, struct amdgpu_device, virt); >>>>>>>>>>>>>>>>>>>>>>>>>> /* wait until RCV_MSG become 3 */ >>>>>>>>>>>>>>>>>>>>>>>>>> @@ -610,7 +610,8 @@ int >>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_get_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>>>>>> *adev) >>>>>>>>>>>>>>>>>>>>>>>>>> return r; >>>>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>>>> - INIT_WORK(&adev->virt.flr_work, >>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>>> INIT_DELAYED_WORK(&adev->virt.flr_work.base, >>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_flr_work); >>>>>>>>>>>>>>>>>>>>>>>>>> + INIT_LIST_HEAD(&adev->virt.flr_work.node); >>>>>>>>>>>>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>>>> @@ -619,6 +620,8 @@ void >>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_mailbox_put_irq(struct amdgpu_device >>>>>>>>>>>>>>>>>>>>>>>>>> *adev) >>>>>>>>>>>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.ack_irq, 0); >>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); >>>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>>> + >>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_reset_domain_del_pendning_work(adev->reset_domain, >>>>>>>>>>>>>>>>>>>>>>>>>> &adev->virt.flr_work); >>>>>>>>>>>>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>>>>>>>>>>>> const struct amdgpu_virt_ops >>>>>>>>>>>>>>>>>>>>>>>>>> xgpu_vi_virt_ops = { >>>>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> >>>>>>>>>>>>>> >>>>>>>>>>>> >>>>>>>>>> >>>>>>>> >>>>> >>> > ^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive. 2022-05-13 15:41 ` Andrey Grodzovsky @ 2022-05-16 14:12 ` Andrey Grodzovsky 2022-05-16 15:08 ` Christian König 0 siblings, 1 reply; 40+ messages in thread From: Andrey Grodzovsky @ 2022-05-16 14:12 UTC (permalink / raw) To: Christian König, Lazar, Lijo, Christian König, Felix Kuehling, amd-gfx Cc: Bai Zoy [-- Attachment #1: Type: text/plain, Size: 1655 bytes --] Ping Andrey On 2022-05-13 11:41, Andrey Grodzovsky wrote: >> Yes, exactly that's the idea. >> >> Basically the reset domain knowns which amdgpu devices it needs to >> reset together. >> >> If you then represent that so that you always have a hive even when >> you only have one device in it, or if you put an array of devices >> which needs to be reset together into the reset domain doesn't matter. >> >> Maybe go for the later approach, that is probably a bit cleaner and >> less code to change. >> >> Christian. > > > Unfortunately this approach raises also a few difficulties - > First - if holding array of devices in reset_domain then when you come > to GPU reset function you don't really know which adev is the one > triggered the reset and this is actually essential to some procedures > like emergency restart. > > Second - in XGMI case we must take into account that one of the hive > members might go away in runtime (i could do echo 1 > > /sysfs/pci_id/remove on it for example at any moment) - so now we need > to maintain this array and mark such entry with NULL probably on XGMI > node removal , and then there might be hot insertion and all this adds > more complications. > > I now tend to prefer your initial solution for it's simplicity and the > result will be what we need - > > "E.g. in the reset code (either before or after the reset, that's > debatable) you do something like this: > > for (i = 0; i < num_ring; ++i) > cancel_delayed_work(ring[i]->scheduler....) > cancel_work(adev->ras_work); > cancel_work(adev->iofault_work); > cancel_work(adev->debugfs_work); > " > > Let me know what you think. > > Andrey [-- Attachment #2: Type: text/html, Size: 2515 bytes --] ^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive. 2022-05-16 14:12 ` Andrey Grodzovsky @ 2022-05-16 15:08 ` Christian König 2022-05-16 15:13 ` Andrey Grodzovsky 0 siblings, 1 reply; 40+ messages in thread From: Christian König @ 2022-05-16 15:08 UTC (permalink / raw) To: Andrey Grodzovsky, Lazar, Lijo, Christian König, Felix Kuehling, amd-gfx Cc: Bai Zoy [-- Attachment #1: Type: text/plain, Size: 1991 bytes --] Am 16.05.22 um 16:12 schrieb Andrey Grodzovsky: > > Ping > Ah, yes sorry. > Andrey > > On 2022-05-13 11:41, Andrey Grodzovsky wrote: >>> Yes, exactly that's the idea. >>> >>> Basically the reset domain knowns which amdgpu devices it needs to >>> reset together. >>> >>> If you then represent that so that you always have a hive even when >>> you only have one device in it, or if you put an array of devices >>> which needs to be reset together into the reset domain doesn't matter. >>> >>> Maybe go for the later approach, that is probably a bit cleaner and >>> less code to change. >>> >>> Christian. >> >> >> Unfortunately this approach raises also a few difficulties - >> First - if holding array of devices in reset_domain then when you >> come to GPU reset function you don't really know which adev is the >> one triggered the reset and this is actually essential to some >> procedures like emergency restart. What is "emergency restart"? That's not some requirement I know about. >> >> Second - in XGMI case we must take into account that one of the hive >> members might go away in runtime (i could do echo 1 > >> /sysfs/pci_id/remove on it for example at any moment) - so now we >> need to maintain this array and mark such entry with NULL probably on >> XGMI node removal , and then there might be hot insertion and all >> this adds more complications. >> >> I now tend to prefer your initial solution for it's simplicity and >> the result will be what we need - >> >> "E.g. in the reset code (either before or after the reset, that's >> debatable) you do something like this: >> >> for (i = 0; i < num_ring; ++i) >> cancel_delayed_work(ring[i]->scheduler....) >> cancel_work(adev->ras_work); >> cancel_work(adev->iofault_work); >> cancel_work(adev->debugfs_work); >> " Works for me. I already expected that switching over the reset to be based on the reset context wouldn't be that easy. Regards, Christian. >> >> Let me know what you think. >> >> Andrey [-- Attachment #2: Type: text/html, Size: 3552 bytes --] ^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive. 2022-05-16 15:08 ` Christian König @ 2022-05-16 15:13 ` Andrey Grodzovsky 0 siblings, 0 replies; 40+ messages in thread From: Andrey Grodzovsky @ 2022-05-16 15:13 UTC (permalink / raw) To: Christian König, Lazar, Lijo, Christian König, Felix Kuehling, amd-gfx Cc: Bai Zoy [-- Attachment #1: Type: text/plain, Size: 2413 bytes --] On 2022-05-16 11:08, Christian König wrote: > Am 16.05.22 um 16:12 schrieb Andrey Grodzovsky: >> >> Ping >> > > Ah, yes sorry. > >> Andrey >> >> On 2022-05-13 11:41, Andrey Grodzovsky wrote: >>>> Yes, exactly that's the idea. >>>> >>>> Basically the reset domain knowns which amdgpu devices it needs to >>>> reset together. >>>> >>>> If you then represent that so that you always have a hive even when >>>> you only have one device in it, or if you put an array of devices >>>> which needs to be reset together into the reset domain doesn't matter. >>>> >>>> Maybe go for the later approach, that is probably a bit cleaner and >>>> less code to change. >>>> >>>> Christian. >>> >>> >>> Unfortunately this approach raises also a few difficulties - >>> First - if holding array of devices in reset_domain then when you >>> come to GPU reset function you don't really know which adev is the >>> one triggered the reset and this is actually essential to some >>> procedures like emergency restart. > > What is "emergency restart"? That's not some requirement I know about. Emergency restart is something u can see at the beginning of amdgpu_gpu_recover function - it's a historical work around for some type of ASICs who weren't able to do full reset I think. We should eventually remove it bu for now I thin it's still in use. > >>> >>> Second - in XGMI case we must take into account that one of the hive >>> members might go away in runtime (i could do echo 1 > >>> /sysfs/pci_id/remove on it for example at any moment) - so now we >>> need to maintain this array and mark such entry with NULL probably >>> on XGMI node removal , and then there might be hot insertion and all >>> this adds more complications. >>> >>> I now tend to prefer your initial solution for it's simplicity and >>> the result will be what we need - >>> >>> "E.g. in the reset code (either before or after the reset, that's >>> debatable) you do something like this: >>> >>> for (i = 0; i < num_ring; ++i) >>> cancel_delayed_work(ring[i]->scheduler....) >>> cancel_work(adev->ras_work); >>> cancel_work(adev->iofault_work); >>> cancel_work(adev->debugfs_work); >>> " > > Works for me. I already expected that switching over the reset to be > based on the reset context wouldn't be that easy. > > Regards, > Christian. Ok - i will resend a patch. Andrey > >>> >>> Let me know what you think. >>> >>> Andrey > [-- Attachment #2: Type: text/html, Size: 4584 bytes --] ^ permalink raw reply [flat|nested] 40+ messages in thread
end of thread, other threads:[~2022-05-16 15:13 UTC | newest] Thread overview: 40+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2022-05-04 16:18 [PATCH] drm/amdgpu: Fix multiple GPU resets in XGMI hive Andrey Grodzovsky 2022-05-05 10:09 ` Christian König 2022-05-05 13:15 ` Andrey Grodzovsky 2022-05-05 13:23 ` Christian König 2022-05-05 13:54 ` Andrey Grodzovsky 2022-05-05 15:06 ` Christian König 2022-05-05 18:57 ` Andrey Grodzovsky 2022-05-05 19:49 ` Felix Kuehling 2022-05-05 21:47 ` Andrey Grodzovsky 2022-05-06 5:41 ` Luben Tuikov 2022-05-06 6:02 ` Lazar, Lijo 2022-05-06 8:56 ` Christian König 2022-05-10 16:00 ` Andrey Grodzovsky 2022-05-10 16:17 ` Christian König 2022-05-10 17:01 ` Andrey Grodzovsky 2022-05-10 17:19 ` Christian König 2022-05-10 18:53 ` Andrey Grodzovsky 2022-05-11 7:38 ` Christian König 2022-05-11 13:43 ` Andrey Grodzovsky 2022-05-11 13:58 ` Christian König 2022-05-11 15:20 ` Lazar, Lijo 2022-05-11 15:35 ` Andrey Grodzovsky 2022-05-11 15:37 ` Lazar, Lijo 2022-05-11 15:43 ` Andrey Grodzovsky 2022-05-11 15:46 ` Lazar, Lijo 2022-05-11 15:53 ` Andrey Grodzovsky 2022-05-11 15:39 ` Christian König 2022-05-11 15:57 ` Andrey Grodzovsky 2022-05-12 6:03 ` Christian König 2022-05-12 12:57 ` Andrey Grodzovsky 2022-05-11 20:27 ` Andrey Grodzovsky 2022-05-12 6:06 ` Christian König 2022-05-12 9:21 ` Lazar, Lijo 2022-05-12 13:07 ` Andrey Grodzovsky 2022-05-12 13:15 ` Christian König 2022-05-12 13:44 ` Andrey Grodzovsky 2022-05-13 15:41 ` Andrey Grodzovsky 2022-05-16 14:12 ` Andrey Grodzovsky 2022-05-16 15:08 ` Christian König 2022-05-16 15:13 ` Andrey Grodzovsky
This is an external index of several public inboxes, see mirroring instructions on how to clone and mirror all data and code used by this external index.