* [RESEND PATCH 1/5] drm/amdgpu: reverts commit b01245ff54db66073b104ac9d9fbefb7b264b36d.
@ 2019-12-11 20:38 ` Andrey Grodzovsky
0 siblings, 0 replies; 24+ messages in thread
From: Andrey Grodzovsky @ 2019-12-11 20:38 UTC (permalink / raw)
To: dri-devel, amd-gfx; +Cc: Alexander.Deucher, Le.Ma, Evan.Quan, hawking.zhang
In preparation for doing XGMI reset synchronization using task barrier.
Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 -
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 76 +++++-------------------------
2 files changed, 12 insertions(+), 66 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index a78a363..50bab33 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1001,8 +1001,6 @@ struct amdgpu_device {
bool pm_sysfs_en;
bool ucode_sysfs_en;
-
- bool in_baco;
};
static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 7324a5f..1d19edfa 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2667,7 +2667,7 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)
adev->asic_reset_res = (adev->in_baco == false) ?
amdgpu_device_baco_enter(adev->ddev) :
- amdgpu_device_baco_exit(adev->ddev);
+ qamdgpu_device_baco_exit(adev->ddev);
else
adev->asic_reset_res = amdgpu_asic_reset(adev);
@@ -3796,18 +3796,13 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
return r;
}
-static int amdgpu_do_asic_reset(struct amdgpu_device *adev,
- struct amdgpu_hive_info *hive,
+static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
struct list_head *device_list_handle,
bool *need_full_reset_arg)
{
struct amdgpu_device *tmp_adev = NULL;
bool need_full_reset = *need_full_reset_arg, vram_lost = false;
int r = 0;
- int cpu = smp_processor_id();
- bool use_baco =
- (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) ?
- true : false;
/*
* ASIC reset has to be done on all HGMI hive nodes ASAP
@@ -3815,62 +3810,22 @@ static int amdgpu_do_asic_reset(struct amdgpu_device *adev,
*/
if (need_full_reset) {
list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
- /*
- * For XGMI run all resets in parallel to speed up the
- * process by scheduling the highpri wq on different
- * cpus. For XGMI with baco reset, all nodes must enter
- * baco within close proximity before anyone exit.
- */
+ /* For XGMI run all resets in parallel to speed up the process */
if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
- if (!queue_work_on(cpu, system_highpri_wq,
- &tmp_adev->xgmi_reset_work))
+ if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work))
r = -EALREADY;
- cpu = cpumask_next(cpu, cpu_online_mask);
} else
r = amdgpu_asic_reset(tmp_adev);
- if (r)
- break;
- }
-
- /* For XGMI wait for all work to complete before proceed */
- if (!r) {
- list_for_each_entry(tmp_adev, device_list_handle,
- gmc.xgmi.head) {
- if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
- flush_work(&tmp_adev->xgmi_reset_work);
- r = tmp_adev->asic_reset_res;
- if (r)
- break;
- if (use_baco)
- tmp_adev->in_baco = true;
- }
- }
- }
- /*
- * For XGMI with baco reset, need exit baco phase by scheduling
- * xgmi_reset_work one more time. PSP reset and sGPU skips this
- * phase. Not assume the situation that PSP reset and baco reset
- * coexist within an XGMI hive.
- */
-
- if (!r && use_baco) {
- cpu = smp_processor_id();
- list_for_each_entry(tmp_adev, device_list_handle,
- gmc.xgmi.head) {
- if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
- if (!queue_work_on(cpu,
- system_highpri_wq,
- &tmp_adev->xgmi_reset_work))
- r = -EALREADY;
- if (r)
- break;
- cpu = cpumask_next(cpu, cpu_online_mask);
- }
+ if (r) {
+ DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
+ r, tmp_adev->ddev->unique);
+ break;
}
}
- if (!r && use_baco) {
+ /* For XGMI wait for all PSP resets to complete before proceed */
+ if (!r) {
list_for_each_entry(tmp_adev, device_list_handle,
gmc.xgmi.head) {
if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
@@ -3878,21 +3833,15 @@ static int amdgpu_do_asic_reset(struct amdgpu_device *adev,
r = tmp_adev->asic_reset_res;
if (r)
break;
- tmp_adev->in_baco = false;
}
}
}
-
- if (r) {
- DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
- r, tmp_adev->ddev->unique);
- goto end;
- }
}
if (!r && amdgpu_ras_intr_triggered())
amdgpu_ras_intr_cleared();
+
list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
if (need_full_reset) {
/* post card */
@@ -4181,8 +4130,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
if (r)
adev->asic_reset_res = r;
} else {
- r = amdgpu_do_asic_reset(adev, hive, device_list_handle,
- &need_full_reset);
+ r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset);
if (r && r == -EAGAIN)
goto retry;
}
--
2.7.4
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [RESEND PATCH 1/5] drm/amdgpu: reverts commit b01245ff54db66073b104ac9d9fbefb7b264b36d.
@ 2019-12-11 20:38 ` Andrey Grodzovsky
0 siblings, 0 replies; 24+ messages in thread
From: Andrey Grodzovsky @ 2019-12-11 20:38 UTC (permalink / raw)
To: dri-devel, amd-gfx
Cc: Alexander.Deucher, Le.Ma, Evan.Quan, Andrey Grodzovsky, hawking.zhang
In preparation for doing XGMI reset synchronization using task barrier.
Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 -
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 76 +++++-------------------------
2 files changed, 12 insertions(+), 66 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index a78a363..50bab33 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1001,8 +1001,6 @@ struct amdgpu_device {
bool pm_sysfs_en;
bool ucode_sysfs_en;
-
- bool in_baco;
};
static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 7324a5f..1d19edfa 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2667,7 +2667,7 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)
adev->asic_reset_res = (adev->in_baco == false) ?
amdgpu_device_baco_enter(adev->ddev) :
- amdgpu_device_baco_exit(adev->ddev);
+ qamdgpu_device_baco_exit(adev->ddev);
else
adev->asic_reset_res = amdgpu_asic_reset(adev);
@@ -3796,18 +3796,13 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
return r;
}
-static int amdgpu_do_asic_reset(struct amdgpu_device *adev,
- struct amdgpu_hive_info *hive,
+static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
struct list_head *device_list_handle,
bool *need_full_reset_arg)
{
struct amdgpu_device *tmp_adev = NULL;
bool need_full_reset = *need_full_reset_arg, vram_lost = false;
int r = 0;
- int cpu = smp_processor_id();
- bool use_baco =
- (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) ?
- true : false;
/*
* ASIC reset has to be done on all HGMI hive nodes ASAP
@@ -3815,62 +3810,22 @@ static int amdgpu_do_asic_reset(struct amdgpu_device *adev,
*/
if (need_full_reset) {
list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
- /*
- * For XGMI run all resets in parallel to speed up the
- * process by scheduling the highpri wq on different
- * cpus. For XGMI with baco reset, all nodes must enter
- * baco within close proximity before anyone exit.
- */
+ /* For XGMI run all resets in parallel to speed up the process */
if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
- if (!queue_work_on(cpu, system_highpri_wq,
- &tmp_adev->xgmi_reset_work))
+ if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work))
r = -EALREADY;
- cpu = cpumask_next(cpu, cpu_online_mask);
} else
r = amdgpu_asic_reset(tmp_adev);
- if (r)
- break;
- }
-
- /* For XGMI wait for all work to complete before proceed */
- if (!r) {
- list_for_each_entry(tmp_adev, device_list_handle,
- gmc.xgmi.head) {
- if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
- flush_work(&tmp_adev->xgmi_reset_work);
- r = tmp_adev->asic_reset_res;
- if (r)
- break;
- if (use_baco)
- tmp_adev->in_baco = true;
- }
- }
- }
- /*
- * For XGMI with baco reset, need exit baco phase by scheduling
- * xgmi_reset_work one more time. PSP reset and sGPU skips this
- * phase. Not assume the situation that PSP reset and baco reset
- * coexist within an XGMI hive.
- */
-
- if (!r && use_baco) {
- cpu = smp_processor_id();
- list_for_each_entry(tmp_adev, device_list_handle,
- gmc.xgmi.head) {
- if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
- if (!queue_work_on(cpu,
- system_highpri_wq,
- &tmp_adev->xgmi_reset_work))
- r = -EALREADY;
- if (r)
- break;
- cpu = cpumask_next(cpu, cpu_online_mask);
- }
+ if (r) {
+ DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
+ r, tmp_adev->ddev->unique);
+ break;
}
}
- if (!r && use_baco) {
+ /* For XGMI wait for all PSP resets to complete before proceed */
+ if (!r) {
list_for_each_entry(tmp_adev, device_list_handle,
gmc.xgmi.head) {
if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
@@ -3878,21 +3833,15 @@ static int amdgpu_do_asic_reset(struct amdgpu_device *adev,
r = tmp_adev->asic_reset_res;
if (r)
break;
- tmp_adev->in_baco = false;
}
}
}
-
- if (r) {
- DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
- r, tmp_adev->ddev->unique);
- goto end;
- }
}
if (!r && amdgpu_ras_intr_triggered())
amdgpu_ras_intr_cleared();
+
list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
if (need_full_reset) {
/* post card */
@@ -4181,8 +4130,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
if (r)
adev->asic_reset_res = r;
} else {
- r = amdgpu_do_asic_reset(adev, hive, device_list_handle,
- &need_full_reset);
+ r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset);
if (r && r == -EAGAIN)
goto retry;
}
--
2.7.4
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [RESEND PATCH 2/5] drm: Add Reusable task barrier.
2019-12-11 20:38 ` Andrey Grodzovsky
@ 2019-12-11 20:38 ` Andrey Grodzovsky
-1 siblings, 0 replies; 24+ messages in thread
From: Andrey Grodzovsky @ 2019-12-11 20:38 UTC (permalink / raw)
To: dri-devel, amd-gfx; +Cc: Alexander.Deucher, Le.Ma, Evan.Quan, hawking.zhang
It is used to synchronize N threads at a rendevouz point before execution
of critical code that has to be started by all the threads at approximatly
the same time.
Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
include/drm/task_barrier.h | 106 +++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 106 insertions(+)
create mode 100644 include/drm/task_barrier.h
diff --git a/include/drm/task_barrier.h b/include/drm/task_barrier.h
new file mode 100644
index 0000000..81fb0f7
--- /dev/null
+++ b/include/drm/task_barrier.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright 2019 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#include <linux/semaphore.h>
+#include <linux/atomic.h>
+
+/*
+ * Reusable 2 PHASE task barrier (randevouz point) implementation for N tasks.
+ * Based on the Little book of sempahores - https://greenteapress.com/wp/semaphores/
+ */
+
+
+
+#ifndef DRM_TASK_BARRIER_H_
+#define DRM_TASK_BARRIER_H_
+
+/*
+ * Represents an instance of a task barrier.
+ */
+struct task_barrier {
+ unsigned int n;
+ atomic_t count;
+ struct semaphore enter_turnstile;
+ struct semaphore exit_turnstile;
+};
+
+static inline void task_barrier_signal_turnstile(struct semaphore *turnstile,
+ unsigned int n)
+{
+ int i;
+
+ for (i = 0 ; i < n; i++)
+ up(turnstile);
+}
+
+static inline void task_barrier_init(struct task_barrier *tb)
+{
+ tb->n = 0;
+ atomic_set(&tb->count, 0);
+ sema_init(&tb->enter_turnstile, 0);
+ sema_init(&tb->exit_turnstile, 0);
+}
+
+static inline void task_barrier_add_task(struct task_barrier *tb)
+{
+ tb->n++;
+}
+
+static inline void task_barrier_rem_task(struct task_barrier *tb)
+{
+ tb->n--;
+}
+
+/*
+ * Lines up all the threads BEFORE the critical point.
+ *
+ * When all thread passed this code the entry barrier is back to locked state.
+ */
+static inline void task_barrier_enter(struct task_barrier *tb)
+{
+ if (atomic_inc_return(&tb->count) == tb->n)
+ task_barrier_signal_turnstile(&tb->enter_turnstile, tb->n);
+
+ down(&tb->enter_turnstile);
+}
+
+/*
+ * Lines up all the threads AFTER the critical point.
+ *
+ * This function is used to avoid any one thread running ahead of the reset if
+ * the barrier is used in a loop (repeatedly) .
+ */
+static inline void task_barrier_exit(struct task_barrier *tb)
+{
+ if (atomic_dec_return(&tb->count) == 0)
+ task_barrier_signal_turnstile(&tb->exit_turnstile, tb->n);
+
+ down(&tb->exit_turnstile);
+}
+
+static inline void task_barrier_full(struct task_barrier *tb)
+{
+ task_barrier_enter(tb);
+ task_barrier_exit(tb);
+}
+
+#endif
--
2.7.4
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [RESEND PATCH 2/5] drm: Add Reusable task barrier.
@ 2019-12-11 20:38 ` Andrey Grodzovsky
0 siblings, 0 replies; 24+ messages in thread
From: Andrey Grodzovsky @ 2019-12-11 20:38 UTC (permalink / raw)
To: dri-devel, amd-gfx
Cc: Alexander.Deucher, Le.Ma, Evan.Quan, Andrey Grodzovsky, hawking.zhang
It is used to synchronize N threads at a rendevouz point before execution
of critical code that has to be started by all the threads at approximatly
the same time.
Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
include/drm/task_barrier.h | 106 +++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 106 insertions(+)
create mode 100644 include/drm/task_barrier.h
diff --git a/include/drm/task_barrier.h b/include/drm/task_barrier.h
new file mode 100644
index 0000000..81fb0f7
--- /dev/null
+++ b/include/drm/task_barrier.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright 2019 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#include <linux/semaphore.h>
+#include <linux/atomic.h>
+
+/*
+ * Reusable 2 PHASE task barrier (randevouz point) implementation for N tasks.
+ * Based on the Little book of sempahores - https://greenteapress.com/wp/semaphores/
+ */
+
+
+
+#ifndef DRM_TASK_BARRIER_H_
+#define DRM_TASK_BARRIER_H_
+
+/*
+ * Represents an instance of a task barrier.
+ */
+struct task_barrier {
+ unsigned int n;
+ atomic_t count;
+ struct semaphore enter_turnstile;
+ struct semaphore exit_turnstile;
+};
+
+static inline void task_barrier_signal_turnstile(struct semaphore *turnstile,
+ unsigned int n)
+{
+ int i;
+
+ for (i = 0 ; i < n; i++)
+ up(turnstile);
+}
+
+static inline void task_barrier_init(struct task_barrier *tb)
+{
+ tb->n = 0;
+ atomic_set(&tb->count, 0);
+ sema_init(&tb->enter_turnstile, 0);
+ sema_init(&tb->exit_turnstile, 0);
+}
+
+static inline void task_barrier_add_task(struct task_barrier *tb)
+{
+ tb->n++;
+}
+
+static inline void task_barrier_rem_task(struct task_barrier *tb)
+{
+ tb->n--;
+}
+
+/*
+ * Lines up all the threads BEFORE the critical point.
+ *
+ * When all thread passed this code the entry barrier is back to locked state.
+ */
+static inline void task_barrier_enter(struct task_barrier *tb)
+{
+ if (atomic_inc_return(&tb->count) == tb->n)
+ task_barrier_signal_turnstile(&tb->enter_turnstile, tb->n);
+
+ down(&tb->enter_turnstile);
+}
+
+/*
+ * Lines up all the threads AFTER the critical point.
+ *
+ * This function is used to avoid any one thread running ahead of the reset if
+ * the barrier is used in a loop (repeatedly) .
+ */
+static inline void task_barrier_exit(struct task_barrier *tb)
+{
+ if (atomic_dec_return(&tb->count) == 0)
+ task_barrier_signal_turnstile(&tb->exit_turnstile, tb->n);
+
+ down(&tb->exit_turnstile);
+}
+
+static inline void task_barrier_full(struct task_barrier *tb)
+{
+ task_barrier_enter(tb);
+ task_barrier_exit(tb);
+}
+
+#endif
--
2.7.4
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [RESEND PATCH 3/5] drm/amdgpu: Add task barrier to XGMI hive.
2019-12-11 20:38 ` Andrey Grodzovsky
@ 2019-12-11 20:38 ` Andrey Grodzovsky
-1 siblings, 0 replies; 24+ messages in thread
From: Andrey Grodzovsky @ 2019-12-11 20:38 UTC (permalink / raw)
To: dri-devel, amd-gfx; +Cc: Alexander.Deucher, Le.Ma, Evan.Quan, hawking.zhang
Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 4 ++++
drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h | 2 ++
2 files changed, 6 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 61d13d8..5cf920d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -261,6 +261,7 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lo
INIT_LIST_HEAD(&tmp->device_list);
mutex_init(&tmp->hive_lock);
mutex_init(&tmp->reset_lock);
+ task_barrier_init(&tmp->tb);
if (lock)
mutex_lock(&tmp->hive_lock);
@@ -408,6 +409,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
top_info->num_nodes = count;
hive->number_devices = count;
+ task_barrier_add_task(&hive->tb);
+
if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
/* update node list for other device in the hive */
@@ -470,6 +473,7 @@ void amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
mutex_destroy(&hive->hive_lock);
mutex_destroy(&hive->reset_lock);
} else {
+ task_barrier_rem_task(&hive->tb);
amdgpu_xgmi_sysfs_rem_dev_info(adev, hive);
mutex_unlock(&hive->hive_lock);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
index bbf504f..74011fb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
@@ -22,6 +22,7 @@
#ifndef __AMDGPU_XGMI_H__
#define __AMDGPU_XGMI_H__
+#include <drm/task_barrier.h>
#include "amdgpu_psp.h"
struct amdgpu_hive_info {
@@ -33,6 +34,7 @@ struct amdgpu_hive_info {
struct device_attribute dev_attr;
struct amdgpu_device *adev;
int pstate; /*0 -- low , 1 -- high , -1 unknown*/
+ struct task_barrier tb;
};
struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock);
--
2.7.4
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [RESEND PATCH 3/5] drm/amdgpu: Add task barrier to XGMI hive.
@ 2019-12-11 20:38 ` Andrey Grodzovsky
0 siblings, 0 replies; 24+ messages in thread
From: Andrey Grodzovsky @ 2019-12-11 20:38 UTC (permalink / raw)
To: dri-devel, amd-gfx
Cc: Alexander.Deucher, Le.Ma, Evan.Quan, Andrey Grodzovsky, hawking.zhang
Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 4 ++++
drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h | 2 ++
2 files changed, 6 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 61d13d8..5cf920d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -261,6 +261,7 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lo
INIT_LIST_HEAD(&tmp->device_list);
mutex_init(&tmp->hive_lock);
mutex_init(&tmp->reset_lock);
+ task_barrier_init(&tmp->tb);
if (lock)
mutex_lock(&tmp->hive_lock);
@@ -408,6 +409,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
top_info->num_nodes = count;
hive->number_devices = count;
+ task_barrier_add_task(&hive->tb);
+
if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
/* update node list for other device in the hive */
@@ -470,6 +473,7 @@ void amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
mutex_destroy(&hive->hive_lock);
mutex_destroy(&hive->reset_lock);
} else {
+ task_barrier_rem_task(&hive->tb);
amdgpu_xgmi_sysfs_rem_dev_info(adev, hive);
mutex_unlock(&hive->hive_lock);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
index bbf504f..74011fb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
@@ -22,6 +22,7 @@
#ifndef __AMDGPU_XGMI_H__
#define __AMDGPU_XGMI_H__
+#include <drm/task_barrier.h>
#include "amdgpu_psp.h"
struct amdgpu_hive_info {
@@ -33,6 +34,7 @@ struct amdgpu_hive_info {
struct device_attribute dev_attr;
struct amdgpu_device *adev;
int pstate; /*0 -- low , 1 -- high , -1 unknown*/
+ struct task_barrier tb;
};
struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock);
--
2.7.4
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [RESEND PATCH 4/5] Subject: drm/amdgpu: Redo XGMI reset synchronization.
2019-12-11 20:38 ` Andrey Grodzovsky
@ 2019-12-11 20:38 ` Andrey Grodzovsky
-1 siblings, 0 replies; 24+ messages in thread
From: Andrey Grodzovsky @ 2019-12-11 20:38 UTC (permalink / raw)
To: dri-devel, amd-gfx; +Cc: Alexander.Deucher, Le.Ma, Evan.Quan, hawking.zhang
Use task barrier in XGMI hive to synchronize ASIC resets
across devices in XGMI hive.
Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 42 +++++++++++++++++++++++++-----
1 file changed, 36 insertions(+), 6 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 1d19edfa..e4089a0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -67,6 +67,7 @@
#include "amdgpu_tmz.h"
#include <linux/suspend.h>
+#include <drm/task_barrier.h>
MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
@@ -2663,14 +2664,43 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
{
struct amdgpu_device *adev =
container_of(__work, struct amdgpu_device, xgmi_reset_work);
+ struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
- if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)
- adev->asic_reset_res = (adev->in_baco == false) ?
- amdgpu_device_baco_enter(adev->ddev) :
- qamdgpu_device_baco_exit(adev->ddev);
- else
- adev->asic_reset_res = amdgpu_asic_reset(adev);
+ /*
+ * Use task barrier to synchronize all xgmi reset works across the
+ * hive.
+ * task_barrier_enter and task_barrier_exit will block untill all the
+ * threads running the xgmi reset works reach those points. I assume
+ * guarantee of progress here for all the threads as the workqueue code
+ * creates new worker threads as needed by amount of work items in queue
+ * (see worker_thread) and also each thread sleeps in the barrir and by
+ * this yielding the CPU for other work threads to make progress.
+ */
+ if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
+
+ if (hive)
+ task_barrier_enter(&hive->tb);
+
+ adev->asic_reset_res = amdgpu_device_baco_enter(adev->ddev);
+
+ if (adev->asic_reset_res)
+ goto fail;
+
+ if (hive)
+ task_barrier_exit(&hive->tb);
+
+ adev->asic_reset_res = amdgpu_device_baco_exit(adev->ddev);
+
+ if (adev->asic_reset_res)
+ goto fail;
+ } else {
+ if (hive)
+ task_barrier_full(&hive->tb);
+
+ adev->asic_reset_res = amdgpu_asic_reset(adev);
+ }
+fail:
if (adev->asic_reset_res)
DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
adev->asic_reset_res, adev->ddev->unique);
--
2.7.4
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [RESEND PATCH 4/5] Subject: drm/amdgpu: Redo XGMI reset synchronization.
@ 2019-12-11 20:38 ` Andrey Grodzovsky
0 siblings, 0 replies; 24+ messages in thread
From: Andrey Grodzovsky @ 2019-12-11 20:38 UTC (permalink / raw)
To: dri-devel, amd-gfx
Cc: Alexander.Deucher, Le.Ma, Evan.Quan, Andrey Grodzovsky, hawking.zhang
Use task barrier in XGMI hive to synchronize ASIC resets
across devices in XGMI hive.
Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 42 +++++++++++++++++++++++++-----
1 file changed, 36 insertions(+), 6 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 1d19edfa..e4089a0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -67,6 +67,7 @@
#include "amdgpu_tmz.h"
#include <linux/suspend.h>
+#include <drm/task_barrier.h>
MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
@@ -2663,14 +2664,43 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
{
struct amdgpu_device *adev =
container_of(__work, struct amdgpu_device, xgmi_reset_work);
+ struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
- if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)
- adev->asic_reset_res = (adev->in_baco == false) ?
- amdgpu_device_baco_enter(adev->ddev) :
- qamdgpu_device_baco_exit(adev->ddev);
- else
- adev->asic_reset_res = amdgpu_asic_reset(adev);
+ /*
+ * Use task barrier to synchronize all xgmi reset works across the
+ * hive.
+ * task_barrier_enter and task_barrier_exit will block untill all the
+ * threads running the xgmi reset works reach those points. I assume
+ * guarantee of progress here for all the threads as the workqueue code
+ * creates new worker threads as needed by amount of work items in queue
+ * (see worker_thread) and also each thread sleeps in the barrir and by
+ * this yielding the CPU for other work threads to make progress.
+ */
+ if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
+
+ if (hive)
+ task_barrier_enter(&hive->tb);
+
+ adev->asic_reset_res = amdgpu_device_baco_enter(adev->ddev);
+
+ if (adev->asic_reset_res)
+ goto fail;
+
+ if (hive)
+ task_barrier_exit(&hive->tb);
+
+ adev->asic_reset_res = amdgpu_device_baco_exit(adev->ddev);
+
+ if (adev->asic_reset_res)
+ goto fail;
+ } else {
+ if (hive)
+ task_barrier_full(&hive->tb);
+
+ adev->asic_reset_res = amdgpu_asic_reset(adev);
+ }
+fail:
if (adev->asic_reset_res)
DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
adev->asic_reset_res, adev->ddev->unique);
--
2.7.4
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [RESEND PATCH 5/5] drm/amdgpu: Switch from system_highpri_wq to system_unbound_wq
2019-12-11 20:38 ` Andrey Grodzovsky
@ 2019-12-11 20:38 ` Andrey Grodzovsky
-1 siblings, 0 replies; 24+ messages in thread
From: Andrey Grodzovsky @ 2019-12-11 20:38 UTC (permalink / raw)
To: dri-devel, amd-gfx; +Cc: Alexander.Deucher, Le.Ma, Evan.Quan, hawking.zhang
This is to avoid queueing jobs to same CPU during XGMI hive reset
because there is a strict timeline for when the reset commands
must reach all the GPUs in the hive.
Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e4089a0..1518565 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3842,7 +3842,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
/* For XGMI run all resets in parallel to speed up the process */
if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
- if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work))
+ if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
r = -EALREADY;
} else
r = amdgpu_asic_reset(tmp_adev);
--
2.7.4
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [RESEND PATCH 5/5] drm/amdgpu: Switch from system_highpri_wq to system_unbound_wq
@ 2019-12-11 20:38 ` Andrey Grodzovsky
0 siblings, 0 replies; 24+ messages in thread
From: Andrey Grodzovsky @ 2019-12-11 20:38 UTC (permalink / raw)
To: dri-devel, amd-gfx
Cc: Alexander.Deucher, Le.Ma, Evan.Quan, Andrey Grodzovsky, hawking.zhang
This is to avoid queueing jobs to same CPU during XGMI hive reset
because there is a strict timeline for when the reset commands
must reach all the GPUs in the hive.
Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e4089a0..1518565 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3842,7 +3842,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
/* For XGMI run all resets in parallel to speed up the process */
if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
- if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work))
+ if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
r = -EALREADY;
} else
r = amdgpu_asic_reset(tmp_adev);
--
2.7.4
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply related [flat|nested] 24+ messages in thread
* RE: [RESEND PATCH 1/5] drm/amdgpu: reverts commit b01245ff54db66073b104ac9d9fbefb7b264b36d.
2019-12-11 20:38 ` Andrey Grodzovsky
@ 2019-12-12 4:04 ` Ma, Le
-1 siblings, 0 replies; 24+ messages in thread
From: Ma, Le @ 2019-12-12 4:04 UTC (permalink / raw)
To: Grodzovsky, Andrey, dri-devel, amd-gfx
Cc: Deucher, Alexander, Quan, Evan, Zhang, Hawking
[-- Attachment #1.1: Type: text/plain, Size: 10090 bytes --]
[AMD Official Use Only - Internal Distribution Only]
-----Original Message-----
From: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Sent: Thursday, December 12, 2019 4:39 AM
To: dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Ma, Le <Le.Ma@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Quan, Evan <Evan.Quan@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
Subject: [RESEND PATCH 1/5] drm/amdgpu: reverts commit b01245ff54db66073b104ac9d9fbefb7b264b36d.
In preparation for doing XGMI reset synchronization using task barrier.
Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 -
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 76 +++++-------------------------
2 files changed, 12 insertions(+), 66 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index a78a363..50bab33 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1001,8 +1001,6 @@ struct amdgpu_device {
bool pm_sysfs_en;
bool ucode_sysfs_en;
-
- bool in_baco;
};
static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 7324a5f..1d19edfa 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2667,7 +2667,7 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)
adev->asic_reset_res = (adev->in_baco == false) ?
amdgpu_device_baco_enter(adev->ddev) :
- amdgpu_device_baco_exit(adev->ddev);
+ qamdgpu_device_baco_exit(adev->ddev);
[Le]: Typo here. With it fixed, Reviewed-by: Le Ma <Le.Ma@amd.com<mailto:Le.Ma@amd.com>>
Regards,
Ma Le
else
adev->asic_reset_res = amdgpu_asic_reset(adev);
@@ -3796,18 +3796,13 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
return r;
}
-static int amdgpu_do_asic_reset(struct amdgpu_device *adev,
- struct amdgpu_hive_info *hive,
+static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
struct list_head *device_list_handle,
bool *need_full_reset_arg)
{
struct amdgpu_device *tmp_adev = NULL;
bool need_full_reset = *need_full_reset_arg, vram_lost = false;
int r = 0;
- int cpu = smp_processor_id();
- bool use_baco =
- (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) ?
- true : false;
/*
* ASIC reset has to be done on all HGMI hive nodes ASAP @@ -3815,62 +3810,22 @@ static int amdgpu_do_asic_reset(struct amdgpu_device *adev,
*/
if (need_full_reset) {
list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
- /*
- * For XGMI run all resets in parallel to speed up the
- * process by scheduling the highpri wq on different
- * cpus. For XGMI with baco reset, all nodes must enter
- * baco within close proximity before anyone exit.
- */
+ /* For XGMI run all resets in parallel to speed up the process */
if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
- if (!queue_work_on(cpu, system_highpri_wq,
- &tmp_adev->xgmi_reset_work))
+ if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work))
r = -EALREADY;
- cpu = cpumask_next(cpu, cpu_online_mask);
} else
r = amdgpu_asic_reset(tmp_adev);
- if (r)
- break;
- }
-
- /* For XGMI wait for all work to complete before proceed */
- if (!r) {
- list_for_each_entry(tmp_adev, device_list_handle,
- gmc.xgmi.head) {
- if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
- flush_work(&tmp_adev->xgmi_reset_work);
- r = tmp_adev->asic_reset_res;
- if (r)
- break;
- if (use_baco)
- tmp_adev->in_baco = true;
- }
- }
- }
- /*
- * For XGMI with baco reset, need exit baco phase by scheduling
- * xgmi_reset_work one more time. PSP reset and sGPU skips this
- * phase. Not assume the situation that PSP reset and baco reset
- * coexist within an XGMI hive.
- */
-
- if (!r && use_baco) {
- cpu = smp_processor_id();
- list_for_each_entry(tmp_adev, device_list_handle,
- gmc.xgmi.head) {
- if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
- if (!queue_work_on(cpu,
- system_highpri_wq,
- &tmp_adev->xgmi_reset_work))
- r = -EALREADY;
- if (r)
- break;
- cpu = cpumask_next(cpu, cpu_online_mask);
- }
+ if (r) {
+ DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
+ r, tmp_adev->ddev->unique);
+ break;
}
}
- if (!r && use_baco) {
+ /* For XGMI wait for all PSP resets to complete before proceed */
+ if (!r) {
list_for_each_entry(tmp_adev, device_list_handle,
gmc.xgmi.head) {
if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { @@ -3878,21 +3833,15 @@ static int amdgpu_do_asic_reset(struct amdgpu_device *adev,
r = tmp_adev->asic_reset_res;
if (r)
break;
- tmp_adev->in_baco = false;
}
}
}
-
- if (r) {
- DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
- r, tmp_adev->ddev->unique);
- goto end;
- }
}
if (!r && amdgpu_ras_intr_triggered())
amdgpu_ras_intr_cleared();
+
list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
if (need_full_reset) {
/* post card */
@@ -4181,8 +4130,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
if (r)
adev->asic_reset_res = r;
} else {
- r = amdgpu_do_asic_reset(adev, hive, device_list_handle,
- &need_full_reset);
+ r = amdgpu_do_asic_reset(hive, device_list_handle,
+&need_full_reset);
if (r && r == -EAGAIN)
goto retry;
}
--
2.7.4
[-- Attachment #1.2: Type: text/html, Size: 40275 bytes --]
[-- Attachment #2: Type: text/plain, Size: 160 bytes --]
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel
^ permalink raw reply [flat|nested] 24+ messages in thread
* RE: [RESEND PATCH 1/5] drm/amdgpu: reverts commit b01245ff54db66073b104ac9d9fbefb7b264b36d.
@ 2019-12-12 4:04 ` Ma, Le
0 siblings, 0 replies; 24+ messages in thread
From: Ma, Le @ 2019-12-12 4:04 UTC (permalink / raw)
To: Grodzovsky, Andrey, dri-devel, amd-gfx
Cc: Deucher, Alexander, Grodzovsky, Andrey, Quan, Evan, Zhang, Hawking
[-- Attachment #1.1: Type: text/plain, Size: 10090 bytes --]
[AMD Official Use Only - Internal Distribution Only]
-----Original Message-----
From: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Sent: Thursday, December 12, 2019 4:39 AM
To: dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Ma, Le <Le.Ma@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Quan, Evan <Evan.Quan@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
Subject: [RESEND PATCH 1/5] drm/amdgpu: reverts commit b01245ff54db66073b104ac9d9fbefb7b264b36d.
In preparation for doing XGMI reset synchronization using task barrier.
Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 -
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 76 +++++-------------------------
2 files changed, 12 insertions(+), 66 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index a78a363..50bab33 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1001,8 +1001,6 @@ struct amdgpu_device {
bool pm_sysfs_en;
bool ucode_sysfs_en;
-
- bool in_baco;
};
static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 7324a5f..1d19edfa 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2667,7 +2667,7 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)
adev->asic_reset_res = (adev->in_baco == false) ?
amdgpu_device_baco_enter(adev->ddev) :
- amdgpu_device_baco_exit(adev->ddev);
+ qamdgpu_device_baco_exit(adev->ddev);
[Le]: Typo here. With it fixed, Reviewed-by: Le Ma <Le.Ma@amd.com<mailto:Le.Ma@amd.com>>
Regards,
Ma Le
else
adev->asic_reset_res = amdgpu_asic_reset(adev);
@@ -3796,18 +3796,13 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
return r;
}
-static int amdgpu_do_asic_reset(struct amdgpu_device *adev,
- struct amdgpu_hive_info *hive,
+static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
struct list_head *device_list_handle,
bool *need_full_reset_arg)
{
struct amdgpu_device *tmp_adev = NULL;
bool need_full_reset = *need_full_reset_arg, vram_lost = false;
int r = 0;
- int cpu = smp_processor_id();
- bool use_baco =
- (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) ?
- true : false;
/*
* ASIC reset has to be done on all HGMI hive nodes ASAP @@ -3815,62 +3810,22 @@ static int amdgpu_do_asic_reset(struct amdgpu_device *adev,
*/
if (need_full_reset) {
list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
- /*
- * For XGMI run all resets in parallel to speed up the
- * process by scheduling the highpri wq on different
- * cpus. For XGMI with baco reset, all nodes must enter
- * baco within close proximity before anyone exit.
- */
+ /* For XGMI run all resets in parallel to speed up the process */
if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
- if (!queue_work_on(cpu, system_highpri_wq,
- &tmp_adev->xgmi_reset_work))
+ if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work))
r = -EALREADY;
- cpu = cpumask_next(cpu, cpu_online_mask);
} else
r = amdgpu_asic_reset(tmp_adev);
- if (r)
- break;
- }
-
- /* For XGMI wait for all work to complete before proceed */
- if (!r) {
- list_for_each_entry(tmp_adev, device_list_handle,
- gmc.xgmi.head) {
- if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
- flush_work(&tmp_adev->xgmi_reset_work);
- r = tmp_adev->asic_reset_res;
- if (r)
- break;
- if (use_baco)
- tmp_adev->in_baco = true;
- }
- }
- }
- /*
- * For XGMI with baco reset, need exit baco phase by scheduling
- * xgmi_reset_work one more time. PSP reset and sGPU skips this
- * phase. Not assume the situation that PSP reset and baco reset
- * coexist within an XGMI hive.
- */
-
- if (!r && use_baco) {
- cpu = smp_processor_id();
- list_for_each_entry(tmp_adev, device_list_handle,
- gmc.xgmi.head) {
- if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
- if (!queue_work_on(cpu,
- system_highpri_wq,
- &tmp_adev->xgmi_reset_work))
- r = -EALREADY;
- if (r)
- break;
- cpu = cpumask_next(cpu, cpu_online_mask);
- }
+ if (r) {
+ DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
+ r, tmp_adev->ddev->unique);
+ break;
}
}
- if (!r && use_baco) {
+ /* For XGMI wait for all PSP resets to complete before proceed */
+ if (!r) {
list_for_each_entry(tmp_adev, device_list_handle,
gmc.xgmi.head) {
if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { @@ -3878,21 +3833,15 @@ static int amdgpu_do_asic_reset(struct amdgpu_device *adev,
r = tmp_adev->asic_reset_res;
if (r)
break;
- tmp_adev->in_baco = false;
}
}
}
-
- if (r) {
- DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
- r, tmp_adev->ddev->unique);
- goto end;
- }
}
if (!r && amdgpu_ras_intr_triggered())
amdgpu_ras_intr_cleared();
+
list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
if (need_full_reset) {
/* post card */
@@ -4181,8 +4130,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
if (r)
adev->asic_reset_res = r;
} else {
- r = amdgpu_do_asic_reset(adev, hive, device_list_handle,
- &need_full_reset);
+ r = amdgpu_do_asic_reset(hive, device_list_handle,
+&need_full_reset);
if (r && r == -EAGAIN)
goto retry;
}
--
2.7.4
[-- Attachment #1.2: Type: text/html, Size: 40275 bytes --]
[-- Attachment #2: Type: text/plain, Size: 154 bytes --]
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply [flat|nested] 24+ messages in thread
* RE: [RESEND PATCH 2/5] drm: Add Reusable task barrier.
2019-12-11 20:38 ` Andrey Grodzovsky
@ 2019-12-12 4:04 ` Ma, Le
-1 siblings, 0 replies; 24+ messages in thread
From: Ma, Le @ 2019-12-12 4:04 UTC (permalink / raw)
To: Grodzovsky, Andrey, dri-devel, amd-gfx
Cc: Deucher, Alexander, Quan, Evan, Zhang, Hawking
[-- Attachment #1.1: Type: text/plain, Size: 5221 bytes --]
[AMD Official Use Only - Internal Distribution Only]
-----Original Message-----
From: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Sent: Thursday, December 12, 2019 4:39 AM
To: dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Ma, Le <Le.Ma@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Quan, Evan <Evan.Quan@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
Subject: [RESEND PATCH 2/5] drm: Add Reusable task barrier.
It is used to synchronize N threads at a rendevouz point before execution of critical code that has to be started by all the threads at approximatly the same time.
Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com<mailto:andrey.grodzovsky@amd.com>>
---
include/drm/task_barrier.h | 106 +++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 106 insertions(+)
create mode 100644 include/drm/task_barrier.h
diff --git a/include/drm/task_barrier.h b/include/drm/task_barrier.h new file mode 100644 index 0000000..81fb0f7
--- /dev/null
+++ b/include/drm/task_barrier.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright 2019 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+obtaining a
+ * copy of this software and associated documentation files (the
+"Software"),
+ * to deal in the Software without restriction, including without
+limitation
+ * the rights to use, copy, modify, merge, publish, distribute,
+sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom
+the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
+SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM,
+DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
+OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#include <linux/semaphore.h>
+#include <linux/atomic.h>
+
+/*
+ * Reusable 2 PHASE task barrier (randevouz point) implementation for N tasks.
+ * Based on the Little book of sempahores -
+https://greenteapress.com/wp/semaphores/
+ */
+
+
+
+#ifndef DRM_TASK_BARRIER_H_
+#define DRM_TASK_BARRIER_H_
+
[Le]: It might be better to prefix "drm_" to the functions and structure below, even this header file name.
+/*
+ * Represents an instance of a task barrier.
+ */
+struct task_barrier {
+ unsigned int n;
[Le]: We can define it as signed type here for more common use.
+ atomic_t count;
+ struct semaphore enter_turnstile;
+ struct semaphore exit_turnstile;
+};
+
+static inline void task_barrier_signal_turnstile(struct semaphore *turnstile,
+ unsigned int n)
+{
+ int i;
+
+ for (i = 0 ; i < n; i++)
+ up(turnstile);
+}
+
+static inline void task_barrier_init(struct task_barrier *tb) {
+ tb->n = 0;
+ atomic_set(&tb->count, 0);
+ sema_init(&tb->enter_turnstile, 0);
+ sema_init(&tb->exit_turnstile, 0);
+}
+
+static inline void task_barrier_add_task(struct task_barrier *tb) {
+ tb->n++;
+}
+
+static inline void task_barrier_rem_task(struct task_barrier *tb) {
+ tb->n--;
+}
+
+/*
+ * Lines up all the threads BEFORE the critical point.
+ *
+ * When all thread passed this code the entry barrier is back to locked state.
+ */
+static inline void task_barrier_enter(struct task_barrier *tb) {
+ if (atomic_inc_return(&tb->count) == tb->n)
+ task_barrier_signal_turnstile(&tb->enter_turnstile, tb->n);
+
+ down(&tb->enter_turnstile);
+}
+
+/*
+ * Lines up all the threads AFTER the critical point.
+ *
+ * This function is used to avoid any one thread running ahead of the
+reset if
[Le]: No need to mention "reset" here.
With the above addressed, Acked-by: Le Ma Le.Ma@amd.com<mailto:Le.Ma@amd.com>
Regards,
Ma Le
+ * the barrier is used in a loop (repeatedly) .
+ */
+static inline void task_barrier_exit(struct task_barrier *tb) {
+ if (atomic_dec_return(&tb->count) == 0)
+ task_barrier_signal_turnstile(&tb->exit_turnstile, tb->n);
+
+ down(&tb->exit_turnstile);
+}
+
+static inline void task_barrier_full(struct task_barrier *tb) {
+ task_barrier_enter(tb);
+ task_barrier_exit(tb);
+}
+
+#endif
--
2.7.4
[-- Attachment #1.2: Type: text/html, Size: 16222 bytes --]
[-- Attachment #2: Type: text/plain, Size: 160 bytes --]
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel
^ permalink raw reply [flat|nested] 24+ messages in thread
* RE: [RESEND PATCH 2/5] drm: Add Reusable task barrier.
@ 2019-12-12 4:04 ` Ma, Le
0 siblings, 0 replies; 24+ messages in thread
From: Ma, Le @ 2019-12-12 4:04 UTC (permalink / raw)
To: Grodzovsky, Andrey, dri-devel, amd-gfx
Cc: Deucher, Alexander, Grodzovsky, Andrey, Quan, Evan, Zhang, Hawking
[-- Attachment #1.1: Type: text/plain, Size: 5221 bytes --]
[AMD Official Use Only - Internal Distribution Only]
-----Original Message-----
From: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Sent: Thursday, December 12, 2019 4:39 AM
To: dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Ma, Le <Le.Ma@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Quan, Evan <Evan.Quan@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
Subject: [RESEND PATCH 2/5] drm: Add Reusable task barrier.
It is used to synchronize N threads at a rendevouz point before execution of critical code that has to be started by all the threads at approximatly the same time.
Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com<mailto:andrey.grodzovsky@amd.com>>
---
include/drm/task_barrier.h | 106 +++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 106 insertions(+)
create mode 100644 include/drm/task_barrier.h
diff --git a/include/drm/task_barrier.h b/include/drm/task_barrier.h new file mode 100644 index 0000000..81fb0f7
--- /dev/null
+++ b/include/drm/task_barrier.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright 2019 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+obtaining a
+ * copy of this software and associated documentation files (the
+"Software"),
+ * to deal in the Software without restriction, including without
+limitation
+ * the rights to use, copy, modify, merge, publish, distribute,
+sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom
+the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
+SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM,
+DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
+OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#include <linux/semaphore.h>
+#include <linux/atomic.h>
+
+/*
+ * Reusable 2 PHASE task barrier (randevouz point) implementation for N tasks.
+ * Based on the Little book of sempahores -
+https://greenteapress.com/wp/semaphores/
+ */
+
+
+
+#ifndef DRM_TASK_BARRIER_H_
+#define DRM_TASK_BARRIER_H_
+
[Le]: It might be better to prefix "drm_" to the functions and structure below, even this header file name.
+/*
+ * Represents an instance of a task barrier.
+ */
+struct task_barrier {
+ unsigned int n;
[Le]: We can define it as signed type here for more common use.
+ atomic_t count;
+ struct semaphore enter_turnstile;
+ struct semaphore exit_turnstile;
+};
+
+static inline void task_barrier_signal_turnstile(struct semaphore *turnstile,
+ unsigned int n)
+{
+ int i;
+
+ for (i = 0 ; i < n; i++)
+ up(turnstile);
+}
+
+static inline void task_barrier_init(struct task_barrier *tb) {
+ tb->n = 0;
+ atomic_set(&tb->count, 0);
+ sema_init(&tb->enter_turnstile, 0);
+ sema_init(&tb->exit_turnstile, 0);
+}
+
+static inline void task_barrier_add_task(struct task_barrier *tb) {
+ tb->n++;
+}
+
+static inline void task_barrier_rem_task(struct task_barrier *tb) {
+ tb->n--;
+}
+
+/*
+ * Lines up all the threads BEFORE the critical point.
+ *
+ * When all thread passed this code the entry barrier is back to locked state.
+ */
+static inline void task_barrier_enter(struct task_barrier *tb) {
+ if (atomic_inc_return(&tb->count) == tb->n)
+ task_barrier_signal_turnstile(&tb->enter_turnstile, tb->n);
+
+ down(&tb->enter_turnstile);
+}
+
+/*
+ * Lines up all the threads AFTER the critical point.
+ *
+ * This function is used to avoid any one thread running ahead of the
+reset if
[Le]: No need to mention "reset" here.
With the above addressed, Acked-by: Le Ma Le.Ma@amd.com<mailto:Le.Ma@amd.com>
Regards,
Ma Le
+ * the barrier is used in a loop (repeatedly) .
+ */
+static inline void task_barrier_exit(struct task_barrier *tb) {
+ if (atomic_dec_return(&tb->count) == 0)
+ task_barrier_signal_turnstile(&tb->exit_turnstile, tb->n);
+
+ down(&tb->exit_turnstile);
+}
+
+static inline void task_barrier_full(struct task_barrier *tb) {
+ task_barrier_enter(tb);
+ task_barrier_exit(tb);
+}
+
+#endif
--
2.7.4
[-- Attachment #1.2: Type: text/html, Size: 16222 bytes --]
[-- Attachment #2: Type: text/plain, Size: 154 bytes --]
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply [flat|nested] 24+ messages in thread
* RE: [RESEND PATCH 3/5] drm/amdgpu: Add task barrier to XGMI hive.
2019-12-11 20:38 ` Andrey Grodzovsky
@ 2019-12-12 4:04 ` Ma, Le
-1 siblings, 0 replies; 24+ messages in thread
From: Ma, Le @ 2019-12-12 4:04 UTC (permalink / raw)
To: Grodzovsky, Andrey, dri-devel, amd-gfx
Cc: Deucher, Alexander, Quan, Evan, Zhang, Hawking
[AMD Official Use Only - Internal Distribution Only]
Reviewed-by: Le Ma <Le.Ma@amd.com>
Regards,
Ma Le
-----Original Message-----
From: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Sent: Thursday, December 12, 2019 4:39 AM
To: dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Ma, Le <Le.Ma@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Quan, Evan <Evan.Quan@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
Subject: [RESEND PATCH 3/5] drm/amdgpu: Add task barrier to XGMI hive.
Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 4 ++++ drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h | 2 ++
2 files changed, 6 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 61d13d8..5cf920d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -261,6 +261,7 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lo
INIT_LIST_HEAD(&tmp->device_list);
mutex_init(&tmp->hive_lock);
mutex_init(&tmp->reset_lock);
+ task_barrier_init(&tmp->tb);
if (lock)
mutex_lock(&tmp->hive_lock);
@@ -408,6 +409,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
top_info->num_nodes = count;
hive->number_devices = count;
+ task_barrier_add_task(&hive->tb);
+
if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
/* update node list for other device in the hive */ @@ -470,6 +473,7 @@ void amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
mutex_destroy(&hive->hive_lock);
mutex_destroy(&hive->reset_lock);
} else {
+ task_barrier_rem_task(&hive->tb);
amdgpu_xgmi_sysfs_rem_dev_info(adev, hive);
mutex_unlock(&hive->hive_lock);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
index bbf504f..74011fb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
@@ -22,6 +22,7 @@
#ifndef __AMDGPU_XGMI_H__
#define __AMDGPU_XGMI_H__
+#include <drm/task_barrier.h>
#include "amdgpu_psp.h"
struct amdgpu_hive_info {
@@ -33,6 +34,7 @@ struct amdgpu_hive_info {
struct device_attribute dev_attr;
struct amdgpu_device *adev;
int pstate; /*0 -- low , 1 -- high , -1 unknown*/
+ struct task_barrier tb;
};
struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock);
--
2.7.4
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel
^ permalink raw reply related [flat|nested] 24+ messages in thread
* RE: [RESEND PATCH 3/5] drm/amdgpu: Add task barrier to XGMI hive.
@ 2019-12-12 4:04 ` Ma, Le
0 siblings, 0 replies; 24+ messages in thread
From: Ma, Le @ 2019-12-12 4:04 UTC (permalink / raw)
To: Grodzovsky, Andrey, dri-devel, amd-gfx
Cc: Deucher, Alexander, Grodzovsky, Andrey, Quan, Evan, Zhang, Hawking
[AMD Official Use Only - Internal Distribution Only]
Reviewed-by: Le Ma <Le.Ma@amd.com>
Regards,
Ma Le
-----Original Message-----
From: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Sent: Thursday, December 12, 2019 4:39 AM
To: dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Ma, Le <Le.Ma@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Quan, Evan <Evan.Quan@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
Subject: [RESEND PATCH 3/5] drm/amdgpu: Add task barrier to XGMI hive.
Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 4 ++++ drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h | 2 ++
2 files changed, 6 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 61d13d8..5cf920d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -261,6 +261,7 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lo
INIT_LIST_HEAD(&tmp->device_list);
mutex_init(&tmp->hive_lock);
mutex_init(&tmp->reset_lock);
+ task_barrier_init(&tmp->tb);
if (lock)
mutex_lock(&tmp->hive_lock);
@@ -408,6 +409,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
top_info->num_nodes = count;
hive->number_devices = count;
+ task_barrier_add_task(&hive->tb);
+
if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
/* update node list for other device in the hive */ @@ -470,6 +473,7 @@ void amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
mutex_destroy(&hive->hive_lock);
mutex_destroy(&hive->reset_lock);
} else {
+ task_barrier_rem_task(&hive->tb);
amdgpu_xgmi_sysfs_rem_dev_info(adev, hive);
mutex_unlock(&hive->hive_lock);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
index bbf504f..74011fb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
@@ -22,6 +22,7 @@
#ifndef __AMDGPU_XGMI_H__
#define __AMDGPU_XGMI_H__
+#include <drm/task_barrier.h>
#include "amdgpu_psp.h"
struct amdgpu_hive_info {
@@ -33,6 +34,7 @@ struct amdgpu_hive_info {
struct device_attribute dev_attr;
struct amdgpu_device *adev;
int pstate; /*0 -- low , 1 -- high , -1 unknown*/
+ struct task_barrier tb;
};
struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock);
--
2.7.4
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply related [flat|nested] 24+ messages in thread
* RE: [RESEND PATCH 4/5] Subject: drm/amdgpu: Redo XGMI reset synchronization.
2019-12-11 20:38 ` Andrey Grodzovsky
@ 2019-12-12 4:05 ` Ma, Le
-1 siblings, 0 replies; 24+ messages in thread
From: Ma, Le @ 2019-12-12 4:05 UTC (permalink / raw)
To: Grodzovsky, Andrey, dri-devel, amd-gfx
Cc: Deucher, Alexander, Quan, Evan, Zhang, Hawking
[-- Attachment #1.1: Type: text/plain, Size: 4235 bytes --]
[AMD Official Use Only - Internal Distribution Only]
-----Original Message-----
From: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Sent: Thursday, December 12, 2019 4:39 AM
To: dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Ma, Le <Le.Ma@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Quan, Evan <Evan.Quan@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
Subject: [RESEND PATCH 4/5] Subject: drm/amdgpu: Redo XGMI reset synchronization.
Use task barrier in XGMI hive to synchronize ASIC resets across devices in XGMI hive.
Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com<mailto:andrey.grodzovsky@amd.com>>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 42 +++++++++++++++++++++++++-----
1 file changed, 36 insertions(+), 6 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 1d19edfa..e4089a0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -67,6 +67,7 @@
#include "amdgpu_tmz.h"
#include <linux/suspend.h>
+#include <drm/task_barrier.h>
MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
@@ -2663,14 +2664,43 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) {
struct amdgpu_device *adev =
container_of(__work, struct amdgpu_device, xgmi_reset_work);
+ struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
- if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)
- adev->asic_reset_res = (adev->in_baco == false) ?
- amdgpu_device_baco_enter(adev->ddev) :
- qamdgpu_device_baco_exit(adev->ddev);
- else
- adev->asic_reset_res = amdgpu_asic_reset(adev);
+ /*
+ * Use task barrier to synchronize all xgmi reset works across the
+ * hive.
+ * task_barrier_enter and task_barrier_exit will block untill all the
+ * threads running the xgmi reset works reach those points. I assume
+ * guarantee of progress here for all the threads as the workqueue code
+ * creates new worker threads as needed by amount of work items in queue
+ * (see worker_thread) and also each thread sleeps in the barrir and by
+ * this yielding the CPU for other work threads to make progress.
+ */
[Le]: This comments can be adjusted since we switch to system_unbound_wq in patch #5.
+ if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
+
+ if (hive)
+ task_barrier_enter(&hive->tb);
[Le]: The multiple hive condition can be checked only once and moved to the location right after the assignment.
+
+ adev->asic_reset_res = amdgpu_device_baco_enter(adev->ddev);
+
+ if (adev->asic_reset_res)
+ goto fail;
+
+ if (hive)
+ task_barrier_exit(&hive->tb);
[Le]: Same as above.
+
+ adev->asic_reset_res = amdgpu_device_baco_exit(adev->ddev);
+
+ if (adev->asic_reset_res)
+ goto fail;
+ } else {
+ if (hive)
+ task_barrier_full(&hive->tb);
[Le]: Same as above.
With above addressed, Reviewed-by: Le Ma <Le.Ma@amd.com<mailto:Le.Ma@amd.com>>
Regards,
Ma Le
+
+ adev->asic_reset_res = amdgpu_asic_reset(adev);
+ }
+fail:
if (adev->asic_reset_res)
DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
adev->asic_reset_res, adev->ddev->unique);
--
2.7.4
[-- Attachment #1.2: Type: text/html, Size: 14702 bytes --]
[-- Attachment #2: Type: text/plain, Size: 160 bytes --]
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel
^ permalink raw reply [flat|nested] 24+ messages in thread
* RE: [RESEND PATCH 4/5] Subject: drm/amdgpu: Redo XGMI reset synchronization.
@ 2019-12-12 4:05 ` Ma, Le
0 siblings, 0 replies; 24+ messages in thread
From: Ma, Le @ 2019-12-12 4:05 UTC (permalink / raw)
To: Grodzovsky, Andrey, dri-devel, amd-gfx
Cc: Deucher, Alexander, Grodzovsky, Andrey, Quan, Evan, Zhang, Hawking
[-- Attachment #1.1: Type: text/plain, Size: 4235 bytes --]
[AMD Official Use Only - Internal Distribution Only]
-----Original Message-----
From: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Sent: Thursday, December 12, 2019 4:39 AM
To: dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Ma, Le <Le.Ma@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Quan, Evan <Evan.Quan@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
Subject: [RESEND PATCH 4/5] Subject: drm/amdgpu: Redo XGMI reset synchronization.
Use task barrier in XGMI hive to synchronize ASIC resets across devices in XGMI hive.
Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com<mailto:andrey.grodzovsky@amd.com>>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 42 +++++++++++++++++++++++++-----
1 file changed, 36 insertions(+), 6 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 1d19edfa..e4089a0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -67,6 +67,7 @@
#include "amdgpu_tmz.h"
#include <linux/suspend.h>
+#include <drm/task_barrier.h>
MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
@@ -2663,14 +2664,43 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) {
struct amdgpu_device *adev =
container_of(__work, struct amdgpu_device, xgmi_reset_work);
+ struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
- if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)
- adev->asic_reset_res = (adev->in_baco == false) ?
- amdgpu_device_baco_enter(adev->ddev) :
- qamdgpu_device_baco_exit(adev->ddev);
- else
- adev->asic_reset_res = amdgpu_asic_reset(adev);
+ /*
+ * Use task barrier to synchronize all xgmi reset works across the
+ * hive.
+ * task_barrier_enter and task_barrier_exit will block untill all the
+ * threads running the xgmi reset works reach those points. I assume
+ * guarantee of progress here for all the threads as the workqueue code
+ * creates new worker threads as needed by amount of work items in queue
+ * (see worker_thread) and also each thread sleeps in the barrir and by
+ * this yielding the CPU for other work threads to make progress.
+ */
[Le]: This comments can be adjusted since we switch to system_unbound_wq in patch #5.
+ if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
+
+ if (hive)
+ task_barrier_enter(&hive->tb);
[Le]: The multiple hive condition can be checked only once and moved to the location right after the assignment.
+
+ adev->asic_reset_res = amdgpu_device_baco_enter(adev->ddev);
+
+ if (adev->asic_reset_res)
+ goto fail;
+
+ if (hive)
+ task_barrier_exit(&hive->tb);
[Le]: Same as above.
+
+ adev->asic_reset_res = amdgpu_device_baco_exit(adev->ddev);
+
+ if (adev->asic_reset_res)
+ goto fail;
+ } else {
+ if (hive)
+ task_barrier_full(&hive->tb);
[Le]: Same as above.
With above addressed, Reviewed-by: Le Ma <Le.Ma@amd.com<mailto:Le.Ma@amd.com>>
Regards,
Ma Le
+
+ adev->asic_reset_res = amdgpu_asic_reset(adev);
+ }
+fail:
if (adev->asic_reset_res)
DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
adev->asic_reset_res, adev->ddev->unique);
--
2.7.4
[-- Attachment #1.2: Type: text/html, Size: 14702 bytes --]
[-- Attachment #2: Type: text/plain, Size: 154 bytes --]
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply [flat|nested] 24+ messages in thread
* RE: [RESEND PATCH 5/5] drm/amdgpu: Switch from system_highpri_wq to system_unbound_wq
2019-12-11 20:38 ` Andrey Grodzovsky
@ 2019-12-12 4:05 ` Ma, Le
-1 siblings, 0 replies; 24+ messages in thread
From: Ma, Le @ 2019-12-12 4:05 UTC (permalink / raw)
To: Grodzovsky, Andrey, dri-devel, amd-gfx
Cc: Deucher, Alexander, Quan, Evan, Zhang, Hawking
[AMD Official Use Only - Internal Distribution Only]
Reviewed-by: Le Ma <Le.Ma@amd.com>
Regards,
Ma Le
-----Original Message-----
From: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Sent: Thursday, December 12, 2019 4:39 AM
To: dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Ma, Le <Le.Ma@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Quan, Evan <Evan.Quan@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
Subject: [RESEND PATCH 5/5] drm/amdgpu: Switch from system_highpri_wq to system_unbound_wq
This is to avoid queueing jobs to same CPU during XGMI hive reset because there is a strict timeline for when the reset commands must reach all the GPUs in the hive.
Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e4089a0..1518565 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3842,7 +3842,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
/* For XGMI run all resets in parallel to speed up the process */
if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
- if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work))
+ if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
r = -EALREADY;
} else
r = amdgpu_asic_reset(tmp_adev);
--
2.7.4
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel
^ permalink raw reply related [flat|nested] 24+ messages in thread
* RE: [RESEND PATCH 5/5] drm/amdgpu: Switch from system_highpri_wq to system_unbound_wq
@ 2019-12-12 4:05 ` Ma, Le
0 siblings, 0 replies; 24+ messages in thread
From: Ma, Le @ 2019-12-12 4:05 UTC (permalink / raw)
To: Grodzovsky, Andrey, dri-devel, amd-gfx
Cc: Deucher, Alexander, Grodzovsky, Andrey, Quan, Evan, Zhang, Hawking
[AMD Official Use Only - Internal Distribution Only]
Reviewed-by: Le Ma <Le.Ma@amd.com>
Regards,
Ma Le
-----Original Message-----
From: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Sent: Thursday, December 12, 2019 4:39 AM
To: dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Ma, Le <Le.Ma@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Quan, Evan <Evan.Quan@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
Subject: [RESEND PATCH 5/5] drm/amdgpu: Switch from system_highpri_wq to system_unbound_wq
This is to avoid queueing jobs to same CPU during XGMI hive reset because there is a strict timeline for when the reset commands must reach all the GPUs in the hive.
Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e4089a0..1518565 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3842,7 +3842,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
/* For XGMI run all resets in parallel to speed up the process */
if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
- if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work))
+ if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
r = -EALREADY;
} else
r = amdgpu_asic_reset(tmp_adev);
--
2.7.4
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply related [flat|nested] 24+ messages in thread
* Re: [RESEND PATCH 2/5] drm: Add Reusable task barrier.
2019-12-12 4:04 ` Ma, Le
@ 2019-12-12 15:07 ` Andrey Grodzovsky
-1 siblings, 0 replies; 24+ messages in thread
From: Andrey Grodzovsky @ 2019-12-12 15:07 UTC (permalink / raw)
To: Ma, Le, dri-devel, amd-gfx; +Cc: Deucher, Alexander, Quan, Evan, Zhang, Hawking
[-- Attachment #1.1: Type: text/plain, Size: 5816 bytes --]
On 12/11/19 11:04 PM, Ma, Le wrote:
>
> [AMD Official Use Only - Internal Distribution Only]
>
> -----Original Message-----
> From: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
> Sent: Thursday, December 12, 2019 4:39 AM
> To: dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Ma, Le
> <Le.Ma@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Quan, Evan
> <Evan.Quan@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
> Subject: [RESEND PATCH 2/5] drm: Add Reusable task barrier.
>
> It is used to synchronize N threads at a rendevouz point before
> execution of critical code that has to be started by all the threads
> at approximatly the same time.
>
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com
> <mailto:andrey.grodzovsky@amd.com>>
>
> ---
>
> include/drm/task_barrier.h | 106
> +++++++++++++++++++++++++++++++++++++++++++++
>
> 1 file changed, 106 insertions(+)
>
> create mode 100644 include/drm/task_barrier.h
>
> diff --git a/include/drm/task_barrier.h b/include/drm/task_barrier.h
> new file mode 100644 index 0000000..81fb0f7
>
> --- /dev/null
>
> +++ b/include/drm/task_barrier.h
>
> @@ -0,0 +1,106 @@
>
> +/*
>
> + * Copyright 2019 Advanced Micro Devices, Inc.
>
> + *
>
> + * Permission is hereby granted, free of charge, to any person
>
> +obtaining a
>
> + * copy of this software and associated documentation files (the
>
> +"Software"),
>
> + * to deal in the Software without restriction, including without
>
> +limitation
>
> + * the rights to use, copy, modify, merge, publish, distribute,
>
> +sublicense,
>
> + * and/or sell copies of the Software, and to permit persons to whom
>
> +the
>
> + * Software is furnished to do so, subject to the following conditions:
>
> + *
>
> + * The above copyright notice and this permission notice shall be
>
> +included in
>
> + * all copies or substantial portions of the Software.
>
> + *
>
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
>
> +EXPRESS OR
>
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
>
> +MERCHANTABILITY,
>
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
>
> +SHALL
>
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM,
>
> +DAMAGES OR
>
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
>
> +OTHERWISE,
>
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
>
> +OR
>
> + * OTHER DEALINGS IN THE SOFTWARE.
>
> + *
>
> + */
>
> +#include <linux/semaphore.h>
>
> +#include <linux/atomic.h>
>
> +
>
> +/*
>
> + * Reusable 2 PHASE task barrier (randevouz point) implementation for
> N tasks.
>
> + * Based on the Little book of sempahores -
>
> +https://greenteapress.com/wp/semaphores/
>
> + */
>
> +
>
> +
>
> +
>
> +#ifndef DRM_TASK_BARRIER_H_
>
> +#define DRM_TASK_BARRIER_H_
>
> +
>
> [Le]: It might be better to prefix “drm_” to the functions and
> structure below, even this header file name.
>
I am not sure about this - see the example of spsc_queue we added for
GPU scheduler use. I just followed it as an example of where to place
the structure. There is nothing DRM specific about spsc_queue or
task_barrier, they are generic constructs that we place in DRM subsystem
for common use.
> +/*
>
> + * Represents an instance of a task barrier.
>
> + */
>
> +struct task_barrier {
>
> + unsigned int n;
>
> [Le]: We can define it as signed type here for more common use.
>
This is a counter of number of tasks/threads to synchronize in the
barrier it cannot go bellow 0
Andrey
> + atomic_t count;
>
> + struct semaphore enter_turnstile;
>
> + struct semaphore exit_turnstile;
>
> +};
>
> +
>
> +static inline void task_barrier_signal_turnstile(struct semaphore
> *turnstile,
>
> + unsigned int n)
>
> +{
>
> + int i;
>
> +
>
> + for (i = 0 ; i < n; i++)
>
> + up(turnstile);
>
> +}
>
> +
>
> +static inline void task_barrier_init(struct task_barrier *tb) {
>
> + tb->n = 0;
>
> + atomic_set(&tb->count, 0);
>
> + sema_init(&tb->enter_turnstile, 0);
>
> + sema_init(&tb->exit_turnstile, 0);
>
> +}
>
> +
>
> +static inline void task_barrier_add_task(struct task_barrier *tb) {
>
> + tb->n++;
>
> +}
>
> +
>
> +static inline void task_barrier_rem_task(struct task_barrier *tb) {
>
> + tb->n--;
>
> +}
>
> +
>
> +/*
>
> + * Lines up all the threads BEFORE the critical point.
>
> + *
>
> + * When all thread passed this code the entry barrier is back to
> locked state.
>
> + */
>
> +static inline void task_barrier_enter(struct task_barrier *tb) {
>
> + if (atomic_inc_return(&tb->count) == tb->n)
>
> + task_barrier_signal_turnstile(&tb->enter_turnstile, tb->n);
>
> +
>
> + down(&tb->enter_turnstile);
>
> +}
>
> +
>
> +/*
>
> + * Lines up all the threads AFTER the critical point.
>
> + *
>
> + * This function is used to avoid any one thread running ahead of the
>
> +reset if
>
> [Le]: No need to mention “reset” here.
>
> With the above addressed, Acked-by: Le Ma Le.Ma@amd.com
> <mailto:Le.Ma@amd.com>
>
> Regards,
>
> Ma Le
>
> + * the barrier is used in a loop (repeatedly) .
>
> + */
>
> +static inline void task_barrier_exit(struct task_barrier *tb) {
>
> + if (atomic_dec_return(&tb->count) == 0)
>
> + task_barrier_signal_turnstile(&tb->exit_turnstile, tb->n);
>
> +
>
> + down(&tb->exit_turnstile);
>
> +}
>
> +
>
> +static inline void task_barrier_full(struct task_barrier *tb) {
>
> + task_barrier_enter(tb);
>
> + task_barrier_exit(tb);
>
> +}
>
> +
>
> +#endif
>
> --
>
> 2.7.4
>
[-- Attachment #1.2: Type: text/html, Size: 19692 bytes --]
[-- Attachment #2: Type: text/plain, Size: 160 bytes --]
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [RESEND PATCH 2/5] drm: Add Reusable task barrier.
@ 2019-12-12 15:07 ` Andrey Grodzovsky
0 siblings, 0 replies; 24+ messages in thread
From: Andrey Grodzovsky @ 2019-12-12 15:07 UTC (permalink / raw)
To: Ma, Le, dri-devel, amd-gfx; +Cc: Deucher, Alexander, Quan, Evan, Zhang, Hawking
[-- Attachment #1.1: Type: text/plain, Size: 5816 bytes --]
On 12/11/19 11:04 PM, Ma, Le wrote:
>
> [AMD Official Use Only - Internal Distribution Only]
>
> -----Original Message-----
> From: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
> Sent: Thursday, December 12, 2019 4:39 AM
> To: dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Ma, Le
> <Le.Ma@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Quan, Evan
> <Evan.Quan@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
> Subject: [RESEND PATCH 2/5] drm: Add Reusable task barrier.
>
> It is used to synchronize N threads at a rendevouz point before
> execution of critical code that has to be started by all the threads
> at approximatly the same time.
>
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com
> <mailto:andrey.grodzovsky@amd.com>>
>
> ---
>
> include/drm/task_barrier.h | 106
> +++++++++++++++++++++++++++++++++++++++++++++
>
> 1 file changed, 106 insertions(+)
>
> create mode 100644 include/drm/task_barrier.h
>
> diff --git a/include/drm/task_barrier.h b/include/drm/task_barrier.h
> new file mode 100644 index 0000000..81fb0f7
>
> --- /dev/null
>
> +++ b/include/drm/task_barrier.h
>
> @@ -0,0 +1,106 @@
>
> +/*
>
> + * Copyright 2019 Advanced Micro Devices, Inc.
>
> + *
>
> + * Permission is hereby granted, free of charge, to any person
>
> +obtaining a
>
> + * copy of this software and associated documentation files (the
>
> +"Software"),
>
> + * to deal in the Software without restriction, including without
>
> +limitation
>
> + * the rights to use, copy, modify, merge, publish, distribute,
>
> +sublicense,
>
> + * and/or sell copies of the Software, and to permit persons to whom
>
> +the
>
> + * Software is furnished to do so, subject to the following conditions:
>
> + *
>
> + * The above copyright notice and this permission notice shall be
>
> +included in
>
> + * all copies or substantial portions of the Software.
>
> + *
>
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
>
> +EXPRESS OR
>
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
>
> +MERCHANTABILITY,
>
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
>
> +SHALL
>
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM,
>
> +DAMAGES OR
>
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
>
> +OTHERWISE,
>
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
>
> +OR
>
> + * OTHER DEALINGS IN THE SOFTWARE.
>
> + *
>
> + */
>
> +#include <linux/semaphore.h>
>
> +#include <linux/atomic.h>
>
> +
>
> +/*
>
> + * Reusable 2 PHASE task barrier (randevouz point) implementation for
> N tasks.
>
> + * Based on the Little book of sempahores -
>
> +https://greenteapress.com/wp/semaphores/
>
> + */
>
> +
>
> +
>
> +
>
> +#ifndef DRM_TASK_BARRIER_H_
>
> +#define DRM_TASK_BARRIER_H_
>
> +
>
> [Le]: It might be better to prefix “drm_” to the functions and
> structure below, even this header file name.
>
I am not sure about this - see the example of spsc_queue we added for
GPU scheduler use. I just followed it as an example of where to place
the structure. There is nothing DRM specific about spsc_queue or
task_barrier, they are generic constructs that we place in DRM subsystem
for common use.
> +/*
>
> + * Represents an instance of a task barrier.
>
> + */
>
> +struct task_barrier {
>
> + unsigned int n;
>
> [Le]: We can define it as signed type here for more common use.
>
This is a counter of number of tasks/threads to synchronize in the
barrier it cannot go bellow 0
Andrey
> + atomic_t count;
>
> + struct semaphore enter_turnstile;
>
> + struct semaphore exit_turnstile;
>
> +};
>
> +
>
> +static inline void task_barrier_signal_turnstile(struct semaphore
> *turnstile,
>
> + unsigned int n)
>
> +{
>
> + int i;
>
> +
>
> + for (i = 0 ; i < n; i++)
>
> + up(turnstile);
>
> +}
>
> +
>
> +static inline void task_barrier_init(struct task_barrier *tb) {
>
> + tb->n = 0;
>
> + atomic_set(&tb->count, 0);
>
> + sema_init(&tb->enter_turnstile, 0);
>
> + sema_init(&tb->exit_turnstile, 0);
>
> +}
>
> +
>
> +static inline void task_barrier_add_task(struct task_barrier *tb) {
>
> + tb->n++;
>
> +}
>
> +
>
> +static inline void task_barrier_rem_task(struct task_barrier *tb) {
>
> + tb->n--;
>
> +}
>
> +
>
> +/*
>
> + * Lines up all the threads BEFORE the critical point.
>
> + *
>
> + * When all thread passed this code the entry barrier is back to
> locked state.
>
> + */
>
> +static inline void task_barrier_enter(struct task_barrier *tb) {
>
> + if (atomic_inc_return(&tb->count) == tb->n)
>
> + task_barrier_signal_turnstile(&tb->enter_turnstile, tb->n);
>
> +
>
> + down(&tb->enter_turnstile);
>
> +}
>
> +
>
> +/*
>
> + * Lines up all the threads AFTER the critical point.
>
> + *
>
> + * This function is used to avoid any one thread running ahead of the
>
> +reset if
>
> [Le]: No need to mention “reset” here.
>
> With the above addressed, Acked-by: Le Ma Le.Ma@amd.com
> <mailto:Le.Ma@amd.com>
>
> Regards,
>
> Ma Le
>
> + * the barrier is used in a loop (repeatedly) .
>
> + */
>
> +static inline void task_barrier_exit(struct task_barrier *tb) {
>
> + if (atomic_dec_return(&tb->count) == 0)
>
> + task_barrier_signal_turnstile(&tb->exit_turnstile, tb->n);
>
> +
>
> + down(&tb->exit_turnstile);
>
> +}
>
> +
>
> +static inline void task_barrier_full(struct task_barrier *tb) {
>
> + task_barrier_enter(tb);
>
> + task_barrier_exit(tb);
>
> +}
>
> +
>
> +#endif
>
> --
>
> 2.7.4
>
[-- Attachment #1.2: Type: text/html, Size: 19692 bytes --]
[-- Attachment #2: Type: text/plain, Size: 154 bytes --]
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [RESEND PATCH 4/5] Subject: drm/amdgpu: Redo XGMI reset synchronization.
2019-12-12 4:05 ` Ma, Le
@ 2019-12-12 15:20 ` Andrey Grodzovsky
-1 siblings, 0 replies; 24+ messages in thread
From: Andrey Grodzovsky @ 2019-12-12 15:20 UTC (permalink / raw)
To: Ma, Le, dri-devel, amd-gfx; +Cc: Deucher, Alexander, Quan, Evan, Zhang, Hawking
[-- Attachment #1.1: Type: text/plain, Size: 4659 bytes --]
On 12/11/19 11:05 PM, Ma, Le wrote:
>
> [AMD Official Use Only - Internal Distribution Only]
>
> -----Original Message-----
> From: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
> Sent: Thursday, December 12, 2019 4:39 AM
> To: dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Ma, Le
> <Le.Ma@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Quan, Evan
> <Evan.Quan@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
> Subject: [RESEND PATCH 4/5] Subject: drm/amdgpu: Redo XGMI reset
> synchronization.
>
> Use task barrier in XGMI hive to synchronize ASIC resets across
> devices in XGMI hive.
>
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com
> <mailto:andrey.grodzovsky@amd.com>>
>
> ---
>
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 42
> +++++++++++++++++++++++++-----
>
> 1 file changed, 36 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>
> index 1d19edfa..e4089a0 100644
>
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>
> @@ -67,6 +67,7 @@
>
> #include "amdgpu_tmz.h"
>
> #include <linux/suspend.h>
>
> +#include <drm/task_barrier.h>
>
> MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
>
> MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
>
> @@ -2663,14 +2664,43 @@ static void
> amdgpu_device_xgmi_reset_func(struct work_struct *__work) {
>
> struct amdgpu_device *adev =
>
> container_of(__work, struct amdgpu_device, xgmi_reset_work);
>
> + struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
>
> - if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)
>
> - adev->asic_reset_res = (adev->in_baco == false) ?
>
> - amdgpu_device_baco_enter(adev->ddev) :
>
> - qamdgpu_device_baco_exit(adev->ddev);
>
> - else
>
> - adev->asic_reset_res = amdgpu_asic_reset(adev);
>
> + /*
>
> + * Use task barrier to synchronize all xgmi reset works
> across the
>
> + * hive.
>
> + * task_barrier_enter and task_barrier_exit will block
> untill all the
>
> + * threads running the xgmi reset works reach those points.
> I assume
>
> + * guarantee of progress here for all the threads as the
> workqueue code
>
> + * creates new worker threads as needed by amount of work
> items in queue
>
> + * (see worker_thread) and also each thread sleeps in the
> barrir and by
>
> + * this yielding the CPU for other work threads to make
> progress.
>
> + */
>
> [Le]: This comments can be adjusted since we switch to
> system_unbound_wq in patch #5.
>
> + if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
>
> +
>
> + if (hive)
>
> + task_barrier_enter(&hive->tb);
>
> [Le]: The multiple hive condition can be checked only once and moved
> to the location right after the assignment.
>
Not sure what you meant here but in fact let's note that while in
amdgpu_device_xgmi_reset_func it's a bug for amdgpu_get_xgmi_hive to
return NULL so I think better instead to add WARN_ON(!hive,"...") and
return right at the beginning of the function if indeed hive == NULL
Andrey
> +
>
> + adev->asic_reset_res = amdgpu_device_baco_enter(adev->ddev);
>
> +
>
> + if (adev->asic_reset_res)
>
> + goto fail;
>
> +
>
> + if (hive)
>
> + task_barrier_exit(&hive->tb);
>
> [Le]: Same as above.
>
> +
>
> + adev->asic_reset_res = amdgpu_device_baco_exit(adev->ddev);
>
> +
>
> + if (adev->asic_reset_res)
>
> + goto fail;
>
> + } else {
>
> + if (hive)
>
> + task_barrier_full(&hive->tb);
>
> [Le]: Same as above.
>
> With above addressed, Reviewed-by: Le Ma <Le.Ma@amd.com
> <mailto:Le.Ma@amd.com>>
>
> Regards,
>
> Ma Le
>
> +
>
> + adev->asic_reset_res = amdgpu_asic_reset(adev);
>
> + }
>
> +fail:
>
> if (adev->asic_reset_res)
>
> DRM_WARN("ASIC reset failed with error, %d for
> drm dev, %s",
>
> adev->asic_reset_res, adev->ddev->unique);
>
> --
>
> 2.7.4
>
[-- Attachment #1.2: Type: text/html, Size: 17123 bytes --]
[-- Attachment #2: Type: text/plain, Size: 160 bytes --]
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [RESEND PATCH 4/5] Subject: drm/amdgpu: Redo XGMI reset synchronization.
@ 2019-12-12 15:20 ` Andrey Grodzovsky
0 siblings, 0 replies; 24+ messages in thread
From: Andrey Grodzovsky @ 2019-12-12 15:20 UTC (permalink / raw)
To: Ma, Le, dri-devel, amd-gfx; +Cc: Deucher, Alexander, Quan, Evan, Zhang, Hawking
[-- Attachment #1.1: Type: text/plain, Size: 4659 bytes --]
On 12/11/19 11:05 PM, Ma, Le wrote:
>
> [AMD Official Use Only - Internal Distribution Only]
>
> -----Original Message-----
> From: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
> Sent: Thursday, December 12, 2019 4:39 AM
> To: dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Ma, Le
> <Le.Ma@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Quan, Evan
> <Evan.Quan@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
> Subject: [RESEND PATCH 4/5] Subject: drm/amdgpu: Redo XGMI reset
> synchronization.
>
> Use task barrier in XGMI hive to synchronize ASIC resets across
> devices in XGMI hive.
>
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com
> <mailto:andrey.grodzovsky@amd.com>>
>
> ---
>
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 42
> +++++++++++++++++++++++++-----
>
> 1 file changed, 36 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>
> index 1d19edfa..e4089a0 100644
>
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>
> @@ -67,6 +67,7 @@
>
> #include "amdgpu_tmz.h"
>
> #include <linux/suspend.h>
>
> +#include <drm/task_barrier.h>
>
> MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
>
> MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
>
> @@ -2663,14 +2664,43 @@ static void
> amdgpu_device_xgmi_reset_func(struct work_struct *__work) {
>
> struct amdgpu_device *adev =
>
> container_of(__work, struct amdgpu_device, xgmi_reset_work);
>
> + struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
>
> - if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)
>
> - adev->asic_reset_res = (adev->in_baco == false) ?
>
> - amdgpu_device_baco_enter(adev->ddev) :
>
> - qamdgpu_device_baco_exit(adev->ddev);
>
> - else
>
> - adev->asic_reset_res = amdgpu_asic_reset(adev);
>
> + /*
>
> + * Use task barrier to synchronize all xgmi reset works
> across the
>
> + * hive.
>
> + * task_barrier_enter and task_barrier_exit will block
> untill all the
>
> + * threads running the xgmi reset works reach those points.
> I assume
>
> + * guarantee of progress here for all the threads as the
> workqueue code
>
> + * creates new worker threads as needed by amount of work
> items in queue
>
> + * (see worker_thread) and also each thread sleeps in the
> barrir and by
>
> + * this yielding the CPU for other work threads to make
> progress.
>
> + */
>
> [Le]: This comments can be adjusted since we switch to
> system_unbound_wq in patch #5.
>
> + if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
>
> +
>
> + if (hive)
>
> + task_barrier_enter(&hive->tb);
>
> [Le]: The multiple hive condition can be checked only once and moved
> to the location right after the assignment.
>
Not sure what you meant here but in fact let's note that while in
amdgpu_device_xgmi_reset_func it's a bug for amdgpu_get_xgmi_hive to
return NULL so I think better instead to add WARN_ON(!hive,"...") and
return right at the beginning of the function if indeed hive == NULL
Andrey
> +
>
> + adev->asic_reset_res = amdgpu_device_baco_enter(adev->ddev);
>
> +
>
> + if (adev->asic_reset_res)
>
> + goto fail;
>
> +
>
> + if (hive)
>
> + task_barrier_exit(&hive->tb);
>
> [Le]: Same as above.
>
> +
>
> + adev->asic_reset_res = amdgpu_device_baco_exit(adev->ddev);
>
> +
>
> + if (adev->asic_reset_res)
>
> + goto fail;
>
> + } else {
>
> + if (hive)
>
> + task_barrier_full(&hive->tb);
>
> [Le]: Same as above.
>
> With above addressed, Reviewed-by: Le Ma <Le.Ma@amd.com
> <mailto:Le.Ma@amd.com>>
>
> Regards,
>
> Ma Le
>
> +
>
> + adev->asic_reset_res = amdgpu_asic_reset(adev);
>
> + }
>
> +fail:
>
> if (adev->asic_reset_res)
>
> DRM_WARN("ASIC reset failed with error, %d for
> drm dev, %s",
>
> adev->asic_reset_res, adev->ddev->unique);
>
> --
>
> 2.7.4
>
[-- Attachment #1.2: Type: text/html, Size: 17123 bytes --]
[-- Attachment #2: Type: text/plain, Size: 154 bytes --]
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply [flat|nested] 24+ messages in thread
end of thread, other threads:[~2019-12-12 15:20 UTC | newest]
Thread overview: 24+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-12-11 20:38 [RESEND PATCH 1/5] drm/amdgpu: reverts commit b01245ff54db66073b104ac9d9fbefb7b264b36d Andrey Grodzovsky
2019-12-11 20:38 ` Andrey Grodzovsky
2019-12-11 20:38 ` [RESEND PATCH 2/5] drm: Add Reusable task barrier Andrey Grodzovsky
2019-12-11 20:38 ` Andrey Grodzovsky
2019-12-12 4:04 ` Ma, Le
2019-12-12 4:04 ` Ma, Le
2019-12-12 15:07 ` Andrey Grodzovsky
2019-12-12 15:07 ` Andrey Grodzovsky
2019-12-11 20:38 ` [RESEND PATCH 3/5] drm/amdgpu: Add task barrier to XGMI hive Andrey Grodzovsky
2019-12-11 20:38 ` Andrey Grodzovsky
2019-12-12 4:04 ` Ma, Le
2019-12-12 4:04 ` Ma, Le
2019-12-11 20:38 ` [RESEND PATCH 4/5] Subject: drm/amdgpu: Redo XGMI reset synchronization Andrey Grodzovsky
2019-12-11 20:38 ` Andrey Grodzovsky
2019-12-12 4:05 ` Ma, Le
2019-12-12 4:05 ` Ma, Le
2019-12-12 15:20 ` Andrey Grodzovsky
2019-12-12 15:20 ` Andrey Grodzovsky
2019-12-11 20:38 ` [RESEND PATCH 5/5] drm/amdgpu: Switch from system_highpri_wq to system_unbound_wq Andrey Grodzovsky
2019-12-11 20:38 ` Andrey Grodzovsky
2019-12-12 4:05 ` Ma, Le
2019-12-12 4:05 ` Ma, Le
2019-12-12 4:04 ` [RESEND PATCH 1/5] drm/amdgpu: reverts commit b01245ff54db66073b104ac9d9fbefb7b264b36d Ma, Le
2019-12-12 4:04 ` Ma, Le
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.