All of lore.kernel.org
 help / color / mirror / Atom feed
* [RESEND PATCH 1/5] drm/amdgpu: reverts commit b01245ff54db66073b104ac9d9fbefb7b264b36d.
@ 2019-12-11 20:38 ` Andrey Grodzovsky
  0 siblings, 0 replies; 24+ messages in thread
From: Andrey Grodzovsky @ 2019-12-11 20:38 UTC (permalink / raw)
  To: dri-devel, amd-gfx; +Cc: Alexander.Deucher, Le.Ma, Evan.Quan, hawking.zhang

In preparation for doing XGMI reset synchronization using task barrier.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  2 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 76 +++++-------------------------
 2 files changed, 12 insertions(+), 66 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index a78a363..50bab33 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1001,8 +1001,6 @@ struct amdgpu_device {
 
 	bool                            pm_sysfs_en;
 	bool                            ucode_sysfs_en;
-
-	bool				in_baco;
 };
 
 static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 7324a5f..1d19edfa 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2667,7 +2667,7 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
 	if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)
 		adev->asic_reset_res = (adev->in_baco == false) ?
 				amdgpu_device_baco_enter(adev->ddev) :
-				amdgpu_device_baco_exit(adev->ddev);
+				qamdgpu_device_baco_exit(adev->ddev);
 	else
 		adev->asic_reset_res = amdgpu_asic_reset(adev);
 
@@ -3796,18 +3796,13 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
 	return r;
 }
 
-static int amdgpu_do_asic_reset(struct amdgpu_device *adev,
-			       struct amdgpu_hive_info *hive,
+static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
 			       struct list_head *device_list_handle,
 			       bool *need_full_reset_arg)
 {
 	struct amdgpu_device *tmp_adev = NULL;
 	bool need_full_reset = *need_full_reset_arg, vram_lost = false;
 	int r = 0;
-	int cpu = smp_processor_id();
-	bool use_baco =
-		(amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) ?
-		true : false;
 
 	/*
 	 * ASIC reset has to be done on all HGMI hive nodes ASAP
@@ -3815,62 +3810,22 @@ static int amdgpu_do_asic_reset(struct amdgpu_device *adev,
 	 */
 	if (need_full_reset) {
 		list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
-			/*
-			 * For XGMI run all resets in parallel to speed up the
-			 * process by scheduling the highpri wq on different
-			 * cpus. For XGMI with baco reset, all nodes must enter
-			 * baco within close proximity before anyone exit.
-			 */
+			/* For XGMI run all resets in parallel to speed up the process */
 			if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
-				if (!queue_work_on(cpu, system_highpri_wq,
-						   &tmp_adev->xgmi_reset_work))
+				if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work))
 					r = -EALREADY;
-				cpu = cpumask_next(cpu, cpu_online_mask);
 			} else
 				r = amdgpu_asic_reset(tmp_adev);
-			if (r)
-				break;
-		}
-
-		/* For XGMI wait for all work to complete before proceed */
-		if (!r) {
-			list_for_each_entry(tmp_adev, device_list_handle,
-					    gmc.xgmi.head) {
-				if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
-					flush_work(&tmp_adev->xgmi_reset_work);
-					r = tmp_adev->asic_reset_res;
-					if (r)
-						break;
-					if (use_baco)
-						tmp_adev->in_baco = true;
-				}
-			}
-		}
 
-		/*
-		 * For XGMI with baco reset, need exit baco phase by scheduling
-		 * xgmi_reset_work one more time. PSP reset and sGPU skips this
-		 * phase. Not assume the situation that PSP reset and baco reset
-		 * coexist within an XGMI hive.
-		 */
-
-		if (!r && use_baco) {
-			cpu = smp_processor_id();
-			list_for_each_entry(tmp_adev, device_list_handle,
-					    gmc.xgmi.head) {
-				if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
-					if (!queue_work_on(cpu,
-						system_highpri_wq,
-						&tmp_adev->xgmi_reset_work))
-						r = -EALREADY;
-					if (r)
-						break;
-					cpu = cpumask_next(cpu, cpu_online_mask);
-				}
+			if (r) {
+				DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
+					 r, tmp_adev->ddev->unique);
+				break;
 			}
 		}
 
-		if (!r && use_baco) {
+		/* For XGMI wait for all PSP resets to complete before proceed */
+		if (!r) {
 			list_for_each_entry(tmp_adev, device_list_handle,
 					    gmc.xgmi.head) {
 				if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
@@ -3878,21 +3833,15 @@ static int amdgpu_do_asic_reset(struct amdgpu_device *adev,
 					r = tmp_adev->asic_reset_res;
 					if (r)
 						break;
-					tmp_adev->in_baco = false;
 				}
 			}
 		}
-
-		if (r) {
-			DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
-				 r, tmp_adev->ddev->unique);
-			goto end;
-		}
 	}
 
 	if (!r && amdgpu_ras_intr_triggered())
 		amdgpu_ras_intr_cleared();
 
+
 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
 		if (need_full_reset) {
 			/* post card */
@@ -4181,8 +4130,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 		if (r)
 			adev->asic_reset_res = r;
 	} else {
-		r  = amdgpu_do_asic_reset(adev, hive, device_list_handle,
-					  &need_full_reset);
+		r  = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset);
 		if (r && r == -EAGAIN)
 			goto retry;
 	}
-- 
2.7.4

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [RESEND PATCH 1/5] drm/amdgpu: reverts commit b01245ff54db66073b104ac9d9fbefb7b264b36d.
@ 2019-12-11 20:38 ` Andrey Grodzovsky
  0 siblings, 0 replies; 24+ messages in thread
From: Andrey Grodzovsky @ 2019-12-11 20:38 UTC (permalink / raw)
  To: dri-devel, amd-gfx
  Cc: Alexander.Deucher, Le.Ma, Evan.Quan, Andrey Grodzovsky, hawking.zhang

In preparation for doing XGMI reset synchronization using task barrier.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  2 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 76 +++++-------------------------
 2 files changed, 12 insertions(+), 66 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index a78a363..50bab33 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1001,8 +1001,6 @@ struct amdgpu_device {
 
 	bool                            pm_sysfs_en;
 	bool                            ucode_sysfs_en;
-
-	bool				in_baco;
 };
 
 static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 7324a5f..1d19edfa 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2667,7 +2667,7 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
 	if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)
 		adev->asic_reset_res = (adev->in_baco == false) ?
 				amdgpu_device_baco_enter(adev->ddev) :
-				amdgpu_device_baco_exit(adev->ddev);
+				qamdgpu_device_baco_exit(adev->ddev);
 	else
 		adev->asic_reset_res = amdgpu_asic_reset(adev);
 
@@ -3796,18 +3796,13 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
 	return r;
 }
 
-static int amdgpu_do_asic_reset(struct amdgpu_device *adev,
-			       struct amdgpu_hive_info *hive,
+static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
 			       struct list_head *device_list_handle,
 			       bool *need_full_reset_arg)
 {
 	struct amdgpu_device *tmp_adev = NULL;
 	bool need_full_reset = *need_full_reset_arg, vram_lost = false;
 	int r = 0;
-	int cpu = smp_processor_id();
-	bool use_baco =
-		(amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) ?
-		true : false;
 
 	/*
 	 * ASIC reset has to be done on all HGMI hive nodes ASAP
@@ -3815,62 +3810,22 @@ static int amdgpu_do_asic_reset(struct amdgpu_device *adev,
 	 */
 	if (need_full_reset) {
 		list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
-			/*
-			 * For XGMI run all resets in parallel to speed up the
-			 * process by scheduling the highpri wq on different
-			 * cpus. For XGMI with baco reset, all nodes must enter
-			 * baco within close proximity before anyone exit.
-			 */
+			/* For XGMI run all resets in parallel to speed up the process */
 			if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
-				if (!queue_work_on(cpu, system_highpri_wq,
-						   &tmp_adev->xgmi_reset_work))
+				if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work))
 					r = -EALREADY;
-				cpu = cpumask_next(cpu, cpu_online_mask);
 			} else
 				r = amdgpu_asic_reset(tmp_adev);
-			if (r)
-				break;
-		}
-
-		/* For XGMI wait for all work to complete before proceed */
-		if (!r) {
-			list_for_each_entry(tmp_adev, device_list_handle,
-					    gmc.xgmi.head) {
-				if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
-					flush_work(&tmp_adev->xgmi_reset_work);
-					r = tmp_adev->asic_reset_res;
-					if (r)
-						break;
-					if (use_baco)
-						tmp_adev->in_baco = true;
-				}
-			}
-		}
 
-		/*
-		 * For XGMI with baco reset, need exit baco phase by scheduling
-		 * xgmi_reset_work one more time. PSP reset and sGPU skips this
-		 * phase. Not assume the situation that PSP reset and baco reset
-		 * coexist within an XGMI hive.
-		 */
-
-		if (!r && use_baco) {
-			cpu = smp_processor_id();
-			list_for_each_entry(tmp_adev, device_list_handle,
-					    gmc.xgmi.head) {
-				if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
-					if (!queue_work_on(cpu,
-						system_highpri_wq,
-						&tmp_adev->xgmi_reset_work))
-						r = -EALREADY;
-					if (r)
-						break;
-					cpu = cpumask_next(cpu, cpu_online_mask);
-				}
+			if (r) {
+				DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
+					 r, tmp_adev->ddev->unique);
+				break;
 			}
 		}
 
-		if (!r && use_baco) {
+		/* For XGMI wait for all PSP resets to complete before proceed */
+		if (!r) {
 			list_for_each_entry(tmp_adev, device_list_handle,
 					    gmc.xgmi.head) {
 				if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
@@ -3878,21 +3833,15 @@ static int amdgpu_do_asic_reset(struct amdgpu_device *adev,
 					r = tmp_adev->asic_reset_res;
 					if (r)
 						break;
-					tmp_adev->in_baco = false;
 				}
 			}
 		}
-
-		if (r) {
-			DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
-				 r, tmp_adev->ddev->unique);
-			goto end;
-		}
 	}
 
 	if (!r && amdgpu_ras_intr_triggered())
 		amdgpu_ras_intr_cleared();
 
+
 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
 		if (need_full_reset) {
 			/* post card */
@@ -4181,8 +4130,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 		if (r)
 			adev->asic_reset_res = r;
 	} else {
-		r  = amdgpu_do_asic_reset(adev, hive, device_list_handle,
-					  &need_full_reset);
+		r  = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset);
 		if (r && r == -EAGAIN)
 			goto retry;
 	}
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [RESEND PATCH 2/5] drm: Add Reusable task barrier.
  2019-12-11 20:38 ` Andrey Grodzovsky
@ 2019-12-11 20:38   ` Andrey Grodzovsky
  -1 siblings, 0 replies; 24+ messages in thread
From: Andrey Grodzovsky @ 2019-12-11 20:38 UTC (permalink / raw)
  To: dri-devel, amd-gfx; +Cc: Alexander.Deucher, Le.Ma, Evan.Quan, hawking.zhang

It is used to synchronize N threads at a rendevouz point before execution
of critical code that has to be started by all the threads at approximatly
the same time.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 include/drm/task_barrier.h | 106 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 106 insertions(+)
 create mode 100644 include/drm/task_barrier.h

diff --git a/include/drm/task_barrier.h b/include/drm/task_barrier.h
new file mode 100644
index 0000000..81fb0f7
--- /dev/null
+++ b/include/drm/task_barrier.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright 2019 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#include <linux/semaphore.h>
+#include <linux/atomic.h>
+
+/*
+ * Reusable 2 PHASE task barrier (randevouz point) implementation for N tasks.
+ * Based on the Little book of sempahores - https://greenteapress.com/wp/semaphores/
+ */
+
+
+
+#ifndef DRM_TASK_BARRIER_H_
+#define DRM_TASK_BARRIER_H_
+
+/*
+ * Represents an instance of a task barrier.
+ */
+struct task_barrier {
+	unsigned int n;
+	atomic_t count;
+	struct semaphore enter_turnstile;
+	struct semaphore exit_turnstile;
+};
+
+static inline void task_barrier_signal_turnstile(struct semaphore *turnstile,
+						 unsigned int n)
+{
+	int i;
+
+	for (i = 0 ; i < n; i++)
+		up(turnstile);
+}
+
+static inline void task_barrier_init(struct task_barrier *tb)
+{
+	tb->n = 0;
+	atomic_set(&tb->count, 0);
+	sema_init(&tb->enter_turnstile, 0);
+	sema_init(&tb->exit_turnstile, 0);
+}
+
+static inline void task_barrier_add_task(struct task_barrier *tb)
+{
+	tb->n++;
+}
+
+static inline void task_barrier_rem_task(struct task_barrier *tb)
+{
+	tb->n--;
+}
+
+/*
+ * Lines up all the threads BEFORE the critical point.
+ *
+ * When all thread passed this code the entry barrier is back to locked state.
+ */
+static inline void task_barrier_enter(struct task_barrier *tb)
+{
+	if (atomic_inc_return(&tb->count) == tb->n)
+		task_barrier_signal_turnstile(&tb->enter_turnstile, tb->n);
+
+	down(&tb->enter_turnstile);
+}
+
+/*
+ * Lines up all the threads AFTER the critical point.
+ *
+ * This function is used to avoid any one thread running ahead of the reset if
+ * the barrier is used in a loop (repeatedly) .
+ */
+static inline void task_barrier_exit(struct task_barrier *tb)
+{
+	if (atomic_dec_return(&tb->count) == 0)
+		task_barrier_signal_turnstile(&tb->exit_turnstile, tb->n);
+
+	down(&tb->exit_turnstile);
+}
+
+static inline void task_barrier_full(struct task_barrier *tb)
+{
+	task_barrier_enter(tb);
+	task_barrier_exit(tb);
+}
+
+#endif
-- 
2.7.4

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [RESEND PATCH 2/5] drm: Add Reusable task barrier.
@ 2019-12-11 20:38   ` Andrey Grodzovsky
  0 siblings, 0 replies; 24+ messages in thread
From: Andrey Grodzovsky @ 2019-12-11 20:38 UTC (permalink / raw)
  To: dri-devel, amd-gfx
  Cc: Alexander.Deucher, Le.Ma, Evan.Quan, Andrey Grodzovsky, hawking.zhang

It is used to synchronize N threads at a rendevouz point before execution
of critical code that has to be started by all the threads at approximatly
the same time.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 include/drm/task_barrier.h | 106 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 106 insertions(+)
 create mode 100644 include/drm/task_barrier.h

diff --git a/include/drm/task_barrier.h b/include/drm/task_barrier.h
new file mode 100644
index 0000000..81fb0f7
--- /dev/null
+++ b/include/drm/task_barrier.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright 2019 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#include <linux/semaphore.h>
+#include <linux/atomic.h>
+
+/*
+ * Reusable 2 PHASE task barrier (randevouz point) implementation for N tasks.
+ * Based on the Little book of sempahores - https://greenteapress.com/wp/semaphores/
+ */
+
+
+
+#ifndef DRM_TASK_BARRIER_H_
+#define DRM_TASK_BARRIER_H_
+
+/*
+ * Represents an instance of a task barrier.
+ */
+struct task_barrier {
+	unsigned int n;
+	atomic_t count;
+	struct semaphore enter_turnstile;
+	struct semaphore exit_turnstile;
+};
+
+static inline void task_barrier_signal_turnstile(struct semaphore *turnstile,
+						 unsigned int n)
+{
+	int i;
+
+	for (i = 0 ; i < n; i++)
+		up(turnstile);
+}
+
+static inline void task_barrier_init(struct task_barrier *tb)
+{
+	tb->n = 0;
+	atomic_set(&tb->count, 0);
+	sema_init(&tb->enter_turnstile, 0);
+	sema_init(&tb->exit_turnstile, 0);
+}
+
+static inline void task_barrier_add_task(struct task_barrier *tb)
+{
+	tb->n++;
+}
+
+static inline void task_barrier_rem_task(struct task_barrier *tb)
+{
+	tb->n--;
+}
+
+/*
+ * Lines up all the threads BEFORE the critical point.
+ *
+ * When all thread passed this code the entry barrier is back to locked state.
+ */
+static inline void task_barrier_enter(struct task_barrier *tb)
+{
+	if (atomic_inc_return(&tb->count) == tb->n)
+		task_barrier_signal_turnstile(&tb->enter_turnstile, tb->n);
+
+	down(&tb->enter_turnstile);
+}
+
+/*
+ * Lines up all the threads AFTER the critical point.
+ *
+ * This function is used to avoid any one thread running ahead of the reset if
+ * the barrier is used in a loop (repeatedly) .
+ */
+static inline void task_barrier_exit(struct task_barrier *tb)
+{
+	if (atomic_dec_return(&tb->count) == 0)
+		task_barrier_signal_turnstile(&tb->exit_turnstile, tb->n);
+
+	down(&tb->exit_turnstile);
+}
+
+static inline void task_barrier_full(struct task_barrier *tb)
+{
+	task_barrier_enter(tb);
+	task_barrier_exit(tb);
+}
+
+#endif
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [RESEND PATCH 3/5] drm/amdgpu: Add task barrier to XGMI hive.
  2019-12-11 20:38 ` Andrey Grodzovsky
@ 2019-12-11 20:38   ` Andrey Grodzovsky
  -1 siblings, 0 replies; 24+ messages in thread
From: Andrey Grodzovsky @ 2019-12-11 20:38 UTC (permalink / raw)
  To: dri-devel, amd-gfx; +Cc: Alexander.Deucher, Le.Ma, Evan.Quan, hawking.zhang

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 4 ++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h | 2 ++
 2 files changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 61d13d8..5cf920d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -261,6 +261,7 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lo
 	INIT_LIST_HEAD(&tmp->device_list);
 	mutex_init(&tmp->hive_lock);
 	mutex_init(&tmp->reset_lock);
+	task_barrier_init(&tmp->tb);
 
 	if (lock)
 		mutex_lock(&tmp->hive_lock);
@@ -408,6 +409,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
 	top_info->num_nodes = count;
 	hive->number_devices = count;
 
+	task_barrier_add_task(&hive->tb);
+
 	if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
 		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
 			/* update node list for other device in the hive */
@@ -470,6 +473,7 @@ void amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
 		mutex_destroy(&hive->hive_lock);
 		mutex_destroy(&hive->reset_lock);
 	} else {
+		task_barrier_rem_task(&hive->tb);
 		amdgpu_xgmi_sysfs_rem_dev_info(adev, hive);
 		mutex_unlock(&hive->hive_lock);
 	}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
index bbf504f..74011fb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
@@ -22,6 +22,7 @@
 #ifndef __AMDGPU_XGMI_H__
 #define __AMDGPU_XGMI_H__
 
+#include <drm/task_barrier.h>
 #include "amdgpu_psp.h"
 
 struct amdgpu_hive_info {
@@ -33,6 +34,7 @@ struct amdgpu_hive_info {
 	struct device_attribute dev_attr;
 	struct amdgpu_device *adev;
 	int pstate; /*0 -- low , 1 -- high , -1 unknown*/
+	struct task_barrier tb;
 };
 
 struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock);
-- 
2.7.4

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [RESEND PATCH 3/5] drm/amdgpu: Add task barrier to XGMI hive.
@ 2019-12-11 20:38   ` Andrey Grodzovsky
  0 siblings, 0 replies; 24+ messages in thread
From: Andrey Grodzovsky @ 2019-12-11 20:38 UTC (permalink / raw)
  To: dri-devel, amd-gfx
  Cc: Alexander.Deucher, Le.Ma, Evan.Quan, Andrey Grodzovsky, hawking.zhang

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 4 ++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h | 2 ++
 2 files changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 61d13d8..5cf920d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -261,6 +261,7 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lo
 	INIT_LIST_HEAD(&tmp->device_list);
 	mutex_init(&tmp->hive_lock);
 	mutex_init(&tmp->reset_lock);
+	task_barrier_init(&tmp->tb);
 
 	if (lock)
 		mutex_lock(&tmp->hive_lock);
@@ -408,6 +409,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
 	top_info->num_nodes = count;
 	hive->number_devices = count;
 
+	task_barrier_add_task(&hive->tb);
+
 	if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
 		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
 			/* update node list for other device in the hive */
@@ -470,6 +473,7 @@ void amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
 		mutex_destroy(&hive->hive_lock);
 		mutex_destroy(&hive->reset_lock);
 	} else {
+		task_barrier_rem_task(&hive->tb);
 		amdgpu_xgmi_sysfs_rem_dev_info(adev, hive);
 		mutex_unlock(&hive->hive_lock);
 	}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
index bbf504f..74011fb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
@@ -22,6 +22,7 @@
 #ifndef __AMDGPU_XGMI_H__
 #define __AMDGPU_XGMI_H__
 
+#include <drm/task_barrier.h>
 #include "amdgpu_psp.h"
 
 struct amdgpu_hive_info {
@@ -33,6 +34,7 @@ struct amdgpu_hive_info {
 	struct device_attribute dev_attr;
 	struct amdgpu_device *adev;
 	int pstate; /*0 -- low , 1 -- high , -1 unknown*/
+	struct task_barrier tb;
 };
 
 struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock);
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [RESEND PATCH 4/5] Subject: drm/amdgpu: Redo XGMI reset synchronization.
  2019-12-11 20:38 ` Andrey Grodzovsky
@ 2019-12-11 20:38   ` Andrey Grodzovsky
  -1 siblings, 0 replies; 24+ messages in thread
From: Andrey Grodzovsky @ 2019-12-11 20:38 UTC (permalink / raw)
  To: dri-devel, amd-gfx; +Cc: Alexander.Deucher, Le.Ma, Evan.Quan, hawking.zhang

Use task barrier in XGMI hive to synchronize ASIC resets
across devices in XGMI hive.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 42 +++++++++++++++++++++++++-----
 1 file changed, 36 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 1d19edfa..e4089a0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -67,6 +67,7 @@
 #include "amdgpu_tmz.h"
 
 #include <linux/suspend.h>
+#include <drm/task_barrier.h>
 
 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
@@ -2663,14 +2664,43 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
 {
 	struct amdgpu_device *adev =
 		container_of(__work, struct amdgpu_device, xgmi_reset_work);
+	struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
 
-	if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)
-		adev->asic_reset_res = (adev->in_baco == false) ?
-				amdgpu_device_baco_enter(adev->ddev) :
-				qamdgpu_device_baco_exit(adev->ddev);
-	else
-		adev->asic_reset_res = amdgpu_asic_reset(adev);
+	/*
+	 * Use task barrier to synchronize all xgmi reset works across the
+	 * hive.
+	 * task_barrier_enter and task_barrier_exit will block untill all the
+	 * threads running the xgmi reset works reach those points. I assume
+	 * guarantee of progress here for all the threads as the workqueue code
+	 * creates new worker threads as needed by amount of work items in queue
+	 * (see worker_thread) and also each thread sleeps in the barrir and by
+	 * this yielding the CPU for other work threads to make progress.
+	 */
+	if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
+
+		if (hive)
+			task_barrier_enter(&hive->tb);
+
+		adev->asic_reset_res = amdgpu_device_baco_enter(adev->ddev);
+
+		if (adev->asic_reset_res)
+			goto fail;
+
+		if (hive)
+			task_barrier_exit(&hive->tb);
+
+		adev->asic_reset_res = amdgpu_device_baco_exit(adev->ddev);
+
+		if (adev->asic_reset_res)
+			goto fail;
+	} else {
+		if (hive)
+			task_barrier_full(&hive->tb);
+
+		adev->asic_reset_res =  amdgpu_asic_reset(adev);
+	}
 
+fail:
 	if (adev->asic_reset_res)
 		DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
 			 adev->asic_reset_res, adev->ddev->unique);
-- 
2.7.4

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [RESEND PATCH 4/5] Subject: drm/amdgpu: Redo XGMI reset synchronization.
@ 2019-12-11 20:38   ` Andrey Grodzovsky
  0 siblings, 0 replies; 24+ messages in thread
From: Andrey Grodzovsky @ 2019-12-11 20:38 UTC (permalink / raw)
  To: dri-devel, amd-gfx
  Cc: Alexander.Deucher, Le.Ma, Evan.Quan, Andrey Grodzovsky, hawking.zhang

Use task barrier in XGMI hive to synchronize ASIC resets
across devices in XGMI hive.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 42 +++++++++++++++++++++++++-----
 1 file changed, 36 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 1d19edfa..e4089a0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -67,6 +67,7 @@
 #include "amdgpu_tmz.h"
 
 #include <linux/suspend.h>
+#include <drm/task_barrier.h>
 
 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
@@ -2663,14 +2664,43 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
 {
 	struct amdgpu_device *adev =
 		container_of(__work, struct amdgpu_device, xgmi_reset_work);
+	struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
 
-	if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)
-		adev->asic_reset_res = (adev->in_baco == false) ?
-				amdgpu_device_baco_enter(adev->ddev) :
-				qamdgpu_device_baco_exit(adev->ddev);
-	else
-		adev->asic_reset_res = amdgpu_asic_reset(adev);
+	/*
+	 * Use task barrier to synchronize all xgmi reset works across the
+	 * hive.
+	 * task_barrier_enter and task_barrier_exit will block untill all the
+	 * threads running the xgmi reset works reach those points. I assume
+	 * guarantee of progress here for all the threads as the workqueue code
+	 * creates new worker threads as needed by amount of work items in queue
+	 * (see worker_thread) and also each thread sleeps in the barrir and by
+	 * this yielding the CPU for other work threads to make progress.
+	 */
+	if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
+
+		if (hive)
+			task_barrier_enter(&hive->tb);
+
+		adev->asic_reset_res = amdgpu_device_baco_enter(adev->ddev);
+
+		if (adev->asic_reset_res)
+			goto fail;
+
+		if (hive)
+			task_barrier_exit(&hive->tb);
+
+		adev->asic_reset_res = amdgpu_device_baco_exit(adev->ddev);
+
+		if (adev->asic_reset_res)
+			goto fail;
+	} else {
+		if (hive)
+			task_barrier_full(&hive->tb);
+
+		adev->asic_reset_res =  amdgpu_asic_reset(adev);
+	}
 
+fail:
 	if (adev->asic_reset_res)
 		DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
 			 adev->asic_reset_res, adev->ddev->unique);
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [RESEND PATCH 5/5] drm/amdgpu: Switch from system_highpri_wq to system_unbound_wq
  2019-12-11 20:38 ` Andrey Grodzovsky
@ 2019-12-11 20:38   ` Andrey Grodzovsky
  -1 siblings, 0 replies; 24+ messages in thread
From: Andrey Grodzovsky @ 2019-12-11 20:38 UTC (permalink / raw)
  To: dri-devel, amd-gfx; +Cc: Alexander.Deucher, Le.Ma, Evan.Quan, hawking.zhang

This is to avoid queueing jobs to same CPU during XGMI hive reset
because there is a strict timeline for when the reset commands
must reach all the GPUs in the hive.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e4089a0..1518565 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3842,7 +3842,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
 		list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
 			/* For XGMI run all resets in parallel to speed up the process */
 			if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
-				if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work))
+				if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
 					r = -EALREADY;
 			} else
 				r = amdgpu_asic_reset(tmp_adev);
-- 
2.7.4

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [RESEND PATCH 5/5] drm/amdgpu: Switch from system_highpri_wq to system_unbound_wq
@ 2019-12-11 20:38   ` Andrey Grodzovsky
  0 siblings, 0 replies; 24+ messages in thread
From: Andrey Grodzovsky @ 2019-12-11 20:38 UTC (permalink / raw)
  To: dri-devel, amd-gfx
  Cc: Alexander.Deucher, Le.Ma, Evan.Quan, Andrey Grodzovsky, hawking.zhang

This is to avoid queueing jobs to same CPU during XGMI hive reset
because there is a strict timeline for when the reset commands
must reach all the GPUs in the hive.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e4089a0..1518565 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3842,7 +3842,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
 		list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
 			/* For XGMI run all resets in parallel to speed up the process */
 			if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
-				if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work))
+				if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
 					r = -EALREADY;
 			} else
 				r = amdgpu_asic_reset(tmp_adev);
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* RE: [RESEND PATCH 1/5] drm/amdgpu: reverts commit b01245ff54db66073b104ac9d9fbefb7b264b36d.
  2019-12-11 20:38 ` Andrey Grodzovsky
@ 2019-12-12  4:04   ` Ma, Le
  -1 siblings, 0 replies; 24+ messages in thread
From: Ma, Le @ 2019-12-12  4:04 UTC (permalink / raw)
  To: Grodzovsky, Andrey, dri-devel, amd-gfx
  Cc: Deucher, Alexander, Quan, Evan, Zhang, Hawking


[-- Attachment #1.1: Type: text/plain, Size: 10090 bytes --]

[AMD Official Use Only - Internal Distribution Only]




-----Original Message-----
From: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Sent: Thursday, December 12, 2019 4:39 AM
To: dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Ma, Le <Le.Ma@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Quan, Evan <Evan.Quan@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
Subject: [RESEND PATCH 1/5] drm/amdgpu: reverts commit b01245ff54db66073b104ac9d9fbefb7b264b36d.



In preparation for doing XGMI reset synchronization using task barrier.



Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>

---

drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  2 -

drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 76 +++++-------------------------

2 files changed, 12 insertions(+), 66 deletions(-)



diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

index a78a363..50bab33 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

@@ -1001,8 +1001,6 @@ struct amdgpu_device {



            bool                            pm_sysfs_en;

           bool                            ucode_sysfs_en;

-

-           bool                                        in_baco;

};



 static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index 7324a5f..1d19edfa 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

@@ -2667,7 +2667,7 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)

           if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)

                       adev->asic_reset_res = (adev->in_baco == false) ?

                                               amdgpu_device_baco_enter(adev->ddev) :

-                                               amdgpu_device_baco_exit(adev->ddev);

+                                              qamdgpu_device_baco_exit(adev->ddev);

[Le]: Typo here. With it fixed, Reviewed-by: Le Ma <Le.Ma@amd.com<mailto:Le.Ma@amd.com>>



Regards,

Ma Le

           else

                       adev->asic_reset_res = amdgpu_asic_reset(adev);



@@ -3796,18 +3796,13 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,

           return r;

}



-static int amdgpu_do_asic_reset(struct amdgpu_device *adev,

-                                          struct amdgpu_hive_info *hive,

+static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,

                                          struct list_head *device_list_handle,

                                          bool *need_full_reset_arg)

{

           struct amdgpu_device *tmp_adev = NULL;

           bool need_full_reset = *need_full_reset_arg, vram_lost = false;

           int r = 0;

-           int cpu = smp_processor_id();

-           bool use_baco =

-                       (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) ?

-                       true : false;



            /*

            * ASIC reset has to be done on all HGMI hive nodes ASAP @@ -3815,62 +3810,22 @@ static int amdgpu_do_asic_reset(struct amdgpu_device *adev,

            */

           if (need_full_reset) {

                       list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {

-                                   /*

-                                   * For XGMI run all resets in parallel to speed up the

-                                   * process by scheduling the highpri wq on different

-                                   * cpus. For XGMI with baco reset, all nodes must enter

-                                   * baco within close proximity before anyone exit.

-                                   */

+                                  /* For XGMI run all resets in parallel to speed up the process */

                                   if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {

-                                               if (!queue_work_on(cpu, system_highpri_wq,

-                                                                          &tmp_adev->xgmi_reset_work))

+                                              if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work))

                                                           r = -EALREADY;

-                                               cpu = cpumask_next(cpu, cpu_online_mask);

                                   } else

                                               r = amdgpu_asic_reset(tmp_adev);

-                                   if (r)

-                                               break;

-                       }

-

-                       /* For XGMI wait for all work to complete before proceed */

-                       if (!r) {

-                                   list_for_each_entry(tmp_adev, device_list_handle,

-                                                               gmc.xgmi.head) {

-                                               if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {

-                                                           flush_work(&tmp_adev->xgmi_reset_work);

-                                                           r = tmp_adev->asic_reset_res;

-                                                           if (r)

-                                                                       break;

-                                                           if (use_baco)

-                                                                       tmp_adev->in_baco = true;

-                                               }

-                                   }

-                       }



-                       /*

-                       * For XGMI with baco reset, need exit baco phase by scheduling

-                       * xgmi_reset_work one more time. PSP reset and sGPU skips this

-                       * phase. Not assume the situation that PSP reset and baco reset

-                       * coexist within an XGMI hive.

-                       */

-

-                       if (!r && use_baco) {

-                                   cpu = smp_processor_id();

-                                   list_for_each_entry(tmp_adev, device_list_handle,

-                                                               gmc.xgmi.head) {

-                                               if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {

-                                                           if (!queue_work_on(cpu,

-                                                                       system_highpri_wq,

-                                                                       &tmp_adev->xgmi_reset_work))

-                                                                       r = -EALREADY;

-                                                           if (r)

-                                                                       break;

-                                                           cpu = cpumask_next(cpu, cpu_online_mask);

-                                               }

+                                  if (r) {

+                                              DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",

+                                                          r, tmp_adev->ddev->unique);

+                                              break;

                                   }

                       }



-                       if (!r && use_baco) {

+                      /* For XGMI wait for all PSP resets to complete before proceed */

+                      if (!r) {

                                   list_for_each_entry(tmp_adev, device_list_handle,

                                                               gmc.xgmi.head) {

                                               if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { @@ -3878,21 +3833,15 @@ static int amdgpu_do_asic_reset(struct amdgpu_device *adev,

                                                           r = tmp_adev->asic_reset_res;

                                                           if (r)

                                                                       break;

-                                                           tmp_adev->in_baco = false;

                                               }

                                   }

                       }

-

-                       if (r) {

-                                   DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",

-                                               r, tmp_adev->ddev->unique);

-                                   goto end;

-                       }

           }



            if (!r && amdgpu_ras_intr_triggered())

                       amdgpu_ras_intr_cleared();



+

           list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {

                       if (need_full_reset) {

                                   /* post card */

@@ -4181,8 +4130,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,

                       if (r)

                                   adev->asic_reset_res = r;

           } else {

-                       r  = amdgpu_do_asic_reset(adev, hive, device_list_handle,

-                                                             &need_full_reset);

+                      r  = amdgpu_do_asic_reset(hive, device_list_handle,

+&need_full_reset);

                       if (r && r == -EAGAIN)

                                   goto retry;

           }

--

2.7.4



[-- Attachment #1.2: Type: text/html, Size: 40275 bytes --]

[-- Attachment #2: Type: text/plain, Size: 160 bytes --]

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 24+ messages in thread

* RE: [RESEND PATCH 1/5] drm/amdgpu: reverts commit b01245ff54db66073b104ac9d9fbefb7b264b36d.
@ 2019-12-12  4:04   ` Ma, Le
  0 siblings, 0 replies; 24+ messages in thread
From: Ma, Le @ 2019-12-12  4:04 UTC (permalink / raw)
  To: Grodzovsky, Andrey, dri-devel, amd-gfx
  Cc: Deucher, Alexander, Grodzovsky, Andrey, Quan, Evan, Zhang, Hawking


[-- Attachment #1.1: Type: text/plain, Size: 10090 bytes --]

[AMD Official Use Only - Internal Distribution Only]




-----Original Message-----
From: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Sent: Thursday, December 12, 2019 4:39 AM
To: dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Ma, Le <Le.Ma@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Quan, Evan <Evan.Quan@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
Subject: [RESEND PATCH 1/5] drm/amdgpu: reverts commit b01245ff54db66073b104ac9d9fbefb7b264b36d.



In preparation for doing XGMI reset synchronization using task barrier.



Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>

---

drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  2 -

drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 76 +++++-------------------------

2 files changed, 12 insertions(+), 66 deletions(-)



diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

index a78a363..50bab33 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

@@ -1001,8 +1001,6 @@ struct amdgpu_device {



            bool                            pm_sysfs_en;

           bool                            ucode_sysfs_en;

-

-           bool                                        in_baco;

};



 static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index 7324a5f..1d19edfa 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

@@ -2667,7 +2667,7 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)

           if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)

                       adev->asic_reset_res = (adev->in_baco == false) ?

                                               amdgpu_device_baco_enter(adev->ddev) :

-                                               amdgpu_device_baco_exit(adev->ddev);

+                                              qamdgpu_device_baco_exit(adev->ddev);

[Le]: Typo here. With it fixed, Reviewed-by: Le Ma <Le.Ma@amd.com<mailto:Le.Ma@amd.com>>



Regards,

Ma Le

           else

                       adev->asic_reset_res = amdgpu_asic_reset(adev);



@@ -3796,18 +3796,13 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,

           return r;

}



-static int amdgpu_do_asic_reset(struct amdgpu_device *adev,

-                                          struct amdgpu_hive_info *hive,

+static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,

                                          struct list_head *device_list_handle,

                                          bool *need_full_reset_arg)

{

           struct amdgpu_device *tmp_adev = NULL;

           bool need_full_reset = *need_full_reset_arg, vram_lost = false;

           int r = 0;

-           int cpu = smp_processor_id();

-           bool use_baco =

-                       (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) ?

-                       true : false;



            /*

            * ASIC reset has to be done on all HGMI hive nodes ASAP @@ -3815,62 +3810,22 @@ static int amdgpu_do_asic_reset(struct amdgpu_device *adev,

            */

           if (need_full_reset) {

                       list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {

-                                   /*

-                                   * For XGMI run all resets in parallel to speed up the

-                                   * process by scheduling the highpri wq on different

-                                   * cpus. For XGMI with baco reset, all nodes must enter

-                                   * baco within close proximity before anyone exit.

-                                   */

+                                  /* For XGMI run all resets in parallel to speed up the process */

                                   if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {

-                                               if (!queue_work_on(cpu, system_highpri_wq,

-                                                                          &tmp_adev->xgmi_reset_work))

+                                              if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work))

                                                           r = -EALREADY;

-                                               cpu = cpumask_next(cpu, cpu_online_mask);

                                   } else

                                               r = amdgpu_asic_reset(tmp_adev);

-                                   if (r)

-                                               break;

-                       }

-

-                       /* For XGMI wait for all work to complete before proceed */

-                       if (!r) {

-                                   list_for_each_entry(tmp_adev, device_list_handle,

-                                                               gmc.xgmi.head) {

-                                               if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {

-                                                           flush_work(&tmp_adev->xgmi_reset_work);

-                                                           r = tmp_adev->asic_reset_res;

-                                                           if (r)

-                                                                       break;

-                                                           if (use_baco)

-                                                                       tmp_adev->in_baco = true;

-                                               }

-                                   }

-                       }



-                       /*

-                       * For XGMI with baco reset, need exit baco phase by scheduling

-                       * xgmi_reset_work one more time. PSP reset and sGPU skips this

-                       * phase. Not assume the situation that PSP reset and baco reset

-                       * coexist within an XGMI hive.

-                       */

-

-                       if (!r && use_baco) {

-                                   cpu = smp_processor_id();

-                                   list_for_each_entry(tmp_adev, device_list_handle,

-                                                               gmc.xgmi.head) {

-                                               if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {

-                                                           if (!queue_work_on(cpu,

-                                                                       system_highpri_wq,

-                                                                       &tmp_adev->xgmi_reset_work))

-                                                                       r = -EALREADY;

-                                                           if (r)

-                                                                       break;

-                                                           cpu = cpumask_next(cpu, cpu_online_mask);

-                                               }

+                                  if (r) {

+                                              DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",

+                                                          r, tmp_adev->ddev->unique);

+                                              break;

                                   }

                       }



-                       if (!r && use_baco) {

+                      /* For XGMI wait for all PSP resets to complete before proceed */

+                      if (!r) {

                                   list_for_each_entry(tmp_adev, device_list_handle,

                                                               gmc.xgmi.head) {

                                               if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { @@ -3878,21 +3833,15 @@ static int amdgpu_do_asic_reset(struct amdgpu_device *adev,

                                                           r = tmp_adev->asic_reset_res;

                                                           if (r)

                                                                       break;

-                                                           tmp_adev->in_baco = false;

                                               }

                                   }

                       }

-

-                       if (r) {

-                                   DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",

-                                               r, tmp_adev->ddev->unique);

-                                   goto end;

-                       }

           }



            if (!r && amdgpu_ras_intr_triggered())

                       amdgpu_ras_intr_cleared();



+

           list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {

                       if (need_full_reset) {

                                   /* post card */

@@ -4181,8 +4130,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,

                       if (r)

                                   adev->asic_reset_res = r;

           } else {

-                       r  = amdgpu_do_asic_reset(adev, hive, device_list_handle,

-                                                             &need_full_reset);

+                      r  = amdgpu_do_asic_reset(hive, device_list_handle,

+&need_full_reset);

                       if (r && r == -EAGAIN)

                                   goto retry;

           }

--

2.7.4



[-- Attachment #1.2: Type: text/html, Size: 40275 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 24+ messages in thread

* RE: [RESEND PATCH 2/5] drm: Add Reusable task barrier.
  2019-12-11 20:38   ` Andrey Grodzovsky
@ 2019-12-12  4:04     ` Ma, Le
  -1 siblings, 0 replies; 24+ messages in thread
From: Ma, Le @ 2019-12-12  4:04 UTC (permalink / raw)
  To: Grodzovsky, Andrey, dri-devel, amd-gfx
  Cc: Deucher, Alexander, Quan, Evan, Zhang, Hawking


[-- Attachment #1.1: Type: text/plain, Size: 5221 bytes --]

[AMD Official Use Only - Internal Distribution Only]






-----Original Message-----
From: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Sent: Thursday, December 12, 2019 4:39 AM
To: dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Ma, Le <Le.Ma@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Quan, Evan <Evan.Quan@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
Subject: [RESEND PATCH 2/5] drm: Add Reusable task barrier.



It is used to synchronize N threads at a rendevouz point before execution of critical code that has to be started by all the threads at approximatly the same time.



Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com<mailto:andrey.grodzovsky@amd.com>>

---

include/drm/task_barrier.h | 106 +++++++++++++++++++++++++++++++++++++++++++++

1 file changed, 106 insertions(+)

create mode 100644 include/drm/task_barrier.h



diff --git a/include/drm/task_barrier.h b/include/drm/task_barrier.h new file mode 100644 index 0000000..81fb0f7

--- /dev/null

+++ b/include/drm/task_barrier.h

@@ -0,0 +1,106 @@

+/*

+ * Copyright 2019 Advanced Micro Devices, Inc.

+ *

+ * Permission is hereby granted, free of charge, to any person

+obtaining a

+ * copy of this software and associated documentation files (the

+"Software"),

+ * to deal in the Software without restriction, including without

+limitation

+ * the rights to use, copy, modify, merge, publish, distribute,

+sublicense,

+ * and/or sell copies of the Software, and to permit persons to whom

+the

+ * Software is furnished to do so, subject to the following conditions:

+ *

+ * The above copyright notice and this permission notice shall be

+included in

+ * all copies or substantial portions of the Software.

+ *

+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,

+EXPRESS OR

+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF

+MERCHANTABILITY,

+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT

+SHALL

+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM,

+DAMAGES OR

+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR

+OTHERWISE,

+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE

+OR

+ * OTHER DEALINGS IN THE SOFTWARE.

+ *

+ */

+#include <linux/semaphore.h>

+#include <linux/atomic.h>

+

+/*

+ * Reusable 2 PHASE task barrier (randevouz point) implementation for N tasks.

+ * Based on the Little book of sempahores -

+https://greenteapress.com/wp/semaphores/

+ */

+

+

+

+#ifndef DRM_TASK_BARRIER_H_

+#define DRM_TASK_BARRIER_H_

+



[Le]: It might be better to prefix "drm_" to the functions and structure below, even this header file name.



+/*

+ * Represents an instance of a task barrier.

+ */

+struct task_barrier {

+          unsigned int n;

[Le]: We can define it as signed type here for more common use.

+          atomic_t count;

+          struct semaphore enter_turnstile;

+          struct semaphore exit_turnstile;

+};

+

+static inline void task_barrier_signal_turnstile(struct semaphore *turnstile,

+                                                                      unsigned int n)

+{

+          int i;

+

+          for (i = 0 ; i < n; i++)

+                      up(turnstile);

+}

+

+static inline void task_barrier_init(struct task_barrier *tb) {

+          tb->n = 0;

+          atomic_set(&tb->count, 0);

+          sema_init(&tb->enter_turnstile, 0);

+          sema_init(&tb->exit_turnstile, 0);

+}

+

+static inline void task_barrier_add_task(struct task_barrier *tb) {

+          tb->n++;

+}

+

+static inline void task_barrier_rem_task(struct task_barrier *tb) {

+          tb->n--;

+}

+

+/*

+ * Lines up all the threads BEFORE the critical point.

+ *

+ * When all thread passed this code the entry barrier is back to locked state.

+ */

+static inline void task_barrier_enter(struct task_barrier *tb) {

+          if (atomic_inc_return(&tb->count) == tb->n)

+                      task_barrier_signal_turnstile(&tb->enter_turnstile, tb->n);

+

+          down(&tb->enter_turnstile);

+}

+

+/*

+ * Lines up all the threads AFTER the critical point.

+ *

+ * This function is used to avoid any one thread running ahead of the

+reset if

[Le]: No need to mention "reset" here.



With the above addressed, Acked-by: Le Ma Le.Ma@amd.com<mailto:Le.Ma@amd.com>



Regards,

Ma Le

+ * the barrier is used in a loop (repeatedly) .

+ */

+static inline void task_barrier_exit(struct task_barrier *tb) {

+          if (atomic_dec_return(&tb->count) == 0)

+                      task_barrier_signal_turnstile(&tb->exit_turnstile, tb->n);

+

+          down(&tb->exit_turnstile);

+}

+

+static inline void task_barrier_full(struct task_barrier *tb) {

+          task_barrier_enter(tb);

+          task_barrier_exit(tb);

+}

+

+#endif

--

2.7.4



[-- Attachment #1.2: Type: text/html, Size: 16222 bytes --]

[-- Attachment #2: Type: text/plain, Size: 160 bytes --]

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 24+ messages in thread

* RE: [RESEND PATCH 2/5] drm: Add Reusable task barrier.
@ 2019-12-12  4:04     ` Ma, Le
  0 siblings, 0 replies; 24+ messages in thread
From: Ma, Le @ 2019-12-12  4:04 UTC (permalink / raw)
  To: Grodzovsky, Andrey, dri-devel, amd-gfx
  Cc: Deucher, Alexander, Grodzovsky, Andrey, Quan, Evan, Zhang, Hawking


[-- Attachment #1.1: Type: text/plain, Size: 5221 bytes --]

[AMD Official Use Only - Internal Distribution Only]






-----Original Message-----
From: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Sent: Thursday, December 12, 2019 4:39 AM
To: dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Ma, Le <Le.Ma@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Quan, Evan <Evan.Quan@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
Subject: [RESEND PATCH 2/5] drm: Add Reusable task barrier.



It is used to synchronize N threads at a rendevouz point before execution of critical code that has to be started by all the threads at approximatly the same time.



Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com<mailto:andrey.grodzovsky@amd.com>>

---

include/drm/task_barrier.h | 106 +++++++++++++++++++++++++++++++++++++++++++++

1 file changed, 106 insertions(+)

create mode 100644 include/drm/task_barrier.h



diff --git a/include/drm/task_barrier.h b/include/drm/task_barrier.h new file mode 100644 index 0000000..81fb0f7

--- /dev/null

+++ b/include/drm/task_barrier.h

@@ -0,0 +1,106 @@

+/*

+ * Copyright 2019 Advanced Micro Devices, Inc.

+ *

+ * Permission is hereby granted, free of charge, to any person

+obtaining a

+ * copy of this software and associated documentation files (the

+"Software"),

+ * to deal in the Software without restriction, including without

+limitation

+ * the rights to use, copy, modify, merge, publish, distribute,

+sublicense,

+ * and/or sell copies of the Software, and to permit persons to whom

+the

+ * Software is furnished to do so, subject to the following conditions:

+ *

+ * The above copyright notice and this permission notice shall be

+included in

+ * all copies or substantial portions of the Software.

+ *

+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,

+EXPRESS OR

+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF

+MERCHANTABILITY,

+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT

+SHALL

+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM,

+DAMAGES OR

+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR

+OTHERWISE,

+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE

+OR

+ * OTHER DEALINGS IN THE SOFTWARE.

+ *

+ */

+#include <linux/semaphore.h>

+#include <linux/atomic.h>

+

+/*

+ * Reusable 2 PHASE task barrier (randevouz point) implementation for N tasks.

+ * Based on the Little book of sempahores -

+https://greenteapress.com/wp/semaphores/

+ */

+

+

+

+#ifndef DRM_TASK_BARRIER_H_

+#define DRM_TASK_BARRIER_H_

+



[Le]: It might be better to prefix "drm_" to the functions and structure below, even this header file name.



+/*

+ * Represents an instance of a task barrier.

+ */

+struct task_barrier {

+          unsigned int n;

[Le]: We can define it as signed type here for more common use.

+          atomic_t count;

+          struct semaphore enter_turnstile;

+          struct semaphore exit_turnstile;

+};

+

+static inline void task_barrier_signal_turnstile(struct semaphore *turnstile,

+                                                                      unsigned int n)

+{

+          int i;

+

+          for (i = 0 ; i < n; i++)

+                      up(turnstile);

+}

+

+static inline void task_barrier_init(struct task_barrier *tb) {

+          tb->n = 0;

+          atomic_set(&tb->count, 0);

+          sema_init(&tb->enter_turnstile, 0);

+          sema_init(&tb->exit_turnstile, 0);

+}

+

+static inline void task_barrier_add_task(struct task_barrier *tb) {

+          tb->n++;

+}

+

+static inline void task_barrier_rem_task(struct task_barrier *tb) {

+          tb->n--;

+}

+

+/*

+ * Lines up all the threads BEFORE the critical point.

+ *

+ * When all thread passed this code the entry barrier is back to locked state.

+ */

+static inline void task_barrier_enter(struct task_barrier *tb) {

+          if (atomic_inc_return(&tb->count) == tb->n)

+                      task_barrier_signal_turnstile(&tb->enter_turnstile, tb->n);

+

+          down(&tb->enter_turnstile);

+}

+

+/*

+ * Lines up all the threads AFTER the critical point.

+ *

+ * This function is used to avoid any one thread running ahead of the

+reset if

[Le]: No need to mention "reset" here.



With the above addressed, Acked-by: Le Ma Le.Ma@amd.com<mailto:Le.Ma@amd.com>



Regards,

Ma Le

+ * the barrier is used in a loop (repeatedly) .

+ */

+static inline void task_barrier_exit(struct task_barrier *tb) {

+          if (atomic_dec_return(&tb->count) == 0)

+                      task_barrier_signal_turnstile(&tb->exit_turnstile, tb->n);

+

+          down(&tb->exit_turnstile);

+}

+

+static inline void task_barrier_full(struct task_barrier *tb) {

+          task_barrier_enter(tb);

+          task_barrier_exit(tb);

+}

+

+#endif

--

2.7.4



[-- Attachment #1.2: Type: text/html, Size: 16222 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 24+ messages in thread

* RE: [RESEND PATCH 3/5] drm/amdgpu: Add task barrier to XGMI hive.
  2019-12-11 20:38   ` Andrey Grodzovsky
@ 2019-12-12  4:04     ` Ma, Le
  -1 siblings, 0 replies; 24+ messages in thread
From: Ma, Le @ 2019-12-12  4:04 UTC (permalink / raw)
  To: Grodzovsky, Andrey, dri-devel, amd-gfx
  Cc: Deucher, Alexander, Quan, Evan, Zhang, Hawking

[AMD Official Use Only - Internal Distribution Only]

Reviewed-by: Le Ma <Le.Ma@amd.com>

Regards,
Ma Le

-----Original Message-----
From: Andrey Grodzovsky <andrey.grodzovsky@amd.com> 
Sent: Thursday, December 12, 2019 4:39 AM
To: dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Ma, Le <Le.Ma@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Quan, Evan <Evan.Quan@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
Subject: [RESEND PATCH 3/5] drm/amdgpu: Add task barrier to XGMI hive.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 4 ++++  drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h | 2 ++
 2 files changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 61d13d8..5cf920d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -261,6 +261,7 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lo
 	INIT_LIST_HEAD(&tmp->device_list);
 	mutex_init(&tmp->hive_lock);
 	mutex_init(&tmp->reset_lock);
+	task_barrier_init(&tmp->tb);
 
 	if (lock)
 		mutex_lock(&tmp->hive_lock);
@@ -408,6 +409,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
 	top_info->num_nodes = count;
 	hive->number_devices = count;
 
+	task_barrier_add_task(&hive->tb);
+
 	if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
 		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
 			/* update node list for other device in the hive */ @@ -470,6 +473,7 @@ void amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
 		mutex_destroy(&hive->hive_lock);
 		mutex_destroy(&hive->reset_lock);
 	} else {
+		task_barrier_rem_task(&hive->tb);
 		amdgpu_xgmi_sysfs_rem_dev_info(adev, hive);
 		mutex_unlock(&hive->hive_lock);
 	}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
index bbf504f..74011fb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
@@ -22,6 +22,7 @@
 #ifndef __AMDGPU_XGMI_H__
 #define __AMDGPU_XGMI_H__
 
+#include <drm/task_barrier.h>
 #include "amdgpu_psp.h"
 
 struct amdgpu_hive_info {
@@ -33,6 +34,7 @@ struct amdgpu_hive_info {
 	struct device_attribute dev_attr;
 	struct amdgpu_device *adev;
 	int pstate; /*0 -- low , 1 -- high , -1 unknown*/
+	struct task_barrier tb;
 };
 
 struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock);
--
2.7.4
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* RE: [RESEND PATCH 3/5] drm/amdgpu: Add task barrier to XGMI hive.
@ 2019-12-12  4:04     ` Ma, Le
  0 siblings, 0 replies; 24+ messages in thread
From: Ma, Le @ 2019-12-12  4:04 UTC (permalink / raw)
  To: Grodzovsky, Andrey, dri-devel, amd-gfx
  Cc: Deucher, Alexander, Grodzovsky, Andrey, Quan, Evan, Zhang, Hawking

[AMD Official Use Only - Internal Distribution Only]

Reviewed-by: Le Ma <Le.Ma@amd.com>

Regards,
Ma Le

-----Original Message-----
From: Andrey Grodzovsky <andrey.grodzovsky@amd.com> 
Sent: Thursday, December 12, 2019 4:39 AM
To: dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Ma, Le <Le.Ma@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Quan, Evan <Evan.Quan@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
Subject: [RESEND PATCH 3/5] drm/amdgpu: Add task barrier to XGMI hive.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 4 ++++  drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h | 2 ++
 2 files changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 61d13d8..5cf920d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -261,6 +261,7 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lo
 	INIT_LIST_HEAD(&tmp->device_list);
 	mutex_init(&tmp->hive_lock);
 	mutex_init(&tmp->reset_lock);
+	task_barrier_init(&tmp->tb);
 
 	if (lock)
 		mutex_lock(&tmp->hive_lock);
@@ -408,6 +409,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
 	top_info->num_nodes = count;
 	hive->number_devices = count;
 
+	task_barrier_add_task(&hive->tb);
+
 	if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
 		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
 			/* update node list for other device in the hive */ @@ -470,6 +473,7 @@ void amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
 		mutex_destroy(&hive->hive_lock);
 		mutex_destroy(&hive->reset_lock);
 	} else {
+		task_barrier_rem_task(&hive->tb);
 		amdgpu_xgmi_sysfs_rem_dev_info(adev, hive);
 		mutex_unlock(&hive->hive_lock);
 	}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
index bbf504f..74011fb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
@@ -22,6 +22,7 @@
 #ifndef __AMDGPU_XGMI_H__
 #define __AMDGPU_XGMI_H__
 
+#include <drm/task_barrier.h>
 #include "amdgpu_psp.h"
 
 struct amdgpu_hive_info {
@@ -33,6 +34,7 @@ struct amdgpu_hive_info {
 	struct device_attribute dev_attr;
 	struct amdgpu_device *adev;
 	int pstate; /*0 -- low , 1 -- high , -1 unknown*/
+	struct task_barrier tb;
 };
 
 struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock);
--
2.7.4
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* RE: [RESEND PATCH 4/5] Subject: drm/amdgpu: Redo XGMI reset synchronization.
  2019-12-11 20:38   ` Andrey Grodzovsky
@ 2019-12-12  4:05     ` Ma, Le
  -1 siblings, 0 replies; 24+ messages in thread
From: Ma, Le @ 2019-12-12  4:05 UTC (permalink / raw)
  To: Grodzovsky, Andrey, dri-devel, amd-gfx
  Cc: Deucher, Alexander, Quan, Evan, Zhang, Hawking


[-- Attachment #1.1: Type: text/plain, Size: 4235 bytes --]

[AMD Official Use Only - Internal Distribution Only]






-----Original Message-----
From: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Sent: Thursday, December 12, 2019 4:39 AM
To: dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Ma, Le <Le.Ma@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Quan, Evan <Evan.Quan@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
Subject: [RESEND PATCH 4/5] Subject: drm/amdgpu: Redo XGMI reset synchronization.



Use task barrier in XGMI hive to synchronize ASIC resets across devices in XGMI hive.



Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com<mailto:andrey.grodzovsky@amd.com>>

---

drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 42 +++++++++++++++++++++++++-----

1 file changed, 36 insertions(+), 6 deletions(-)



diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index 1d19edfa..e4089a0 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

@@ -67,6 +67,7 @@

#include "amdgpu_tmz.h"



 #include <linux/suspend.h>

+#include <drm/task_barrier.h>



 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");

MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");

@@ -2663,14 +2664,43 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)  {

           struct amdgpu_device *adev =

                       container_of(__work, struct amdgpu_device, xgmi_reset_work);

+          struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);



-           if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)

-                       adev->asic_reset_res = (adev->in_baco == false) ?

-                                               amdgpu_device_baco_enter(adev->ddev) :

-                                               qamdgpu_device_baco_exit(adev->ddev);

-           else

-                       adev->asic_reset_res = amdgpu_asic_reset(adev);

+          /*

+          * Use task barrier to synchronize all xgmi reset works across the

+          * hive.

+          * task_barrier_enter and task_barrier_exit will block untill all the

+          * threads running the xgmi reset works reach those points. I assume

+          * guarantee of progress here for all the threads as the workqueue code

+          * creates new worker threads as needed by amount of work items in queue

+          * (see worker_thread) and also each thread sleeps in the barrir and by

+          * this yielding the CPU for other work threads to make progress.

+          */

[Le]: This comments can be adjusted since we switch to system_unbound_wq in patch #5.

+          if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {

+

+                      if (hive)

+                                  task_barrier_enter(&hive->tb);

[Le]: The multiple hive condition can be checked only once and moved to the location right after the assignment.

+

+                      adev->asic_reset_res = amdgpu_device_baco_enter(adev->ddev);

+

+                      if (adev->asic_reset_res)

+                                  goto fail;

+

+                      if (hive)

+                                  task_barrier_exit(&hive->tb);

[Le]: Same as above.

+

+                      adev->asic_reset_res = amdgpu_device_baco_exit(adev->ddev);

+

+                      if (adev->asic_reset_res)

+                                  goto fail;

+          } else {

+                      if (hive)

+                                  task_barrier_full(&hive->tb);

[Le]: Same as above.



With above addressed, Reviewed-by: Le Ma <Le.Ma@amd.com<mailto:Le.Ma@amd.com>>



Regards,

Ma Le

+

+                      adev->asic_reset_res =  amdgpu_asic_reset(adev);

+          }



+fail:

           if (adev->asic_reset_res)

                       DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",

                                    adev->asic_reset_res, adev->ddev->unique);

--

2.7.4



[-- Attachment #1.2: Type: text/html, Size: 14702 bytes --]

[-- Attachment #2: Type: text/plain, Size: 160 bytes --]

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 24+ messages in thread

* RE: [RESEND PATCH 4/5] Subject: drm/amdgpu: Redo XGMI reset synchronization.
@ 2019-12-12  4:05     ` Ma, Le
  0 siblings, 0 replies; 24+ messages in thread
From: Ma, Le @ 2019-12-12  4:05 UTC (permalink / raw)
  To: Grodzovsky, Andrey, dri-devel, amd-gfx
  Cc: Deucher, Alexander, Grodzovsky, Andrey, Quan, Evan, Zhang, Hawking


[-- Attachment #1.1: Type: text/plain, Size: 4235 bytes --]

[AMD Official Use Only - Internal Distribution Only]






-----Original Message-----
From: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Sent: Thursday, December 12, 2019 4:39 AM
To: dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Ma, Le <Le.Ma@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Quan, Evan <Evan.Quan@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
Subject: [RESEND PATCH 4/5] Subject: drm/amdgpu: Redo XGMI reset synchronization.



Use task barrier in XGMI hive to synchronize ASIC resets across devices in XGMI hive.



Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com<mailto:andrey.grodzovsky@amd.com>>

---

drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 42 +++++++++++++++++++++++++-----

1 file changed, 36 insertions(+), 6 deletions(-)



diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index 1d19edfa..e4089a0 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

@@ -67,6 +67,7 @@

#include "amdgpu_tmz.h"



 #include <linux/suspend.h>

+#include <drm/task_barrier.h>



 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");

MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");

@@ -2663,14 +2664,43 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)  {

           struct amdgpu_device *adev =

                       container_of(__work, struct amdgpu_device, xgmi_reset_work);

+          struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);



-           if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)

-                       adev->asic_reset_res = (adev->in_baco == false) ?

-                                               amdgpu_device_baco_enter(adev->ddev) :

-                                               qamdgpu_device_baco_exit(adev->ddev);

-           else

-                       adev->asic_reset_res = amdgpu_asic_reset(adev);

+          /*

+          * Use task barrier to synchronize all xgmi reset works across the

+          * hive.

+          * task_barrier_enter and task_barrier_exit will block untill all the

+          * threads running the xgmi reset works reach those points. I assume

+          * guarantee of progress here for all the threads as the workqueue code

+          * creates new worker threads as needed by amount of work items in queue

+          * (see worker_thread) and also each thread sleeps in the barrir and by

+          * this yielding the CPU for other work threads to make progress.

+          */

[Le]: This comments can be adjusted since we switch to system_unbound_wq in patch #5.

+          if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {

+

+                      if (hive)

+                                  task_barrier_enter(&hive->tb);

[Le]: The multiple hive condition can be checked only once and moved to the location right after the assignment.

+

+                      adev->asic_reset_res = amdgpu_device_baco_enter(adev->ddev);

+

+                      if (adev->asic_reset_res)

+                                  goto fail;

+

+                      if (hive)

+                                  task_barrier_exit(&hive->tb);

[Le]: Same as above.

+

+                      adev->asic_reset_res = amdgpu_device_baco_exit(adev->ddev);

+

+                      if (adev->asic_reset_res)

+                                  goto fail;

+          } else {

+                      if (hive)

+                                  task_barrier_full(&hive->tb);

[Le]: Same as above.



With above addressed, Reviewed-by: Le Ma <Le.Ma@amd.com<mailto:Le.Ma@amd.com>>



Regards,

Ma Le

+

+                      adev->asic_reset_res =  amdgpu_asic_reset(adev);

+          }



+fail:

           if (adev->asic_reset_res)

                       DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",

                                    adev->asic_reset_res, adev->ddev->unique);

--

2.7.4



[-- Attachment #1.2: Type: text/html, Size: 14702 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 24+ messages in thread

* RE: [RESEND PATCH 5/5] drm/amdgpu: Switch from system_highpri_wq to system_unbound_wq
  2019-12-11 20:38   ` Andrey Grodzovsky
@ 2019-12-12  4:05     ` Ma, Le
  -1 siblings, 0 replies; 24+ messages in thread
From: Ma, Le @ 2019-12-12  4:05 UTC (permalink / raw)
  To: Grodzovsky, Andrey, dri-devel, amd-gfx
  Cc: Deucher, Alexander, Quan, Evan, Zhang, Hawking

[AMD Official Use Only - Internal Distribution Only]

Reviewed-by: Le Ma <Le.Ma@amd.com>

Regards,
Ma Le

-----Original Message-----
From: Andrey Grodzovsky <andrey.grodzovsky@amd.com> 
Sent: Thursday, December 12, 2019 4:39 AM
To: dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Ma, Le <Le.Ma@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Quan, Evan <Evan.Quan@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
Subject: [RESEND PATCH 5/5] drm/amdgpu: Switch from system_highpri_wq to system_unbound_wq

This is to avoid queueing jobs to same CPU during XGMI hive reset because there is a strict timeline for when the reset commands must reach all the GPUs in the hive.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e4089a0..1518565 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3842,7 +3842,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
 		list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
 			/* For XGMI run all resets in parallel to speed up the process */
 			if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
-				if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work))
+				if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
 					r = -EALREADY;
 			} else
 				r = amdgpu_asic_reset(tmp_adev);
--
2.7.4
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* RE: [RESEND PATCH 5/5] drm/amdgpu: Switch from system_highpri_wq to system_unbound_wq
@ 2019-12-12  4:05     ` Ma, Le
  0 siblings, 0 replies; 24+ messages in thread
From: Ma, Le @ 2019-12-12  4:05 UTC (permalink / raw)
  To: Grodzovsky, Andrey, dri-devel, amd-gfx
  Cc: Deucher, Alexander, Grodzovsky, Andrey, Quan, Evan, Zhang, Hawking

[AMD Official Use Only - Internal Distribution Only]

Reviewed-by: Le Ma <Le.Ma@amd.com>

Regards,
Ma Le

-----Original Message-----
From: Andrey Grodzovsky <andrey.grodzovsky@amd.com> 
Sent: Thursday, December 12, 2019 4:39 AM
To: dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Ma, Le <Le.Ma@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Quan, Evan <Evan.Quan@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
Subject: [RESEND PATCH 5/5] drm/amdgpu: Switch from system_highpri_wq to system_unbound_wq

This is to avoid queueing jobs to same CPU during XGMI hive reset because there is a strict timeline for when the reset commands must reach all the GPUs in the hive.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e4089a0..1518565 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3842,7 +3842,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
 		list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
 			/* For XGMI run all resets in parallel to speed up the process */
 			if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
-				if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work))
+				if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
 					r = -EALREADY;
 			} else
 				r = amdgpu_asic_reset(tmp_adev);
--
2.7.4
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [RESEND PATCH 2/5] drm: Add Reusable task barrier.
  2019-12-12  4:04     ` Ma, Le
@ 2019-12-12 15:07       ` Andrey Grodzovsky
  -1 siblings, 0 replies; 24+ messages in thread
From: Andrey Grodzovsky @ 2019-12-12 15:07 UTC (permalink / raw)
  To: Ma, Le, dri-devel, amd-gfx; +Cc: Deucher, Alexander, Quan, Evan, Zhang, Hawking


[-- Attachment #1.1: Type: text/plain, Size: 5816 bytes --]


On 12/11/19 11:04 PM, Ma, Le wrote:
>
> [AMD Official Use Only - Internal Distribution Only]
>
> -----Original Message-----
> From: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
> Sent: Thursday, December 12, 2019 4:39 AM
> To: dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Ma, Le 
> <Le.Ma@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Quan, Evan 
> <Evan.Quan@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
> Subject: [RESEND PATCH 2/5] drm: Add Reusable task barrier.
>
> It is used to synchronize N threads at a rendevouz point before 
> execution of critical code that has to be started by all the threads 
> at approximatly the same time.
>
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com 
> <mailto:andrey.grodzovsky@amd.com>>
>
> ---
>
> include/drm/task_barrier.h | 106 
> +++++++++++++++++++++++++++++++++++++++++++++
>
> 1 file changed, 106 insertions(+)
>
> create mode 100644 include/drm/task_barrier.h
>
> diff --git a/include/drm/task_barrier.h b/include/drm/task_barrier.h 
> new file mode 100644 index 0000000..81fb0f7
>
> --- /dev/null
>
> +++ b/include/drm/task_barrier.h
>
> @@ -0,0 +1,106 @@
>
> +/*
>
> + * Copyright 2019 Advanced Micro Devices, Inc.
>
> + *
>
> + * Permission is hereby granted, free of charge, to any person
>
> +obtaining a
>
> + * copy of this software and associated documentation files (the
>
> +"Software"),
>
> + * to deal in the Software without restriction, including without
>
> +limitation
>
> + * the rights to use, copy, modify, merge, publish, distribute,
>
> +sublicense,
>
> + * and/or sell copies of the Software, and to permit persons to whom
>
> +the
>
> + * Software is furnished to do so, subject to the following conditions:
>
> + *
>
> + * The above copyright notice and this permission notice shall be
>
> +included in
>
> + * all copies or substantial portions of the Software.
>
> + *
>
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
>
> +EXPRESS OR
>
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
>
> +MERCHANTABILITY,
>
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT
>
> +SHALL
>
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM,
>
> +DAMAGES OR
>
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
>
> +OTHERWISE,
>
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
>
> +OR
>
> + * OTHER DEALINGS IN THE SOFTWARE.
>
> + *
>
> + */
>
> +#include <linux/semaphore.h>
>
> +#include <linux/atomic.h>
>
> +
>
> +/*
>
> + * Reusable 2 PHASE task barrier (randevouz point) implementation for 
> N tasks.
>
> + * Based on the Little book of sempahores -
>
> +https://greenteapress.com/wp/semaphores/
>
> + */
>
> +
>
> +
>
> +
>
> +#ifndef DRM_TASK_BARRIER_H_
>
> +#define DRM_TASK_BARRIER_H_
>
> +
>
> [Le]: It might be better to prefix “drm_” to the functions and 
> structure below, even this header file name.
>

I am not sure about this - see the example of spsc_queue we added for 
GPU scheduler use. I just followed it as an example of where to place 
the structure. There is nothing DRM specific about spsc_queue or 
task_barrier, they are generic constructs that we place in DRM subsystem 
for common use.


> +/*
>
> + * Represents an instance of a task barrier.
>
> + */
>
> +struct task_barrier {
>
> +          unsigned int n;
>
> [Le]: We can define it as signed type here for more common use.
>

This is a counter of number of tasks/threads to synchronize in the 
barrier it cannot go bellow 0

Andrey


> +          atomic_t count;
>
> +          struct semaphore enter_turnstile;
>
> +          struct semaphore exit_turnstile;
>
> +};
>
> +
>
> +static inline void task_barrier_signal_turnstile(struct semaphore 
> *turnstile,
>
> + unsigned int n)
>
> +{
>
> +          int i;
>
> +
>
> +          for (i = 0 ; i < n; i++)
>
> +                      up(turnstile);
>
> +}
>
> +
>
> +static inline void task_barrier_init(struct task_barrier *tb) {
>
> +          tb->n = 0;
>
> +          atomic_set(&tb->count, 0);
>
> + sema_init(&tb->enter_turnstile, 0);
>
> + sema_init(&tb->exit_turnstile, 0);
>
> +}
>
> +
>
> +static inline void task_barrier_add_task(struct task_barrier *tb) {
>
> +          tb->n++;
>
> +}
>
> +
>
> +static inline void task_barrier_rem_task(struct task_barrier *tb) {
>
> +          tb->n--;
>
> +}
>
> +
>
> +/*
>
> + * Lines up all the threads BEFORE the critical point.
>
> + *
>
> + * When all thread passed this code the entry barrier is back to 
> locked state.
>
> + */
>
> +static inline void task_barrier_enter(struct task_barrier *tb) {
>
> +          if (atomic_inc_return(&tb->count) == tb->n)
>
> + task_barrier_signal_turnstile(&tb->enter_turnstile, tb->n);
>
> +
>
> + down(&tb->enter_turnstile);
>
> +}
>
> +
>
> +/*
>
> + * Lines up all the threads AFTER the critical point.
>
> + *
>
> + * This function is used to avoid any one thread running ahead of the
>
> +reset if
>
> [Le]: No need to mention “reset” here.
>
> With the above addressed, Acked-by: Le Ma Le.Ma@amd.com 
> <mailto:Le.Ma@amd.com>
>
> Regards,
>
> Ma Le
>
> + * the barrier is used in a loop (repeatedly) .
>
> + */
>
> +static inline void task_barrier_exit(struct task_barrier *tb) {
>
> +          if (atomic_dec_return(&tb->count) == 0)
>
> + task_barrier_signal_turnstile(&tb->exit_turnstile, tb->n);
>
> +
>
> + down(&tb->exit_turnstile);
>
> +}
>
> +
>
> +static inline void task_barrier_full(struct task_barrier *tb) {
>
> +          task_barrier_enter(tb);
>
> +          task_barrier_exit(tb);
>
> +}
>
> +
>
> +#endif
>
> --
>
> 2.7.4
>

[-- Attachment #1.2: Type: text/html, Size: 19692 bytes --]

[-- Attachment #2: Type: text/plain, Size: 160 bytes --]

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RESEND PATCH 2/5] drm: Add Reusable task barrier.
@ 2019-12-12 15:07       ` Andrey Grodzovsky
  0 siblings, 0 replies; 24+ messages in thread
From: Andrey Grodzovsky @ 2019-12-12 15:07 UTC (permalink / raw)
  To: Ma, Le, dri-devel, amd-gfx; +Cc: Deucher, Alexander, Quan, Evan, Zhang, Hawking


[-- Attachment #1.1: Type: text/plain, Size: 5816 bytes --]


On 12/11/19 11:04 PM, Ma, Le wrote:
>
> [AMD Official Use Only - Internal Distribution Only]
>
> -----Original Message-----
> From: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
> Sent: Thursday, December 12, 2019 4:39 AM
> To: dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Ma, Le 
> <Le.Ma@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Quan, Evan 
> <Evan.Quan@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
> Subject: [RESEND PATCH 2/5] drm: Add Reusable task barrier.
>
> It is used to synchronize N threads at a rendevouz point before 
> execution of critical code that has to be started by all the threads 
> at approximatly the same time.
>
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com 
> <mailto:andrey.grodzovsky@amd.com>>
>
> ---
>
> include/drm/task_barrier.h | 106 
> +++++++++++++++++++++++++++++++++++++++++++++
>
> 1 file changed, 106 insertions(+)
>
> create mode 100644 include/drm/task_barrier.h
>
> diff --git a/include/drm/task_barrier.h b/include/drm/task_barrier.h 
> new file mode 100644 index 0000000..81fb0f7
>
> --- /dev/null
>
> +++ b/include/drm/task_barrier.h
>
> @@ -0,0 +1,106 @@
>
> +/*
>
> + * Copyright 2019 Advanced Micro Devices, Inc.
>
> + *
>
> + * Permission is hereby granted, free of charge, to any person
>
> +obtaining a
>
> + * copy of this software and associated documentation files (the
>
> +"Software"),
>
> + * to deal in the Software without restriction, including without
>
> +limitation
>
> + * the rights to use, copy, modify, merge, publish, distribute,
>
> +sublicense,
>
> + * and/or sell copies of the Software, and to permit persons to whom
>
> +the
>
> + * Software is furnished to do so, subject to the following conditions:
>
> + *
>
> + * The above copyright notice and this permission notice shall be
>
> +included in
>
> + * all copies or substantial portions of the Software.
>
> + *
>
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
>
> +EXPRESS OR
>
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
>
> +MERCHANTABILITY,
>
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT
>
> +SHALL
>
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM,
>
> +DAMAGES OR
>
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
>
> +OTHERWISE,
>
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
>
> +OR
>
> + * OTHER DEALINGS IN THE SOFTWARE.
>
> + *
>
> + */
>
> +#include <linux/semaphore.h>
>
> +#include <linux/atomic.h>
>
> +
>
> +/*
>
> + * Reusable 2 PHASE task barrier (randevouz point) implementation for 
> N tasks.
>
> + * Based on the Little book of sempahores -
>
> +https://greenteapress.com/wp/semaphores/
>
> + */
>
> +
>
> +
>
> +
>
> +#ifndef DRM_TASK_BARRIER_H_
>
> +#define DRM_TASK_BARRIER_H_
>
> +
>
> [Le]: It might be better to prefix “drm_” to the functions and 
> structure below, even this header file name.
>

I am not sure about this - see the example of spsc_queue we added for 
GPU scheduler use. I just followed it as an example of where to place 
the structure. There is nothing DRM specific about spsc_queue or 
task_barrier, they are generic constructs that we place in DRM subsystem 
for common use.


> +/*
>
> + * Represents an instance of a task barrier.
>
> + */
>
> +struct task_barrier {
>
> +          unsigned int n;
>
> [Le]: We can define it as signed type here for more common use.
>

This is a counter of number of tasks/threads to synchronize in the 
barrier it cannot go bellow 0

Andrey


> +          atomic_t count;
>
> +          struct semaphore enter_turnstile;
>
> +          struct semaphore exit_turnstile;
>
> +};
>
> +
>
> +static inline void task_barrier_signal_turnstile(struct semaphore 
> *turnstile,
>
> + unsigned int n)
>
> +{
>
> +          int i;
>
> +
>
> +          for (i = 0 ; i < n; i++)
>
> +                      up(turnstile);
>
> +}
>
> +
>
> +static inline void task_barrier_init(struct task_barrier *tb) {
>
> +          tb->n = 0;
>
> +          atomic_set(&tb->count, 0);
>
> + sema_init(&tb->enter_turnstile, 0);
>
> + sema_init(&tb->exit_turnstile, 0);
>
> +}
>
> +
>
> +static inline void task_barrier_add_task(struct task_barrier *tb) {
>
> +          tb->n++;
>
> +}
>
> +
>
> +static inline void task_barrier_rem_task(struct task_barrier *tb) {
>
> +          tb->n--;
>
> +}
>
> +
>
> +/*
>
> + * Lines up all the threads BEFORE the critical point.
>
> + *
>
> + * When all thread passed this code the entry barrier is back to 
> locked state.
>
> + */
>
> +static inline void task_barrier_enter(struct task_barrier *tb) {
>
> +          if (atomic_inc_return(&tb->count) == tb->n)
>
> + task_barrier_signal_turnstile(&tb->enter_turnstile, tb->n);
>
> +
>
> + down(&tb->enter_turnstile);
>
> +}
>
> +
>
> +/*
>
> + * Lines up all the threads AFTER the critical point.
>
> + *
>
> + * This function is used to avoid any one thread running ahead of the
>
> +reset if
>
> [Le]: No need to mention “reset” here.
>
> With the above addressed, Acked-by: Le Ma Le.Ma@amd.com 
> <mailto:Le.Ma@amd.com>
>
> Regards,
>
> Ma Le
>
> + * the barrier is used in a loop (repeatedly) .
>
> + */
>
> +static inline void task_barrier_exit(struct task_barrier *tb) {
>
> +          if (atomic_dec_return(&tb->count) == 0)
>
> + task_barrier_signal_turnstile(&tb->exit_turnstile, tb->n);
>
> +
>
> + down(&tb->exit_turnstile);
>
> +}
>
> +
>
> +static inline void task_barrier_full(struct task_barrier *tb) {
>
> +          task_barrier_enter(tb);
>
> +          task_barrier_exit(tb);
>
> +}
>
> +
>
> +#endif
>
> --
>
> 2.7.4
>

[-- Attachment #1.2: Type: text/html, Size: 19692 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RESEND PATCH 4/5] Subject: drm/amdgpu: Redo XGMI reset synchronization.
  2019-12-12  4:05     ` Ma, Le
@ 2019-12-12 15:20       ` Andrey Grodzovsky
  -1 siblings, 0 replies; 24+ messages in thread
From: Andrey Grodzovsky @ 2019-12-12 15:20 UTC (permalink / raw)
  To: Ma, Le, dri-devel, amd-gfx; +Cc: Deucher, Alexander, Quan, Evan, Zhang, Hawking


[-- Attachment #1.1: Type: text/plain, Size: 4659 bytes --]


On 12/11/19 11:05 PM, Ma, Le wrote:
>
> [AMD Official Use Only - Internal Distribution Only]
>
> -----Original Message-----
> From: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
> Sent: Thursday, December 12, 2019 4:39 AM
> To: dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Ma, Le 
> <Le.Ma@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Quan, Evan 
> <Evan.Quan@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
> Subject: [RESEND PATCH 4/5] Subject: drm/amdgpu: Redo XGMI reset 
> synchronization.
>
> Use task barrier in XGMI hive to synchronize ASIC resets across 
> devices in XGMI hive.
>
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com 
> <mailto:andrey.grodzovsky@amd.com>>
>
> ---
>
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 42 
> +++++++++++++++++++++++++-----
>
> 1 file changed, 36 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>
> index 1d19edfa..e4089a0 100644
>
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>
> @@ -67,6 +67,7 @@
>
> #include "amdgpu_tmz.h"
>
>  #include <linux/suspend.h>
>
> +#include <drm/task_barrier.h>
>
>  MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
>
> MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
>
> @@ -2663,14 +2664,43 @@ static void 
> amdgpu_device_xgmi_reset_func(struct work_struct *__work)  {
>
>            struct amdgpu_device *adev =
>
> container_of(__work, struct amdgpu_device, xgmi_reset_work);
>
> +          struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
>
> -           if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)
>
> - adev->asic_reset_res = (adev->in_baco == false) ?
>
> - amdgpu_device_baco_enter(adev->ddev) :
>
> - qamdgpu_device_baco_exit(adev->ddev);
>
> -           else
>
> - adev->asic_reset_res = amdgpu_asic_reset(adev);
>
> +          /*
>
> +          * Use task barrier to synchronize all xgmi reset works 
> across the
>
> +          * hive.
>
> +          * task_barrier_enter and task_barrier_exit will block 
> untill all the
>
> +          * threads running the xgmi reset works reach those points. 
> I assume
>
> +          * guarantee of progress here for all the threads as the 
> workqueue code
>
> +          * creates new worker threads as needed by amount of work 
> items in queue
>
> +          * (see worker_thread) and also each thread sleeps in the 
> barrir and by
>
> +          * this yielding the CPU for other work threads to make 
> progress.
>
> +          */
>
> [Le]: This comments can be adjusted since we switch to 
> system_unbound_wq in patch #5.
>
> +          if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
>
> +
>
> +                      if (hive)
>
> + task_barrier_enter(&hive->tb);
>
> [Le]: The multiple hive condition can be checked only once and moved 
> to the location right after the assignment.
>

Not sure what you meant here but in fact let's note that while in 
amdgpu_device_xgmi_reset_func it's a bug for amdgpu_get_xgmi_hive to 
return NULL so I think better instead to add WARN_ON(!hive,"...") and 
return right at the beginning of the function if indeed hive == NULL

Andrey


> +
>
> + adev->asic_reset_res = amdgpu_device_baco_enter(adev->ddev);
>
> +
>
> +                      if (adev->asic_reset_res)
>
> +                                  goto fail;
>
> +
>
> +                      if (hive)
>
> + task_barrier_exit(&hive->tb);
>
> [Le]: Same as above.
>
> +
>
> + adev->asic_reset_res = amdgpu_device_baco_exit(adev->ddev);
>
> +
>
> +                      if (adev->asic_reset_res)
>
> +                                  goto fail;
>
> +          } else {
>
> +                      if (hive)
>
> + task_barrier_full(&hive->tb);
>
> [Le]: Same as above.
>
> With above addressed, Reviewed-by: Le Ma <Le.Ma@amd.com 
> <mailto:Le.Ma@amd.com>>
>
> Regards,
>
> Ma Le
>
> +
>
> + adev->asic_reset_res =  amdgpu_asic_reset(adev);
>
> +          }
>
> +fail:
>
>            if (adev->asic_reset_res)
>
>                        DRM_WARN("ASIC reset failed with error, %d for 
> drm dev, %s",
>
>  adev->asic_reset_res, adev->ddev->unique);
>
> --
>
> 2.7.4
>

[-- Attachment #1.2: Type: text/html, Size: 17123 bytes --]

[-- Attachment #2: Type: text/plain, Size: 160 bytes --]

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RESEND PATCH 4/5] Subject: drm/amdgpu: Redo XGMI reset synchronization.
@ 2019-12-12 15:20       ` Andrey Grodzovsky
  0 siblings, 0 replies; 24+ messages in thread
From: Andrey Grodzovsky @ 2019-12-12 15:20 UTC (permalink / raw)
  To: Ma, Le, dri-devel, amd-gfx; +Cc: Deucher, Alexander, Quan, Evan, Zhang, Hawking


[-- Attachment #1.1: Type: text/plain, Size: 4659 bytes --]


On 12/11/19 11:05 PM, Ma, Le wrote:
>
> [AMD Official Use Only - Internal Distribution Only]
>
> -----Original Message-----
> From: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
> Sent: Thursday, December 12, 2019 4:39 AM
> To: dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Ma, Le 
> <Le.Ma@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Quan, Evan 
> <Evan.Quan@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
> Subject: [RESEND PATCH 4/5] Subject: drm/amdgpu: Redo XGMI reset 
> synchronization.
>
> Use task barrier in XGMI hive to synchronize ASIC resets across 
> devices in XGMI hive.
>
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com 
> <mailto:andrey.grodzovsky@amd.com>>
>
> ---
>
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 42 
> +++++++++++++++++++++++++-----
>
> 1 file changed, 36 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>
> index 1d19edfa..e4089a0 100644
>
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>
> @@ -67,6 +67,7 @@
>
> #include "amdgpu_tmz.h"
>
>  #include <linux/suspend.h>
>
> +#include <drm/task_barrier.h>
>
>  MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
>
> MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
>
> @@ -2663,14 +2664,43 @@ static void 
> amdgpu_device_xgmi_reset_func(struct work_struct *__work)  {
>
>            struct amdgpu_device *adev =
>
> container_of(__work, struct amdgpu_device, xgmi_reset_work);
>
> +          struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
>
> -           if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)
>
> - adev->asic_reset_res = (adev->in_baco == false) ?
>
> - amdgpu_device_baco_enter(adev->ddev) :
>
> - qamdgpu_device_baco_exit(adev->ddev);
>
> -           else
>
> - adev->asic_reset_res = amdgpu_asic_reset(adev);
>
> +          /*
>
> +          * Use task barrier to synchronize all xgmi reset works 
> across the
>
> +          * hive.
>
> +          * task_barrier_enter and task_barrier_exit will block 
> untill all the
>
> +          * threads running the xgmi reset works reach those points. 
> I assume
>
> +          * guarantee of progress here for all the threads as the 
> workqueue code
>
> +          * creates new worker threads as needed by amount of work 
> items in queue
>
> +          * (see worker_thread) and also each thread sleeps in the 
> barrir and by
>
> +          * this yielding the CPU for other work threads to make 
> progress.
>
> +          */
>
> [Le]: This comments can be adjusted since we switch to 
> system_unbound_wq in patch #5.
>
> +          if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
>
> +
>
> +                      if (hive)
>
> + task_barrier_enter(&hive->tb);
>
> [Le]: The multiple hive condition can be checked only once and moved 
> to the location right after the assignment.
>

Not sure what you meant here but in fact let's note that while in 
amdgpu_device_xgmi_reset_func it's a bug for amdgpu_get_xgmi_hive to 
return NULL so I think better instead to add WARN_ON(!hive,"...") and 
return right at the beginning of the function if indeed hive == NULL

Andrey


> +
>
> + adev->asic_reset_res = amdgpu_device_baco_enter(adev->ddev);
>
> +
>
> +                      if (adev->asic_reset_res)
>
> +                                  goto fail;
>
> +
>
> +                      if (hive)
>
> + task_barrier_exit(&hive->tb);
>
> [Le]: Same as above.
>
> +
>
> + adev->asic_reset_res = amdgpu_device_baco_exit(adev->ddev);
>
> +
>
> +                      if (adev->asic_reset_res)
>
> +                                  goto fail;
>
> +          } else {
>
> +                      if (hive)
>
> + task_barrier_full(&hive->tb);
>
> [Le]: Same as above.
>
> With above addressed, Reviewed-by: Le Ma <Le.Ma@amd.com 
> <mailto:Le.Ma@amd.com>>
>
> Regards,
>
> Ma Le
>
> +
>
> + adev->asic_reset_res =  amdgpu_asic_reset(adev);
>
> +          }
>
> +fail:
>
>            if (adev->asic_reset_res)
>
>                        DRM_WARN("ASIC reset failed with error, %d for 
> drm dev, %s",
>
>  adev->asic_reset_res, adev->ddev->unique);
>
> --
>
> 2.7.4
>

[-- Attachment #1.2: Type: text/html, Size: 17123 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2019-12-12 15:20 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-12-11 20:38 [RESEND PATCH 1/5] drm/amdgpu: reverts commit b01245ff54db66073b104ac9d9fbefb7b264b36d Andrey Grodzovsky
2019-12-11 20:38 ` Andrey Grodzovsky
2019-12-11 20:38 ` [RESEND PATCH 2/5] drm: Add Reusable task barrier Andrey Grodzovsky
2019-12-11 20:38   ` Andrey Grodzovsky
2019-12-12  4:04   ` Ma, Le
2019-12-12  4:04     ` Ma, Le
2019-12-12 15:07     ` Andrey Grodzovsky
2019-12-12 15:07       ` Andrey Grodzovsky
2019-12-11 20:38 ` [RESEND PATCH 3/5] drm/amdgpu: Add task barrier to XGMI hive Andrey Grodzovsky
2019-12-11 20:38   ` Andrey Grodzovsky
2019-12-12  4:04   ` Ma, Le
2019-12-12  4:04     ` Ma, Le
2019-12-11 20:38 ` [RESEND PATCH 4/5] Subject: drm/amdgpu: Redo XGMI reset synchronization Andrey Grodzovsky
2019-12-11 20:38   ` Andrey Grodzovsky
2019-12-12  4:05   ` Ma, Le
2019-12-12  4:05     ` Ma, Le
2019-12-12 15:20     ` Andrey Grodzovsky
2019-12-12 15:20       ` Andrey Grodzovsky
2019-12-11 20:38 ` [RESEND PATCH 5/5] drm/amdgpu: Switch from system_highpri_wq to system_unbound_wq Andrey Grodzovsky
2019-12-11 20:38   ` Andrey Grodzovsky
2019-12-12  4:05   ` Ma, Le
2019-12-12  4:05     ` Ma, Le
2019-12-12  4:04 ` [RESEND PATCH 1/5] drm/amdgpu: reverts commit b01245ff54db66073b104ac9d9fbefb7b264b36d Ma, Le
2019-12-12  4:04   ` Ma, Le

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.