All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/5] drm/amdgpu: Introduce gfx software ring (v8)
@ 2022-10-18  9:08 jiadong.zhu
  2022-10-18  9:08 ` [PATCH 2/5] drm/amdgpu: Add software ring callbacks for gfx9 (v8) jiadong.zhu
                   ` (6 more replies)
  0 siblings, 7 replies; 29+ messages in thread
From: jiadong.zhu @ 2022-10-18  9:08 UTC (permalink / raw)
  To: amd-gfx
  Cc: Andrey Grodzovsky, Michel Dänzer, Huang Rui, Luben Tuikov,
	Jiadong.Zhu, Christian Koenig

From: "Jiadong.Zhu" <Jiadong.Zhu@amd.com>

The software ring is created to support priority context while there is only
one hardware queue for gfx.

Every software ring has its fence driver and could be used as an ordinary ring
for the GPU scheduler.
Multiple software rings are bound to a real ring with the ring muxer. The
packages committed on the software ring are copied to the real ring.

v2: Use array to store software ring entry.
v3: Remove unnecessary prints.
v4: Remove amdgpu_ring_sw_init/fini functions,
using gtt for sw ring buffer for later dma copy
optimization.
v5: Allocate ring entry dynamically in the muxer.
v6: Update comments for the ring muxer.
v7: Modify for function naming.
v8: Combine software ring functions into amdgpu_ring_mux.c

Cc: Christian Koenig <Christian.Koenig@amd.com>
Cc: Luben Tuikov <Luben.Tuikov@amd.com>
Cc: Andrey Grodzovsky  <Andrey.Grodzovsky@amd.com>
Cc: Michel Dänzer <michel@daenzer.net>
Signed-off-by: Jiadong.Zhu <Jiadong.Zhu@amd.com>
Acked-by: Huang Rui <ray.huang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/Makefile          |   3 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h      |   3 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h     |   4 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c | 217 +++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h |  76 +++++++
 5 files changed, 302 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
 create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h

diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile
index 3e0e2eb7e235..add7da53950c 100644
--- a/drivers/gpu/drm/amd/amdgpu/Makefile
+++ b/drivers/gpu/drm/amd/amdgpu/Makefile
@@ -58,7 +58,8 @@ amdgpu-y += amdgpu_device.o amdgpu_kms.o \
 	amdgpu_vm_sdma.o amdgpu_discovery.o amdgpu_ras_eeprom.o amdgpu_nbio.o \
 	amdgpu_umc.o smu_v11_0_i2c.o amdgpu_fru_eeprom.o amdgpu_rap.o \
 	amdgpu_fw_attestation.o amdgpu_securedisplay.o \
-	amdgpu_eeprom.o amdgpu_mca.o amdgpu_psp_ta.o amdgpu_lsdma.o
+	amdgpu_eeprom.o amdgpu_mca.o amdgpu_psp_ta.o amdgpu_lsdma.o \
+	amdgpu_ring_mux.o
 
 amdgpu-$(CONFIG_PROC_FS) += amdgpu_fdinfo.o
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index 53526ffb2ce1..9996dadb39f7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -33,6 +33,7 @@
 #include "amdgpu_imu.h"
 #include "soc15.h"
 #include "amdgpu_ras.h"
+#include "amdgpu_ring_mux.h"
 
 /* GFX current status */
 #define AMDGPU_GFX_NORMAL_MODE			0x00000000L
@@ -346,6 +347,8 @@ struct amdgpu_gfx {
 	struct amdgpu_gfx_ras		*ras;
 
 	bool				is_poweron;
+
+	struct amdgpu_ring_mux          muxer;
 };
 
 #define amdgpu_gfx_get_gpu_clock_counter(adev) (adev)->gfx.funcs->get_gpu_clock_counter((adev))
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 7d89a52091c0..40b1277b4f0c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -278,6 +278,10 @@ struct amdgpu_ring {
 	bool			is_mes_queue;
 	uint32_t		hw_queue_id;
 	struct amdgpu_mes_ctx_data *mes_ctx;
+
+	bool            is_sw_ring;
+	unsigned int    entry_index;
+
 };
 
 #define amdgpu_ring_parse_cs(r, p, job, ib) ((r)->funcs->parse_cs((p), (job), (ib)))
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
new file mode 100644
index 000000000000..43cab8a37754
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
@@ -0,0 +1,217 @@
+/*
+ * Copyright 2022 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#include <linux/slab.h>
+#include <drm/drm_print.h>
+
+#include "amdgpu_ring_mux.h"
+#include "amdgpu_ring.h"
+#include "amdgpu.h"
+
+#define AMDGPU_MUX_RESUBMIT_JIFFIES_TIMEOUT (HZ / 2)
+
+int amdgpu_ring_mux_init(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring,
+			 unsigned int entry_size)
+{
+	mux->real_ring = ring;
+	mux->num_ring_entries = 0;
+	mux->ring_entry = kcalloc(entry_size, sizeof(struct amdgpu_mux_entry), GFP_KERNEL);
+	if (!mux->ring_entry)
+		return -ENOMEM;
+
+	mux->ring_entry_size = entry_size;
+	spin_lock_init(&mux->lock);
+
+	return 0;
+}
+
+void amdgpu_ring_mux_fini(struct amdgpu_ring_mux *mux)
+{
+	kfree(mux->ring_entry);
+	mux->ring_entry = NULL;
+	mux->num_ring_entries = 0;
+	mux->ring_entry_size = 0;
+}
+
+int amdgpu_ring_mux_add_sw_ring(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring)
+{
+	struct amdgpu_mux_entry *e;
+
+	if (mux->num_ring_entries >= mux->ring_entry_size) {
+		DRM_ERROR("add sw ring exceeding max entry size\n");
+		return -ENOENT;
+	}
+
+	e = &mux->ring_entry[mux->num_ring_entries];
+	ring->entry_index = mux->num_ring_entries;
+	e->ring = ring;
+
+	mux->num_ring_entries += 1;
+	return 0;
+}
+
+static inline struct amdgpu_mux_entry *amdgpu_ring_mux_sw_entry(struct amdgpu_ring_mux *mux,
+								struct amdgpu_ring *ring)
+{
+	return ring->entry_index < mux->ring_entry_size ?
+			&mux->ring_entry[ring->entry_index] : NULL;
+}
+
+/* copy packages on sw ring range[begin, end) */
+static void amdgpu_ring_mux_copy_pkt_from_sw_ring(struct amdgpu_ring_mux *mux,
+						  struct amdgpu_ring *ring,
+						  u64 s_start, u64 s_end)
+{
+	u64 start, end;
+	struct amdgpu_ring *real_ring = mux->real_ring;
+
+	start = s_start & ring->buf_mask;
+	end = s_end & ring->buf_mask;
+
+	if (start == end) {
+		DRM_ERROR("no more data copied from sw ring\n");
+		return;
+	}
+	if (start > end) {
+		amdgpu_ring_alloc(real_ring, (ring->ring_size >> 2) + end - start);
+		amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[start],
+					   (ring->ring_size >> 2) - start);
+		amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[0], end);
+	} else {
+		amdgpu_ring_alloc(real_ring, end - start);
+		amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[start], end - start);
+	}
+}
+
+void amdgpu_ring_mux_set_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring, u64 wptr)
+{
+	struct amdgpu_mux_entry *e;
+
+	e = amdgpu_ring_mux_sw_entry(mux, ring);
+	if (!e) {
+		DRM_ERROR("cannot find entry for sw ring\n");
+		return;
+	}
+
+	spin_lock(&mux->lock);
+	e->sw_cptr = e->sw_wptr;
+	e->sw_wptr = wptr;
+	e->start_ptr_in_hw_ring = mux->real_ring->wptr;
+
+	amdgpu_ring_mux_copy_pkt_from_sw_ring(mux, ring, e->sw_cptr, wptr);
+	e->end_ptr_in_hw_ring = mux->real_ring->wptr;
+	amdgpu_ring_commit(mux->real_ring);
+
+	spin_unlock(&mux->lock);
+}
+
+u64 amdgpu_ring_mux_get_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring)
+{
+	struct amdgpu_mux_entry *e;
+
+	e = amdgpu_ring_mux_sw_entry(mux, ring);
+	if (!e) {
+		DRM_ERROR("cannot find entry for sw ring\n");
+		return 0;
+	}
+
+	return e->sw_wptr;
+}
+
+/*
+ * The return value of the readptr is not precise while the other rings could
+ * write data onto the real ring buffer.After overwriting on the real ring, we
+ * can not decide if our packages have been excuted or not read yet. However,
+ * this function is only called by the tools such as umr to collect the latest
+ * packages for the hang analysis. We assume the hang happens near our latest
+ * submit. Thus we could use the following logic to give the clue:
+ * If the readptr is between start and end, then we return the copy pointer
+ * plus the distance from start to readptr. If the readptr is before start, we
+ * return the copy pointer. Lastly, if the readptr is past end, we return the
+ * write pointer.
+ */
+u64 amdgpu_ring_mux_get_rptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring)
+{
+	struct amdgpu_mux_entry *e;
+	u64 readp, offset, start, end;
+
+	e = amdgpu_ring_mux_sw_entry(mux, ring);
+	if (!e) {
+		DRM_ERROR("no sw entry found!\n");
+		return 0;
+	}
+
+	readp = amdgpu_ring_get_rptr(mux->real_ring);
+
+	start = e->start_ptr_in_hw_ring & mux->real_ring->buf_mask;
+	end = e->end_ptr_in_hw_ring & mux->real_ring->buf_mask;
+	if (start > end) {
+		if (readp <= end)
+			readp += mux->real_ring->ring_size >> 2;
+		end += mux->real_ring->ring_size >> 2;
+	}
+
+	if (start <= readp && readp <= end) {
+		offset = readp - start;
+		e->sw_rptr = (e->sw_cptr + offset) & ring->buf_mask;
+	} else if (readp < start) {
+		e->sw_rptr = e->sw_cptr;
+	} else {
+		/* end < readptr */
+		e->sw_rptr = e->sw_wptr;
+	}
+
+	return e->sw_rptr;
+}
+
+u64 amdgpu_sw_ring_get_rptr_gfx(struct amdgpu_ring *ring)
+{
+	struct amdgpu_device *adev = ring->adev;
+	struct amdgpu_ring_mux *mux = &adev->gfx.muxer;
+
+	WARN_ON(!ring->is_sw_ring);
+	return amdgpu_ring_mux_get_rptr(mux, ring);
+}
+
+u64 amdgpu_sw_ring_get_wptr_gfx(struct amdgpu_ring *ring)
+{
+	struct amdgpu_device *adev = ring->adev;
+	struct amdgpu_ring_mux *mux = &adev->gfx.muxer;
+
+	WARN_ON(!ring->is_sw_ring);
+	return amdgpu_ring_mux_get_wptr(mux, ring);
+}
+
+void amdgpu_sw_ring_set_wptr_gfx(struct amdgpu_ring *ring)
+{
+	struct amdgpu_device *adev = ring->adev;
+	struct amdgpu_ring_mux *mux = &adev->gfx.muxer;
+
+	WARN_ON(!ring->is_sw_ring);
+	amdgpu_ring_mux_set_wptr(mux, ring, ring->wptr);
+}
+
+/* Override insert_nop to prevent emitting nops to the software rings */
+void amdgpu_sw_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count)
+{
+	WARN_ON(!ring->is_sw_ring);
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
new file mode 100644
index 000000000000..d91629589577
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright 2022 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __AMDGPU_RING_MUX__
+#define __AMDGPU_RING_MUX__
+
+#include <linux/timer.h>
+#include <linux/spinlock.h>
+#include "amdgpu_ring.h"
+
+struct amdgpu_ring;
+/**
+ * struct amdgpu_mux_entry - the entry recording software rings copying information.
+ * @ring: the pointer to the software ring.
+ * @start_ptr_in_hw_ring: last start location copied to in the hardware ring.
+ * @end_ptr_in_hw_ring: last end location copied to in the hardware ring.
+ * @sw_cptr: the position of the copy pointer in the sw ring.
+ * @sw_rptr: the read pointer in software ring.
+ * @sw_wptr: the write pointer in software ring.
+ */
+struct amdgpu_mux_entry {
+	struct                  amdgpu_ring *ring;
+	u64                     start_ptr_in_hw_ring;
+	u64                     end_ptr_in_hw_ring;
+	u64                     sw_cptr;
+	u64                     sw_rptr;
+	u64                     sw_wptr;
+};
+
+struct amdgpu_ring_mux {
+	struct amdgpu_ring      *real_ring;
+
+	struct amdgpu_mux_entry *ring_entry;
+	unsigned int            num_ring_entries;
+	unsigned int            ring_entry_size;
+	/*the lock for copy data from different software rings*/
+	spinlock_t              lock;
+};
+
+int amdgpu_ring_mux_init(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring,
+			 unsigned int entry_size);
+void amdgpu_ring_mux_fini(struct amdgpu_ring_mux *mux);
+int amdgpu_ring_mux_add_sw_ring(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring);
+void amdgpu_ring_mux_set_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring, u64 wptr);
+u64 amdgpu_ring_mux_get_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring);
+u64 amdgpu_ring_mux_get_rptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring);
+
+u64 amdgpu_sw_ring_get_rptr_gfx(struct amdgpu_ring *ring);
+u64 amdgpu_sw_ring_get_wptr_gfx(struct amdgpu_ring *ring);
+void amdgpu_sw_ring_set_wptr_gfx(struct amdgpu_ring *ring);
+
+void amdgpu_sw_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count);
+void amdgpu_sw_ring_ib_begin(struct amdgpu_ring *ring);
+void amdgpu_sw_ring_ib_end(struct amdgpu_ring *ring);
+
+#endif
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH 2/5] drm/amdgpu: Add software ring callbacks for gfx9 (v8)
  2022-10-18  9:08 [PATCH 1/5] drm/amdgpu: Introduce gfx software ring (v8) jiadong.zhu
@ 2022-10-18  9:08 ` jiadong.zhu
  2022-10-18  9:08 ` [PATCH 3/5] drm/amdgpu: Modify unmap_queue format for gfx9 (v4) jiadong.zhu
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 29+ messages in thread
From: jiadong.zhu @ 2022-10-18  9:08 UTC (permalink / raw)
  To: amd-gfx
  Cc: Andrey Grodzovsky, Michel Dänzer, Huang Rui, Luben Tuikov,
	Likun Gao, Jiadong.Zhu, Christian Koenig

From: "Jiadong.Zhu" <Jiadong.Zhu@amd.com>

Set ring functions with software ring callbacks on gfx9.

The software ring could be tested by debugfs_test_ib case.

v2: Set sw_ring 2 to enable software ring by default.
v3: Remove the parameter for software ring enablement.
v4: Use amdgpu_ring_init/fini for software rings.
v5: Update for code format. Fix conflict.
v6: Remove unnecessary checks and enable software ring on gfx9 by default.
v7: Use static array for software ring names and priorities.
v8: Stop creating software rings if no gfx ring existed.

Cc: Christian Koenig <Christian.Koenig@amd.com>
Cc: Luben Tuikov <Luben.Tuikov@amd.com>
Cc: Andrey Grodzovsky <Andrey.Grodzovsky@amd.com>
Cc: Michel Dänzer <michel@daenzer.net>
Cc: Likun Gao <Likun.Gao@amd.com>
Signed-off-by: Jiadong.Zhu <Jiadong.Zhu@amd.com>
Acked-by: Luben Tuikov <luben.tuikov@amd.com>
Acked-by: Huang Rui <ray.huang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h      |   1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h     |   1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c |  20 ++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h |   2 +
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c        | 113 ++++++++++++++++++-
 5 files changed, 136 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index 9996dadb39f7..4fdfc3ec134a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -348,6 +348,7 @@ struct amdgpu_gfx {
 
 	bool				is_poweron;
 
+	struct amdgpu_ring		sw_gfx_ring[AMDGPU_MAX_SW_GFX_RINGS];
 	struct amdgpu_ring_mux          muxer;
 };
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 40b1277b4f0c..f08ee1ac281c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -39,6 +39,7 @@ struct amdgpu_vm;
 #define AMDGPU_MAX_RINGS		28
 #define AMDGPU_MAX_HWIP_RINGS		8
 #define AMDGPU_MAX_GFX_RINGS		2
+#define AMDGPU_MAX_SW_GFX_RINGS         2
 #define AMDGPU_MAX_COMPUTE_RINGS	8
 #define AMDGPU_MAX_VCE_RINGS		3
 #define AMDGPU_MAX_UVD_ENC_RINGS	2
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
index 43cab8a37754..2e64ffccc030 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
@@ -29,6 +29,14 @@
 
 #define AMDGPU_MUX_RESUBMIT_JIFFIES_TIMEOUT (HZ / 2)
 
+static const struct ring_info {
+	unsigned int hw_pio;
+	const char *ring_name;
+} sw_ring_info[] = {
+	{ AMDGPU_RING_PRIO_DEFAULT, "gfx_low"},
+	{ AMDGPU_RING_PRIO_2, "gfx_high"},
+};
+
 int amdgpu_ring_mux_init(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring,
 			 unsigned int entry_size)
 {
@@ -215,3 +223,15 @@ void amdgpu_sw_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count)
 {
 	WARN_ON(!ring->is_sw_ring);
 }
+
+const char *amdgpu_sw_ring_name(int idx)
+{
+	return idx < ARRAY_SIZE(sw_ring_info) ?
+		sw_ring_info[idx].ring_name : NULL;
+}
+
+unsigned int amdgpu_sw_ring_priority(int idx)
+{
+	return idx < ARRAY_SIZE(sw_ring_info) ?
+		sw_ring_info[idx].hw_pio : AMDGPU_RING_PRIO_DEFAULT;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
index d91629589577..28399f4b0e5c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
@@ -73,4 +73,6 @@ void amdgpu_sw_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count);
 void amdgpu_sw_ring_ib_begin(struct amdgpu_ring *ring);
 void amdgpu_sw_ring_ib_end(struct amdgpu_ring *ring);
 
+const char *amdgpu_sw_ring_name(int idx);
+unsigned int amdgpu_sw_ring_priority(int idx);
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 6b609f33261f..8d4fbc9e3fc0 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -47,6 +47,7 @@
 
 #include "amdgpu_ras.h"
 
+#include "amdgpu_ring_mux.h"
 #include "gfx_v9_4.h"
 #include "gfx_v9_0.h"
 #include "gfx_v9_4_2.h"
@@ -56,6 +57,7 @@
 #include "asic_reg/gc/gc_9_0_default.h"
 
 #define GFX9_NUM_GFX_RINGS     1
+#define GFX9_NUM_SW_GFX_RINGS  2
 #define GFX9_MEC_HPD_SIZE 4096
 #define RLCG_UCODE_LOADING_START_ADDRESS 0x00002000L
 #define RLC_SAVE_RESTORE_ADDR_STARTING_OFFSET 0x00000000L
@@ -2273,6 +2275,7 @@ static int gfx_v9_0_sw_init(void *handle)
 	struct amdgpu_ring *ring;
 	struct amdgpu_kiq *kiq;
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+	unsigned int hw_prio;
 
 	switch (adev->ip_versions[GC_HWIP][0]) {
 	case IP_VERSION(9, 0, 1):
@@ -2356,6 +2359,9 @@ static int gfx_v9_0_sw_init(void *handle)
 			sprintf(ring->name, "gfx_%d", i);
 		ring->use_doorbell = true;
 		ring->doorbell_index = adev->doorbell_index.gfx_ring0 << 1;
+
+		/* disable scheduler on the real ring */
+		ring->no_scheduler = true;
 		r = amdgpu_ring_init(adev, ring, 1024, &adev->gfx.eop_irq,
 				     AMDGPU_CP_IRQ_GFX_ME0_PIPE0_EOP,
 				     AMDGPU_RING_PRIO_DEFAULT, NULL);
@@ -2363,6 +2369,41 @@ static int gfx_v9_0_sw_init(void *handle)
 			return r;
 	}
 
+	/* set up the software rings */
+	if (adev->gfx.num_gfx_rings) {
+		for (i = 0; i < GFX9_NUM_SW_GFX_RINGS; i++) {
+			ring = &adev->gfx.sw_gfx_ring[i];
+			ring->ring_obj = NULL;
+			sprintf(ring->name, amdgpu_sw_ring_name(i));
+			ring->use_doorbell = true;
+			ring->doorbell_index = adev->doorbell_index.gfx_ring0 << 1;
+			ring->is_sw_ring = true;
+			hw_prio = amdgpu_sw_ring_priority(i);
+			r = amdgpu_ring_init(adev, ring, 1024, &adev->gfx.eop_irq,
+					     AMDGPU_CP_IRQ_GFX_ME0_PIPE0_EOP, hw_prio,
+					     NULL);
+			if (r)
+				return r;
+			ring->wptr = 0;
+		}
+
+		/* init the muxer and add software rings */
+		r = amdgpu_ring_mux_init(&adev->gfx.muxer, &adev->gfx.gfx_ring[0],
+					 GFX9_NUM_SW_GFX_RINGS);
+		if (r) {
+			DRM_ERROR("amdgpu_ring_mux_init failed(%d)\n", r);
+			return r;
+		}
+		for (i = 0; i < GFX9_NUM_SW_GFX_RINGS; i++) {
+			r = amdgpu_ring_mux_add_sw_ring(&adev->gfx.muxer,
+							&adev->gfx.sw_gfx_ring[i]);
+			if (r) {
+				DRM_ERROR("amdgpu_ring_mux_add_sw_ring failed(%d)\n", r);
+				return r;
+			}
+		}
+	}
+
 	/* set up the compute queues - allocate horizontally across pipes */
 	ring_id = 0;
 	for (i = 0; i < adev->gfx.mec.num_mec; ++i) {
@@ -2413,6 +2454,12 @@ static int gfx_v9_0_sw_fini(void *handle)
 	int i;
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
+	if (adev->gfx.num_gfx_rings) {
+		for (i = 0; i < GFX9_NUM_SW_GFX_RINGS; i++)
+			amdgpu_ring_fini(&adev->gfx.sw_gfx_ring[i]);
+		amdgpu_ring_mux_fini(&adev->gfx.muxer);
+	}
+
 	for (i = 0; i < adev->gfx.num_gfx_rings; i++)
 		amdgpu_ring_fini(&adev->gfx.gfx_ring[i]);
 	for (i = 0; i < adev->gfx.num_compute_rings; i++)
@@ -5877,7 +5924,11 @@ static int gfx_v9_0_eop_irq(struct amdgpu_device *adev,
 
 	switch (me_id) {
 	case 0:
-		amdgpu_fence_process(&adev->gfx.gfx_ring[0]);
+		/* Fence signals are handled on the software rings*/
+		if (adev->gfx.num_gfx_rings) {
+			for (i = 0; i < GFX9_NUM_SW_GFX_RINGS; i++)
+				amdgpu_fence_process(&adev->gfx.sw_gfx_ring[i]);
+		}
 		break;
 	case 1:
 	case 2:
@@ -6882,6 +6933,61 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
 	.emit_mem_sync = gfx_v9_0_emit_mem_sync,
 };
 
+static const struct amdgpu_ring_funcs gfx_v9_0_sw_ring_funcs_gfx = {
+	.type = AMDGPU_RING_TYPE_GFX,
+	.align_mask = 0xff,
+	.nop = PACKET3(PACKET3_NOP, 0x3FFF),
+	.support_64bit_ptrs = true,
+	.secure_submission_supported = true,
+	.vmhub = AMDGPU_GFXHUB_0,
+	.get_rptr = amdgpu_sw_ring_get_rptr_gfx,
+	.get_wptr = amdgpu_sw_ring_get_wptr_gfx,
+	.set_wptr = amdgpu_sw_ring_set_wptr_gfx,
+	.emit_frame_size = /* totally 242 maximum if 16 IBs */
+		5 +  /* COND_EXEC */
+		7 +  /* PIPELINE_SYNC */
+		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
+		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
+		2 + /* VM_FLUSH */
+		8 +  /* FENCE for VM_FLUSH */
+		20 + /* GDS switch */
+		4 + /* double SWITCH_BUFFER,
+		     * the first COND_EXEC jump to the place just
+		     * prior to this double SWITCH_BUFFER
+		     */
+		5 + /* COND_EXEC */
+		7 +	 /*	HDP_flush */
+		4 +	 /*	VGT_flush */
+		14 + /*	CE_META */
+		31 + /*	DE_META */
+		3 + /* CNTX_CTRL */
+		5 + /* HDP_INVL */
+		8 + 8 + /* FENCE x2 */
+		2 + /* SWITCH_BUFFER */
+		7, /* gfx_v9_0_emit_mem_sync */
+	.emit_ib_size =	4, /* gfx_v9_0_ring_emit_ib_gfx */
+	.emit_ib = gfx_v9_0_ring_emit_ib_gfx,
+	.emit_fence = gfx_v9_0_ring_emit_fence,
+	.emit_pipeline_sync = gfx_v9_0_ring_emit_pipeline_sync,
+	.emit_vm_flush = gfx_v9_0_ring_emit_vm_flush,
+	.emit_gds_switch = gfx_v9_0_ring_emit_gds_switch,
+	.emit_hdp_flush = gfx_v9_0_ring_emit_hdp_flush,
+	.test_ring = gfx_v9_0_ring_test_ring,
+	.test_ib = gfx_v9_0_ring_test_ib,
+	.insert_nop = amdgpu_sw_ring_insert_nop,
+	.pad_ib = amdgpu_ring_generic_pad_ib,
+	.emit_switch_buffer = gfx_v9_ring_emit_sb,
+	.emit_cntxcntl = gfx_v9_ring_emit_cntxcntl,
+	.init_cond_exec = gfx_v9_0_ring_emit_init_cond_exec,
+	.patch_cond_exec = gfx_v9_0_ring_emit_patch_cond_exec,
+	.emit_frame_cntl = gfx_v9_0_ring_emit_frame_cntl,
+	.emit_wreg = gfx_v9_0_ring_emit_wreg,
+	.emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
+	.emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
+	.soft_recovery = gfx_v9_0_ring_soft_recovery,
+	.emit_mem_sync = gfx_v9_0_emit_mem_sync,
+};
+
 static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
 	.type = AMDGPU_RING_TYPE_COMPUTE,
 	.align_mask = 0xff,
@@ -6959,6 +7065,11 @@ static void gfx_v9_0_set_ring_funcs(struct amdgpu_device *adev)
 	for (i = 0; i < adev->gfx.num_gfx_rings; i++)
 		adev->gfx.gfx_ring[i].funcs = &gfx_v9_0_ring_funcs_gfx;
 
+	if (adev->gfx.num_gfx_rings) {
+		for (i = 0; i < GFX9_NUM_SW_GFX_RINGS; i++)
+			adev->gfx.sw_gfx_ring[i].funcs = &gfx_v9_0_sw_ring_funcs_gfx;
+	}
+
 	for (i = 0; i < adev->gfx.num_compute_rings; i++)
 		adev->gfx.compute_ring[i].funcs = &gfx_v9_0_ring_funcs_compute;
 }
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH 3/5] drm/amdgpu: Modify unmap_queue format for gfx9 (v4)
  2022-10-18  9:08 [PATCH 1/5] drm/amdgpu: Introduce gfx software ring (v8) jiadong.zhu
  2022-10-18  9:08 ` [PATCH 2/5] drm/amdgpu: Add software ring callbacks for gfx9 (v8) jiadong.zhu
@ 2022-10-18  9:08 ` jiadong.zhu
  2022-11-22  5:46   ` Luben Tuikov
  2022-10-18  9:08 ` [PATCH 4/5] drm/amdgpu: MCBP based on DRM scheduler (v8) jiadong.zhu
                   ` (4 subsequent siblings)
  6 siblings, 1 reply; 29+ messages in thread
From: jiadong.zhu @ 2022-10-18  9:08 UTC (permalink / raw)
  To: amd-gfx; +Cc: Michel Dänzer, Jiadong.Zhu, Huang Rui, Christian König

From: "Jiadong.Zhu" <Jiadong.Zhu@amd.com>

1. Modify the unmap_queue package on gfx9. Add trailing fence to track the
   preemption done.
2. Modify emit_ce_meta emit_de_meta functions for the resumed ibs.

v2: Restyle code not to use ternary operator.
v3: Modify code format.
v4: Enable Mid-Command Buffer Preemption for gfx9 by default.

Cc: Christian Koenig <Christian.Koenig@amd.com>
Cc: Michel Dänzer <michel@daenzer.net>
Signed-off-by: Jiadong.Zhu <Jiadong.Zhu@amd.com>
Acked-by: Christian König <christian.koenig@amd.com>
Acked-by: Huang Rui <ray.huang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |   1 +
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 181 +++++++++++++++++++----
 drivers/gpu/drm/amd/amdgpu/soc15d.h      |   2 +
 3 files changed, 155 insertions(+), 29 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index f08ee1ac281c..e90d327a589e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -60,6 +60,7 @@ enum amdgpu_ring_priority_level {
 #define AMDGPU_FENCE_FLAG_64BIT         (1 << 0)
 #define AMDGPU_FENCE_FLAG_INT           (1 << 1)
 #define AMDGPU_FENCE_FLAG_TC_WB_ONLY    (1 << 2)
+#define AMDGPU_FENCE_FLAG_EXEC          (1 << 3)
 
 #define to_amdgpu_ring(s) container_of((s), struct amdgpu_ring, sched)
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 8d4fbc9e3fc0..01ec0551d26a 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -753,7 +753,7 @@ static void gfx_v9_0_set_rlc_funcs(struct amdgpu_device *adev);
 static int gfx_v9_0_get_cu_info(struct amdgpu_device *adev,
 				struct amdgpu_cu_info *cu_info);
 static uint64_t gfx_v9_0_get_gpu_clock_counter(struct amdgpu_device *adev);
-static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring);
+static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring, bool resume);
 static u64 gfx_v9_0_ring_get_rptr_compute(struct amdgpu_ring *ring);
 static void gfx_v9_0_query_ras_error_count(struct amdgpu_device *adev,
 					  void *ras_error_status);
@@ -826,9 +826,10 @@ static void gfx_v9_0_kiq_unmap_queues(struct amdgpu_ring *kiq_ring,
 			PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET0(ring->doorbell_index));
 
 	if (action == PREEMPT_QUEUES_NO_UNMAP) {
-		amdgpu_ring_write(kiq_ring, lower_32_bits(gpu_addr));
-		amdgpu_ring_write(kiq_ring, upper_32_bits(gpu_addr));
-		amdgpu_ring_write(kiq_ring, seq);
+		amdgpu_ring_write(kiq_ring, lower_32_bits(ring->wptr & ring->buf_mask));
+		amdgpu_ring_write(kiq_ring, 0);
+		amdgpu_ring_write(kiq_ring, 0);
+
 	} else {
 		amdgpu_ring_write(kiq_ring, 0);
 		amdgpu_ring_write(kiq_ring, 0);
@@ -5369,11 +5370,17 @@ static void gfx_v9_0_ring_emit_ib_gfx(struct amdgpu_ring *ring,
 
 	control |= ib->length_dw | (vmid << 24);
 
-	if (amdgpu_sriov_vf(ring->adev) && (ib->flags & AMDGPU_IB_FLAG_PREEMPT)) {
+	if (ib->flags & AMDGPU_IB_FLAG_PREEMPT) {
 		control |= INDIRECT_BUFFER_PRE_ENB(1);
 
+		if (flags & AMDGPU_IB_PREEMPTED)
+			control |= INDIRECT_BUFFER_PRE_RESUME(1);
+
 		if (!(ib->flags & AMDGPU_IB_FLAG_CE) && vmid)
-			gfx_v9_0_ring_emit_de_meta(ring);
+			gfx_v9_0_ring_emit_de_meta(ring,
+						   (!amdgpu_sriov_vf(ring->adev) &&
+						   flags & AMDGPU_IB_PREEMPTED) ?
+						   true : false);
 	}
 
 	amdgpu_ring_write(ring, header);
@@ -5428,17 +5435,23 @@ static void gfx_v9_0_ring_emit_fence(struct amdgpu_ring *ring, u64 addr,
 	bool write64bit = flags & AMDGPU_FENCE_FLAG_64BIT;
 	bool int_sel = flags & AMDGPU_FENCE_FLAG_INT;
 	bool writeback = flags & AMDGPU_FENCE_FLAG_TC_WB_ONLY;
+	bool exec = flags & AMDGPU_FENCE_FLAG_EXEC;
+	uint32_t dw2 = 0;
 
 	/* RELEASE_MEM - flush caches, send int */
 	amdgpu_ring_write(ring, PACKET3(PACKET3_RELEASE_MEM, 6));
-	amdgpu_ring_write(ring, ((writeback ? (EOP_TC_WB_ACTION_EN |
-					       EOP_TC_NC_ACTION_EN) :
-					      (EOP_TCL1_ACTION_EN |
-					       EOP_TC_ACTION_EN |
-					       EOP_TC_WB_ACTION_EN |
-					       EOP_TC_MD_ACTION_EN)) |
-				 EVENT_TYPE(CACHE_FLUSH_AND_INV_TS_EVENT) |
-				 EVENT_INDEX(5)));
+
+	if (writeback) {
+		dw2 = EOP_TC_WB_ACTION_EN | EOP_TC_NC_ACTION_EN;
+	} else {
+		dw2 = EOP_TCL1_ACTION_EN | EOP_TC_ACTION_EN |
+				EOP_TC_WB_ACTION_EN | EOP_TC_MD_ACTION_EN;
+	}
+	dw2 |= EVENT_TYPE(CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5);
+	if (exec)
+		dw2 |= EOP_EXEC;
+
+	amdgpu_ring_write(ring, dw2);
 	amdgpu_ring_write(ring, DATA_SEL(write64bit ? 2 : 1) | INT_SEL(int_sel ? 2 : 0));
 
 	/*
@@ -5543,33 +5556,135 @@ static void gfx_v9_ring_emit_sb(struct amdgpu_ring *ring)
 	amdgpu_ring_write(ring, 0);
 }
 
-static void gfx_v9_0_ring_emit_ce_meta(struct amdgpu_ring *ring)
+static void gfx_v9_0_ring_emit_ce_meta(struct amdgpu_ring *ring, bool resume)
 {
+	struct amdgpu_device *adev = ring->adev;
 	struct v9_ce_ib_state ce_payload = {0};
-	uint64_t csa_addr;
+	uint64_t offset, ce_payload_gpu_addr;
+	void *ce_payload_cpu_addr;
 	int cnt;
 
 	cnt = (sizeof(ce_payload) >> 2) + 4 - 2;
-	csa_addr = amdgpu_csa_vaddr(ring->adev);
+
+	if (ring->is_mes_queue) {
+		offset = offsetof(struct amdgpu_mes_ctx_meta_data,
+				  gfx[0].gfx_meta_data) +
+			offsetof(struct v9_gfx_meta_data, ce_payload);
+		ce_payload_gpu_addr =
+			amdgpu_mes_ctx_get_offs_gpu_addr(ring, offset);
+		ce_payload_cpu_addr =
+			amdgpu_mes_ctx_get_offs_cpu_addr(ring, offset);
+	} else {
+		offset = offsetof(struct v9_gfx_meta_data, ce_payload);
+		ce_payload_gpu_addr = amdgpu_csa_vaddr(ring->adev) + offset;
+		ce_payload_cpu_addr = adev->virt.csa_cpu_addr + offset;
+	}
 
 	amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, cnt));
 	amdgpu_ring_write(ring, (WRITE_DATA_ENGINE_SEL(2) |
 				 WRITE_DATA_DST_SEL(8) |
 				 WR_CONFIRM) |
 				 WRITE_DATA_CACHE_POLICY(0));
-	amdgpu_ring_write(ring, lower_32_bits(csa_addr + offsetof(struct v9_gfx_meta_data, ce_payload)));
-	amdgpu_ring_write(ring, upper_32_bits(csa_addr + offsetof(struct v9_gfx_meta_data, ce_payload)));
-	amdgpu_ring_write_multiple(ring, (void *)&ce_payload, sizeof(ce_payload) >> 2);
+	amdgpu_ring_write(ring, lower_32_bits(ce_payload_gpu_addr));
+	amdgpu_ring_write(ring, upper_32_bits(ce_payload_gpu_addr));
+
+	if (resume)
+		amdgpu_ring_write_multiple(ring, ce_payload_cpu_addr,
+					   sizeof(ce_payload) >> 2);
+	else
+		amdgpu_ring_write_multiple(ring, (void *)&ce_payload,
+					   sizeof(ce_payload) >> 2);
+}
+
+static int gfx_v9_0_ring_preempt_ib(struct amdgpu_ring *ring)
+{
+	int i, r = 0;
+	struct amdgpu_device *adev = ring->adev;
+	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
+	struct amdgpu_ring *kiq_ring = &kiq->ring;
+	unsigned long flags;
+
+	if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues)
+		return -EINVAL;
+
+	spin_lock_irqsave(&kiq->ring_lock, flags);
+
+	if (amdgpu_ring_alloc(kiq_ring, kiq->pmf->unmap_queues_size)) {
+		spin_unlock_irqrestore(&kiq->ring_lock, flags);
+		return -ENOMEM;
+	}
+
+	/* assert preemption condition */
+	amdgpu_ring_set_preempt_cond_exec(ring, false);
+
+	ring->trail_seq += 1;
+	amdgpu_ring_alloc(ring, 13);
+	gfx_v9_0_ring_emit_fence(ring, ring->trail_fence_gpu_addr,
+				 ring->trail_seq, AMDGPU_FENCE_FLAG_EXEC);
+	/*reset the CP_VMID_PREEMPT after trailing fence*/
+	amdgpu_ring_emit_wreg(ring,
+			      SOC15_REG_OFFSET(GC, 0, mmCP_VMID_PREEMPT),
+			      0x0);
+
+	/* assert IB preemption, emit the trailing fence */
+	kiq->pmf->kiq_unmap_queues(kiq_ring, ring, PREEMPT_QUEUES_NO_UNMAP,
+				   ring->trail_fence_gpu_addr,
+				   ring->trail_seq);
+
+	amdgpu_ring_commit(kiq_ring);
+	spin_unlock_irqrestore(&kiq->ring_lock, flags);
+
+	/* poll the trailing fence */
+	for (i = 0; i < adev->usec_timeout; i++) {
+		if (ring->trail_seq ==
+			le32_to_cpu(*ring->trail_fence_cpu_addr))
+			break;
+		udelay(1);
+	}
+
+	if (i >= adev->usec_timeout) {
+		r = -EINVAL;
+		DRM_ERROR("ring %d failed to preempt ib\n", ring->idx);
+	}
+
+	amdgpu_ring_commit(ring);
+
+	/* deassert preemption condition */
+	amdgpu_ring_set_preempt_cond_exec(ring, true);
+	return r;
 }
 
-static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring)
+static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring, bool resume)
 {
+	struct amdgpu_device *adev = ring->adev;
 	struct v9_de_ib_state de_payload = {0};
-	uint64_t csa_addr, gds_addr;
+	uint64_t offset, gds_addr, de_payload_gpu_addr;
+	void *de_payload_cpu_addr;
 	int cnt;
 
-	csa_addr = amdgpu_csa_vaddr(ring->adev);
-	gds_addr = csa_addr + 4096;
+	if (ring->is_mes_queue) {
+		offset = offsetof(struct amdgpu_mes_ctx_meta_data,
+				  gfx[0].gfx_meta_data) +
+			offsetof(struct v9_gfx_meta_data, de_payload);
+		de_payload_gpu_addr =
+			amdgpu_mes_ctx_get_offs_gpu_addr(ring, offset);
+		de_payload_cpu_addr =
+			amdgpu_mes_ctx_get_offs_cpu_addr(ring, offset);
+
+		offset = offsetof(struct amdgpu_mes_ctx_meta_data,
+				  gfx[0].gds_backup) +
+			offsetof(struct v9_gfx_meta_data, de_payload);
+		gds_addr = amdgpu_mes_ctx_get_offs_gpu_addr(ring, offset);
+	} else {
+		offset = offsetof(struct v9_gfx_meta_data, de_payload);
+		de_payload_gpu_addr = amdgpu_csa_vaddr(ring->adev) + offset;
+		de_payload_cpu_addr = adev->virt.csa_cpu_addr + offset;
+
+		gds_addr = ALIGN(amdgpu_csa_vaddr(ring->adev) +
+				 AMDGPU_CSA_SIZE - adev->gds.gds_size,
+				 PAGE_SIZE);
+	}
+
 	de_payload.gds_backup_addrlo = lower_32_bits(gds_addr);
 	de_payload.gds_backup_addrhi = upper_32_bits(gds_addr);
 
@@ -5579,9 +5694,15 @@ static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring)
 				 WRITE_DATA_DST_SEL(8) |
 				 WR_CONFIRM) |
 				 WRITE_DATA_CACHE_POLICY(0));
-	amdgpu_ring_write(ring, lower_32_bits(csa_addr + offsetof(struct v9_gfx_meta_data, de_payload)));
-	amdgpu_ring_write(ring, upper_32_bits(csa_addr + offsetof(struct v9_gfx_meta_data, de_payload)));
-	amdgpu_ring_write_multiple(ring, (void *)&de_payload, sizeof(de_payload) >> 2);
+	amdgpu_ring_write(ring, lower_32_bits(de_payload_gpu_addr));
+	amdgpu_ring_write(ring, upper_32_bits(de_payload_gpu_addr));
+
+	if (resume)
+		amdgpu_ring_write_multiple(ring, de_payload_cpu_addr,
+					   sizeof(de_payload) >> 2);
+	else
+		amdgpu_ring_write_multiple(ring, (void *)&de_payload,
+					   sizeof(de_payload) >> 2);
 }
 
 static void gfx_v9_0_ring_emit_frame_cntl(struct amdgpu_ring *ring, bool start,
@@ -5597,8 +5718,9 @@ static void gfx_v9_ring_emit_cntxcntl(struct amdgpu_ring *ring, uint32_t flags)
 {
 	uint32_t dw2 = 0;
 
-	if (amdgpu_sriov_vf(ring->adev))
-		gfx_v9_0_ring_emit_ce_meta(ring);
+	gfx_v9_0_ring_emit_ce_meta(ring,
+				   (!amdgpu_sriov_vf(ring->adev) &&
+				   flags & AMDGPU_IB_PREEMPTED) ? true : false);
 
 	dw2 |= 0x80000000; /* set load_enable otherwise this package is just NOPs */
 	if (flags & AMDGPU_HAVE_CTX_SWITCH) {
@@ -6925,6 +7047,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
 	.emit_cntxcntl = gfx_v9_ring_emit_cntxcntl,
 	.init_cond_exec = gfx_v9_0_ring_emit_init_cond_exec,
 	.patch_cond_exec = gfx_v9_0_ring_emit_patch_cond_exec,
+	.preempt_ib = gfx_v9_0_ring_preempt_ib,
 	.emit_frame_cntl = gfx_v9_0_ring_emit_frame_cntl,
 	.emit_wreg = gfx_v9_0_ring_emit_wreg,
 	.emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
diff --git a/drivers/gpu/drm/amd/amdgpu/soc15d.h b/drivers/gpu/drm/amd/amdgpu/soc15d.h
index 799925d22fc8..2357ff39323f 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15d.h
+++ b/drivers/gpu/drm/amd/amdgpu/soc15d.h
@@ -162,6 +162,7 @@
 		 * 2 - Bypass
 		 */
 #define     INDIRECT_BUFFER_PRE_ENB(x)		 ((x) << 21)
+#define     INDIRECT_BUFFER_PRE_RESUME(x)               ((x) << 30)
 #define	PACKET3_COPY_DATA				0x40
 #define	PACKET3_PFP_SYNC_ME				0x42
 #define	PACKET3_COND_WRITE				0x45
@@ -184,6 +185,7 @@
 #define		EOP_TC_ACTION_EN                        (1 << 17) /* L2 */
 #define		EOP_TC_NC_ACTION_EN			(1 << 19)
 #define		EOP_TC_MD_ACTION_EN			(1 << 21) /* L2 metadata */
+#define		EOP_EXEC				(1 << 28) /* For Trailing Fence */
 
 #define		DATA_SEL(x)                             ((x) << 29)
 		/* 0 - discard
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH 4/5] drm/amdgpu: MCBP based on DRM scheduler (v8)
  2022-10-18  9:08 [PATCH 1/5] drm/amdgpu: Introduce gfx software ring (v8) jiadong.zhu
  2022-10-18  9:08 ` [PATCH 2/5] drm/amdgpu: Add software ring callbacks for gfx9 (v8) jiadong.zhu
  2022-10-18  9:08 ` [PATCH 3/5] drm/amdgpu: Modify unmap_queue format for gfx9 (v4) jiadong.zhu
@ 2022-10-18  9:08 ` jiadong.zhu
  2022-10-31 12:01   ` Michel Dänzer
  2022-11-10 19:27   ` Luben Tuikov
  2022-10-18  9:08 ` [PATCH 5/5] drm/amdgpu: Improve the software ring priority scheduler jiadong.zhu
                   ` (3 subsequent siblings)
  6 siblings, 2 replies; 29+ messages in thread
From: jiadong.zhu @ 2022-10-18  9:08 UTC (permalink / raw)
  To: amd-gfx
  Cc: Andrey Grodzovsky, Michel Dänzer, Huang Rui, Luben Tuikov,
	Jiadong.Zhu, Christian Koenig

From: "Jiadong.Zhu" <Jiadong.Zhu@amd.com>

Trigger Mid-Command Buffer Preemption according to the priority of the software
rings and the hw fence signalling condition.

The muxer saves the locations of the indirect buffer frames from the software
ring together with the fence sequence number in its fifo queue, and pops out
those records when the fences are signalled. The locations are used to resubmit
packages in preemption scenarios by coping the chunks from the software ring.

v2: Update comment style.
v3: Fix conflict caused by previous modifications.
v4: Remove unnecessary prints.
v5: Fix corner cases for resubmission cases.
v6: Refactor functions for resubmission, calling fence_process in irq handler.
v7: Solve conflict for removing amdgpu_sw_ring.c.
v8: Add time threshold to judge if preemption request is needed.

Cc: Christian Koenig <Christian.Koenig@amd.com>
Cc: Luben Tuikov <Luben.Tuikov@amd.com>
Cc: Andrey Grodzovsky <Andrey.Grodzovsky@amd.com>
Cc: Michel Dänzer <michel@daenzer.net>
Signed-off-by: Jiadong.Zhu <Jiadong.Zhu@amd.com>
Acked-by: Luben Tuikov <luben.tuikov@amd.com>
Acked-by: Huang Rui <ray.huang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c    |  53 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c       |   2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c     |  12 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h     |   6 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c | 353 +++++++++++++++++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h |  29 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c       |   2 +
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c        |   7 +-
 8 files changed, 420 insertions(+), 44 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 790f7bfdc654..470448bc1ebb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -54,6 +54,7 @@ struct amdgpu_fence {
 
 	/* RB, DMA, etc. */
 	struct amdgpu_ring		*ring;
+	ktime_t				start_timestamp;
 };
 
 static struct kmem_cache *amdgpu_fence_slab;
@@ -199,6 +200,7 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f, struct amd
 	rcu_assign_pointer(*ptr, dma_fence_get(fence));
 
 	*f = fence;
+	to_amdgpu_fence(fence)->start_timestamp = ktime_get();
 
 	return 0;
 }
@@ -400,6 +402,57 @@ unsigned amdgpu_fence_count_emitted(struct amdgpu_ring *ring)
 	return lower_32_bits(emitted);
 }
 
+/**
+ * amdgpu_fence_last_unsignaled_time_us - the time fence emited till now
+ * @ring: ring the fence is associated with
+ *
+ * Find the earlist fence unsignaled till now, calculate the time delta
+ * between the time fence emitted and now.
+ */
+u64 amdgpu_fence_last_unsignaled_time_us(struct amdgpu_ring *ring)
+{
+	struct amdgpu_fence_driver *drv = &ring->fence_drv;
+	struct dma_fence *fence;
+	uint32_t last_seq, sync_seq;
+
+	last_seq = atomic_read(&ring->fence_drv.last_seq);
+	sync_seq = READ_ONCE(ring->fence_drv.sync_seq);
+	if (last_seq == sync_seq)
+		return 0;
+
+	++last_seq;
+	last_seq &= drv->num_fences_mask;
+	fence = drv->fences[last_seq];
+	if (!fence)
+		return 0;
+
+	return ktime_us_delta(ktime_get(),
+		to_amdgpu_fence(fence)->start_timestamp);
+}
+
+/**
+ * amdgpu_fence_update_start_timestamp - update the timestamp of the fence
+ * @ring: ring the fence is associated with
+ * @seq: the fence seq number to update.
+ * @timestamp: the start timestamp to update.
+ *
+ * The function called at the time the fence and related ib is about to
+ * resubmit to gpu in MCBP scenario. Thus we do not consider race condition
+ * with amdgpu_fence_process to modify the same fence.
+ */
+void amdgpu_fence_update_start_timestamp(struct amdgpu_ring *ring, uint32_t seq, ktime_t timestamp)
+{
+	struct amdgpu_fence_driver *drv = &ring->fence_drv;
+	struct dma_fence *fence;
+
+	seq &= drv->num_fences_mask;
+	fence = drv->fences[seq];
+	if (!fence)
+		return;
+
+	to_amdgpu_fence(fence)->start_timestamp = timestamp;
+}
+
 /**
  * amdgpu_fence_driver_start_ring - make the fence driver
  * ready for use on the requested ring.
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
index 258cffe3c06a..af86d87e2f3b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
@@ -211,6 +211,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
 		}
 	}
 
+	amdgpu_ring_ib_begin(ring);
 	if (job && ring->funcs->init_cond_exec)
 		patch_offset = amdgpu_ring_init_cond_exec(ring);
 
@@ -285,6 +286,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
 	    ring->hw_prio == AMDGPU_GFX_PIPE_PRIO_HIGH)
 		ring->funcs->emit_wave_limit(ring, false);
 
+	amdgpu_ring_ib_end(ring);
 	amdgpu_ring_commit(ring);
 	return 0;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index 13db99d653bd..1f15f9242e99 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -569,3 +569,15 @@ int amdgpu_ring_init_mqd(struct amdgpu_ring *ring)
 
 	return mqd_mgr->init_mqd(adev, ring->mqd_ptr, &prop);
 }
+
+void amdgpu_ring_ib_begin(struct amdgpu_ring *ring)
+{
+	if (ring->is_sw_ring)
+		amdgpu_sw_ring_ib_begin(ring);
+}
+
+void amdgpu_ring_ib_end(struct amdgpu_ring *ring)
+{
+	if (ring->is_sw_ring)
+		amdgpu_sw_ring_ib_end(ring);
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index e90d327a589e..8f82fd0d47c6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -145,6 +145,9 @@ signed long amdgpu_fence_wait_polling(struct amdgpu_ring *ring,
 				      uint32_t wait_seq,
 				      signed long timeout);
 unsigned amdgpu_fence_count_emitted(struct amdgpu_ring *ring);
+u64 amdgpu_fence_last_unsignaled_time_us(struct amdgpu_ring *ring);
+void amdgpu_fence_update_start_timestamp(struct amdgpu_ring *ring, uint32_t seq,
+					 ktime_t timestamp);
 
 /*
  * Rings.
@@ -312,6 +315,9 @@ struct amdgpu_ring {
 #define amdgpu_ring_preempt_ib(r) (r)->funcs->preempt_ib(r)
 
 int amdgpu_ring_alloc(struct amdgpu_ring *ring, unsigned ndw);
+void amdgpu_ring_ib_begin(struct amdgpu_ring *ring);
+void amdgpu_ring_ib_end(struct amdgpu_ring *ring);
+
 void amdgpu_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count);
 void amdgpu_ring_generic_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib);
 void amdgpu_ring_commit(struct amdgpu_ring *ring);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
index 2e64ffccc030..41b057b9358e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
@@ -28,6 +28,7 @@
 #include "amdgpu.h"
 
 #define AMDGPU_MUX_RESUBMIT_JIFFIES_TIMEOUT (HZ / 2)
+#define AMDGPU_MAX_LAST_UNSIGNALED_THRESHOLD_US 10000
 
 static const struct ring_info {
 	unsigned int hw_pio;
@@ -37,23 +38,145 @@ static const struct ring_info {
 	{ AMDGPU_RING_PRIO_2, "gfx_high"},
 };
 
+static struct kmem_cache *amdgpu_mux_chunk_slab;
+
+static inline struct amdgpu_mux_entry *amdgpu_ring_mux_sw_entry(struct amdgpu_ring_mux *mux,
+								struct amdgpu_ring *ring)
+{
+	return ring->entry_index < mux->ring_entry_size ?
+			&mux->ring_entry[ring->entry_index] : NULL;
+}
+
+/* copy packages on sw ring range[begin, end) */
+static void amdgpu_ring_mux_copy_pkt_from_sw_ring(struct amdgpu_ring_mux *mux,
+						  struct amdgpu_ring *ring,
+						  u64 s_start, u64 s_end)
+{
+	u64 start, end;
+	struct amdgpu_ring *real_ring = mux->real_ring;
+
+	start = s_start & ring->buf_mask;
+	end = s_end & ring->buf_mask;
+
+	if (start == end) {
+		DRM_ERROR("no more data copied from sw ring\n");
+		return;
+	}
+	if (start > end) {
+		amdgpu_ring_alloc(real_ring, (ring->ring_size >> 2) + end - start);
+		amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[start],
+					   (ring->ring_size >> 2) - start);
+		amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[0], end);
+	} else {
+		amdgpu_ring_alloc(real_ring, end - start);
+		amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[start], end - start);
+	}
+}
+
+static void amdgpu_mux_resubmit_chunks(struct amdgpu_ring_mux *mux)
+{
+	struct amdgpu_mux_entry *e = NULL;
+	struct amdgpu_mux_chunk *chunk;
+	uint32_t seq, last_seq;
+	int i;
+
+	/*find low priority entries:*/
+	if (!mux->s_resubmit)
+		return;
+
+	for (i = 0; i < mux->num_ring_entries; i++) {
+		if (mux->ring_entry[i].ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT) {
+			e = &mux->ring_entry[i];
+			break;
+		}
+	}
+
+	if (!e) {
+		DRM_ERROR("%s no low priority ring found\n", __func__);
+		return;
+	}
+
+	last_seq = atomic_read(&e->ring->fence_drv.last_seq);
+	seq = mux->seqno_to_resubmit;
+	if (last_seq < seq) {
+		/*resubmit all the fences between (last_seq, seq]*/
+		list_for_each_entry(chunk, &e->list, entry) {
+			if (chunk->sync_seq > last_seq && chunk->sync_seq <= seq) {
+				amdgpu_fence_update_start_timestamp(e->ring,
+								    chunk->sync_seq,
+								    ktime_get());
+				amdgpu_ring_mux_copy_pkt_from_sw_ring(mux, e->ring,
+								      chunk->start,
+								      chunk->end);
+				mux->wptr_resubmit = chunk->end;
+				amdgpu_ring_commit(mux->real_ring);
+			}
+		}
+	}
+
+	del_timer(&mux->resubmit_timer);
+	mux->s_resubmit = false;
+}
+
+static void amdgpu_ring_mux_schedule_resubmit(struct amdgpu_ring_mux *mux)
+{
+	mod_timer(&mux->resubmit_timer, jiffies + AMDGPU_MUX_RESUBMIT_JIFFIES_TIMEOUT);
+}
+
+static void amdgpu_mux_resubmit_fallback(struct timer_list *t)
+{
+	struct amdgpu_ring_mux *mux = from_timer(mux, t, resubmit_timer);
+
+	if (!spin_trylock(&mux->lock)) {
+		amdgpu_ring_mux_schedule_resubmit(mux);
+		DRM_ERROR("reschedule resubmit\n");
+		return;
+	}
+	amdgpu_mux_resubmit_chunks(mux);
+	spin_unlock(&mux->lock);
+}
+
 int amdgpu_ring_mux_init(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring,
 			 unsigned int entry_size)
 {
 	mux->real_ring = ring;
 	mux->num_ring_entries = 0;
+
 	mux->ring_entry = kcalloc(entry_size, sizeof(struct amdgpu_mux_entry), GFP_KERNEL);
 	if (!mux->ring_entry)
 		return -ENOMEM;
 
 	mux->ring_entry_size = entry_size;
+	mux->s_resubmit = false;
+
+	amdgpu_mux_chunk_slab = kmem_cache_create("amdgpu_mux_chunk",
+						  sizeof(struct amdgpu_mux_chunk), 0,
+						  SLAB_HWCACHE_ALIGN, NULL);
+	if (!amdgpu_mux_chunk_slab) {
+		DRM_ERROR("create amdgpu_mux_chunk cache failed\n");
+		return -ENOMEM;
+	}
+
 	spin_lock_init(&mux->lock);
+	timer_setup(&mux->resubmit_timer, amdgpu_mux_resubmit_fallback, 0);
 
 	return 0;
 }
 
 void amdgpu_ring_mux_fini(struct amdgpu_ring_mux *mux)
 {
+	struct amdgpu_mux_entry *e;
+	struct amdgpu_mux_chunk *chunk, *chunk2;
+	int i;
+
+	for (i = 0; i < mux->num_ring_entries; i++) {
+		e = &mux->ring_entry[i];
+		list_for_each_entry_safe(chunk, chunk2, &e->list, entry) {
+			list_del(&chunk->entry);
+			kmem_cache_free(amdgpu_mux_chunk_slab, chunk);
+		}
+	}
+	kmem_cache_destroy(amdgpu_mux_chunk_slab);
 	kfree(mux->ring_entry);
 	mux->ring_entry = NULL;
 	mux->num_ring_entries = 0;
@@ -73,62 +196,48 @@ int amdgpu_ring_mux_add_sw_ring(struct amdgpu_ring_mux *mux, struct amdgpu_ring
 	ring->entry_index = mux->num_ring_entries;
 	e->ring = ring;
 
+	INIT_LIST_HEAD(&e->list);
 	mux->num_ring_entries += 1;
 	return 0;
 }
 
-static inline struct amdgpu_mux_entry *amdgpu_ring_mux_sw_entry(struct amdgpu_ring_mux *mux,
-								struct amdgpu_ring *ring)
-{
-	return ring->entry_index < mux->ring_entry_size ?
-			&mux->ring_entry[ring->entry_index] : NULL;
-}
-
-/* copy packages on sw ring range[begin, end) */
-static void amdgpu_ring_mux_copy_pkt_from_sw_ring(struct amdgpu_ring_mux *mux,
-						  struct amdgpu_ring *ring,
-						  u64 s_start, u64 s_end)
-{
-	u64 start, end;
-	struct amdgpu_ring *real_ring = mux->real_ring;
-
-	start = s_start & ring->buf_mask;
-	end = s_end & ring->buf_mask;
-
-	if (start == end) {
-		DRM_ERROR("no more data copied from sw ring\n");
-		return;
-	}
-	if (start > end) {
-		amdgpu_ring_alloc(real_ring, (ring->ring_size >> 2) + end - start);
-		amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[start],
-					   (ring->ring_size >> 2) - start);
-		amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[0], end);
-	} else {
-		amdgpu_ring_alloc(real_ring, end - start);
-		amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[start], end - start);
-	}
-}
-
 void amdgpu_ring_mux_set_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring, u64 wptr)
 {
 	struct amdgpu_mux_entry *e;
 
+	spin_lock(&mux->lock);
+
+	if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT)
+		amdgpu_mux_resubmit_chunks(mux);
+
 	e = amdgpu_ring_mux_sw_entry(mux, ring);
 	if (!e) {
 		DRM_ERROR("cannot find entry for sw ring\n");
+		spin_unlock(&mux->lock);
+		return;
+	}
+
+	/* We could skip this set wptr as preemption in process. */
+	if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT && mux->pending_trailing_fence_signaled) {
+		spin_unlock(&mux->lock);
 		return;
 	}
 
-	spin_lock(&mux->lock);
 	e->sw_cptr = e->sw_wptr;
+	/* Update cptr if the package already copied in resubmit functions */
+	if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT && e->sw_cptr < mux->wptr_resubmit)
+		e->sw_cptr = mux->wptr_resubmit;
 	e->sw_wptr = wptr;
 	e->start_ptr_in_hw_ring = mux->real_ring->wptr;
 
-	amdgpu_ring_mux_copy_pkt_from_sw_ring(mux, ring, e->sw_cptr, wptr);
-	e->end_ptr_in_hw_ring = mux->real_ring->wptr;
-	amdgpu_ring_commit(mux->real_ring);
-
+	/* Skip copying for the packages already resubmitted.*/
+	if (ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT || mux->wptr_resubmit < wptr) {
+		amdgpu_ring_mux_copy_pkt_from_sw_ring(mux, ring, e->sw_cptr, wptr);
+		e->end_ptr_in_hw_ring = mux->real_ring->wptr;
+		amdgpu_ring_commit(mux->real_ring);
+	} else {
+		e->end_ptr_in_hw_ring = mux->real_ring->wptr;
+	}
 	spin_unlock(&mux->lock);
 }
 
@@ -145,7 +254,7 @@ u64 amdgpu_ring_mux_get_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ri
 	return e->sw_wptr;
 }
 
-/*
+/**
  * The return value of the readptr is not precise while the other rings could
  * write data onto the real ring buffer.After overwriting on the real ring, we
  * can not decide if our packages have been excuted or not read yet. However,
@@ -235,3 +344,169 @@ unsigned int amdgpu_sw_ring_priority(int idx)
 	return idx < ARRAY_SIZE(sw_ring_info) ?
 		sw_ring_info[idx].hw_pio : AMDGPU_RING_PRIO_DEFAULT;
 }
+
+/*Scan on low prio rings to have unsignaled fence and high ring has no fence.*/
+int amdgpu_mcbp_scan(struct amdgpu_ring_mux *mux)
+{
+	struct amdgpu_ring *ring;
+	int i, need_preempt;
+
+	need_preempt = 0;
+	for (i = 0; i < mux->num_ring_entries; i++) {
+		ring = mux->ring_entry[i].ring;
+		if (ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT &&
+		    amdgpu_fence_count_emitted(ring) > 0)
+			return 0;
+		if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT &&
+		    amdgpu_fence_last_unsignaled_time_us(ring) >
+		    AMDGPU_MAX_LAST_UNSIGNALED_THRESHOLD_US)
+			need_preempt = 1;
+	}
+	return need_preempt && !mux->s_resubmit;
+}
+
+/* Trigger Mid-Command Buffer Preemption (MCBP) and find if we need to resubmit. */
+int amdgpu_mcbp_trigger_preempt(struct amdgpu_ring_mux *mux)
+{
+	int r;
+
+	spin_lock(&mux->lock);
+	mux->pending_trailing_fence_signaled = true;
+	r = amdgpu_ring_preempt_ib(mux->real_ring);
+	spin_unlock(&mux->lock);
+	return r;
+}
+
+void amdgpu_sw_ring_ib_begin(struct amdgpu_ring *ring)
+{
+	struct amdgpu_device *adev = ring->adev;
+	struct amdgpu_ring_mux *mux = &adev->gfx.muxer;
+
+	WARN_ON(!ring->is_sw_ring);
+	if (ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT) {
+		if (amdgpu_mcbp_scan(mux) > 0)
+			amdgpu_mcbp_trigger_preempt(mux);
+		return;
+	}
+
+	amdgpu_ring_mux_start_ib(mux, ring);
+}
+
+void amdgpu_sw_ring_ib_end(struct amdgpu_ring *ring)
+{
+	struct amdgpu_device *adev = ring->adev;
+	struct amdgpu_ring_mux *mux = &adev->gfx.muxer;
+
+	WARN_ON(!ring->is_sw_ring);
+	if (ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT)
+		return;
+	amdgpu_ring_mux_end_ib(mux, ring);
+}
+
+void amdgpu_ring_mux_start_ib(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring)
+{
+	struct amdgpu_mux_entry *e;
+	struct amdgpu_mux_chunk *chunk;
+
+	spin_lock(&mux->lock);
+	amdgpu_mux_resubmit_chunks(mux);
+	spin_unlock(&mux->lock);
+
+	e = amdgpu_ring_mux_sw_entry(mux, ring);
+	if (!e) {
+		DRM_ERROR("cannot find entry!\n");
+		return;
+	}
+
+	chunk = kmem_cache_alloc(amdgpu_mux_chunk_slab, GFP_KERNEL);
+	if (!chunk) {
+		DRM_ERROR("alloc amdgpu_mux_chunk_slab failed\n");
+		return;
+	}
+
+	chunk->start = ring->wptr;
+	list_add_tail(&chunk->entry, &e->list);
+}
+
+static void scan_and_remove_signaled_chunk(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring)
+{
+	uint32_t last_seq, size = 0;
+	struct amdgpu_mux_entry *e;
+	struct amdgpu_mux_chunk *chunk, *tmp;
+
+	e = amdgpu_ring_mux_sw_entry(mux, ring);
+	if (!e) {
+		DRM_ERROR("cannot find entry!\n");
+		return;
+	}
+
+	last_seq = atomic_read(&ring->fence_drv.last_seq);
+
+	list_for_each_entry_safe(chunk, tmp, &e->list, entry) {
+		if (chunk->sync_seq <= last_seq) {
+			list_del(&chunk->entry);
+			kmem_cache_free(amdgpu_mux_chunk_slab, chunk);
+		} else {
+			size++;
+		}
+	}
+}
+
+void amdgpu_ring_mux_end_ib(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring)
+{
+	struct amdgpu_mux_entry *e;
+	struct amdgpu_mux_chunk *chunk;
+
+	e = amdgpu_ring_mux_sw_entry(mux, ring);
+	if (!e) {
+		DRM_ERROR("cannot find entry!\n");
+		return;
+	}
+
+	chunk = list_last_entry(&e->list, struct amdgpu_mux_chunk, entry);
+	if (!chunk) {
+		DRM_ERROR("cannot find chunk!\n");
+		return;
+	}
+
+	chunk->end = ring->wptr;
+	chunk->sync_seq = READ_ONCE(ring->fence_drv.sync_seq);
+
+	scan_and_remove_signaled_chunk(mux, ring);
+}
+
+bool amdgpu_mcbp_handle_trailing_fence_irq(struct amdgpu_ring_mux *mux)
+{
+	struct amdgpu_mux_entry *e;
+	struct amdgpu_ring *ring = NULL;
+	int i;
+
+	if (!mux->pending_trailing_fence_signaled)
+		return false;
+
+	if (mux->real_ring->trail_seq != le32_to_cpu(*mux->real_ring->trail_fence_cpu_addr))
+		return false;
+
+	for (i = 0; i < mux->num_ring_entries; i++) {
+		e = &mux->ring_entry[i];
+		if (e->ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT) {
+			ring = e->ring;
+			break;
+		}
+	}
+
+	if (!ring) {
+		DRM_ERROR("cannot find low priority ring\n");
+		return false;
+	}
+
+	amdgpu_fence_process(ring);
+	if (amdgpu_fence_count_emitted(ring) > 0) {
+		mux->s_resubmit = true;
+		mux->seqno_to_resubmit = ring->fence_drv.sync_seq;
+		amdgpu_ring_mux_schedule_resubmit(mux);
+	}
+
+	mux->pending_trailing_fence_signaled = false;
+	return true;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
index 28399f4b0e5c..aa758ebc86ae 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
@@ -29,6 +29,7 @@
 #include "amdgpu_ring.h"
 
 struct amdgpu_ring;
+
 /**
  * struct amdgpu_mux_entry - the entry recording software rings copying information.
  * @ring: the pointer to the software ring.
@@ -37,6 +38,7 @@ struct amdgpu_ring;
  * @sw_cptr: the position of the copy pointer in the sw ring.
  * @sw_rptr: the read pointer in software ring.
  * @sw_wptr: the write pointer in software ring.
+ * @list: list head for amdgpu_mux_chunk
  */
 struct amdgpu_mux_entry {
 	struct                  amdgpu_ring *ring;
@@ -45,6 +47,7 @@ struct amdgpu_mux_entry {
 	u64                     sw_cptr;
 	u64                     sw_rptr;
 	u64                     sw_wptr;
+	struct list_head        list;
 };
 
 struct amdgpu_ring_mux {
@@ -55,6 +58,26 @@ struct amdgpu_ring_mux {
 	unsigned int            ring_entry_size;
 	/*the lock for copy data from different software rings*/
 	spinlock_t              lock;
+	bool                    s_resubmit;
+	uint32_t                seqno_to_resubmit;
+	u64                     wptr_resubmit;
+	struct timer_list       resubmit_timer;
+
+	bool                    pending_trailing_fence_signaled;
+};
+
+/**
+ * struct amdgpu_mux_chunk - save the location of indirect buffer's package on softare rings.
+ * @entry: the list entry.
+ * @sync_seq: the fence seqno related with the saved IB.
+ * @start:- start location on the software ring.
+ * @end:- end location on the software ring.
+ */
+struct amdgpu_mux_chunk {
+	struct list_head        entry;
+	uint32_t                sync_seq;
+	u64                     start;
+	u64                     end;
 };
 
 int amdgpu_ring_mux_init(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring,
@@ -64,15 +87,17 @@ int amdgpu_ring_mux_add_sw_ring(struct amdgpu_ring_mux *mux, struct amdgpu_ring
 void amdgpu_ring_mux_set_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring, u64 wptr);
 u64 amdgpu_ring_mux_get_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring);
 u64 amdgpu_ring_mux_get_rptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring);
+void amdgpu_ring_mux_start_ib(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring);
+void amdgpu_ring_mux_end_ib(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring);
+bool amdgpu_mcbp_handle_trailing_fence_irq(struct amdgpu_ring_mux *mux);
 
 u64 amdgpu_sw_ring_get_rptr_gfx(struct amdgpu_ring *ring);
 u64 amdgpu_sw_ring_get_wptr_gfx(struct amdgpu_ring *ring);
 void amdgpu_sw_ring_set_wptr_gfx(struct amdgpu_ring *ring);
-
 void amdgpu_sw_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count);
 void amdgpu_sw_ring_ib_begin(struct amdgpu_ring *ring);
 void amdgpu_sw_ring_ib_end(struct amdgpu_ring *ring);
-
 const char *amdgpu_sw_ring_name(int idx);
 unsigned int amdgpu_sw_ring_priority(int idx);
+
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 9596c22fded6..b7e94553f4fb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -601,6 +601,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
 	if (!vm_flush_needed && !gds_switch_needed && !need_pipe_sync)
 		return 0;
 
+	amdgpu_ring_ib_begin(ring);
 	if (ring->funcs->init_cond_exec)
 		patch_offset = amdgpu_ring_init_cond_exec(ring);
 
@@ -661,6 +662,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
 		amdgpu_ring_emit_switch_buffer(ring);
 		amdgpu_ring_emit_switch_buffer(ring);
 	}
+	amdgpu_ring_ib_end(ring);
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 01ec0551d26a..42bda7766e11 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -5620,7 +5620,7 @@ static int gfx_v9_0_ring_preempt_ib(struct amdgpu_ring *ring)
 	ring->trail_seq += 1;
 	amdgpu_ring_alloc(ring, 13);
 	gfx_v9_0_ring_emit_fence(ring, ring->trail_fence_gpu_addr,
-				 ring->trail_seq, AMDGPU_FENCE_FLAG_EXEC);
+				 ring->trail_seq, AMDGPU_FENCE_FLAG_EXEC | AMDGPU_FENCE_FLAG_INT);
 	/*reset the CP_VMID_PREEMPT after trailing fence*/
 	amdgpu_ring_emit_wreg(ring,
 			      SOC15_REG_OFFSET(GC, 0, mmCP_VMID_PREEMPT),
@@ -6046,8 +6046,9 @@ static int gfx_v9_0_eop_irq(struct amdgpu_device *adev,
 
 	switch (me_id) {
 	case 0:
-		/* Fence signals are handled on the software rings*/
-		if (adev->gfx.num_gfx_rings) {
+		if (adev->gfx.num_gfx_rings &&
+		    !amdgpu_mcbp_handle_trailing_fence_irq(&adev->gfx.muxer)) {
+			/* Fence signals are handled on the software rings*/
 			for (i = 0; i < GFX9_NUM_SW_GFX_RINGS; i++)
 				amdgpu_fence_process(&adev->gfx.sw_gfx_ring[i]);
 		}
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH 5/5] drm/amdgpu: Improve the software ring priority scheduler
  2022-10-18  9:08 [PATCH 1/5] drm/amdgpu: Introduce gfx software ring (v8) jiadong.zhu
                   ` (2 preceding siblings ...)
  2022-10-18  9:08 ` [PATCH 4/5] drm/amdgpu: MCBP based on DRM scheduler (v8) jiadong.zhu
@ 2022-10-18  9:08 ` jiadong.zhu
  2022-10-18 11:24   ` Christian König
  2022-10-19 15:14 ` [PATCH 1/5] drm/amdgpu: Introduce gfx software ring (v8) Luben Tuikov
                   ` (2 subsequent siblings)
  6 siblings, 1 reply; 29+ messages in thread
From: jiadong.zhu @ 2022-10-18  9:08 UTC (permalink / raw)
  To: amd-gfx; +Cc: Michel Dänzer, Jiadong Zhu, Christian Koenig

From: Jiadong Zhu <Jiadong.Zhu@amd.com>

Using the drm scheduler, the software rings' scheduling threads with different
priorities have the same opportunity to get the spinlock and copy its packages
into the real ring. Though preemption may happen for the low priority ring, it
will catch up with the high priority ring by copying more data (the resubmit
package and the current ibs) on the next calling of set_wptr. As a result, the
priority is not quite effective.

Here are some details to improve the priority of software rings at the bottom
of drm scheduler by slowing down the low priority thread with following
strategy.
1. If all the high priority fences are signaled which indicates gpu is idle
   while there are low priority packages to submit, no delay happens.
2. When there are unsignaled fences on high priority rings, we account for the
   portion of the ibs sent from the low priority ring. If the portion exceeds
   a certain threshold(eg, 30%), a timeout wait happens on low priority
   threads till more high priority ibs submitted.
3. The mechanism is started when the first time mcbp triggered, ended when all
   the high priority fences are signaled.

Cc: Christian Koenig <Christian.Koenig@amd.com>
Cc: Michel Dänzer <michel@daenzer.net>
Signed-off-by: Jiadong Zhu <Jiadong.Zhu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c | 93 ++++++++++++++++++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h |  3 +
 2 files changed, 90 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
index 41b057b9358e..eac89094f1d1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
@@ -27,7 +27,13 @@
 #include "amdgpu_ring.h"
 #include "amdgpu.h"
 
+/* The jiffies fallback resubmission happens */
 #define AMDGPU_MUX_RESUBMIT_JIFFIES_TIMEOUT (HZ / 2)
+
+/* Maximum waiting jiffies on low priority ring thread */
+#define AMDGPU_MUX_DELAY_JIFFIES_TIMEOUT (HZ / 10)
+
+/* The time threshold of unsignaled fence that trigger mcbp */
 #define AMDGPU_MAX_LAST_UNSIGNALED_THRESHOLD_US 10000
 
 static const struct ring_info {
@@ -47,6 +53,69 @@ static inline struct amdgpu_mux_entry *amdgpu_ring_mux_sw_entry(struct amdgpu_ri
 			&mux->ring_entry[ring->entry_index] : NULL;
 }
 
+static uint32_t ring_priority_to_ratio_pct(unsigned int hw_prio)
+{
+	uint32_t ratio;
+
+	switch (hw_prio) {
+	case AMDGPU_RING_PRIO_DEFAULT:
+		ratio = 30;
+		break;
+	case AMDGPU_RING_PRIO_2:
+		ratio = 100;
+		break;
+	default:
+		ratio = 100;
+	}
+	return ratio;
+}
+
+static void reset_wcnt_on_all_rings(struct amdgpu_ring_mux *mux)
+{
+	int i;
+
+	for (i = 0; i < mux->num_ring_entries; i++)
+		mux->ring_entry[i].w_cnt = 0;
+}
+
+/**
+ * Decide if the low priority ring task should be delayed when there are high
+ * priority ibs ongoing. If all the high priority fences are signaled at that
+ * time, gpu is idle, we do not need to wait. Otherwise we calculate the
+ * percentage of portions copying ibs on the current ring and compare with the
+ * threshold according to the priority.
+ */
+static bool proceed_on_ring(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring)
+{
+	struct amdgpu_ring *high_prio_ring;
+	u64 current_cnt, total_cnt = 0;
+	int i;
+
+	for (i = 0; i < mux->num_ring_entries; i++) {
+		if (mux->ring_entry[i].ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT) {
+			high_prio_ring = mux->ring_entry[i].ring;
+			break;
+		}
+	}
+
+	/*All high priority fences signaled indicates gpu is idle.*/
+	if (amdgpu_fence_count_emitted(high_prio_ring) == 0) {
+		reset_wcnt_on_all_rings(mux);
+		return true;
+	}
+
+	for (i = 0; i < mux->num_ring_entries; i++) {
+		if (mux->ring_entry[i].ring->hw_prio == ring->hw_prio)
+			current_cnt = mux->ring_entry[i].w_cnt;
+		total_cnt += mux->ring_entry[i].w_cnt;
+	}
+
+	if (total_cnt == 0)
+		return true;
+
+	return ring_priority_to_ratio_pct(ring->hw_prio) > current_cnt * 100 / total_cnt;
+}
+
 /* copy packages on sw ring range[begin, end) */
 static void amdgpu_ring_mux_copy_pkt_from_sw_ring(struct amdgpu_ring_mux *mux,
 						  struct amdgpu_ring *ring,
@@ -73,6 +142,13 @@ static void amdgpu_ring_mux_copy_pkt_from_sw_ring(struct amdgpu_ring_mux *mux,
 	}
 }
 
+/* delay low priotiry task depending on high priority rings fence signal condition*/
+static void wait_on_ring(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring)
+{
+	wait_event_interruptible_timeout(mux->wait, proceed_on_ring(mux, ring),
+					 AMDGPU_MUX_DELAY_JIFFIES_TIMEOUT);
+}
+
 static void amdgpu_mux_resubmit_chunks(struct amdgpu_ring_mux *mux)
 {
 	struct amdgpu_mux_entry *e = NULL;
@@ -126,7 +202,6 @@ static void amdgpu_ring_mux_schedule_resubmit(struct amdgpu_ring_mux *mux)
 static void amdgpu_mux_resubmit_fallback(struct timer_list *t)
 {
 	struct amdgpu_ring_mux *mux = from_timer(mux, t, resubmit_timer);
-
 	if (!spin_trylock(&mux->lock)) {
 		amdgpu_ring_mux_schedule_resubmit(mux);
 		DRM_ERROR("reschedule resubmit\n");
@@ -158,6 +233,7 @@ int amdgpu_ring_mux_init(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring,
 	}
 
 	spin_lock_init(&mux->lock);
+	init_waitqueue_head(&mux->wait);
 	timer_setup(&mux->resubmit_timer, amdgpu_mux_resubmit_fallback, 0);
 
 	return 0;
@@ -205,8 +281,10 @@ void amdgpu_ring_mux_set_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *r
 {
 	struct amdgpu_mux_entry *e;
 
-	spin_lock(&mux->lock);
+	if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT && !proceed_on_ring(mux, ring))
+		wait_on_ring(mux, ring);
 
+	spin_lock(&mux->lock);
 	if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT)
 		amdgpu_mux_resubmit_chunks(mux);
 
@@ -238,7 +316,12 @@ void amdgpu_ring_mux_set_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *r
 	} else {
 		e->end_ptr_in_hw_ring = mux->real_ring->wptr;
 	}
+
+	e->w_cnt++;
 	spin_unlock(&mux->lock);
+
+	if (ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT)
+		wake_up_interruptible(&mux->wait);
 }
 
 u64 amdgpu_ring_mux_get_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring)
@@ -373,7 +456,9 @@ int amdgpu_mcbp_trigger_preempt(struct amdgpu_ring_mux *mux)
 	spin_lock(&mux->lock);
 	mux->pending_trailing_fence_signaled = true;
 	r = amdgpu_ring_preempt_ib(mux->real_ring);
+	reset_wcnt_on_all_rings(mux);
 	spin_unlock(&mux->lock);
+
 	return r;
 }
 
@@ -408,10 +493,6 @@ void amdgpu_ring_mux_start_ib(struct amdgpu_ring_mux *mux, struct amdgpu_ring *r
 	struct amdgpu_mux_entry *e;
 	struct amdgpu_mux_chunk *chunk;
 
-	spin_lock(&mux->lock);
-	amdgpu_mux_resubmit_chunks(mux);
-	spin_unlock(&mux->lock);
-
 	e = amdgpu_ring_mux_sw_entry(mux, ring);
 	if (!e) {
 		DRM_ERROR("cannot find entry!\n");
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
index aa758ebc86ae..a99647e33c9e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
@@ -39,6 +39,7 @@ struct amdgpu_ring;
  * @sw_rptr: the read pointer in software ring.
  * @sw_wptr: the write pointer in software ring.
  * @list: list head for amdgpu_mux_chunk
+ * @w_cnt: the write count of the current ring.
  */
 struct amdgpu_mux_entry {
 	struct                  amdgpu_ring *ring;
@@ -48,6 +49,7 @@ struct amdgpu_mux_entry {
 	u64                     sw_rptr;
 	u64                     sw_wptr;
 	struct list_head        list;
+	u64                     w_cnt;
 };
 
 struct amdgpu_ring_mux {
@@ -64,6 +66,7 @@ struct amdgpu_ring_mux {
 	struct timer_list       resubmit_timer;
 
 	bool                    pending_trailing_fence_signaled;
+	wait_queue_head_t       wait;
 };
 
 /**
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: [PATCH 5/5] drm/amdgpu: Improve the software ring priority scheduler
  2022-10-18  9:08 ` [PATCH 5/5] drm/amdgpu: Improve the software ring priority scheduler jiadong.zhu
@ 2022-10-18 11:24   ` Christian König
  0 siblings, 0 replies; 29+ messages in thread
From: Christian König @ 2022-10-18 11:24 UTC (permalink / raw)
  To: jiadong.zhu, amd-gfx; +Cc: Michel Dänzer

Am 18.10.22 um 11:08 schrieb jiadong.zhu@amd.com:
> From: Jiadong Zhu <Jiadong.Zhu@amd.com>
>
> Using the drm scheduler, the software rings' scheduling threads with different
> priorities have the same opportunity to get the spinlock and copy its packages
> into the real ring. Though preemption may happen for the low priority ring, it
> will catch up with the high priority ring by copying more data (the resubmit
> package and the current ibs) on the next calling of set_wptr. As a result, the
> priority is not quite effective.
>
> Here are some details to improve the priority of software rings at the bottom
> of drm scheduler by slowing down the low priority thread with following
> strategy.
> 1. If all the high priority fences are signaled which indicates gpu is idle
>     while there are low priority packages to submit, no delay happens.
> 2. When there are unsignaled fences on high priority rings, we account for the
>     portion of the ibs sent from the low priority ring. If the portion exceeds
>     a certain threshold(eg, 30%), a timeout wait happens on low priority
>     threads till more high priority ibs submitted.
> 3. The mechanism is started when the first time mcbp triggered, ended when all
>     the high priority fences are signaled.

This is a really big NAK for this approach since it will break GPU reset 
and can lead to deadlocks. You can't sleep in any of the callbacks 
waiting for the hardware to do something.

What we could do instead is to insert a dependency for the low priority 
ring. E.g. in amdgpu_job_dependency() return the latest high priority 
fence whenever a low priority job is about to be scheduled.

Regards,
Christian.

>
> Cc: Christian Koenig <Christian.Koenig@amd.com>
> Cc: Michel Dänzer <michel@daenzer.net>
> Signed-off-by: Jiadong Zhu <Jiadong.Zhu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c | 93 ++++++++++++++++++--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h |  3 +
>   2 files changed, 90 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
> index 41b057b9358e..eac89094f1d1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
> @@ -27,7 +27,13 @@
>   #include "amdgpu_ring.h"
>   #include "amdgpu.h"
>   
> +/* The jiffies fallback resubmission happens */
>   #define AMDGPU_MUX_RESUBMIT_JIFFIES_TIMEOUT (HZ / 2)
> +
> +/* Maximum waiting jiffies on low priority ring thread */
> +#define AMDGPU_MUX_DELAY_JIFFIES_TIMEOUT (HZ / 10)
> +
> +/* The time threshold of unsignaled fence that trigger mcbp */
>   #define AMDGPU_MAX_LAST_UNSIGNALED_THRESHOLD_US 10000
>   
>   static const struct ring_info {
> @@ -47,6 +53,69 @@ static inline struct amdgpu_mux_entry *amdgpu_ring_mux_sw_entry(struct amdgpu_ri
>   			&mux->ring_entry[ring->entry_index] : NULL;
>   }
>   
> +static uint32_t ring_priority_to_ratio_pct(unsigned int hw_prio)
> +{
> +	uint32_t ratio;
> +
> +	switch (hw_prio) {
> +	case AMDGPU_RING_PRIO_DEFAULT:
> +		ratio = 30;
> +		break;
> +	case AMDGPU_RING_PRIO_2:
> +		ratio = 100;
> +		break;
> +	default:
> +		ratio = 100;
> +	}
> +	return ratio;
> +}
> +
> +static void reset_wcnt_on_all_rings(struct amdgpu_ring_mux *mux)
> +{
> +	int i;
> +
> +	for (i = 0; i < mux->num_ring_entries; i++)
> +		mux->ring_entry[i].w_cnt = 0;
> +}
> +
> +/**
> + * Decide if the low priority ring task should be delayed when there are high
> + * priority ibs ongoing. If all the high priority fences are signaled at that
> + * time, gpu is idle, we do not need to wait. Otherwise we calculate the
> + * percentage of portions copying ibs on the current ring and compare with the
> + * threshold according to the priority.
> + */
> +static bool proceed_on_ring(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring)
> +{
> +	struct amdgpu_ring *high_prio_ring;
> +	u64 current_cnt, total_cnt = 0;
> +	int i;
> +
> +	for (i = 0; i < mux->num_ring_entries; i++) {
> +		if (mux->ring_entry[i].ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT) {
> +			high_prio_ring = mux->ring_entry[i].ring;
> +			break;
> +		}
> +	}
> +
> +	/*All high priority fences signaled indicates gpu is idle.*/
> +	if (amdgpu_fence_count_emitted(high_prio_ring) == 0) {
> +		reset_wcnt_on_all_rings(mux);
> +		return true;
> +	}
> +
> +	for (i = 0; i < mux->num_ring_entries; i++) {
> +		if (mux->ring_entry[i].ring->hw_prio == ring->hw_prio)
> +			current_cnt = mux->ring_entry[i].w_cnt;
> +		total_cnt += mux->ring_entry[i].w_cnt;
> +	}
> +
> +	if (total_cnt == 0)
> +		return true;
> +
> +	return ring_priority_to_ratio_pct(ring->hw_prio) > current_cnt * 100 / total_cnt;
> +}
> +
>   /* copy packages on sw ring range[begin, end) */
>   static void amdgpu_ring_mux_copy_pkt_from_sw_ring(struct amdgpu_ring_mux *mux,
>   						  struct amdgpu_ring *ring,
> @@ -73,6 +142,13 @@ static void amdgpu_ring_mux_copy_pkt_from_sw_ring(struct amdgpu_ring_mux *mux,
>   	}
>   }
>   
> +/* delay low priotiry task depending on high priority rings fence signal condition*/
> +static void wait_on_ring(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring)
> +{
> +	wait_event_interruptible_timeout(mux->wait, proceed_on_ring(mux, ring),
> +					 AMDGPU_MUX_DELAY_JIFFIES_TIMEOUT);
> +}
> +
>   static void amdgpu_mux_resubmit_chunks(struct amdgpu_ring_mux *mux)
>   {
>   	struct amdgpu_mux_entry *e = NULL;
> @@ -126,7 +202,6 @@ static void amdgpu_ring_mux_schedule_resubmit(struct amdgpu_ring_mux *mux)
>   static void amdgpu_mux_resubmit_fallback(struct timer_list *t)
>   {
>   	struct amdgpu_ring_mux *mux = from_timer(mux, t, resubmit_timer);
> -
>   	if (!spin_trylock(&mux->lock)) {
>   		amdgpu_ring_mux_schedule_resubmit(mux);
>   		DRM_ERROR("reschedule resubmit\n");
> @@ -158,6 +233,7 @@ int amdgpu_ring_mux_init(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring,
>   	}
>   
>   	spin_lock_init(&mux->lock);
> +	init_waitqueue_head(&mux->wait);
>   	timer_setup(&mux->resubmit_timer, amdgpu_mux_resubmit_fallback, 0);
>   
>   	return 0;
> @@ -205,8 +281,10 @@ void amdgpu_ring_mux_set_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *r
>   {
>   	struct amdgpu_mux_entry *e;
>   
> -	spin_lock(&mux->lock);
> +	if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT && !proceed_on_ring(mux, ring))
> +		wait_on_ring(mux, ring);
>   
> +	spin_lock(&mux->lock);
>   	if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT)
>   		amdgpu_mux_resubmit_chunks(mux);
>   
> @@ -238,7 +316,12 @@ void amdgpu_ring_mux_set_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *r
>   	} else {
>   		e->end_ptr_in_hw_ring = mux->real_ring->wptr;
>   	}
> +
> +	e->w_cnt++;
>   	spin_unlock(&mux->lock);
> +
> +	if (ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT)
> +		wake_up_interruptible(&mux->wait);
>   }
>   
>   u64 amdgpu_ring_mux_get_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring)
> @@ -373,7 +456,9 @@ int amdgpu_mcbp_trigger_preempt(struct amdgpu_ring_mux *mux)
>   	spin_lock(&mux->lock);
>   	mux->pending_trailing_fence_signaled = true;
>   	r = amdgpu_ring_preempt_ib(mux->real_ring);
> +	reset_wcnt_on_all_rings(mux);
>   	spin_unlock(&mux->lock);
> +
>   	return r;
>   }
>   
> @@ -408,10 +493,6 @@ void amdgpu_ring_mux_start_ib(struct amdgpu_ring_mux *mux, struct amdgpu_ring *r
>   	struct amdgpu_mux_entry *e;
>   	struct amdgpu_mux_chunk *chunk;
>   
> -	spin_lock(&mux->lock);
> -	amdgpu_mux_resubmit_chunks(mux);
> -	spin_unlock(&mux->lock);
> -
>   	e = amdgpu_ring_mux_sw_entry(mux, ring);
>   	if (!e) {
>   		DRM_ERROR("cannot find entry!\n");
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
> index aa758ebc86ae..a99647e33c9e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
> @@ -39,6 +39,7 @@ struct amdgpu_ring;
>    * @sw_rptr: the read pointer in software ring.
>    * @sw_wptr: the write pointer in software ring.
>    * @list: list head for amdgpu_mux_chunk
> + * @w_cnt: the write count of the current ring.
>    */
>   struct amdgpu_mux_entry {
>   	struct                  amdgpu_ring *ring;
> @@ -48,6 +49,7 @@ struct amdgpu_mux_entry {
>   	u64                     sw_rptr;
>   	u64                     sw_wptr;
>   	struct list_head        list;
> +	u64                     w_cnt;
>   };
>   
>   struct amdgpu_ring_mux {
> @@ -64,6 +66,7 @@ struct amdgpu_ring_mux {
>   	struct timer_list       resubmit_timer;
>   
>   	bool                    pending_trailing_fence_signaled;
> +	wait_queue_head_t       wait;
>   };
>   
>   /**


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 1/5] drm/amdgpu: Introduce gfx software ring (v8)
  2022-10-18  9:08 [PATCH 1/5] drm/amdgpu: Introduce gfx software ring (v8) jiadong.zhu
                   ` (3 preceding siblings ...)
  2022-10-18  9:08 ` [PATCH 5/5] drm/amdgpu: Improve the software ring priority scheduler jiadong.zhu
@ 2022-10-19 15:14 ` Luben Tuikov
  2022-10-20 14:49 ` Michel Dänzer
  2022-11-22  5:33 ` Luben Tuikov
  6 siblings, 0 replies; 29+ messages in thread
From: Luben Tuikov @ 2022-10-19 15:14 UTC (permalink / raw)
  To: jiadong.zhu, amd-gfx
  Cc: Andrey Grodzovsky, Michel Dänzer, Huang Rui, Christian Koenig

Inlined:

On 2022-10-18 05:08, jiadong.zhu@amd.com wrote:
> From: "Jiadong.Zhu" <Jiadong.Zhu@amd.com>
> 
> The software ring is created to support priority context while there is only
> one hardware queue for gfx.
> 
> Every software ring has its fence driver and could be used as an ordinary ring
> for the GPU scheduler.
> Multiple software rings are bound to a real ring with the ring muxer. The
> packages committed on the software ring are copied to the real ring.
> 
> v2: Use array to store software ring entry.
> v3: Remove unnecessary prints.
> v4: Remove amdgpu_ring_sw_init/fini functions,
> using gtt for sw ring buffer for later dma copy
> optimization.
> v5: Allocate ring entry dynamically in the muxer.
> v6: Update comments for the ring muxer.
> v7: Modify for function naming.
> v8: Combine software ring functions into amdgpu_ring_mux.c
> 
> Cc: Christian Koenig <Christian.Koenig@amd.com>
> Cc: Luben Tuikov <Luben.Tuikov@amd.com>
> Cc: Andrey Grodzovsky  <Andrey.Grodzovsky@amd.com>
> Cc: Michel Dänzer <michel@daenzer.net>
> Signed-off-by: Jiadong.Zhu <Jiadong.Zhu@amd.com>
> Acked-by: Huang Rui <ray.huang@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/Makefile          |   3 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h      |   3 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h     |   4 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c | 217 +++++++++++++++++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h |  76 +++++++
>  5 files changed, 302 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
>  create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile
> index 3e0e2eb7e235..add7da53950c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/Makefile
> +++ b/drivers/gpu/drm/amd/amdgpu/Makefile
> @@ -58,7 +58,8 @@ amdgpu-y += amdgpu_device.o amdgpu_kms.o \
>  	amdgpu_vm_sdma.o amdgpu_discovery.o amdgpu_ras_eeprom.o amdgpu_nbio.o \
>  	amdgpu_umc.o smu_v11_0_i2c.o amdgpu_fru_eeprom.o amdgpu_rap.o \
>  	amdgpu_fw_attestation.o amdgpu_securedisplay.o \
> -	amdgpu_eeprom.o amdgpu_mca.o amdgpu_psp_ta.o amdgpu_lsdma.o
> +	amdgpu_eeprom.o amdgpu_mca.o amdgpu_psp_ta.o amdgpu_lsdma.o \
> +	amdgpu_ring_mux.o
>  
>  amdgpu-$(CONFIG_PROC_FS) += amdgpu_fdinfo.o
>  
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> index 53526ffb2ce1..9996dadb39f7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> @@ -33,6 +33,7 @@
>  #include "amdgpu_imu.h"
>  #include "soc15.h"
>  #include "amdgpu_ras.h"
> +#include "amdgpu_ring_mux.h"
>  
>  /* GFX current status */
>  #define AMDGPU_GFX_NORMAL_MODE			0x00000000L
> @@ -346,6 +347,8 @@ struct amdgpu_gfx {
>  	struct amdgpu_gfx_ras		*ras;
>  
>  	bool				is_poweron;
> +
> +	struct amdgpu_ring_mux          muxer;
>  };
>  
>  #define amdgpu_gfx_get_gpu_clock_counter(adev) (adev)->gfx.funcs->get_gpu_clock_counter((adev))
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index 7d89a52091c0..40b1277b4f0c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -278,6 +278,10 @@ struct amdgpu_ring {
>  	bool			is_mes_queue;
>  	uint32_t		hw_queue_id;
>  	struct amdgpu_mes_ctx_data *mes_ctx;
> +
> +	bool            is_sw_ring;
> +	unsigned int    entry_index;
> +
>  };
>  
>  #define amdgpu_ring_parse_cs(r, p, job, ib) ((r)->funcs->parse_cs((p), (job), (ib)))
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
> new file mode 100644
> index 000000000000..43cab8a37754
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
> @@ -0,0 +1,217 @@
> +/*
> + * Copyright 2022 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + *
> + */
> +#include <linux/slab.h>
> +#include <drm/drm_print.h>
> +
> +#include "amdgpu_ring_mux.h"
> +#include "amdgpu_ring.h"
> +#include "amdgpu.h"
> +
> +#define AMDGPU_MUX_RESUBMIT_JIFFIES_TIMEOUT (HZ / 2)
> +
> +int amdgpu_ring_mux_init(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring,
> +			 unsigned int entry_size)
> +{
> +	mux->real_ring = ring;
> +	mux->num_ring_entries = 0;
> +	mux->ring_entry = kcalloc(entry_size, sizeof(struct amdgpu_mux_entry), GFP_KERNEL);
> +	if (!mux->ring_entry)
> +		return -ENOMEM;
> +
> +	mux->ring_entry_size = entry_size;
> +	spin_lock_init(&mux->lock);
> +
> +	return 0;
> +}
> +
> +void amdgpu_ring_mux_fini(struct amdgpu_ring_mux *mux)
> +{
> +	kfree(mux->ring_entry);
> +	mux->ring_entry = NULL;
> +	mux->num_ring_entries = 0;
> +	mux->ring_entry_size = 0;
> +}
> +
> +int amdgpu_ring_mux_add_sw_ring(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring)
> +{
> +	struct amdgpu_mux_entry *e;
> +
> +	if (mux->num_ring_entries >= mux->ring_entry_size) {
> +		DRM_ERROR("add sw ring exceeding max entry size\n");
> +		return -ENOENT;
> +	}
> +
> +	e = &mux->ring_entry[mux->num_ring_entries];
> +	ring->entry_index = mux->num_ring_entries;
> +	e->ring = ring;
> +
> +	mux->num_ring_entries += 1;

Why is this all the way down here right next to the "return 0;"?
Move it up, so that it is clearly visible and in context,
right after "ring->entry_index = mux->num_ring_entries;".

IOW, the "adding" of an entry should be clearly visible and in context.

So it should look something like this:
	...
	e = &mux->ring_entry[mux->num_ring_entries];
	ring->entry_index = mux->num_ring_entries;
	mux->num_ring_entries += 1;
	e->ring = ring;

	return 0;
}

With that changed, this patch is Acked-by: Luben Tuikov <luben.tuikov@amd.com>.

Regards.
Luben

> +	return 0;
> +}
> +
> +static inline struct amdgpu_mux_entry *amdgpu_ring_mux_sw_entry(struct amdgpu_ring_mux *mux,
> +								struct amdgpu_ring *ring)
> +{
> +	return ring->entry_index < mux->ring_entry_size ?
> +			&mux->ring_entry[ring->entry_index] : NULL;
> +}
> +
> +/* copy packages on sw ring range[begin, end) */
> +static void amdgpu_ring_mux_copy_pkt_from_sw_ring(struct amdgpu_ring_mux *mux,
> +						  struct amdgpu_ring *ring,
> +						  u64 s_start, u64 s_end)
> +{
> +	u64 start, end;
> +	struct amdgpu_ring *real_ring = mux->real_ring;
> +
> +	start = s_start & ring->buf_mask;
> +	end = s_end & ring->buf_mask;
> +
> +	if (start == end) {
> +		DRM_ERROR("no more data copied from sw ring\n");
> +		return;
> +	}
> +	if (start > end) {
> +		amdgpu_ring_alloc(real_ring, (ring->ring_size >> 2) + end - start);
> +		amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[start],
> +					   (ring->ring_size >> 2) - start);
> +		amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[0], end);
> +	} else {
> +		amdgpu_ring_alloc(real_ring, end - start);
> +		amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[start], end - start);
> +	}
> +}
> +
> +void amdgpu_ring_mux_set_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring, u64 wptr)
> +{
> +	struct amdgpu_mux_entry *e;
> +
> +	e = amdgpu_ring_mux_sw_entry(mux, ring);
> +	if (!e) {
> +		DRM_ERROR("cannot find entry for sw ring\n");
> +		return;
> +	}
> +
> +	spin_lock(&mux->lock);
> +	e->sw_cptr = e->sw_wptr;
> +	e->sw_wptr = wptr;
> +	e->start_ptr_in_hw_ring = mux->real_ring->wptr;
> +
> +	amdgpu_ring_mux_copy_pkt_from_sw_ring(mux, ring, e->sw_cptr, wptr);
> +	e->end_ptr_in_hw_ring = mux->real_ring->wptr;
> +	amdgpu_ring_commit(mux->real_ring);
> +
> +	spin_unlock(&mux->lock);
> +}
> +
> +u64 amdgpu_ring_mux_get_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring)
> +{
> +	struct amdgpu_mux_entry *e;
> +
> +	e = amdgpu_ring_mux_sw_entry(mux, ring);
> +	if (!e) {
> +		DRM_ERROR("cannot find entry for sw ring\n");
> +		return 0;
> +	}
> +
> +	return e->sw_wptr;
> +}
> +
> +/*
> + * The return value of the readptr is not precise while the other rings could
> + * write data onto the real ring buffer.After overwriting on the real ring, we
> + * can not decide if our packages have been excuted or not read yet. However,
> + * this function is only called by the tools such as umr to collect the latest
> + * packages for the hang analysis. We assume the hang happens near our latest
> + * submit. Thus we could use the following logic to give the clue:
> + * If the readptr is between start and end, then we return the copy pointer
> + * plus the distance from start to readptr. If the readptr is before start, we
> + * return the copy pointer. Lastly, if the readptr is past end, we return the
> + * write pointer.
> + */
> +u64 amdgpu_ring_mux_get_rptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring)
> +{
> +	struct amdgpu_mux_entry *e;
> +	u64 readp, offset, start, end;
> +
> +	e = amdgpu_ring_mux_sw_entry(mux, ring);
> +	if (!e) {
> +		DRM_ERROR("no sw entry found!\n");
> +		return 0;
> +	}
> +
> +	readp = amdgpu_ring_get_rptr(mux->real_ring);
> +
> +	start = e->start_ptr_in_hw_ring & mux->real_ring->buf_mask;
> +	end = e->end_ptr_in_hw_ring & mux->real_ring->buf_mask;
> +	if (start > end) {
> +		if (readp <= end)
> +			readp += mux->real_ring->ring_size >> 2;
> +		end += mux->real_ring->ring_size >> 2;
> +	}
> +
> +	if (start <= readp && readp <= end) {
> +		offset = readp - start;
> +		e->sw_rptr = (e->sw_cptr + offset) & ring->buf_mask;
> +	} else if (readp < start) {
> +		e->sw_rptr = e->sw_cptr;
> +	} else {
> +		/* end < readptr */
> +		e->sw_rptr = e->sw_wptr;
> +	}
> +
> +	return e->sw_rptr;
> +}
> +
> +u64 amdgpu_sw_ring_get_rptr_gfx(struct amdgpu_ring *ring)
> +{
> +	struct amdgpu_device *adev = ring->adev;
> +	struct amdgpu_ring_mux *mux = &adev->gfx.muxer;
> +
> +	WARN_ON(!ring->is_sw_ring);
> +	return amdgpu_ring_mux_get_rptr(mux, ring);
> +}
> +
> +u64 amdgpu_sw_ring_get_wptr_gfx(struct amdgpu_ring *ring)
> +{
> +	struct amdgpu_device *adev = ring->adev;
> +	struct amdgpu_ring_mux *mux = &adev->gfx.muxer;
> +
> +	WARN_ON(!ring->is_sw_ring);
> +	return amdgpu_ring_mux_get_wptr(mux, ring);
> +}
> +
> +void amdgpu_sw_ring_set_wptr_gfx(struct amdgpu_ring *ring)
> +{
> +	struct amdgpu_device *adev = ring->adev;
> +	struct amdgpu_ring_mux *mux = &adev->gfx.muxer;
> +
> +	WARN_ON(!ring->is_sw_ring);
> +	amdgpu_ring_mux_set_wptr(mux, ring, ring->wptr);
> +}
> +
> +/* Override insert_nop to prevent emitting nops to the software rings */
> +void amdgpu_sw_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count)
> +{
> +	WARN_ON(!ring->is_sw_ring);
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
> new file mode 100644
> index 000000000000..d91629589577
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
> @@ -0,0 +1,76 @@
> +/*
> + * Copyright 2022 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + *
> + */
> +
> +#ifndef __AMDGPU_RING_MUX__
> +#define __AMDGPU_RING_MUX__
> +
> +#include <linux/timer.h>
> +#include <linux/spinlock.h>
> +#include "amdgpu_ring.h"
> +
> +struct amdgpu_ring;
> +/**
> + * struct amdgpu_mux_entry - the entry recording software rings copying information.
> + * @ring: the pointer to the software ring.
> + * @start_ptr_in_hw_ring: last start location copied to in the hardware ring.
> + * @end_ptr_in_hw_ring: last end location copied to in the hardware ring.
> + * @sw_cptr: the position of the copy pointer in the sw ring.
> + * @sw_rptr: the read pointer in software ring.
> + * @sw_wptr: the write pointer in software ring.
> + */
> +struct amdgpu_mux_entry {
> +	struct                  amdgpu_ring *ring;
> +	u64                     start_ptr_in_hw_ring;
> +	u64                     end_ptr_in_hw_ring;
> +	u64                     sw_cptr;
> +	u64                     sw_rptr;
> +	u64                     sw_wptr;
> +};
> +
> +struct amdgpu_ring_mux {
> +	struct amdgpu_ring      *real_ring;
> +
> +	struct amdgpu_mux_entry *ring_entry;
> +	unsigned int            num_ring_entries;
> +	unsigned int            ring_entry_size;
> +	/*the lock for copy data from different software rings*/
> +	spinlock_t              lock;
> +};
> +
> +int amdgpu_ring_mux_init(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring,
> +			 unsigned int entry_size);
> +void amdgpu_ring_mux_fini(struct amdgpu_ring_mux *mux);
> +int amdgpu_ring_mux_add_sw_ring(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring);
> +void amdgpu_ring_mux_set_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring, u64 wptr);
> +u64 amdgpu_ring_mux_get_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring);
> +u64 amdgpu_ring_mux_get_rptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring);
> +
> +u64 amdgpu_sw_ring_get_rptr_gfx(struct amdgpu_ring *ring);
> +u64 amdgpu_sw_ring_get_wptr_gfx(struct amdgpu_ring *ring);
> +void amdgpu_sw_ring_set_wptr_gfx(struct amdgpu_ring *ring);
> +
> +void amdgpu_sw_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count);
> +void amdgpu_sw_ring_ib_begin(struct amdgpu_ring *ring);
> +void amdgpu_sw_ring_ib_end(struct amdgpu_ring *ring);
> +
> +#endif

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 1/5] drm/amdgpu: Introduce gfx software ring (v8)
  2022-10-18  9:08 [PATCH 1/5] drm/amdgpu: Introduce gfx software ring (v8) jiadong.zhu
                   ` (4 preceding siblings ...)
  2022-10-19 15:14 ` [PATCH 1/5] drm/amdgpu: Introduce gfx software ring (v8) Luben Tuikov
@ 2022-10-20 14:49 ` Michel Dänzer
  2022-10-20 14:59   ` Christian König
  2022-11-22  5:33 ` Luben Tuikov
  6 siblings, 1 reply; 29+ messages in thread
From: Michel Dänzer @ 2022-10-20 14:49 UTC (permalink / raw)
  To: jiadong.zhu, amd-gfx
  Cc: Andrey Grodzovsky, Huang Rui, Christian Koenig, Luben Tuikov

On 2022-10-18 11:08, jiadong.zhu@amd.com wrote:
> From: "Jiadong.Zhu" <Jiadong.Zhu@amd.com>
> 
> The software ring is created to support priority context while there is only
> one hardware queue for gfx.
> 
> Every software ring has its fence driver and could be used as an ordinary ring
> for the GPU scheduler.
> Multiple software rings are bound to a real ring with the ring muxer. The
> packages committed on the software ring are copied to the real ring.
> 
> v2: Use array to store software ring entry.
> v3: Remove unnecessary prints.
> v4: Remove amdgpu_ring_sw_init/fini functions,
> using gtt for sw ring buffer for later dma copy
> optimization.
> v5: Allocate ring entry dynamically in the muxer.
> v6: Update comments for the ring muxer.
> v7: Modify for function naming.
> v8: Combine software ring functions into amdgpu_ring_mux.c

I tested patches 1-4 of this series (since Christian clearly nacked patch 5). I hit the oops below.

amdgpu_sw_ring_ib_begin+0x70/0x80 is in amdgpu_mcbp_trigger_preempt according to scripts/faddr2line, specifically line 376:

	spin_unlock(&mux->lock);

though I'm not sure why that would crash.


Are you not able to reproduce this with a GNOME Wayland session?


BUG: kernel NULL pointer dereference, address: 0000000000000000
#PF: supervisor instruction fetch in kernel mode
#PF: error_code(0x0010) - not-present page
PGD 0 P4D 0
Oops: 0010 [#1] PREEMPT SMP NOPTI
CPU: 7 PID: 281 Comm: gfx_high Tainted: G            E      6.0.2+ #1
Hardware name: LENOVO 20NF0000GE/20NF0000GE, BIOS R11ET36W (1.16 ) 03/30/2020
RIP: 0010:0x0
Code: Unable to access opcode bytes at RIP 0xffffffffffffffd6.
RSP: 0018:ffffbd594073bdc8 EFLAGS: 00010246
RAX: 0000000000000000 RBX: ffff993c4a620000 RCX: 0000000000000000
RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff993c4a62a350
RBP: ffff993c4a62d530 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000114
R13: ffff993c4a620000 R14: 0000000000000000 R15: ffff993c4a62d128
FS:  0000000000000000(0000) GS:ffff993ef0bc0000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: ffffffffffffffd6 CR3: 00000001959fc000 CR4: 00000000003506e0
Call Trace:
 <TASK>
 amdgpu_sw_ring_ib_begin+0x70/0x80 [amdgpu]
 amdgpu_ib_schedule+0x15f/0x5d0 [amdgpu]
 amdgpu_job_run+0x102/0x1c0 [amdgpu]
 drm_sched_main+0x19a/0x440 [gpu_sched]
 ? dequeue_task_stop+0x70/0x70
 ? drm_sched_resubmit_jobs+0x10/0x10 [gpu_sched]
 kthread+0xe9/0x110
 ? kthread_complete_and_exit+0x20/0x20
 ret_from_fork+0x22/0x30
 </TASK>
[...]
note: gfx_high[281] exited with preempt_count 1
[...]
[drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring gfx_low timeout, signaled seq=14864, emitted seq=14865
[drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information: process firefox.dpkg-di pid 3540 thread firefox:cs0 pid 4666
amdgpu 0000:05:00.0: amdgpu: GPU reset begin!


-- 
Earthling Michel Dänzer            |                  https://redhat.com
Libre software enthusiast          |         Mesa and Xwayland developer


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 1/5] drm/amdgpu: Introduce gfx software ring (v8)
  2022-10-20 14:49 ` Michel Dänzer
@ 2022-10-20 14:59   ` Christian König
  2022-10-21  7:42     ` Michel Dänzer
  2022-10-31  8:10     ` Zhu, Jiadong
  0 siblings, 2 replies; 29+ messages in thread
From: Christian König @ 2022-10-20 14:59 UTC (permalink / raw)
  To: Michel Dänzer, jiadong.zhu, amd-gfx
  Cc: Andrey Grodzovsky, Huang Rui, Luben Tuikov

Am 20.10.22 um 16:49 schrieb Michel Dänzer:
> On 2022-10-18 11:08, jiadong.zhu@amd.com wrote:
>> From: "Jiadong.Zhu" <Jiadong.Zhu@amd.com>
>>
>> The software ring is created to support priority context while there is only
>> one hardware queue for gfx.
>>
>> Every software ring has its fence driver and could be used as an ordinary ring
>> for the GPU scheduler.
>> Multiple software rings are bound to a real ring with the ring muxer. The
>> packages committed on the software ring are copied to the real ring.
>>
>> v2: Use array to store software ring entry.
>> v3: Remove unnecessary prints.
>> v4: Remove amdgpu_ring_sw_init/fini functions,
>> using gtt for sw ring buffer for later dma copy
>> optimization.
>> v5: Allocate ring entry dynamically in the muxer.
>> v6: Update comments for the ring muxer.
>> v7: Modify for function naming.
>> v8: Combine software ring functions into amdgpu_ring_mux.c
> I tested patches 1-4 of this series (since Christian clearly nacked patch 5). I hit the oops below.

As long as you don't try to reset the GPU you can also test patch 5. 
It's just that we can't upstream the stuff like this or that would break 
immediately.

Regards,
Christian.

>
> amdgpu_sw_ring_ib_begin+0x70/0x80 is in amdgpu_mcbp_trigger_preempt according to scripts/faddr2line, specifically line 376:
>
> 	spin_unlock(&mux->lock);
>
> though I'm not sure why that would crash.
>
>
> Are you not able to reproduce this with a GNOME Wayland session?
>
>
> BUG: kernel NULL pointer dereference, address: 0000000000000000
> #PF: supervisor instruction fetch in kernel mode
> #PF: error_code(0x0010) - not-present page
> PGD 0 P4D 0
> Oops: 0010 [#1] PREEMPT SMP NOPTI
> CPU: 7 PID: 281 Comm: gfx_high Tainted: G            E      6.0.2+ #1
> Hardware name: LENOVO 20NF0000GE/20NF0000GE, BIOS R11ET36W (1.16 ) 03/30/2020
> RIP: 0010:0x0
> Code: Unable to access opcode bytes at RIP 0xffffffffffffffd6.
> RSP: 0018:ffffbd594073bdc8 EFLAGS: 00010246
> RAX: 0000000000000000 RBX: ffff993c4a620000 RCX: 0000000000000000
> RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff993c4a62a350
> RBP: ffff993c4a62d530 R08: 0000000000000000 R09: 0000000000000000
> R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000114
> R13: ffff993c4a620000 R14: 0000000000000000 R15: ffff993c4a62d128
> FS:  0000000000000000(0000) GS:ffff993ef0bc0000(0000) knlGS:0000000000000000
> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: ffffffffffffffd6 CR3: 00000001959fc000 CR4: 00000000003506e0
> Call Trace:
>   <TASK>
>   amdgpu_sw_ring_ib_begin+0x70/0x80 [amdgpu]
>   amdgpu_ib_schedule+0x15f/0x5d0 [amdgpu]
>   amdgpu_job_run+0x102/0x1c0 [amdgpu]
>   drm_sched_main+0x19a/0x440 [gpu_sched]
>   ? dequeue_task_stop+0x70/0x70
>   ? drm_sched_resubmit_jobs+0x10/0x10 [gpu_sched]
>   kthread+0xe9/0x110
>   ? kthread_complete_and_exit+0x20/0x20
>   ret_from_fork+0x22/0x30
>   </TASK>
> [...]
> note: gfx_high[281] exited with preempt_count 1
> [...]
> [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring gfx_low timeout, signaled seq=14864, emitted seq=14865
> [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information: process firefox.dpkg-di pid 3540 thread firefox:cs0 pid 4666
> amdgpu 0000:05:00.0: amdgpu: GPU reset begin!
>
>


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 1/5] drm/amdgpu: Introduce gfx software ring (v8)
  2022-10-20 14:59   ` Christian König
@ 2022-10-21  7:42     ` Michel Dänzer
  2022-10-31  8:10     ` Zhu, Jiadong
  1 sibling, 0 replies; 29+ messages in thread
From: Michel Dänzer @ 2022-10-21  7:42 UTC (permalink / raw)
  To: Christian König, jiadong.zhu
  Cc: Andrey Grodzovsky, Huang Rui, amd-gfx, Luben Tuikov

On 2022-10-20 16:59, Christian König wrote:
> Am 20.10.22 um 16:49 schrieb Michel Dänzer:
>> On 2022-10-18 11:08, jiadong.zhu@amd.com wrote:
>>> From: "Jiadong.Zhu" <Jiadong.Zhu@amd.com>
>>>
>>> The software ring is created to support priority context while there is only
>>> one hardware queue for gfx.
>>>
>>> Every software ring has its fence driver and could be used as an ordinary ring
>>> for the GPU scheduler.
>>> Multiple software rings are bound to a real ring with the ring muxer. The
>>> packages committed on the software ring are copied to the real ring.
>>>
>>> v2: Use array to store software ring entry.
>>> v3: Remove unnecessary prints.
>>> v4: Remove amdgpu_ring_sw_init/fini functions,
>>> using gtt for sw ring buffer for later dma copy
>>> optimization.
>>> v5: Allocate ring entry dynamically in the muxer.
>>> v6: Update comments for the ring muxer.
>>> v7: Modify for function naming.
>>> v8: Combine software ring functions into amdgpu_ring_mux.c
>> I tested patches 1-4 of this series (since Christian clearly nacked patch 5). I hit the oops below.
> 
> As long as you don't try to reset the GPU you can also test patch 5.

Sure, I can test it once there's a fix for the oops.


-- 
Earthling Michel Dänzer            |                  https://redhat.com
Libre software enthusiast          |         Mesa and Xwayland developer


^ permalink raw reply	[flat|nested] 29+ messages in thread

* RE: [PATCH 1/5] drm/amdgpu: Introduce gfx software ring (v8)
  2022-10-20 14:59   ` Christian König
  2022-10-21  7:42     ` Michel Dänzer
@ 2022-10-31  8:10     ` Zhu, Jiadong
  2022-10-31 11:58       ` Michel Dänzer
  1 sibling, 1 reply; 29+ messages in thread
From: Zhu, Jiadong @ 2022-10-31  8:10 UTC (permalink / raw)
  To: Koenig, Christian, Michel Dänzer, amd-gfx
  Cc: Tuikov, Luben, Grodzovsky, Andrey, Huang, Ray

[AMD Official Use Only - General]

Hi Michel,

Sorry for the late response. It is more likely the null pointer is raised from function amdgpu_ring_preempt_ib as preempt_ib is not assigned.

Could you have a check preempt_ib  is set in gfx_v9_0.c from the patch "drm/amdgpu: Modify unmap_queue format for gfx9 (v4)".

"
...
.preempt_ib = gfx_v9_0_ring_preempt_ib,
"

Btw, Which branch of kmd are you cherry-pick? Maybe my code base is too old.
I tried using wayland on ubuntu 20.04 and not reproduced the crash.

Thanks,
Jiadong

-----Original Message-----
From: Koenig, Christian <Christian.Koenig@amd.com>
Sent: Thursday, October 20, 2022 10:59 PM
To: Michel Dänzer <michel@daenzer.net>; Zhu, Jiadong <Jiadong.Zhu@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Tuikov, Luben <Luben.Tuikov@amd.com>
Subject: Re: [PATCH 1/5] drm/amdgpu: Introduce gfx software ring (v8)

Am 20.10.22 um 16:49 schrieb Michel Dänzer:
> On 2022-10-18 11:08, jiadong.zhu@amd.com wrote:
>> From: "Jiadong.Zhu" <Jiadong.Zhu@amd.com>
>>
>> The software ring is created to support priority context while there
>> is only one hardware queue for gfx.
>>
>> Every software ring has its fence driver and could be used as an
>> ordinary ring for the GPU scheduler.
>> Multiple software rings are bound to a real ring with the ring muxer.
>> The packages committed on the software ring are copied to the real ring.
>>
>> v2: Use array to store software ring entry.
>> v3: Remove unnecessary prints.
>> v4: Remove amdgpu_ring_sw_init/fini functions, using gtt for sw ring
>> buffer for later dma copy optimization.
>> v5: Allocate ring entry dynamically in the muxer.
>> v6: Update comments for the ring muxer.
>> v7: Modify for function naming.
>> v8: Combine software ring functions into amdgpu_ring_mux.c
> I tested patches 1-4 of this series (since Christian clearly nacked patch 5). I hit the oops below.

As long as you don't try to reset the GPU you can also test patch 5.
It's just that we can't upstream the stuff like this or that would break immediately.

Regards,
Christian.

>
> amdgpu_sw_ring_ib_begin+0x70/0x80 is in amdgpu_mcbp_trigger_preempt according to scripts/faddr2line, specifically line 376:
>
>       spin_unlock(&mux->lock);
>
> though I'm not sure why that would crash.
>
>
> Are you not able to reproduce this with a GNOME Wayland session?
>
>
> BUG: kernel NULL pointer dereference, address: 0000000000000000
> #PF: supervisor instruction fetch in kernel mode
> #PF: error_code(0x0010) - not-present page PGD 0 P4D 0
> Oops: 0010 [#1] PREEMPT SMP NOPTI
> CPU: 7 PID: 281 Comm: gfx_high Tainted: G            E      6.0.2+ #1
> Hardware name: LENOVO 20NF0000GE/20NF0000GE, BIOS R11ET36W (1.16 )
> 03/30/2020
> RIP: 0010:0x0
> Code: Unable to access opcode bytes at RIP 0xffffffffffffffd6.
> RSP: 0018:ffffbd594073bdc8 EFLAGS: 00010246
> RAX: 0000000000000000 RBX: ffff993c4a620000 RCX: 0000000000000000
> RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff993c4a62a350
> RBP: ffff993c4a62d530 R08: 0000000000000000 R09: 0000000000000000
> R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000114
> R13: ffff993c4a620000 R14: 0000000000000000 R15: ffff993c4a62d128
> FS:  0000000000000000(0000) GS:ffff993ef0bc0000(0000)
> knlGS:0000000000000000
> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: ffffffffffffffd6 CR3: 00000001959fc000 CR4: 00000000003506e0 Call
> Trace:
>   <TASK>
>   amdgpu_sw_ring_ib_begin+0x70/0x80 [amdgpu]
>   amdgpu_ib_schedule+0x15f/0x5d0 [amdgpu]
>   amdgpu_job_run+0x102/0x1c0 [amdgpu]
>   drm_sched_main+0x19a/0x440 [gpu_sched]
>   ? dequeue_task_stop+0x70/0x70
>   ? drm_sched_resubmit_jobs+0x10/0x10 [gpu_sched]
>   kthread+0xe9/0x110
>   ? kthread_complete_and_exit+0x20/0x20
>   ret_from_fork+0x22/0x30
>   </TASK>
> [...]
> note: gfx_high[281] exited with preempt_count 1 [...]
> [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring gfx_low timeout,
> signaled seq=14864, emitted seq=14865 [drm:amdgpu_job_timedout
> [amdgpu]] *ERROR* Process information: process firefox.dpkg-di pid 3540 thread firefox:cs0 pid 4666 amdgpu 0000:05:00.0: amdgpu: GPU reset begin!
>
>


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 1/5] drm/amdgpu: Introduce gfx software ring (v8)
  2022-10-31  8:10     ` Zhu, Jiadong
@ 2022-10-31 11:58       ` Michel Dänzer
  0 siblings, 0 replies; 29+ messages in thread
From: Michel Dänzer @ 2022-10-31 11:58 UTC (permalink / raw)
  To: Zhu, Jiadong, Koenig, Christian, amd-gfx
  Cc: Grodzovsky, Andrey, Tuikov, Luben, Huang, Ray

On 2022-10-31 09:10, Zhu, Jiadong wrote:
> [AMD Official Use Only - General]
> 
> Hi Michel,
> 
> Sorry for the late response. It is more likely the null pointer is raised from function amdgpu_ring_preempt_ib as preempt_ib is not assigned.

That makes sense, since amdgpu_mcbp_trigger_preempt passes mux->real_ring to amdgpu_ring_preempt_ib, and the real ring doesn't have the preempt_ib hook set, does it?


> Btw, Which branch of kmd are you cherry-pick? Maybe my code base is too old.
> I tried using wayland on ubuntu 20.04 and not reproduced the crash.

The Mesa radeonsi driver in Ubuntu 20.04 didn't support the EGL_IMG_context_priority extension yet. Does eglinfo list that extension as supported by the EGL Device platform on your system?


> -----Original Message-----
> From: Koenig, Christian <Christian.Koenig@amd.com>
> Sent: Thursday, October 20, 2022 10:59 PM
> To: Michel Dänzer <michel@daenzer.net>; Zhu, Jiadong <Jiadong.Zhu@amd.com>; amd-gfx@lists.freedesktop.org
> Cc: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Tuikov, Luben <Luben.Tuikov@amd.com>
> Subject: Re: [PATCH 1/5] drm/amdgpu: Introduce gfx software ring (v8)
> 
> Am 20.10.22 um 16:49 schrieb Michel Dänzer:
>> On 2022-10-18 11:08, jiadong.zhu@amd.com wrote:
>>> From: "Jiadong.Zhu" <Jiadong.Zhu@amd.com>
>>>
>>> The software ring is created to support priority context while there
>>> is only one hardware queue for gfx.
>>>
>>> Every software ring has its fence driver and could be used as an
>>> ordinary ring for the GPU scheduler.
>>> Multiple software rings are bound to a real ring with the ring muxer.
>>> The packages committed on the software ring are copied to the real ring.
>>>
>>> v2: Use array to store software ring entry.
>>> v3: Remove unnecessary prints.
>>> v4: Remove amdgpu_ring_sw_init/fini functions, using gtt for sw ring
>>> buffer for later dma copy optimization.
>>> v5: Allocate ring entry dynamically in the muxer.
>>> v6: Update comments for the ring muxer.
>>> v7: Modify for function naming.
>>> v8: Combine software ring functions into amdgpu_ring_mux.c
>> I tested patches 1-4 of this series (since Christian clearly nacked patch 5). I hit the oops below.
> 
> As long as you don't try to reset the GPU you can also test patch 5.
> It's just that we can't upstream the stuff like this or that would break immediately.
> 
> Regards,
> Christian.
> 
>>
>> amdgpu_sw_ring_ib_begin+0x70/0x80 is in amdgpu_mcbp_trigger_preempt according to scripts/faddr2line, specifically line 376:
>>
>>       spin_unlock(&mux->lock);
>>
>> though I'm not sure why that would crash.
>>
>>
>> Are you not able to reproduce this with a GNOME Wayland session?
>>
>>
>> BUG: kernel NULL pointer dereference, address: 0000000000000000
>> #PF: supervisor instruction fetch in kernel mode
>> #PF: error_code(0x0010) - not-present page PGD 0 P4D 0
>> Oops: 0010 [#1] PREEMPT SMP NOPTI
>> CPU: 7 PID: 281 Comm: gfx_high Tainted: G            E      6.0.2+ #1
>> Hardware name: LENOVO 20NF0000GE/20NF0000GE, BIOS R11ET36W (1.16 )
>> 03/30/2020
>> RIP: 0010:0x0
>> Code: Unable to access opcode bytes at RIP 0xffffffffffffffd6.
>> RSP: 0018:ffffbd594073bdc8 EFLAGS: 00010246
>> RAX: 0000000000000000 RBX: ffff993c4a620000 RCX: 0000000000000000
>> RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff993c4a62a350
>> RBP: ffff993c4a62d530 R08: 0000000000000000 R09: 0000000000000000
>> R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000114
>> R13: ffff993c4a620000 R14: 0000000000000000 R15: ffff993c4a62d128
>> FS:  0000000000000000(0000) GS:ffff993ef0bc0000(0000)
>> knlGS:0000000000000000
>> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>> CR2: ffffffffffffffd6 CR3: 00000001959fc000 CR4: 00000000003506e0 Call
>> Trace:
>>   <TASK>
>>   amdgpu_sw_ring_ib_begin+0x70/0x80 [amdgpu]
>>   amdgpu_ib_schedule+0x15f/0x5d0 [amdgpu]
>>   amdgpu_job_run+0x102/0x1c0 [amdgpu]
>>   drm_sched_main+0x19a/0x440 [gpu_sched]
>>   ? dequeue_task_stop+0x70/0x70
>>   ? drm_sched_resubmit_jobs+0x10/0x10 [gpu_sched]
>>   kthread+0xe9/0x110
>>   ? kthread_complete_and_exit+0x20/0x20
>>   ret_from_fork+0x22/0x30
>>   </TASK>
>> [...]
>> note: gfx_high[281] exited with preempt_count 1 [...]
>> [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring gfx_low timeout,
>> signaled seq=14864, emitted seq=14865 [drm:amdgpu_job_timedout
>> [amdgpu]] *ERROR* Process information: process firefox.dpkg-di pid 3540 thread firefox:cs0 pid 4666 amdgpu 0000:05:00.0: amdgpu: GPU reset begin!
>>
>>
> 

-- 
Earthling Michel Dänzer            |                  https://redhat.com
Libre software enthusiast          |         Mesa and Xwayland developer


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 4/5] drm/amdgpu: MCBP based on DRM scheduler (v8)
  2022-10-18  9:08 ` [PATCH 4/5] drm/amdgpu: MCBP based on DRM scheduler (v8) jiadong.zhu
@ 2022-10-31 12:01   ` Michel Dänzer
  2022-11-01  1:04     ` Zhu, Jiadong
  2022-11-10 19:27   ` Luben Tuikov
  1 sibling, 1 reply; 29+ messages in thread
From: Michel Dänzer @ 2022-10-31 12:01 UTC (permalink / raw)
  To: jiadong.zhu
  Cc: amd-gfx, Andrey Grodzovsky, Huang Rui, Christian Koenig, Luben Tuikov

On 2022-10-18 11:08, jiadong.zhu@amd.com wrote:
> From: "Jiadong.Zhu" <Jiadong.Zhu@amd.com>
> 
> Trigger Mid-Command Buffer Preemption according to the priority of the software
> rings and the hw fence signalling condition.
> 
> The muxer saves the locations of the indirect buffer frames from the software
> ring together with the fence sequence number in its fifo queue, and pops out
> those records when the fences are signalled. The locations are used to resubmit
> packages in preemption scenarios by coping the chunks from the software ring.
> 
> v2: Update comment style.
> v3: Fix conflict caused by previous modifications.
> v4: Remove unnecessary prints.
> v5: Fix corner cases for resubmission cases.
> v6: Refactor functions for resubmission, calling fence_process in irq handler.
> v7: Solve conflict for removing amdgpu_sw_ring.c.
> v8: Add time threshold to judge if preemption request is needed.
> 
> Cc: Christian Koenig <Christian.Koenig@amd.com>
> Cc: Luben Tuikov <Luben.Tuikov@amd.com>
> Cc: Andrey Grodzovsky <Andrey.Grodzovsky@amd.com>
> Cc: Michel Dänzer <michel@daenzer.net>
> Signed-off-by: Jiadong.Zhu <Jiadong.Zhu@amd.com>
> Acked-by: Luben Tuikov <luben.tuikov@amd.com>
> Acked-by: Huang Rui <ray.huang@amd.com>

[...]

> +/* Trigger Mid-Command Buffer Preemption (MCBP) and find if we need to resubmit. */
> +int amdgpu_mcbp_trigger_preempt(struct amdgpu_ring_mux *mux)
> +{
> +	int r;
> +
> +	spin_lock(&mux->lock);
> +	mux->pending_trailing_fence_signaled = true;
> +	r = amdgpu_ring_preempt_ib(mux->real_ring);
> +	spin_unlock(&mux->lock);
> +	return r;
> +}

AFAICT amdgpu_mcbp_trigger_preempt is used only in this file, so it should be declared static.

(I didn't audit the other new functions added by this series for this, just happened to notice this one)


-- 
Earthling Michel Dänzer            |                  https://redhat.com
Libre software enthusiast          |         Mesa and Xwayland developer


^ permalink raw reply	[flat|nested] 29+ messages in thread

* RE: [PATCH 4/5] drm/amdgpu: MCBP based on DRM scheduler (v8)
  2022-10-31 12:01   ` Michel Dänzer
@ 2022-11-01  1:04     ` Zhu, Jiadong
  2022-11-01  9:10       ` Michel Dänzer
  0 siblings, 1 reply; 29+ messages in thread
From: Zhu, Jiadong @ 2022-11-01  1:04 UTC (permalink / raw)
  To: Michel Dänzer
  Cc: amd-gfx, Andrey Grodzovsky, Huang, Ray, Koenig,  Christian,
	Tuikov, Luben

[AMD Official Use Only - General]

It is a macro defined in amdgpu_ring.h, calling function pointer preempt_ib directly:
#define amdgpu_ring_preempt_ib(r) (r)->funcs->preempt_ib(r)

The real ring's hook is assigned  in gfx_v9_0.c:
static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
...
.preempt_ib = gfx_v9_0_ring_preempt_ib, //please have a check this line existed.
...
}

Here is the eglinfo on my board, I have once built mesa component (22.3.0-devel), which may update the libEGL_mesa.so
EGL client extensions string:
    EGL_EXT_device_base EGL_EXT_device_enumeration EGL_EXT_device_query
    EGL_EXT_platform_base EGL_KHR_client_get_all_proc_addresses
    EGL_EXT_client_extensions EGL_KHR_debug EGL_EXT_platform_device
    EGL_EXT_platform_x11 EGL_KHR_platform_x11 EGL_EXT_platform_xcb
    EGL_MESA_platform_gbm EGL_KHR_platform_gbm
    EGL_MESA_platform_surfaceless

GBM platform:
EGL API version: 1.5
EGL vendor string: Mesa Project
EGL version string: 1.5
EGL client APIs: OpenGL OpenGL_ES
EGL extensions string:
    EGL_ANDROID_blob_cache EGL_ANDROID_native_fence_sync
    EGL_EXT_buffer_age EGL_EXT_create_context_robustness
    EGL_EXT_image_dma_buf_import EGL_EXT_image_dma_buf_import_modifiers
    EGL_EXT_protected_surface EGL_IMG_context_priority EGL_KHR_cl_event2
    EGL_KHR_config_attribs EGL_KHR_context_flush_control
    EGL_KHR_create_context EGL_KHR_create_context_no_error
    EGL_KHR_fence_sync EGL_KHR_get_all_proc_addresses
    EGL_KHR_gl_colorspace EGL_KHR_gl_renderbuffer_image
    EGL_KHR_gl_texture_2D_image EGL_KHR_gl_texture_3D_image
    EGL_KHR_gl_texture_cubemap_image EGL_KHR_image EGL_KHR_image_base
    EGL_KHR_image_pixmap EGL_KHR_no_config_context EGL_KHR_reusable_sync
    EGL_KHR_surfaceless_context EGL_EXT_pixel_format_float
    EGL_KHR_wait_sync EGL_MESA_configless_context EGL_MESA_drm_image
    EGL_MESA_image_dma_buf_export EGL_MESA_query_driver

Thanks,
Jiadong

-----Original Message-----
From: Michel Dänzer <michel@daenzer.net>
Sent: Monday, October 31, 2022 8:02 PM
To: Zhu, Jiadong <Jiadong.Zhu@amd.com>
Cc: Andrey Grodzovsky <Andrey.Grodzovsky@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Tuikov, Luben <Luben.Tuikov@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH 4/5] drm/amdgpu: MCBP based on DRM scheduler (v8)

On 2022-10-18 11:08, jiadong.zhu@amd.com wrote:
> From: "Jiadong.Zhu" <Jiadong.Zhu@amd.com>
>
> Trigger Mid-Command Buffer Preemption according to the priority of the
> software rings and the hw fence signalling condition.
>
> The muxer saves the locations of the indirect buffer frames from the
> software ring together with the fence sequence number in its fifo
> queue, and pops out those records when the fences are signalled. The
> locations are used to resubmit packages in preemption scenarios by coping the chunks from the software ring.
>
> v2: Update comment style.
> v3: Fix conflict caused by previous modifications.
> v4: Remove unnecessary prints.
> v5: Fix corner cases for resubmission cases.
> v6: Refactor functions for resubmission, calling fence_process in irq handler.
> v7: Solve conflict for removing amdgpu_sw_ring.c.
> v8: Add time threshold to judge if preemption request is needed.
>
> Cc: Christian Koenig <Christian.Koenig@amd.com>
> Cc: Luben Tuikov <Luben.Tuikov@amd.com>
> Cc: Andrey Grodzovsky <Andrey.Grodzovsky@amd.com>
> Cc: Michel Dänzer <michel@daenzer.net>
> Signed-off-by: Jiadong.Zhu <Jiadong.Zhu@amd.com>
> Acked-by: Luben Tuikov <luben.tuikov@amd.com>
> Acked-by: Huang Rui <ray.huang@amd.com>

[...]

> +/* Trigger Mid-Command Buffer Preemption (MCBP) and find if we need
> +to resubmit. */ int amdgpu_mcbp_trigger_preempt(struct
> +amdgpu_ring_mux *mux) {
> +     int r;
> +
> +     spin_lock(&mux->lock);
> +     mux->pending_trailing_fence_signaled = true;
> +     r = amdgpu_ring_preempt_ib(mux->real_ring);
> +     spin_unlock(&mux->lock);
> +     return r;
> +}

AFAICT amdgpu_mcbp_trigger_preempt is used only in this file, so it should be declared static.

(I didn't audit the other new functions added by this series for this, just happened to notice this one)


--
Earthling Michel Dänzer            |                  https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fredhat.com%2F&amp;data=05%7C01%7Cjiadong.zhu%40amd.com%7Cfaca0a3e35964bcbb24708dabb37b284%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C638028145150743814%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&amp;sdata=BcnmJQ5jUdI%2BGnKvUpn3agyxTD4iTSweGxEQKUlQpxI%3D&amp;reserved=0
Libre software enthusiast          |         Mesa and Xwayland developer


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 4/5] drm/amdgpu: MCBP based on DRM scheduler (v8)
  2022-11-01  1:04     ` Zhu, Jiadong
@ 2022-11-01  9:10       ` Michel Dänzer
  2022-11-01  9:58         ` Zhu, Jiadong
  0 siblings, 1 reply; 29+ messages in thread
From: Michel Dänzer @ 2022-11-01  9:10 UTC (permalink / raw)
  To: Zhu, Jiadong
  Cc: Tuikov, Luben, Andrey Grodzovsky, Huang, Ray, Koenig, Christian, amd-gfx


[ Please don't top-post ]


On 2022-11-01 02:04, Zhu, Jiadong wrote:
> 
> It is a macro defined in amdgpu_ring.h, calling function pointer preempt_ib directly:
> #define amdgpu_ring_preempt_ib(r) (r)->funcs->preempt_ib(r)
> 
> The real ring's hook is assigned  in gfx_v9_0.c:
> static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
> ...
> .preempt_ib = gfx_v9_0_ring_preempt_ib, //please have a check this line existed.
> ...
> }

Patch 3 assigns preempt_ib in gfx_v9_0_sw_ring_funcs_gfx, but not in gfx_v9_0_ring_funcs_gfx. mux->real_ring in amdgpu_mcbp_trigger_preempt presumably uses the latter, which would explain why amdgpu_ring_preempt_ib ends up dereferencing a NULL pointer.


> Here is the eglinfo on my board, I have once built mesa component (22.3.0-devel), which may update the libEGL_mesa.so

Looks like EGL_IMG_context_priority is supported (it would be interesting to see if it's listed for the Device Platform as well, but there's probably no reason to assume otherwise). Strange that you can't reproduce then.


-- 
Earthling Michel Dänzer            |                  https://redhat.com
Libre software enthusiast          |         Mesa and Xwayland developer


^ permalink raw reply	[flat|nested] 29+ messages in thread

* RE: [PATCH 4/5] drm/amdgpu: MCBP based on DRM scheduler (v8)
  2022-11-01  9:10       ` Michel Dänzer
@ 2022-11-01  9:58         ` Zhu, Jiadong
  2022-11-01 10:09           ` Michel Dänzer
  0 siblings, 1 reply; 29+ messages in thread
From: Zhu, Jiadong @ 2022-11-01  9:58 UTC (permalink / raw)
  To: Michel Dänzer
  Cc: Tuikov, Luben, Andrey Grodzovsky, Huang, Ray, Koenig, Christian, amd-gfx

[AMD Official Use Only - General]

>Patch 3 assigns preempt_ib in gfx_v9_0_sw_ring_funcs_gfx, but not in gfx_v9_0_ring_funcs_gfx. mux->real_ring in amdgpu_mcbp_trigger_preempt presumably uses the latter, which would explain why amdgpu_ring_preempt_ib ends up dereferencing a NULL pointer.

It's weird the assignment should be in gfx_v9_0_ring_funcs_gfx instead of gfx_v9_0_sw_ring_funcs_gfx.

[PATCH 3/5] drm/amdgpu: Modify unmap_queue format for gfx9 (v4):
@@ -6925,6 +7047,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
        .emit_cntxcntl = gfx_v9_ring_emit_cntxcntl,
        .init_cond_exec = gfx_v9_0_ring_emit_init_cond_exec,
        .patch_cond_exec = gfx_v9_0_ring_emit_patch_cond_exec,
+       .preempt_ib = gfx_v9_0_ring_preempt_ib,
        .emit_frame_cntl = gfx_v9_0_ring_emit_frame_cntl,
        .emit_wreg = gfx_v9_0_ring_emit_wreg,
        .emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
diff --git a/drivers/gpu/drm/amd/amdgpu/soc15d.h b/drivers/gpu/drm/amd/amdgpu/soc15d.h

Thanks,
Jiadong

-----Original Message-----
From: Michel Dänzer <michel@daenzer.net>
Sent: Tuesday, November 1, 2022 5:11 PM
To: Zhu, Jiadong <Jiadong.Zhu@amd.com>
Cc: amd-gfx@lists.freedesktop.org; Andrey Grodzovsky <Andrey.Grodzovsky@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Tuikov, Luben <Luben.Tuikov@amd.com>
Subject: Re: [PATCH 4/5] drm/amdgpu: MCBP based on DRM scheduler (v8)


[ Please don't top-post ]


On 2022-11-01 02:04, Zhu, Jiadong wrote:
>
> It is a macro defined in amdgpu_ring.h, calling function pointer preempt_ib directly:
> #define amdgpu_ring_preempt_ib(r) (r)->funcs->preempt_ib(r)
>
> The real ring's hook is assigned  in gfx_v9_0.c:
> static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = { ...
> .preempt_ib = gfx_v9_0_ring_preempt_ib, //please have a check this line existed.
> ...
> }

Patch 3 assigns preempt_ib in gfx_v9_0_sw_ring_funcs_gfx, but not in gfx_v9_0_ring_funcs_gfx. mux->real_ring in amdgpu_mcbp_trigger_preempt presumably uses the latter, which would explain why amdgpu_ring_preempt_ib ends up dereferencing a NULL pointer.


> Here is the eglinfo on my board, I have once built mesa component
> (22.3.0-devel), which may update the libEGL_mesa.so

Looks like EGL_IMG_context_priority is supported (it would be interesting to see if it's listed for the Device Platform as well, but there's probably no reason to assume otherwise). Strange that you can't reproduce then.


--
Earthling Michel Dänzer            |                  https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fredhat.com%2F&amp;data=05%7C01%7CJiadong.Zhu%40amd.com%7C89dc4843c4674e7c706a08dabbe91326%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C638028906976834499%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&amp;sdata=8Ow2Lx890iYmhsih0ZFqgOYg9ciGz%2FjJLF3p7hhAtc4%3D&amp;reserved=0
Libre software enthusiast          |         Mesa and Xwayland developer


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 4/5] drm/amdgpu: MCBP based on DRM scheduler (v8)
  2022-11-01  9:58         ` Zhu, Jiadong
@ 2022-11-01 10:09           ` Michel Dänzer
  2022-11-02 11:26             ` Michel Dänzer
  0 siblings, 1 reply; 29+ messages in thread
From: Michel Dänzer @ 2022-11-01 10:09 UTC (permalink / raw)
  To: Zhu, Jiadong
  Cc: Andrey Grodzovsky, Tuikov, Luben, Huang, Ray, Koenig, Christian, amd-gfx

On 2022-11-01 10:58, Zhu, Jiadong wrote:
> 
>> Patch 3 assigns preempt_ib in gfx_v9_0_sw_ring_funcs_gfx, but not in gfx_v9_0_ring_funcs_gfx. mux->real_ring in amdgpu_mcbp_trigger_preempt presumably uses the latter, which would explain why amdgpu_ring_preempt_ib ends up dereferencing a NULL pointer.
> 
> It's weird the assignment should be in gfx_v9_0_ring_funcs_gfx instead of gfx_v9_0_sw_ring_funcs_gfx.
> 
> [PATCH 3/5] drm/amdgpu: Modify unmap_queue format for gfx9 (v4):
> @@ -6925,6 +7047,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>         .emit_cntxcntl = gfx_v9_ring_emit_cntxcntl,
>         .init_cond_exec = gfx_v9_0_ring_emit_init_cond_exec,
>         .patch_cond_exec = gfx_v9_0_ring_emit_patch_cond_exec,
> +       .preempt_ib = gfx_v9_0_ring_preempt_ib,
>         .emit_frame_cntl = gfx_v9_0_ring_emit_frame_cntl,
>         .emit_wreg = gfx_v9_0_ring_emit_wreg,
>         .emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
> diff --git a/drivers/gpu/drm/amd/amdgpu/soc15d.h b/drivers/gpu/drm/amd/amdgpu/soc15d.h

Ah! Looks like stg applied patch 3 incorrectly for me. :(

I'll try and test with this fixed this week, and report back.


-- 
Earthling Michel Dänzer            |                  https://redhat.com
Libre software enthusiast          |         Mesa and Xwayland developer


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 4/5] drm/amdgpu: MCBP based on DRM scheduler (v8)
  2022-11-01 10:09           ` Michel Dänzer
@ 2022-11-02 11:26             ` Michel Dänzer
  2022-11-03  2:58               ` Zhu, Jiadong
  0 siblings, 1 reply; 29+ messages in thread
From: Michel Dänzer @ 2022-11-02 11:26 UTC (permalink / raw)
  To: Zhu, Jiadong; +Cc: Tuikov, Luben, Huang, Ray, Koenig, Christian, amd-gfx


[ Dropping Andrey's no longer working address from Cc ]

On 2022-11-01 11:09, Michel Dänzer wrote:
> On 2022-11-01 10:58, Zhu, Jiadong wrote:
>>
>>> Patch 3 assigns preempt_ib in gfx_v9_0_sw_ring_funcs_gfx, but not in gfx_v9_0_ring_funcs_gfx. mux->real_ring in amdgpu_mcbp_trigger_preempt presumably uses the latter, which would explain why amdgpu_ring_preempt_ib ends up dereferencing a NULL pointer.
>>
>> It's weird the assignment should be in gfx_v9_0_ring_funcs_gfx instead of gfx_v9_0_sw_ring_funcs_gfx.
>>
>> [PATCH 3/5] drm/amdgpu: Modify unmap_queue format for gfx9 (v4):
>> @@ -6925,6 +7047,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>>         .emit_cntxcntl = gfx_v9_ring_emit_cntxcntl,
>>         .init_cond_exec = gfx_v9_0_ring_emit_init_cond_exec,
>>         .patch_cond_exec = gfx_v9_0_ring_emit_patch_cond_exec,
>> +       .preempt_ib = gfx_v9_0_ring_preempt_ib,
>>         .emit_frame_cntl = gfx_v9_0_ring_emit_frame_cntl,
>>         .emit_wreg = gfx_v9_0_ring_emit_wreg,
>>         .emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
>> diff --git a/drivers/gpu/drm/amd/amdgpu/soc15d.h b/drivers/gpu/drm/amd/amdgpu/soc15d.h
> 
> Ah! Looks like stg applied patch 3 incorrectly for me. :(
> 
> I'll try and test with this fixed this week, and report back.

I'm now running with patch 3 applied correctly, and with patch 5 as well.


The good news is that I'm now seeing a positive effect with GpuTest benchmarks which are GPU-limited at low frame rates. In particular, with the pixmark piano benchmark, the GNOME Wayland session now actually stays more responsive on this machine than it does on my work laptop with an Intel iGPU. However, with the plot3d benchmark (with /plot3d_vertex_density=1750 on the command line to increase GPU load), it still doesn't quite manage to keep the desktop running at full frame rate, in contrast to the Intel iGPU.

The bad news is that this series still makes some things very slow. The most extreme examples so far are glxgears (runs at ~400 fps now, ~7000 fps before, i.e. almost 20x slowdown) and hexchat (scrolling one page now takes ~1 second, I can see it drawing line by line; before it was almost instantaneous). I suspect this series makes the overhead of running a single GPU job much bigger. On the bright side, I'm not noticing any significant intermittent freezes anymore.


In summary, while the benefits are promising, the downsides are unacceptable for enabling this by default.


-- 
Earthling Michel Dänzer            |                  https://redhat.com
Libre software enthusiast          |         Mesa and Xwayland developer


^ permalink raw reply	[flat|nested] 29+ messages in thread

* RE: [PATCH 4/5] drm/amdgpu: MCBP based on DRM scheduler (v8)
  2022-11-02 11:26             ` Michel Dänzer
@ 2022-11-03  2:58               ` Zhu, Jiadong
  2022-11-03  9:04                 ` Michel Dänzer
  0 siblings, 1 reply; 29+ messages in thread
From: Zhu, Jiadong @ 2022-11-03  2:58 UTC (permalink / raw)
  To: Michel Dänzer; +Cc: Tuikov, Luben, Huang, Ray, Koenig, Christian, amd-gfx

[AMD Official Use Only - General]

>The bad news is that this series still makes some things very slow. The most extreme examples so far are glxgears (runs at ~400 fps now, ~7000 fps before, i.e. almost 20x slowdown) and hexchat (scrolling one page now takes ~1 second, I can see it drawing line by line; before it was almost instantaneous). I suspect this series makes the overhead of running a single GPU job much bigger. On the bright side, I'm not noticing any significant intermittent freezes anymore.

Hi Michel,

Thanks for the trying.
Is there high priority jobs running while executing glxgears? I am running glxgears while submitting high priority ibs using amdgpu_test, the fps ranges from 6000~8000.

Continuous preemption and resubmission may cause the slow fps. Could you have a check about how fast the trailing fence seqNo expands. On my side, the increment of Last signaled trailing fence is < 10 in a second.


cat /sys/kernel/debug/dri/0/amdgpu_fence_info
--- ring 0 (gfx) ---
Last signaled fence          0x00000001
Last emitted                 0x00000001
Last signaled trailing fence 0x0000013c
Last emitted                 0x0000013c
Last preempted               0x00000000

Thanks,
Jiadong

-----Original Message-----
From: Michel Dänzer <michel@daenzer.net>
Sent: Wednesday, November 2, 2022 7:26 PM
To: Zhu, Jiadong <Jiadong.Zhu@amd.com>
Cc: Tuikov, Luben <Luben.Tuikov@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH 4/5] drm/amdgpu: MCBP based on DRM scheduler (v8)


[ Dropping Andrey's no longer working address from Cc ]

On 2022-11-01 11:09, Michel Dänzer wrote:
> On 2022-11-01 10:58, Zhu, Jiadong wrote:
>>
>>> Patch 3 assigns preempt_ib in gfx_v9_0_sw_ring_funcs_gfx, but not in gfx_v9_0_ring_funcs_gfx. mux->real_ring in amdgpu_mcbp_trigger_preempt presumably uses the latter, which would explain why amdgpu_ring_preempt_ib ends up dereferencing a NULL pointer.
>>
>> It's weird the assignment should be in gfx_v9_0_ring_funcs_gfx instead of gfx_v9_0_sw_ring_funcs_gfx.
>>
>> [PATCH 3/5] drm/amdgpu: Modify unmap_queue format for gfx9 (v4):
>> @@ -6925,6 +7047,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>>         .emit_cntxcntl = gfx_v9_ring_emit_cntxcntl,
>>         .init_cond_exec = gfx_v9_0_ring_emit_init_cond_exec,
>>         .patch_cond_exec = gfx_v9_0_ring_emit_patch_cond_exec,
>> +       .preempt_ib = gfx_v9_0_ring_preempt_ib,
>>         .emit_frame_cntl = gfx_v9_0_ring_emit_frame_cntl,
>>         .emit_wreg = gfx_v9_0_ring_emit_wreg,
>>         .emit_reg_wait = gfx_v9_0_ring_emit_reg_wait, diff --git
>> a/drivers/gpu/drm/amd/amdgpu/soc15d.h
>> b/drivers/gpu/drm/amd/amdgpu/soc15d.h
>
> Ah! Looks like stg applied patch 3 incorrectly for me. :(
>
> I'll try and test with this fixed this week, and report back.

I'm now running with patch 3 applied correctly, and with patch 5 as well.


The good news is that I'm now seeing a positive effect with GpuTest benchmarks which are GPU-limited at low frame rates. In particular, with the pixmark piano benchmark, the GNOME Wayland session now actually stays more responsive on this machine than it does on my work laptop with an Intel iGPU. However, with the plot3d benchmark (with /plot3d_vertex_density=1750 on the command line to increase GPU load), it still doesn't quite manage to keep the desktop running at full frame rate, in contrast to the Intel iGPU.

The bad news is that this series still makes some things very slow. The most extreme examples so far are glxgears (runs at ~400 fps now, ~7000 fps before, i.e. almost 20x slowdown) and hexchat (scrolling one page now takes ~1 second, I can see it drawing line by line; before it was almost instantaneous). I suspect this series makes the overhead of running a single GPU job much bigger. On the bright side, I'm not noticing any significant intermittent freezes anymore.


In summary, while the benefits are promising, the downsides are unacceptable for enabling this by default.


--
Earthling Michel Dänzer            |                  https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fredhat.com%2F&amp;data=05%7C01%7CJiadong.Zhu%40amd.com%7Cb15fb94893a247d734ff08dabcc5265c%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C638029852189066953%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&amp;sdata=awC3VH4zMdZGK9ayi8V3goI%2B%2FEkj0%2B2LL2VokYlLXSk%3D&amp;reserved=0
Libre software enthusiast          |         Mesa and Xwayland developer


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 4/5] drm/amdgpu: MCBP based on DRM scheduler (v8)
  2022-11-03  2:58               ` Zhu, Jiadong
@ 2022-11-03  9:04                 ` Michel Dänzer
  2022-11-08  8:01                   ` Zhu, Jiadong
  0 siblings, 1 reply; 29+ messages in thread
From: Michel Dänzer @ 2022-11-03  9:04 UTC (permalink / raw)
  To: Zhu, Jiadong; +Cc: Tuikov, Luben, Huang, Ray, Koenig, Christian, amd-gfx

On 2022-11-03 03:58, Zhu, Jiadong wrote:
> [AMD Official Use Only - General]
> 
>> The bad news is that this series still makes some things very slow. The most extreme examples so far are glxgears (runs at ~400 fps now, ~7000 fps before, i.e. almost 20x slowdown) and hexchat (scrolling one page now takes ~1 second, I can see it drawing line by line; before it was almost instantaneous). I suspect this series makes the overhead of running a single GPU job much bigger. On the bright side, I'm not noticing any significant intermittent freezes anymore.
> 
> Hi Michel,
> 
> Thanks for the trying.
> Is there high priority jobs running while executing glxgears?

Yes, mutter is submitting high priority jobs. However, I don't think that can explain the problem by itself:

mutter only draws once per display refresh cycle. Let's assume mutter's GPU work takes ~6-7ms (conservative example, should be less than that usually). That leaves ~10ms per display refresh cycle (at 60 Hz refresh rate) where GPU work from glxgears & Xwayland can run without getting preempted. Since glxgears runs at ~7000 fps without this series, it should be able to draw at least ~70 frames in 10ms[0], which corresponds to over 4000 fps. Yet it manages only 1/10 of that.

[0] Worst case consideration, ignoring the fact that without this series, glxgears runs at ~7000 fps while mutter sustains 60 fps.


> I am running glxgears while submitting high priority ibs using amdgpu_test, the fps ranges from 6000~8000.

It's getting clear that artificial tests such as amdgpu_test don't suffice for evaluating the real-world impact of this kind of change.


> Continuous preemption and resubmission may cause the slow fps. Could you have a check about how fast the trailing fence seqNo expands. On my side, the increment of Last signaled trailing fence is < 10 in a second.

I had to go back to a kernel without this series, as it was just unusable. As this is my main machine, I don't know when I'll get a chance to check this.


-- 
Earthling Michel Dänzer            |                  https://redhat.com
Libre software enthusiast          |         Mesa and Xwayland developer


^ permalink raw reply	[flat|nested] 29+ messages in thread

* RE: [PATCH 4/5] drm/amdgpu: MCBP based on DRM scheduler (v8)
  2022-11-03  9:04                 ` Michel Dänzer
@ 2022-11-08  8:01                   ` Zhu, Jiadong
  2022-11-10 17:00                     ` Michel Dänzer
  0 siblings, 1 reply; 29+ messages in thread
From: Zhu, Jiadong @ 2022-11-08  8:01 UTC (permalink / raw)
  To: Michel Dänzer; +Cc: Tuikov, Luben, Huang, Ray, Koenig, Christian, amd-gfx

[AMD Official Use Only - General]

Hi Michel,

I reproduced the glxgears 400fps scenario locally. The issue is caused by the patch5 "drm/amdgpu: Improve the software rings priority scheduler" which slows down the low priority scheduler thread if high priority ib is under executing. I'll drop this patch as we cannot identify gpu bound according to the unsignaled fence, etc.

Thanks,
Jiadong

-----Original Message-----
From: Michel Dänzer <michel@daenzer.net>
Sent: Thursday, November 3, 2022 5:05 PM
To: Zhu, Jiadong <Jiadong.Zhu@amd.com>
Cc: Tuikov, Luben <Luben.Tuikov@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH 4/5] drm/amdgpu: MCBP based on DRM scheduler (v8)

On 2022-11-03 03:58, Zhu, Jiadong wrote:
> [AMD Official Use Only - General]
>
>> The bad news is that this series still makes some things very slow. The most extreme examples so far are glxgears (runs at ~400 fps now, ~7000 fps before, i.e. almost 20x slowdown) and hexchat (scrolling one page now takes ~1 second, I can see it drawing line by line; before it was almost instantaneous). I suspect this series makes the overhead of running a single GPU job much bigger. On the bright side, I'm not noticing any significant intermittent freezes anymore.
>
> Hi Michel,
>
> Thanks for the trying.
> Is there high priority jobs running while executing glxgears?

Yes, mutter is submitting high priority jobs. However, I don't think that can explain the problem by itself:

mutter only draws once per display refresh cycle. Let's assume mutter's GPU work takes ~6-7ms (conservative example, should be less than that usually). That leaves ~10ms per display refresh cycle (at 60 Hz refresh rate) where GPU work from glxgears & Xwayland can run without getting preempted. Since glxgears runs at ~7000 fps without this series, it should be able to draw at least ~70 frames in 10ms[0], which corresponds to over 4000 fps. Yet it manages only 1/10 of that.

[0] Worst case consideration, ignoring the fact that without this series, glxgears runs at ~7000 fps while mutter sustains 60 fps.


> I am running glxgears while submitting high priority ibs using amdgpu_test, the fps ranges from 6000~8000.

It's getting clear that artificial tests such as amdgpu_test don't suffice for evaluating the real-world impact of this kind of change.


> Continuous preemption and resubmission may cause the slow fps. Could you have a check about how fast the trailing fence seqNo expands. On my side, the increment of Last signaled trailing fence is < 10 in a second.

I had to go back to a kernel without this series, as it was just unusable. As this is my main machine, I don't know when I'll get a chance to check this.


--
Earthling Michel Dänzer            |                  https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fredhat.com%2F&amp;data=05%7C01%7CJiadong.Zhu%40amd.com%7C5cb642e1abf34ab7377308dabd7adfb7%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C638030632689527329%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&amp;sdata=jJuSMxqY4nMltWdrSOe4iJF5kmwPG2gBFXudDmheNBc%3D&amp;reserved=0
Libre software enthusiast          |         Mesa and Xwayland developer


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 4/5] drm/amdgpu: MCBP based on DRM scheduler (v8)
  2022-11-08  8:01                   ` Zhu, Jiadong
@ 2022-11-10 17:00                     ` Michel Dänzer
  2022-11-10 17:54                       ` Alex Deucher
  2022-11-14 17:15                       ` Michel Dänzer
  0 siblings, 2 replies; 29+ messages in thread
From: Michel Dänzer @ 2022-11-10 17:00 UTC (permalink / raw)
  To: Zhu, Jiadong; +Cc: Tuikov, Luben, Huang, Ray, Koenig, Christian, amd-gfx

On 2022-11-08 09:01, Zhu, Jiadong wrote:> From: Michel Dänzer <michel@daenzer.net>
>
>>>> The bad news is that this series still makes some things very slow. The most extreme examples so far are glxgears (runs at ~400 fps now, ~7000 fps before, i.e. almost 20x slowdown) and hexchat (scrolling one page now takes ~1 second, I can see it drawing line by line; before it was almost instantaneous). I suspect this series makes the overhead of running a single GPU job much bigger. On the bright side, I'm not noticing any significant intermittent freezes anymore.
>>>
>>> Hi Michel,
>>>
>>> Thanks for the trying.
>>> Is there high priority jobs running while executing glxgears?
>> 
>> Yes, mutter is submitting high priority jobs. However, I don't think that can explain the problem by itself:
>> 
>> mutter only draws once per display refresh cycle. Let's assume mutter's GPU work takes ~6-7ms (conservative example, should be less than that usually). That leaves ~10ms per display refresh cycle (at 60 Hz refresh rate) where GPU work from glxgears & Xwayland can run without getting preempted. Since glxgears runs at ~7000 fps without this series, it should be able to draw at least ~70 frames in 10ms[0], which corresponds to over 4000 fps. Yet it manages only 1/10 of that.
>> 
>> [0] Worst case consideration, ignoring the fact that without this series, glxgears runs at ~7000 fps while mutter sustains 60 fps.
>
> I reproduced the glxgears 400fps scenario locally. The issue is caused by the patch5 "drm/amdgpu: Improve the software rings priority scheduler" which slows down the low priority scheduler thread if high priority ib is under executing. I'll drop this patch as we cannot identify gpu bound according to the unsignaled fence, etc.

Okay, I'm testing with patches 1-4 only now.

So far I haven't noticed any negative effects, no slowdowns or intermittent freezes.

The only issue is that there's hardly any positive effect either. While constantly moving the window of a GPU-limited GpuTest benchmark in circles, most of the time it looks exactly the same as without these patches. Only occasionally, at most every few seconds, I notice that the window movement becomes smoother for an instant.


-- 
Earthling Michel Dänzer            |                  https://redhat.com
Libre software enthusiast          |         Mesa and Xwayland developer



^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 4/5] drm/amdgpu: MCBP based on DRM scheduler (v8)
  2022-11-10 17:00                     ` Michel Dänzer
@ 2022-11-10 17:54                       ` Alex Deucher
  2022-11-11  6:15                         ` Zhu, Jiadong
  2022-11-14 17:15                       ` Michel Dänzer
  1 sibling, 1 reply; 29+ messages in thread
From: Alex Deucher @ 2022-11-10 17:54 UTC (permalink / raw)
  To: Michel Dänzer
  Cc: Tuikov, Luben, Zhu, Jiadong, Huang, Ray, Koenig, Christian, amd-gfx

On Thu, Nov 10, 2022 at 12:00 PM Michel Dänzer <michel@daenzer.net> wrote:
>
> On 2022-11-08 09:01, Zhu, Jiadong wrote:> From: Michel Dänzer <michel@daenzer.net>
> >
> >>>> The bad news is that this series still makes some things very slow. The most extreme examples so far are glxgears (runs at ~400 fps now, ~7000 fps before, i.e. almost 20x slowdown) and hexchat (scrolling one page now takes ~1 second, I can see it drawing line by line; before it was almost instantaneous). I suspect this series makes the overhead of running a single GPU job much bigger. On the bright side, I'm not noticing any significant intermittent freezes anymore.
> >>>
> >>> Hi Michel,
> >>>
> >>> Thanks for the trying.
> >>> Is there high priority jobs running while executing glxgears?
> >>
> >> Yes, mutter is submitting high priority jobs. However, I don't think that can explain the problem by itself:
> >>
> >> mutter only draws once per display refresh cycle. Let's assume mutter's GPU work takes ~6-7ms (conservative example, should be less than that usually). That leaves ~10ms per display refresh cycle (at 60 Hz refresh rate) where GPU work from glxgears & Xwayland can run without getting preempted. Since glxgears runs at ~7000 fps without this series, it should be able to draw at least ~70 frames in 10ms[0], which corresponds to over 4000 fps. Yet it manages only 1/10 of that.
> >>
> >> [0] Worst case consideration, ignoring the fact that without this series, glxgears runs at ~7000 fps while mutter sustains 60 fps.
> >
> > I reproduced the glxgears 400fps scenario locally. The issue is caused by the patch5 "drm/amdgpu: Improve the software rings priority scheduler" which slows down the low priority scheduler thread if high priority ib is under executing. I'll drop this patch as we cannot identify gpu bound according to the unsignaled fence, etc.
>
> Okay, I'm testing with patches 1-4 only now.
>
> So far I haven't noticed any negative effects, no slowdowns or intermittent freezes.
>
> The only issue is that there's hardly any positive effect either. While constantly moving the window of a GPU-limited GpuTest benchmark in circles, most of the time it looks exactly the same as without these patches. Only occasionally, at most every few seconds, I notice that the window movement becomes smoother for an instant.
>

I think it will largely depend on the workload.  The gfx pipe can only
be preempted on draw boundaries so if most operations are a single
draw, you probably won't see much difference.

Alex

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 4/5] drm/amdgpu: MCBP based on DRM scheduler (v8)
  2022-10-18  9:08 ` [PATCH 4/5] drm/amdgpu: MCBP based on DRM scheduler (v8) jiadong.zhu
  2022-10-31 12:01   ` Michel Dänzer
@ 2022-11-10 19:27   ` Luben Tuikov
  1 sibling, 0 replies; 29+ messages in thread
From: Luben Tuikov @ 2022-11-10 19:27 UTC (permalink / raw)
  To: jiadong.zhu, amd-gfx; +Cc: Michel Dänzer, Huang Rui, Christian Koenig

On 2022-10-18 05:08, jiadong.zhu@amd.com wrote:
> From: "Jiadong.Zhu" <Jiadong.Zhu@amd.com>
> 
> Trigger Mid-Command Buffer Preemption according to the priority of the software
> rings and the hw fence signalling condition.
> 
> The muxer saves the locations of the indirect buffer frames from the software
> ring together with the fence sequence number in its fifo queue, and pops out
> those records when the fences are signalled. The locations are used to resubmit
> packages in preemption scenarios by coping the chunks from the software ring.
> 
> v2: Update comment style.
> v3: Fix conflict caused by previous modifications.
> v4: Remove unnecessary prints.
> v5: Fix corner cases for resubmission cases.
> v6: Refactor functions for resubmission, calling fence_process in irq handler.
> v7: Solve conflict for removing amdgpu_sw_ring.c.
> v8: Add time threshold to judge if preemption request is needed.
> 
> Cc: Christian Koenig <Christian.Koenig@amd.com>
> Cc: Luben Tuikov <Luben.Tuikov@amd.com>
> Cc: Andrey Grodzovsky <Andrey.Grodzovsky@amd.com>
> Cc: Michel Dänzer <michel@daenzer.net>
> Signed-off-by: Jiadong.Zhu <Jiadong.Zhu@amd.com>
> Acked-by: Luben Tuikov <luben.tuikov@amd.com>
> Acked-by: Huang Rui <ray.huang@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c    |  53 +++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c       |   2 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c     |  12 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h     |   6 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c | 353 +++++++++++++++++--
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h |  29 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c       |   2 +
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c        |   7 +-
>  8 files changed, 420 insertions(+), 44 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> index 790f7bfdc654..470448bc1ebb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> @@ -54,6 +54,7 @@ struct amdgpu_fence {
>  
>  	/* RB, DMA, etc. */
>  	struct amdgpu_ring		*ring;
> +	ktime_t				start_timestamp;
>  };
>  
>  static struct kmem_cache *amdgpu_fence_slab;
> @@ -199,6 +200,7 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f, struct amd
>  	rcu_assign_pointer(*ptr, dma_fence_get(fence));
>  
>  	*f = fence;
> +	to_amdgpu_fence(fence)->start_timestamp = ktime_get();
>  
>  	return 0;
>  }
> @@ -400,6 +402,57 @@ unsigned amdgpu_fence_count_emitted(struct amdgpu_ring *ring)
>  	return lower_32_bits(emitted);
>  }
>  
> +/**
> + * amdgpu_fence_last_unsignaled_time_us - the time fence emited till now

Spelling: emitted until

> + * @ring: ring the fence is associated with
> + *
> + * Find the earlist fence unsignaled till now, calculate the time delta

Spelling: earliest fence unsignalled until

> + * between the time fence emitted and now.
> + */
> +u64 amdgpu_fence_last_unsignaled_time_us(struct amdgpu_ring *ring)
> +{
> +	struct amdgpu_fence_driver *drv = &ring->fence_drv;
> +	struct dma_fence *fence;
> +	uint32_t last_seq, sync_seq;
> +
> +	last_seq = atomic_read(&ring->fence_drv.last_seq);
> +	sync_seq = READ_ONCE(ring->fence_drv.sync_seq);
> +	if (last_seq == sync_seq)
> +		return 0;
> +
> +	++last_seq;
> +	last_seq &= drv->num_fences_mask;
> +	fence = drv->fences[last_seq];
> +	if (!fence)
> +		return 0;
> +
> +	return ktime_us_delta(ktime_get(),
> +		to_amdgpu_fence(fence)->start_timestamp);
> +}
> +
> +/**
> + * amdgpu_fence_update_start_timestamp - update the timestamp of the fence
> + * @ring: ring the fence is associated with
> + * @seq: the fence seq number to update.
> + * @timestamp: the start timestamp to update.
> + *
> + * The function called at the time the fence and related ib is about to
> + * resubmit to gpu in MCBP scenario. Thus we do not consider race condition
> + * with amdgpu_fence_process to modify the same fence.
> + */
> +void amdgpu_fence_update_start_timestamp(struct amdgpu_ring *ring, uint32_t seq, ktime_t timestamp)
> +{
> +	struct amdgpu_fence_driver *drv = &ring->fence_drv;
> +	struct dma_fence *fence;
> +
> +	seq &= drv->num_fences_mask;
> +	fence = drv->fences[seq];
> +	if (!fence)
> +		return;
> +
> +	to_amdgpu_fence(fence)->start_timestamp = timestamp;
> +}
> +
>  /**
>   * amdgpu_fence_driver_start_ring - make the fence driver
>   * ready for use on the requested ring.
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> index 258cffe3c06a..af86d87e2f3b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> @@ -211,6 +211,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>  		}
>  	}
>  
> +	amdgpu_ring_ib_begin(ring);
>  	if (job && ring->funcs->init_cond_exec)
>  		patch_offset = amdgpu_ring_init_cond_exec(ring);
>  
> @@ -285,6 +286,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>  	    ring->hw_prio == AMDGPU_GFX_PIPE_PRIO_HIGH)
>  		ring->funcs->emit_wave_limit(ring, false);
>  
> +	amdgpu_ring_ib_end(ring);
>  	amdgpu_ring_commit(ring);
>  	return 0;
>  }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> index 13db99d653bd..1f15f9242e99 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> @@ -569,3 +569,15 @@ int amdgpu_ring_init_mqd(struct amdgpu_ring *ring)
>  
>  	return mqd_mgr->init_mqd(adev, ring->mqd_ptr, &prop);
>  }
> +
> +void amdgpu_ring_ib_begin(struct amdgpu_ring *ring)
> +{
> +	if (ring->is_sw_ring)
> +		amdgpu_sw_ring_ib_begin(ring);
> +}
> +
> +void amdgpu_ring_ib_end(struct amdgpu_ring *ring)
> +{
> +	if (ring->is_sw_ring)
> +		amdgpu_sw_ring_ib_end(ring);
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index e90d327a589e..8f82fd0d47c6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -145,6 +145,9 @@ signed long amdgpu_fence_wait_polling(struct amdgpu_ring *ring,
>  				      uint32_t wait_seq,
>  				      signed long timeout);
>  unsigned amdgpu_fence_count_emitted(struct amdgpu_ring *ring);
> +u64 amdgpu_fence_last_unsignaled_time_us(struct amdgpu_ring *ring);
> +void amdgpu_fence_update_start_timestamp(struct amdgpu_ring *ring, uint32_t seq,
> +					 ktime_t timestamp);

FYI: A conflict exists here with latest amd-staging-drm-next. Easy to fix.

Regards,
Luben

>  
>  /*
>   * Rings.
> @@ -312,6 +315,9 @@ struct amdgpu_ring {
>  #define amdgpu_ring_preempt_ib(r) (r)->funcs->preempt_ib(r)
>  
>  int amdgpu_ring_alloc(struct amdgpu_ring *ring, unsigned ndw);
> +void amdgpu_ring_ib_begin(struct amdgpu_ring *ring);
> +void amdgpu_ring_ib_end(struct amdgpu_ring *ring);
> +
>  void amdgpu_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count);
>  void amdgpu_ring_generic_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib);
>  void amdgpu_ring_commit(struct amdgpu_ring *ring);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
> index 2e64ffccc030..41b057b9358e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
> @@ -28,6 +28,7 @@
>  #include "amdgpu.h"
>  
>  #define AMDGPU_MUX_RESUBMIT_JIFFIES_TIMEOUT (HZ / 2)
> +#define AMDGPU_MAX_LAST_UNSIGNALED_THRESHOLD_US 10000
>  
>  static const struct ring_info {
>  	unsigned int hw_pio;
> @@ -37,23 +38,145 @@ static const struct ring_info {
>  	{ AMDGPU_RING_PRIO_2, "gfx_high"},
>  };
>  
> +static struct kmem_cache *amdgpu_mux_chunk_slab;
> +
> +static inline struct amdgpu_mux_entry *amdgpu_ring_mux_sw_entry(struct amdgpu_ring_mux *mux,
> +								struct amdgpu_ring *ring)
> +{
> +	return ring->entry_index < mux->ring_entry_size ?
> +			&mux->ring_entry[ring->entry_index] : NULL;
> +}
> +
> +/* copy packages on sw ring range[begin, end) */
> +static void amdgpu_ring_mux_copy_pkt_from_sw_ring(struct amdgpu_ring_mux *mux,
> +						  struct amdgpu_ring *ring,
> +						  u64 s_start, u64 s_end)
> +{
> +	u64 start, end;
> +	struct amdgpu_ring *real_ring = mux->real_ring;
> +
> +	start = s_start & ring->buf_mask;
> +	end = s_end & ring->buf_mask;
> +
> +	if (start == end) {
> +		DRM_ERROR("no more data copied from sw ring\n");
> +		return;
> +	}
> +	if (start > end) {
> +		amdgpu_ring_alloc(real_ring, (ring->ring_size >> 2) + end - start);
> +		amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[start],
> +					   (ring->ring_size >> 2) - start);
> +		amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[0], end);
> +	} else {
> +		amdgpu_ring_alloc(real_ring, end - start);
> +		amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[start], end - start);
> +	}
> +}
> +
> +static void amdgpu_mux_resubmit_chunks(struct amdgpu_ring_mux *mux)
> +{
> +	struct amdgpu_mux_entry *e = NULL;
> +	struct amdgpu_mux_chunk *chunk;
> +	uint32_t seq, last_seq;
> +	int i;
> +
> +	/*find low priority entries:*/
> +	if (!mux->s_resubmit)
> +		return;
> +
> +	for (i = 0; i < mux->num_ring_entries; i++) {
> +		if (mux->ring_entry[i].ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT) {
> +			e = &mux->ring_entry[i];
> +			break;
> +		}
> +	}
> +
> +	if (!e) {
> +		DRM_ERROR("%s no low priority ring found\n", __func__);
> +		return;
> +	}
> +
> +	last_seq = atomic_read(&e->ring->fence_drv.last_seq);
> +	seq = mux->seqno_to_resubmit;
> +	if (last_seq < seq) {
> +		/*resubmit all the fences between (last_seq, seq]*/
> +		list_for_each_entry(chunk, &e->list, entry) {
> +			if (chunk->sync_seq > last_seq && chunk->sync_seq <= seq) {
> +				amdgpu_fence_update_start_timestamp(e->ring,
> +								    chunk->sync_seq,
> +								    ktime_get());
> +				amdgpu_ring_mux_copy_pkt_from_sw_ring(mux, e->ring,
> +								      chunk->start,
> +								      chunk->end);
> +				mux->wptr_resubmit = chunk->end;
> +				amdgpu_ring_commit(mux->real_ring);
> +			}
> +		}
> +	}
> +
> +	del_timer(&mux->resubmit_timer);
> +	mux->s_resubmit = false;
> +}
> +
> +static void amdgpu_ring_mux_schedule_resubmit(struct amdgpu_ring_mux *mux)
> +{
> +	mod_timer(&mux->resubmit_timer, jiffies + AMDGPU_MUX_RESUBMIT_JIFFIES_TIMEOUT);
> +}
> +
> +static void amdgpu_mux_resubmit_fallback(struct timer_list *t)
> +{
> +	struct amdgpu_ring_mux *mux = from_timer(mux, t, resubmit_timer);
> +
> +	if (!spin_trylock(&mux->lock)) {
> +		amdgpu_ring_mux_schedule_resubmit(mux);
> +		DRM_ERROR("reschedule resubmit\n");
> +		return;
> +	}
> +	amdgpu_mux_resubmit_chunks(mux);
> +	spin_unlock(&mux->lock);
> +}
> +
>  int amdgpu_ring_mux_init(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring,
>  			 unsigned int entry_size)
>  {
>  	mux->real_ring = ring;
>  	mux->num_ring_entries = 0;
> +
>  	mux->ring_entry = kcalloc(entry_size, sizeof(struct amdgpu_mux_entry), GFP_KERNEL);
>  	if (!mux->ring_entry)
>  		return -ENOMEM;
>  
>  	mux->ring_entry_size = entry_size;
> +	mux->s_resubmit = false;
> +
> +	amdgpu_mux_chunk_slab = kmem_cache_create("amdgpu_mux_chunk",
> +						  sizeof(struct amdgpu_mux_chunk), 0,
> +						  SLAB_HWCACHE_ALIGN, NULL);
> +	if (!amdgpu_mux_chunk_slab) {
> +		DRM_ERROR("create amdgpu_mux_chunk cache failed\n");
> +		return -ENOMEM;
> +	}
> +
>  	spin_lock_init(&mux->lock);
> +	timer_setup(&mux->resubmit_timer, amdgpu_mux_resubmit_fallback, 0);
>  
>  	return 0;
>  }
>  
>  void amdgpu_ring_mux_fini(struct amdgpu_ring_mux *mux)
>  {
> +	struct amdgpu_mux_entry *e;
> +	struct amdgpu_mux_chunk *chunk, *chunk2;
> +	int i;
> +
> +	for (i = 0; i < mux->num_ring_entries; i++) {
> +		e = &mux->ring_entry[i];
> +		list_for_each_entry_safe(chunk, chunk2, &e->list, entry) {
> +			list_del(&chunk->entry);
> +			kmem_cache_free(amdgpu_mux_chunk_slab, chunk);
> +		}
> +	}
> +	kmem_cache_destroy(amdgpu_mux_chunk_slab);
>  	kfree(mux->ring_entry);
>  	mux->ring_entry = NULL;
>  	mux->num_ring_entries = 0;
> @@ -73,62 +196,48 @@ int amdgpu_ring_mux_add_sw_ring(struct amdgpu_ring_mux *mux, struct amdgpu_ring
>  	ring->entry_index = mux->num_ring_entries;
>  	e->ring = ring;
>  
> +	INIT_LIST_HEAD(&e->list);
>  	mux->num_ring_entries += 1;
>  	return 0;
>  }
>  
> -static inline struct amdgpu_mux_entry *amdgpu_ring_mux_sw_entry(struct amdgpu_ring_mux *mux,
> -								struct amdgpu_ring *ring)
> -{
> -	return ring->entry_index < mux->ring_entry_size ?
> -			&mux->ring_entry[ring->entry_index] : NULL;
> -}
> -
> -/* copy packages on sw ring range[begin, end) */
> -static void amdgpu_ring_mux_copy_pkt_from_sw_ring(struct amdgpu_ring_mux *mux,
> -						  struct amdgpu_ring *ring,
> -						  u64 s_start, u64 s_end)
> -{
> -	u64 start, end;
> -	struct amdgpu_ring *real_ring = mux->real_ring;
> -
> -	start = s_start & ring->buf_mask;
> -	end = s_end & ring->buf_mask;
> -
> -	if (start == end) {
> -		DRM_ERROR("no more data copied from sw ring\n");
> -		return;
> -	}
> -	if (start > end) {
> -		amdgpu_ring_alloc(real_ring, (ring->ring_size >> 2) + end - start);
> -		amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[start],
> -					   (ring->ring_size >> 2) - start);
> -		amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[0], end);
> -	} else {
> -		amdgpu_ring_alloc(real_ring, end - start);
> -		amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[start], end - start);
> -	}
> -}
> -
>  void amdgpu_ring_mux_set_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring, u64 wptr)
>  {
>  	struct amdgpu_mux_entry *e;
>  
> +	spin_lock(&mux->lock);
> +
> +	if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT)
> +		amdgpu_mux_resubmit_chunks(mux);
> +
>  	e = amdgpu_ring_mux_sw_entry(mux, ring);
>  	if (!e) {
>  		DRM_ERROR("cannot find entry for sw ring\n");
> +		spin_unlock(&mux->lock);
> +		return;
> +	}
> +
> +	/* We could skip this set wptr as preemption in process. */
> +	if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT && mux->pending_trailing_fence_signaled) {
> +		spin_unlock(&mux->lock);
>  		return;
>  	}
>  
> -	spin_lock(&mux->lock);
>  	e->sw_cptr = e->sw_wptr;
> +	/* Update cptr if the package already copied in resubmit functions */
> +	if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT && e->sw_cptr < mux->wptr_resubmit)
> +		e->sw_cptr = mux->wptr_resubmit;
>  	e->sw_wptr = wptr;
>  	e->start_ptr_in_hw_ring = mux->real_ring->wptr;
>  
> -	amdgpu_ring_mux_copy_pkt_from_sw_ring(mux, ring, e->sw_cptr, wptr);
> -	e->end_ptr_in_hw_ring = mux->real_ring->wptr;
> -	amdgpu_ring_commit(mux->real_ring);
> -
> +	/* Skip copying for the packages already resubmitted.*/
> +	if (ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT || mux->wptr_resubmit < wptr) {
> +		amdgpu_ring_mux_copy_pkt_from_sw_ring(mux, ring, e->sw_cptr, wptr);
> +		e->end_ptr_in_hw_ring = mux->real_ring->wptr;
> +		amdgpu_ring_commit(mux->real_ring);
> +	} else {
> +		e->end_ptr_in_hw_ring = mux->real_ring->wptr;
> +	}
>  	spin_unlock(&mux->lock);
>  }
>  
> @@ -145,7 +254,7 @@ u64 amdgpu_ring_mux_get_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ri
>  	return e->sw_wptr;
>  }
>  
> -/*
> +/**
>   * The return value of the readptr is not precise while the other rings could
>   * write data onto the real ring buffer.After overwriting on the real ring, we
>   * can not decide if our packages have been excuted or not read yet. However,
> @@ -235,3 +344,169 @@ unsigned int amdgpu_sw_ring_priority(int idx)
>  	return idx < ARRAY_SIZE(sw_ring_info) ?
>  		sw_ring_info[idx].hw_pio : AMDGPU_RING_PRIO_DEFAULT;
>  }
> +
> +/*Scan on low prio rings to have unsignaled fence and high ring has no fence.*/
> +int amdgpu_mcbp_scan(struct amdgpu_ring_mux *mux)
> +{
> +	struct amdgpu_ring *ring;
> +	int i, need_preempt;
> +
> +	need_preempt = 0;
> +	for (i = 0; i < mux->num_ring_entries; i++) {
> +		ring = mux->ring_entry[i].ring;
> +		if (ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT &&
> +		    amdgpu_fence_count_emitted(ring) > 0)
> +			return 0;
> +		if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT &&
> +		    amdgpu_fence_last_unsignaled_time_us(ring) >
> +		    AMDGPU_MAX_LAST_UNSIGNALED_THRESHOLD_US)
> +			need_preempt = 1;
> +	}
> +	return need_preempt && !mux->s_resubmit;
> +}
> +
> +/* Trigger Mid-Command Buffer Preemption (MCBP) and find if we need to resubmit. */
> +int amdgpu_mcbp_trigger_preempt(struct amdgpu_ring_mux *mux)
> +{
> +	int r;
> +
> +	spin_lock(&mux->lock);
> +	mux->pending_trailing_fence_signaled = true;
> +	r = amdgpu_ring_preempt_ib(mux->real_ring);
> +	spin_unlock(&mux->lock);
> +	return r;
> +}
> +
> +void amdgpu_sw_ring_ib_begin(struct amdgpu_ring *ring)
> +{
> +	struct amdgpu_device *adev = ring->adev;
> +	struct amdgpu_ring_mux *mux = &adev->gfx.muxer;
> +
> +	WARN_ON(!ring->is_sw_ring);
> +	if (ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT) {
> +		if (amdgpu_mcbp_scan(mux) > 0)
> +			amdgpu_mcbp_trigger_preempt(mux);
> +		return;
> +	}
> +
> +	amdgpu_ring_mux_start_ib(mux, ring);
> +}
> +
> +void amdgpu_sw_ring_ib_end(struct amdgpu_ring *ring)
> +{
> +	struct amdgpu_device *adev = ring->adev;
> +	struct amdgpu_ring_mux *mux = &adev->gfx.muxer;
> +
> +	WARN_ON(!ring->is_sw_ring);
> +	if (ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT)
> +		return;
> +	amdgpu_ring_mux_end_ib(mux, ring);
> +}
> +
> +void amdgpu_ring_mux_start_ib(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring)
> +{
> +	struct amdgpu_mux_entry *e;
> +	struct amdgpu_mux_chunk *chunk;
> +
> +	spin_lock(&mux->lock);
> +	amdgpu_mux_resubmit_chunks(mux);
> +	spin_unlock(&mux->lock);
> +
> +	e = amdgpu_ring_mux_sw_entry(mux, ring);
> +	if (!e) {
> +		DRM_ERROR("cannot find entry!\n");
> +		return;
> +	}
> +
> +	chunk = kmem_cache_alloc(amdgpu_mux_chunk_slab, GFP_KERNEL);
> +	if (!chunk) {
> +		DRM_ERROR("alloc amdgpu_mux_chunk_slab failed\n");
> +		return;
> +	}
> +
> +	chunk->start = ring->wptr;
> +	list_add_tail(&chunk->entry, &e->list);
> +}
> +
> +static void scan_and_remove_signaled_chunk(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring)
> +{
> +	uint32_t last_seq, size = 0;
> +	struct amdgpu_mux_entry *e;
> +	struct amdgpu_mux_chunk *chunk, *tmp;
> +
> +	e = amdgpu_ring_mux_sw_entry(mux, ring);
> +	if (!e) {
> +		DRM_ERROR("cannot find entry!\n");
> +		return;
> +	}
> +
> +	last_seq = atomic_read(&ring->fence_drv.last_seq);
> +
> +	list_for_each_entry_safe(chunk, tmp, &e->list, entry) {
> +		if (chunk->sync_seq <= last_seq) {
> +			list_del(&chunk->entry);
> +			kmem_cache_free(amdgpu_mux_chunk_slab, chunk);
> +		} else {
> +			size++;
> +		}
> +	}
> +}
> +
> +void amdgpu_ring_mux_end_ib(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring)
> +{
> +	struct amdgpu_mux_entry *e;
> +	struct amdgpu_mux_chunk *chunk;
> +
> +	e = amdgpu_ring_mux_sw_entry(mux, ring);
> +	if (!e) {
> +		DRM_ERROR("cannot find entry!\n");
> +		return;
> +	}
> +
> +	chunk = list_last_entry(&e->list, struct amdgpu_mux_chunk, entry);
> +	if (!chunk) {
> +		DRM_ERROR("cannot find chunk!\n");
> +		return;
> +	}
> +
> +	chunk->end = ring->wptr;
> +	chunk->sync_seq = READ_ONCE(ring->fence_drv.sync_seq);
> +
> +	scan_and_remove_signaled_chunk(mux, ring);
> +}
> +
> +bool amdgpu_mcbp_handle_trailing_fence_irq(struct amdgpu_ring_mux *mux)
> +{
> +	struct amdgpu_mux_entry *e;
> +	struct amdgpu_ring *ring = NULL;
> +	int i;
> +
> +	if (!mux->pending_trailing_fence_signaled)
> +		return false;
> +
> +	if (mux->real_ring->trail_seq != le32_to_cpu(*mux->real_ring->trail_fence_cpu_addr))
> +		return false;
> +
> +	for (i = 0; i < mux->num_ring_entries; i++) {
> +		e = &mux->ring_entry[i];
> +		if (e->ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT) {
> +			ring = e->ring;
> +			break;
> +		}
> +	}
> +
> +	if (!ring) {
> +		DRM_ERROR("cannot find low priority ring\n");
> +		return false;
> +	}
> +
> +	amdgpu_fence_process(ring);
> +	if (amdgpu_fence_count_emitted(ring) > 0) {
> +		mux->s_resubmit = true;
> +		mux->seqno_to_resubmit = ring->fence_drv.sync_seq;
> +		amdgpu_ring_mux_schedule_resubmit(mux);
> +	}
> +
> +	mux->pending_trailing_fence_signaled = false;
> +	return true;
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
> index 28399f4b0e5c..aa758ebc86ae 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
> @@ -29,6 +29,7 @@
>  #include "amdgpu_ring.h"
>  
>  struct amdgpu_ring;
> +
>  /**
>   * struct amdgpu_mux_entry - the entry recording software rings copying information.
>   * @ring: the pointer to the software ring.
> @@ -37,6 +38,7 @@ struct amdgpu_ring;
>   * @sw_cptr: the position of the copy pointer in the sw ring.
>   * @sw_rptr: the read pointer in software ring.
>   * @sw_wptr: the write pointer in software ring.
> + * @list: list head for amdgpu_mux_chunk
>   */
>  struct amdgpu_mux_entry {
>  	struct                  amdgpu_ring *ring;
> @@ -45,6 +47,7 @@ struct amdgpu_mux_entry {
>  	u64                     sw_cptr;
>  	u64                     sw_rptr;
>  	u64                     sw_wptr;
> +	struct list_head        list;
>  };
>  
>  struct amdgpu_ring_mux {
> @@ -55,6 +58,26 @@ struct amdgpu_ring_mux {
>  	unsigned int            ring_entry_size;
>  	/*the lock for copy data from different software rings*/
>  	spinlock_t              lock;
> +	bool                    s_resubmit;
> +	uint32_t                seqno_to_resubmit;
> +	u64                     wptr_resubmit;
> +	struct timer_list       resubmit_timer;
> +
> +	bool                    pending_trailing_fence_signaled;
> +};
> +
> +/**
> + * struct amdgpu_mux_chunk - save the location of indirect buffer's package on softare rings.
> + * @entry: the list entry.
> + * @sync_seq: the fence seqno related with the saved IB.
> + * @start:- start location on the software ring.
> + * @end:- end location on the software ring.
> + */
> +struct amdgpu_mux_chunk {
> +	struct list_head        entry;
> +	uint32_t                sync_seq;
> +	u64                     start;
> +	u64                     end;
>  };
>  
>  int amdgpu_ring_mux_init(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring,
> @@ -64,15 +87,17 @@ int amdgpu_ring_mux_add_sw_ring(struct amdgpu_ring_mux *mux, struct amdgpu_ring
>  void amdgpu_ring_mux_set_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring, u64 wptr);
>  u64 amdgpu_ring_mux_get_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring);
>  u64 amdgpu_ring_mux_get_rptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring);
> +void amdgpu_ring_mux_start_ib(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring);
> +void amdgpu_ring_mux_end_ib(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring);
> +bool amdgpu_mcbp_handle_trailing_fence_irq(struct amdgpu_ring_mux *mux);
>  
>  u64 amdgpu_sw_ring_get_rptr_gfx(struct amdgpu_ring *ring);
>  u64 amdgpu_sw_ring_get_wptr_gfx(struct amdgpu_ring *ring);
>  void amdgpu_sw_ring_set_wptr_gfx(struct amdgpu_ring *ring);
> -
>  void amdgpu_sw_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count);
>  void amdgpu_sw_ring_ib_begin(struct amdgpu_ring *ring);
>  void amdgpu_sw_ring_ib_end(struct amdgpu_ring *ring);
> -
>  const char *amdgpu_sw_ring_name(int idx);
>  unsigned int amdgpu_sw_ring_priority(int idx);
> +
>  #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 9596c22fded6..b7e94553f4fb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -601,6 +601,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
>  	if (!vm_flush_needed && !gds_switch_needed && !need_pipe_sync)
>  		return 0;
>  
> +	amdgpu_ring_ib_begin(ring);
>  	if (ring->funcs->init_cond_exec)
>  		patch_offset = amdgpu_ring_init_cond_exec(ring);
>  
> @@ -661,6 +662,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
>  		amdgpu_ring_emit_switch_buffer(ring);
>  		amdgpu_ring_emit_switch_buffer(ring);
>  	}
> +	amdgpu_ring_ib_end(ring);
>  	return 0;
>  }
>  
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 01ec0551d26a..42bda7766e11 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -5620,7 +5620,7 @@ static int gfx_v9_0_ring_preempt_ib(struct amdgpu_ring *ring)
>  	ring->trail_seq += 1;
>  	amdgpu_ring_alloc(ring, 13);
>  	gfx_v9_0_ring_emit_fence(ring, ring->trail_fence_gpu_addr,
> -				 ring->trail_seq, AMDGPU_FENCE_FLAG_EXEC);
> +				 ring->trail_seq, AMDGPU_FENCE_FLAG_EXEC | AMDGPU_FENCE_FLAG_INT);
>  	/*reset the CP_VMID_PREEMPT after trailing fence*/
>  	amdgpu_ring_emit_wreg(ring,
>  			      SOC15_REG_OFFSET(GC, 0, mmCP_VMID_PREEMPT),
> @@ -6046,8 +6046,9 @@ static int gfx_v9_0_eop_irq(struct amdgpu_device *adev,
>  
>  	switch (me_id) {
>  	case 0:
> -		/* Fence signals are handled on the software rings*/
> -		if (adev->gfx.num_gfx_rings) {
> +		if (adev->gfx.num_gfx_rings &&
> +		    !amdgpu_mcbp_handle_trailing_fence_irq(&adev->gfx.muxer)) {
> +			/* Fence signals are handled on the software rings*/
>  			for (i = 0; i < GFX9_NUM_SW_GFX_RINGS; i++)
>  				amdgpu_fence_process(&adev->gfx.sw_gfx_ring[i]);
>  		}


^ permalink raw reply	[flat|nested] 29+ messages in thread

* RE: [PATCH 4/5] drm/amdgpu: MCBP based on DRM scheduler (v8)
  2022-11-10 17:54                       ` Alex Deucher
@ 2022-11-11  6:15                         ` Zhu, Jiadong
  0 siblings, 0 replies; 29+ messages in thread
From: Zhu, Jiadong @ 2022-11-11  6:15 UTC (permalink / raw)
  To: Alex Deucher, Michel Dänzer
  Cc: Tuikov, Luben, Huang, Ray, Koenig, Christian, amd-gfx

[AMD Official Use Only - General]

Hi Michel,

It is true that we don’t get obvious improvement on performance with these patches.
The original requirement of using mcbp is that when there is a very long ib package with many draw cmds on low priority which uses up gpu utilization, we give a chance to high priority ibs executed by gpu.
The total performance could be dropped as mcbp drains the pipe and the low priority ibs would be resubmitted again after that.

This set of patches is mainly to implement priority queues by software rings. We may use other method instead of mcbp to improve it later.

Thanks,
Jiadong

-----Original Message-----
From: Alex Deucher <alexdeucher@gmail.com>
Sent: Friday, November 11, 2022 1:54 AM
To: Michel Dänzer <michel@daenzer.net>
Cc: Zhu, Jiadong <Jiadong.Zhu@amd.com>; Tuikov, Luben <Luben.Tuikov@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH 4/5] drm/amdgpu: MCBP based on DRM scheduler (v8)

On Thu, Nov 10, 2022 at 12:00 PM Michel Dänzer <michel@daenzer.net> wrote:
>
> On 2022-11-08 09:01, Zhu, Jiadong wrote:> From: Michel Dänzer
> <michel@daenzer.net>
> >
> >>>> The bad news is that this series still makes some things very slow. The most extreme examples so far are glxgears (runs at ~400 fps now, ~7000 fps before, i.e. almost 20x slowdown) and hexchat (scrolling one page now takes ~1 second, I can see it drawing line by line; before it was almost instantaneous). I suspect this series makes the overhead of running a single GPU job much bigger. On the bright side, I'm not noticing any significant intermittent freezes anymore.
> >>>
> >>> Hi Michel,
> >>>
> >>> Thanks for the trying.
> >>> Is there high priority jobs running while executing glxgears?
> >>
> >> Yes, mutter is submitting high priority jobs. However, I don't think that can explain the problem by itself:
> >>
> >> mutter only draws once per display refresh cycle. Let's assume mutter's GPU work takes ~6-7ms (conservative example, should be less than that usually). That leaves ~10ms per display refresh cycle (at 60 Hz refresh rate) where GPU work from glxgears & Xwayland can run without getting preempted. Since glxgears runs at ~7000 fps without this series, it should be able to draw at least ~70 frames in 10ms[0], which corresponds to over 4000 fps. Yet it manages only 1/10 of that.
> >>
> >> [0] Worst case consideration, ignoring the fact that without this series, glxgears runs at ~7000 fps while mutter sustains 60 fps.
> >
> > I reproduced the glxgears 400fps scenario locally. The issue is caused by the patch5 "drm/amdgpu: Improve the software rings priority scheduler" which slows down the low priority scheduler thread if high priority ib is under executing. I'll drop this patch as we cannot identify gpu bound according to the unsignaled fence, etc.
>
> Okay, I'm testing with patches 1-4 only now.
>
> So far I haven't noticed any negative effects, no slowdowns or intermittent freezes.
>
> The only issue is that there's hardly any positive effect either. While constantly moving the window of a GPU-limited GpuTest benchmark in circles, most of the time it looks exactly the same as without these patches. Only occasionally, at most every few seconds, I notice that the window movement becomes smoother for an instant.
>

I think it will largely depend on the workload.  The gfx pipe can only be preempted on draw boundaries so if most operations are a single draw, you probably won't see much difference.

Alex

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 4/5] drm/amdgpu: MCBP based on DRM scheduler (v8)
  2022-11-10 17:00                     ` Michel Dänzer
  2022-11-10 17:54                       ` Alex Deucher
@ 2022-11-14 17:15                       ` Michel Dänzer
  2022-11-17  3:34                         ` Zhu, Jiadong
  1 sibling, 1 reply; 29+ messages in thread
From: Michel Dänzer @ 2022-11-14 17:15 UTC (permalink / raw)
  To: Zhu, Jiadong; +Cc: Tuikov, Luben, Huang, Ray, Koenig, Christian, amd-gfx

On 2022-11-10 18:00, Michel Dänzer wrote:
> On 2022-11-08 09:01, Zhu, Jiadong wrote:
>>
>> I reproduced the glxgears 400fps scenario locally. The issue is caused by the patch5 "drm/amdgpu: Improve the software rings priority scheduler" which slows down the low priority scheduler thread if high priority ib is under executing. I'll drop this patch as we cannot identify gpu bound according to the unsignaled fence, etc.
> 
> Okay, I'm testing with patches 1-4 only now.
> 
> So far I haven't noticed any negative effects, no slowdowns or intermittent freezes.

I'm afraid I may have run into another issue. I just hit a GPU hang, see the
journalctl excerpt below.

(I tried rebooting the machine via SSH after this, but it never seemed to
complete, so I had to hard-power-off the machine by holding the power
button for a few seconds)

I can't be sure that the GPU hang is directly related to this series,
but it seems plausible, and I hadn't hit a GPU hang in months if not
over a year before. If this series results in potentially hitting a
GPU hang every few days, it definitely doesn't provide enough benefit
to justify that.


Nov 14 17:21:22 thor kernel: [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring gfx_high timeout, signaled seq=1166051, emitted seq=1166052
Nov 14 17:21:22 thor kernel: [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information: process gnome-shell pid 2828 thread gnome-shel:cs0 pid 2860
Nov 14 17:21:22 thor kernel: amdgpu 0000:05:00.0: amdgpu: GPU reset begin!
Nov 14 17:21:22 thor kernel: amdgpu 0000:05:00.0: amdgpu: free PSP TMR buffer
Nov 14 17:21:22 thor kernel: amdgpu 0000:05:00.0: amdgpu: MODE2 reset
Nov 14 17:21:22 thor kernel: amdgpu 0000:05:00.0: amdgpu: GPU reset succeeded, trying to resume
Nov 14 17:21:22 thor kernel: [drm] PCIE GART of 1024M enabled.
Nov 14 17:21:22 thor kernel: [drm] PTB located at 0x000000F400A00000
Nov 14 17:21:22 thor kernel: [drm] VRAM is lost due to GPU reset!
Nov 14 17:21:22 thor kernel: [drm] PSP is resuming...
Nov 14 17:21:22 thor kernel: [drm] reserve 0x400000 from 0xf431c00000 for PSP TMR
Nov 14 17:21:23 thor kernel: amdgpu 0000:05:00.0: amdgpu: RAS: optional ras ta ucode is not available
Nov 14 17:21:23 thor kernel: amdgpu 0000:05:00.0: amdgpu: RAP: optional rap ta ucode is not available
Nov 14 17:21:23 thor gnome-shell[3639]: amdgpu: The CS has been rejected (-125), but the context isn't robust.
Nov 14 17:21:23 thor gnome-shell[3639]: amdgpu: The process will be terminated.
Nov 14 17:21:23 thor kernel: [drm] kiq ring mec 2 pipe 1 q 0
Nov 14 17:21:23 thor kernel: amdgpu 0000:05:00.0: [drm:amdgpu_ring_test_helper [amdgpu]] *ERROR* ring kiq_2.1.0 test failed (-110)
Nov 14 17:21:23 thor kernel: [drm:amdgpu_gfx_enable_kcq.cold [amdgpu]] *ERROR* KCQ enable failed
Nov 14 17:21:23 thor kernel: [drm:amdgpu_device_ip_resume_phase2 [amdgpu]] *ERROR* resume of IP block <gfx_v9_0> failed -110
Nov 14 17:21:23 thor kernel: amdgpu 0000:05:00.0: amdgpu: GPU reset(2) failed
Nov 14 17:21:23 thor kernel: amdgpu 0000:05:00.0: amdgpu: GPU reset end with ret = -110
Nov 14 17:21:23 thor kernel: [drm:amdgpu_job_timedout [amdgpu]] *ERROR* GPU Recovery Failed: -110
[...]
Nov 14 17:21:33 thor kernel: [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring gfx_high timeout, signaled seq=1166052, emitted seq=1166052
Nov 14 17:21:33 thor kernel: [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information: process gnome-shell pid 2828 thread gnome-shel:cs0 pid 2860
Nov 14 17:21:33 thor kernel: amdgpu 0000:05:00.0: amdgpu: GPU reset begin!


-- 
Earthling Michel Dänzer            |                  https://redhat.com
Libre software enthusiast          |         Mesa and Xwayland developer


^ permalink raw reply	[flat|nested] 29+ messages in thread

* RE: [PATCH 4/5] drm/amdgpu: MCBP based on DRM scheduler (v8)
  2022-11-14 17:15                       ` Michel Dänzer
@ 2022-11-17  3:34                         ` Zhu, Jiadong
  0 siblings, 0 replies; 29+ messages in thread
From: Zhu, Jiadong @ 2022-11-17  3:34 UTC (permalink / raw)
  To: Michel Dänzer; +Cc: Tuikov, Luben, Huang, Ray, Koenig, Christian, amd-gfx

[AMD Official Use Only - General]

Hi Michel,

I didn't reproduce the hang yet but find a race condition related with fence signaling time. I updated the patch series based on kernel 5.18.

Thanks,
Jiadong

-----Original Message-----
From: Michel Dänzer <michel@daenzer.net>
Sent: Tuesday, November 15, 2022 1:15 AM
To: Zhu, Jiadong <Jiadong.Zhu@amd.com>
Cc: Tuikov, Luben <Luben.Tuikov@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH 4/5] drm/amdgpu: MCBP based on DRM scheduler (v8)

On 2022-11-10 18:00, Michel Dänzer wrote:
> On 2022-11-08 09:01, Zhu, Jiadong wrote:
>>
>> I reproduced the glxgears 400fps scenario locally. The issue is caused by the patch5 "drm/amdgpu: Improve the software rings priority scheduler" which slows down the low priority scheduler thread if high priority ib is under executing. I'll drop this patch as we cannot identify gpu bound according to the unsignaled fence, etc.
>
> Okay, I'm testing with patches 1-4 only now.
>
> So far I haven't noticed any negative effects, no slowdowns or intermittent freezes.

I'm afraid I may have run into another issue. I just hit a GPU hang, see the journalctl excerpt below.

(I tried rebooting the machine via SSH after this, but it never seemed to complete, so I had to hard-power-off the machine by holding the power button for a few seconds)

I can't be sure that the GPU hang is directly related to this series, but it seems plausible, and I hadn't hit a GPU hang in months if not over a year before. If this series results in potentially hitting a GPU hang every few days, it definitely doesn't provide enough benefit to justify that.


Nov 14 17:21:22 thor kernel: [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring gfx_high timeout, signaled seq=1166051, emitted seq=1166052 Nov 14 17:21:22 thor kernel: [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information: process gnome-shell pid 2828 thread gnome-shel:cs0 pid 2860 Nov 14 17:21:22 thor kernel: amdgpu 0000:05:00.0: amdgpu: GPU reset begin!
Nov 14 17:21:22 thor kernel: amdgpu 0000:05:00.0: amdgpu: free PSP TMR buffer Nov 14 17:21:22 thor kernel: amdgpu 0000:05:00.0: amdgpu: MODE2 reset Nov 14 17:21:22 thor kernel: amdgpu 0000:05:00.0: amdgpu: GPU reset succeeded, trying to resume Nov 14 17:21:22 thor kernel: [drm] PCIE GART of 1024M enabled.
Nov 14 17:21:22 thor kernel: [drm] PTB located at 0x000000F400A00000 Nov 14 17:21:22 thor kernel: [drm] VRAM is lost due to GPU reset!
Nov 14 17:21:22 thor kernel: [drm] PSP is resuming...
Nov 14 17:21:22 thor kernel: [drm] reserve 0x400000 from 0xf431c00000 for PSP TMR Nov 14 17:21:23 thor kernel: amdgpu 0000:05:00.0: amdgpu: RAS: optional ras ta ucode is not available Nov 14 17:21:23 thor kernel: amdgpu 0000:05:00.0: amdgpu: RAP: optional rap ta ucode is not available Nov 14 17:21:23 thor gnome-shell[3639]: amdgpu: The CS has been rejected (-125), but the context isn't robust.
Nov 14 17:21:23 thor gnome-shell[3639]: amdgpu: The process will be terminated.
Nov 14 17:21:23 thor kernel: [drm] kiq ring mec 2 pipe 1 q 0 Nov 14 17:21:23 thor kernel: amdgpu 0000:05:00.0: [drm:amdgpu_ring_test_helper [amdgpu]] *ERROR* ring kiq_2.1.0 test failed (-110) Nov 14 17:21:23 thor kernel: [drm:amdgpu_gfx_enable_kcq.cold [amdgpu]] *ERROR* KCQ enable failed Nov 14 17:21:23 thor kernel: [drm:amdgpu_device_ip_resume_phase2 [amdgpu]] *ERROR* resume of IP block <gfx_v9_0> failed -110 Nov 14 17:21:23 thor kernel: amdgpu 0000:05:00.0: amdgpu: GPU reset(2) failed Nov 14 17:21:23 thor kernel: amdgpu 0000:05:00.0: amdgpu: GPU reset end with ret = -110 Nov 14 17:21:23 thor kernel: [drm:amdgpu_job_timedout [amdgpu]] *ERROR* GPU Recovery Failed: -110 [...] Nov 14 17:21:33 thor kernel: [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring gfx_high timeout, signaled seq=1166052, emitted seq=1166052 Nov 14 17:21:33 thor kernel: [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information: process gnome-shell pid 2828 thread gnome-shel:cs0 pid 2860 Nov 14 17:21:33 thor kernel: amdgpu 0000:05:00.0: amdgpu: GPU reset begin!


--
Earthling Michel Dänzer            |                  https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fredhat.com%2F&amp;data=05%7C01%7CJiadong.Zhu%40amd.com%7C33282c71226444b6c24508dac663cf4a%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C638040429257070484%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&amp;sdata=Cb8OyRawA9T9%2FfGSehB9r9JY%2FKcC4%2B%2FCdY8UaRh79t4%3D&amp;reserved=0
Libre software enthusiast          |         Mesa and Xwayland developer


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 1/5] drm/amdgpu: Introduce gfx software ring (v8)
  2022-10-18  9:08 [PATCH 1/5] drm/amdgpu: Introduce gfx software ring (v8) jiadong.zhu
                   ` (5 preceding siblings ...)
  2022-10-20 14:49 ` Michel Dänzer
@ 2022-11-22  5:33 ` Luben Tuikov
  6 siblings, 0 replies; 29+ messages in thread
From: Luben Tuikov @ 2022-11-22  5:33 UTC (permalink / raw)
  To: jiadong.zhu, amd-gfx
  Cc: Andrey Grodzovsky, Michel Dänzer, Huang Rui, Christian Koenig

On 2022-10-18 05:08, jiadong.zhu@amd.com wrote:
> From: "Jiadong.Zhu" <Jiadong.Zhu@amd.com>
> 
> The software ring is created to support priority context while there is only
> one hardware queue for gfx.
> 
> Every software ring has its fence driver and could be used as an ordinary ring
> for the GPU scheduler.
> Multiple software rings are bound to a real ring with the ring muxer. The
> packages committed on the software ring are copied to the real ring.
> 
> v2: Use array to store software ring entry.
> v3: Remove unnecessary prints.
> v4: Remove amdgpu_ring_sw_init/fini functions,
> using gtt for sw ring buffer for later dma copy
> optimization.
> v5: Allocate ring entry dynamically in the muxer.
> v6: Update comments for the ring muxer.
> v7: Modify for function naming.
> v8: Combine software ring functions into amdgpu_ring_mux.c
> 
> Cc: Christian Koenig <Christian.Koenig@amd.com>
> Cc: Luben Tuikov <Luben.Tuikov@amd.com>
> Cc: Andrey Grodzovsky  <Andrey.Grodzovsky@amd.com>
> Cc: Michel Dänzer <michel@daenzer.net>
> Signed-off-by: Jiadong.Zhu <Jiadong.Zhu@amd.com>
> Acked-by: Huang Rui <ray.huang@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/Makefile          |   3 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h      |   3 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h     |   4 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c | 217 +++++++++++++++++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h |  76 +++++++
>  5 files changed, 302 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
>  create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile
> index 3e0e2eb7e235..add7da53950c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/Makefile
> +++ b/drivers/gpu/drm/amd/amdgpu/Makefile
> @@ -58,7 +58,8 @@ amdgpu-y += amdgpu_device.o amdgpu_kms.o \
>  	amdgpu_vm_sdma.o amdgpu_discovery.o amdgpu_ras_eeprom.o amdgpu_nbio.o \
>  	amdgpu_umc.o smu_v11_0_i2c.o amdgpu_fru_eeprom.o amdgpu_rap.o \
>  	amdgpu_fw_attestation.o amdgpu_securedisplay.o \
> -	amdgpu_eeprom.o amdgpu_mca.o amdgpu_psp_ta.o amdgpu_lsdma.o
> +	amdgpu_eeprom.o amdgpu_mca.o amdgpu_psp_ta.o amdgpu_lsdma.o \
> +	amdgpu_ring_mux.o
>  
>  amdgpu-$(CONFIG_PROC_FS) += amdgpu_fdinfo.o
>  
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> index 53526ffb2ce1..9996dadb39f7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> @@ -33,6 +33,7 @@
>  #include "amdgpu_imu.h"
>  #include "soc15.h"
>  #include "amdgpu_ras.h"
> +#include "amdgpu_ring_mux.h"
>  
>  /* GFX current status */
>  #define AMDGPU_GFX_NORMAL_MODE			0x00000000L
> @@ -346,6 +347,8 @@ struct amdgpu_gfx {
>  	struct amdgpu_gfx_ras		*ras;
>  
>  	bool				is_poweron;
> +
> +	struct amdgpu_ring_mux          muxer;
>  };
>  
>  #define amdgpu_gfx_get_gpu_clock_counter(adev) (adev)->gfx.funcs->get_gpu_clock_counter((adev))
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index 7d89a52091c0..40b1277b4f0c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -278,6 +278,10 @@ struct amdgpu_ring {
>  	bool			is_mes_queue;
>  	uint32_t		hw_queue_id;
>  	struct amdgpu_mes_ctx_data *mes_ctx;
> +
> +	bool            is_sw_ring;
> +	unsigned int    entry_index;
> +
>  };
>  
>  #define amdgpu_ring_parse_cs(r, p, job, ib) ((r)->funcs->parse_cs((p), (job), (ib)))
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
> new file mode 100644
> index 000000000000..43cab8a37754
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
> @@ -0,0 +1,217 @@
> +/*
> + * Copyright 2022 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + *
> + */
> +#include <linux/slab.h>
> +#include <drm/drm_print.h>
> +
> +#include "amdgpu_ring_mux.h"
> +#include "amdgpu_ring.h"
> +#include "amdgpu.h"
> +
> +#define AMDGPU_MUX_RESUBMIT_JIFFIES_TIMEOUT (HZ / 2)
> +
> +int amdgpu_ring_mux_init(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring,
> +			 unsigned int entry_size)
> +{
> +	mux->real_ring = ring;
> +	mux->num_ring_entries = 0;
> +	mux->ring_entry = kcalloc(entry_size, sizeof(struct amdgpu_mux_entry), GFP_KERNEL);
> +	if (!mux->ring_entry)
> +		return -ENOMEM;
> +
> +	mux->ring_entry_size = entry_size;
> +	spin_lock_init(&mux->lock);
> +
> +	return 0;
> +}
> +
> +void amdgpu_ring_mux_fini(struct amdgpu_ring_mux *mux)
> +{
> +	kfree(mux->ring_entry);
> +	mux->ring_entry = NULL;
> +	mux->num_ring_entries = 0;
> +	mux->ring_entry_size = 0;
> +}
> +
> +int amdgpu_ring_mux_add_sw_ring(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring)
> +{
> +	struct amdgpu_mux_entry *e;
> +
> +	if (mux->num_ring_entries >= mux->ring_entry_size) {
> +		DRM_ERROR("add sw ring exceeding max entry size\n");
> +		return -ENOENT;
> +	}
> +
> +	e = &mux->ring_entry[mux->num_ring_entries];
> +	ring->entry_index = mux->num_ring_entries;
> +	e->ring = ring;
> +
> +	mux->num_ring_entries += 1;
> +	return 0;
> +}
> +
> +static inline struct amdgpu_mux_entry *amdgpu_ring_mux_sw_entry(struct amdgpu_ring_mux *mux,
> +								struct amdgpu_ring *ring)
> +{
> +	return ring->entry_index < mux->ring_entry_size ?
> +			&mux->ring_entry[ring->entry_index] : NULL;
> +}
> +
> +/* copy packages on sw ring range[begin, end) */
> +static void amdgpu_ring_mux_copy_pkt_from_sw_ring(struct amdgpu_ring_mux *mux,
> +						  struct amdgpu_ring *ring,
> +						  u64 s_start, u64 s_end)
> +{
> +	u64 start, end;
> +	struct amdgpu_ring *real_ring = mux->real_ring;
> +
> +	start = s_start & ring->buf_mask;
> +	end = s_end & ring->buf_mask;
> +
> +	if (start == end) {
> +		DRM_ERROR("no more data copied from sw ring\n");
> +		return;
> +	}
> +	if (start > end) {
> +		amdgpu_ring_alloc(real_ring, (ring->ring_size >> 2) + end - start);
> +		amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[start],
> +					   (ring->ring_size >> 2) - start);
> +		amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[0], end);
> +	} else {
> +		amdgpu_ring_alloc(real_ring, end - start);
> +		amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[start], end - start);
> +	}
> +}
> +
> +void amdgpu_ring_mux_set_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring, u64 wptr)
> +{
> +	struct amdgpu_mux_entry *e;
> +
> +	e = amdgpu_ring_mux_sw_entry(mux, ring);
> +	if (!e) {
> +		DRM_ERROR("cannot find entry for sw ring\n");
> +		return;
> +	}
> +
> +	spin_lock(&mux->lock);
> +	e->sw_cptr = e->sw_wptr;
> +	e->sw_wptr = wptr;
> +	e->start_ptr_in_hw_ring = mux->real_ring->wptr;
> +
> +	amdgpu_ring_mux_copy_pkt_from_sw_ring(mux, ring, e->sw_cptr, wptr);
> +	e->end_ptr_in_hw_ring = mux->real_ring->wptr;
> +	amdgpu_ring_commit(mux->real_ring);
> +
> +	spin_unlock(&mux->lock);
> +}
> +
> +u64 amdgpu_ring_mux_get_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring)
> +{
> +	struct amdgpu_mux_entry *e;
> +
> +	e = amdgpu_ring_mux_sw_entry(mux, ring);
> +	if (!e) {
> +		DRM_ERROR("cannot find entry for sw ring\n");
> +		return 0;
> +	}
> +
> +	return e->sw_wptr;
> +}
> +
> +/*
> + * The return value of the readptr is not precise while the other rings could
> + * write data onto the real ring buffer.After overwriting on the real ring, we
> + * can not decide if our packages have been excuted or not read yet. However,
> + * this function is only called by the tools such as umr to collect the latest
> + * packages for the hang analysis. We assume the hang happens near our latest
> + * submit. Thus we could use the following logic to give the clue:
> + * If the readptr is between start and end, then we return the copy pointer
> + * plus the distance from start to readptr. If the readptr is before start, we
> + * return the copy pointer. Lastly, if the readptr is past end, we return the
> + * write pointer.
> + */
> +u64 amdgpu_ring_mux_get_rptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring)

Perhaps this comment would be better being a kernel-doc comment. Please make it so.

> +{
> +	struct amdgpu_mux_entry *e;
> +	u64 readp, offset, start, end;
> +
> +	e = amdgpu_ring_mux_sw_entry(mux, ring);
> +	if (!e) {
> +		DRM_ERROR("no sw entry found!\n");
> +		return 0;
> +	}
> +
> +	readp = amdgpu_ring_get_rptr(mux->real_ring);
> +
> +	start = e->start_ptr_in_hw_ring & mux->real_ring->buf_mask;
> +	end = e->end_ptr_in_hw_ring & mux->real_ring->buf_mask;
> +	if (start > end) {
> +		if (readp <= end)
> +			readp += mux->real_ring->ring_size >> 2;
> +		end += mux->real_ring->ring_size >> 2;
> +	}
> +
> +	if (start <= readp && readp <= end) {
> +		offset = readp - start;
> +		e->sw_rptr = (e->sw_cptr + offset) & ring->buf_mask;
> +	} else if (readp < start) {
> +		e->sw_rptr = e->sw_cptr;
> +	} else {
> +		/* end < readptr */
> +		e->sw_rptr = e->sw_wptr;
> +	}
> +
> +	return e->sw_rptr;
> +}
> +
> +u64 amdgpu_sw_ring_get_rptr_gfx(struct amdgpu_ring *ring)
> +{
> +	struct amdgpu_device *adev = ring->adev;
> +	struct amdgpu_ring_mux *mux = &adev->gfx.muxer;
> +
> +	WARN_ON(!ring->is_sw_ring);
> +	return amdgpu_ring_mux_get_rptr(mux, ring);
> +}
> +
> +u64 amdgpu_sw_ring_get_wptr_gfx(struct amdgpu_ring *ring)
> +{
> +	struct amdgpu_device *adev = ring->adev;
> +	struct amdgpu_ring_mux *mux = &adev->gfx.muxer;
> +
> +	WARN_ON(!ring->is_sw_ring);
> +	return amdgpu_ring_mux_get_wptr(mux, ring);
> +}
> +
> +void amdgpu_sw_ring_set_wptr_gfx(struct amdgpu_ring *ring)
> +{
> +	struct amdgpu_device *adev = ring->adev;
> +	struct amdgpu_ring_mux *mux = &adev->gfx.muxer;
> +
> +	WARN_ON(!ring->is_sw_ring);
> +	amdgpu_ring_mux_set_wptr(mux, ring, ring->wptr);
> +}
> +
> +/* Override insert_nop to prevent emitting nops to the software rings */
> +void amdgpu_sw_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count)
> +{
> +	WARN_ON(!ring->is_sw_ring);
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
> new file mode 100644
> index 000000000000..d91629589577
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
> @@ -0,0 +1,76 @@
> +/*
> + * Copyright 2022 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + *
> + */
> +
> +#ifndef __AMDGPU_RING_MUX__
> +#define __AMDGPU_RING_MUX__
> +
> +#include <linux/timer.h>
> +#include <linux/spinlock.h>
> +#include "amdgpu_ring.h"
> +
> +struct amdgpu_ring;
> +/**
> + * struct amdgpu_mux_entry - the entry recording software rings copying information.
> + * @ring: the pointer to the software ring.
> + * @start_ptr_in_hw_ring: last start location copied to in the hardware ring.
> + * @end_ptr_in_hw_ring: last end location copied to in the hardware ring.
> + * @sw_cptr: the position of the copy pointer in the sw ring.
> + * @sw_rptr: the read pointer in software ring.
> + * @sw_wptr: the write pointer in software ring.
> + */
> +struct amdgpu_mux_entry {
> +	struct                  amdgpu_ring *ring;

"amdgpu_ring" should be next to "struct", like this:

	struct amdgpu_ring      *ring;

With these changes, this patch is:
Acked-by: Luben Tuikov <luben.tuikov@amd.com>

Given also that it has passed rigorous testing.

Regards,
Luben

> +	u64                     start_ptr_in_hw_ring;
> +	u64                     end_ptr_in_hw_ring;
> +	u64                     sw_cptr;
> +	u64                     sw_rptr;
> +	u64                     sw_wptr;
> +};
> +
> +struct amdgpu_ring_mux {
> +	struct amdgpu_ring      *real_ring;
> +
> +	struct amdgpu_mux_entry *ring_entry;
> +	unsigned int            num_ring_entries;
> +	unsigned int            ring_entry_size;
> +	/*the lock for copy data from different software rings*/
> +	spinlock_t              lock;
> +};
> +
> +int amdgpu_ring_mux_init(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring,
> +			 unsigned int entry_size);
> +void amdgpu_ring_mux_fini(struct amdgpu_ring_mux *mux);
> +int amdgpu_ring_mux_add_sw_ring(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring);
> +void amdgpu_ring_mux_set_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring, u64 wptr);
> +u64 amdgpu_ring_mux_get_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring);
> +u64 amdgpu_ring_mux_get_rptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring);
> +
> +u64 amdgpu_sw_ring_get_rptr_gfx(struct amdgpu_ring *ring);
> +u64 amdgpu_sw_ring_get_wptr_gfx(struct amdgpu_ring *ring);
> +void amdgpu_sw_ring_set_wptr_gfx(struct amdgpu_ring *ring);
> +
> +void amdgpu_sw_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count);
> +void amdgpu_sw_ring_ib_begin(struct amdgpu_ring *ring);
> +void amdgpu_sw_ring_ib_end(struct amdgpu_ring *ring);
> +
> +#endif

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 3/5] drm/amdgpu: Modify unmap_queue format for gfx9 (v4)
  2022-10-18  9:08 ` [PATCH 3/5] drm/amdgpu: Modify unmap_queue format for gfx9 (v4) jiadong.zhu
@ 2022-11-22  5:46   ` Luben Tuikov
  0 siblings, 0 replies; 29+ messages in thread
From: Luben Tuikov @ 2022-11-22  5:46 UTC (permalink / raw)
  To: jiadong.zhu, amd-gfx; +Cc: Michel Dänzer, Huang Rui, Christian König

On 2022-10-18 05:08, jiadong.zhu@amd.com wrote:
> From: "Jiadong.Zhu" <Jiadong.Zhu@amd.com>
> 
> 1. Modify the unmap_queue package on gfx9. Add trailing fence to track the
>    preemption done.
> 2. Modify emit_ce_meta emit_de_meta functions for the resumed ibs.
> 
> v2: Restyle code not to use ternary operator.
> v3: Modify code format.
> v4: Enable Mid-Command Buffer Preemption for gfx9 by default.
> 
> Cc: Christian Koenig <Christian.Koenig@amd.com>
> Cc: Michel Dänzer <michel@daenzer.net>
> Signed-off-by: Jiadong.Zhu <Jiadong.Zhu@amd.com>
> Acked-by: Christian König <christian.koenig@amd.com>
> Acked-by: Huang Rui <ray.huang@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |   1 +
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 181 +++++++++++++++++++----
>  drivers/gpu/drm/amd/amdgpu/soc15d.h      |   2 +
>  3 files changed, 155 insertions(+), 29 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index f08ee1ac281c..e90d327a589e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -60,6 +60,7 @@ enum amdgpu_ring_priority_level {
>  #define AMDGPU_FENCE_FLAG_64BIT         (1 << 0)
>  #define AMDGPU_FENCE_FLAG_INT           (1 << 1)
>  #define AMDGPU_FENCE_FLAG_TC_WB_ONLY    (1 << 2)
> +#define AMDGPU_FENCE_FLAG_EXEC          (1 << 3)
>  
>  #define to_amdgpu_ring(s) container_of((s), struct amdgpu_ring, sched)
>  
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 8d4fbc9e3fc0..01ec0551d26a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -753,7 +753,7 @@ static void gfx_v9_0_set_rlc_funcs(struct amdgpu_device *adev);
>  static int gfx_v9_0_get_cu_info(struct amdgpu_device *adev,
>  				struct amdgpu_cu_info *cu_info);
>  static uint64_t gfx_v9_0_get_gpu_clock_counter(struct amdgpu_device *adev);
> -static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring);
> +static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring, bool resume);
>  static u64 gfx_v9_0_ring_get_rptr_compute(struct amdgpu_ring *ring);
>  static void gfx_v9_0_query_ras_error_count(struct amdgpu_device *adev,
>  					  void *ras_error_status);
> @@ -826,9 +826,10 @@ static void gfx_v9_0_kiq_unmap_queues(struct amdgpu_ring *kiq_ring,
>  			PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET0(ring->doorbell_index));
>  
>  	if (action == PREEMPT_QUEUES_NO_UNMAP) {
> -		amdgpu_ring_write(kiq_ring, lower_32_bits(gpu_addr));
> -		amdgpu_ring_write(kiq_ring, upper_32_bits(gpu_addr));
> -		amdgpu_ring_write(kiq_ring, seq);
> +		amdgpu_ring_write(kiq_ring, lower_32_bits(ring->wptr & ring->buf_mask));
> +		amdgpu_ring_write(kiq_ring, 0);
> +		amdgpu_ring_write(kiq_ring, 0);
> +
>  	} else {
>  		amdgpu_ring_write(kiq_ring, 0);
>  		amdgpu_ring_write(kiq_ring, 0);
> @@ -5369,11 +5370,17 @@ static void gfx_v9_0_ring_emit_ib_gfx(struct amdgpu_ring *ring,
>  
>  	control |= ib->length_dw | (vmid << 24);
>  
> -	if (amdgpu_sriov_vf(ring->adev) && (ib->flags & AMDGPU_IB_FLAG_PREEMPT)) {
> +	if (ib->flags & AMDGPU_IB_FLAG_PREEMPT) {
>  		control |= INDIRECT_BUFFER_PRE_ENB(1);
>  
> +		if (flags & AMDGPU_IB_PREEMPTED)
> +			control |= INDIRECT_BUFFER_PRE_RESUME(1);
> +
>  		if (!(ib->flags & AMDGPU_IB_FLAG_CE) && vmid)
> -			gfx_v9_0_ring_emit_de_meta(ring);
> +			gfx_v9_0_ring_emit_de_meta(ring,
> +						   (!amdgpu_sriov_vf(ring->adev) &&
> +						   flags & AMDGPU_IB_PREEMPTED) ?
> +						   true : false);
>  	}
>  
>  	amdgpu_ring_write(ring, header);
> @@ -5428,17 +5435,23 @@ static void gfx_v9_0_ring_emit_fence(struct amdgpu_ring *ring, u64 addr,
>  	bool write64bit = flags & AMDGPU_FENCE_FLAG_64BIT;
>  	bool int_sel = flags & AMDGPU_FENCE_FLAG_INT;
>  	bool writeback = flags & AMDGPU_FENCE_FLAG_TC_WB_ONLY;
> +	bool exec = flags & AMDGPU_FENCE_FLAG_EXEC;
> +	uint32_t dw2 = 0;
>  
>  	/* RELEASE_MEM - flush caches, send int */
>  	amdgpu_ring_write(ring, PACKET3(PACKET3_RELEASE_MEM, 6));
> -	amdgpu_ring_write(ring, ((writeback ? (EOP_TC_WB_ACTION_EN |
> -					       EOP_TC_NC_ACTION_EN) :
> -					      (EOP_TCL1_ACTION_EN |
> -					       EOP_TC_ACTION_EN |
> -					       EOP_TC_WB_ACTION_EN |
> -					       EOP_TC_MD_ACTION_EN)) |
> -				 EVENT_TYPE(CACHE_FLUSH_AND_INV_TS_EVENT) |
> -				 EVENT_INDEX(5)));
> +
> +	if (writeback) {
> +		dw2 = EOP_TC_WB_ACTION_EN | EOP_TC_NC_ACTION_EN;
> +	} else {
> +		dw2 = EOP_TCL1_ACTION_EN | EOP_TC_ACTION_EN |
> +				EOP_TC_WB_ACTION_EN | EOP_TC_MD_ACTION_EN;
> +	}
> +	dw2 |= EVENT_TYPE(CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5);

You can pull out EOP_TC_WB_ACTION_EN, as it appears in either conditional, to look
something like this:

	if (writeback) {
		dw2 = EOP_TC_NC_ACTION_EN;
	} else {
		dw2 = EOP_TCL1_ACTION_EN |
		      EOP_TC_ACTION_EN |
		      EOP_TC_MD_ACTION_EN;
	}
	dw2 |= EOP_TC_WB_ACTION_EN | EVENT_TYPE(CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5);

Regards,
Luben

> +	if (exec)
> +		dw2 |= EOP_EXEC;
> +
> +	amdgpu_ring_write(ring, dw2);
>  	amdgpu_ring_write(ring, DATA_SEL(write64bit ? 2 : 1) | INT_SEL(int_sel ? 2 : 0));
>  
>  	/*
> @@ -5543,33 +5556,135 @@ static void gfx_v9_ring_emit_sb(struct amdgpu_ring *ring)
>  	amdgpu_ring_write(ring, 0);
>  }
>  
> -static void gfx_v9_0_ring_emit_ce_meta(struct amdgpu_ring *ring)
> +static void gfx_v9_0_ring_emit_ce_meta(struct amdgpu_ring *ring, bool resume)
>  {
> +	struct amdgpu_device *adev = ring->adev;
>  	struct v9_ce_ib_state ce_payload = {0};
> -	uint64_t csa_addr;
> +	uint64_t offset, ce_payload_gpu_addr;
> +	void *ce_payload_cpu_addr;
>  	int cnt;
>  
>  	cnt = (sizeof(ce_payload) >> 2) + 4 - 2;
> -	csa_addr = amdgpu_csa_vaddr(ring->adev);
> +
> +	if (ring->is_mes_queue) {
> +		offset = offsetof(struct amdgpu_mes_ctx_meta_data,
> +				  gfx[0].gfx_meta_data) +
> +			offsetof(struct v9_gfx_meta_data, ce_payload);
> +		ce_payload_gpu_addr =
> +			amdgpu_mes_ctx_get_offs_gpu_addr(ring, offset);
> +		ce_payload_cpu_addr =
> +			amdgpu_mes_ctx_get_offs_cpu_addr(ring, offset);
> +	} else {
> +		offset = offsetof(struct v9_gfx_meta_data, ce_payload);
> +		ce_payload_gpu_addr = amdgpu_csa_vaddr(ring->adev) + offset;
> +		ce_payload_cpu_addr = adev->virt.csa_cpu_addr + offset;
> +	}
>  
>  	amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, cnt));
>  	amdgpu_ring_write(ring, (WRITE_DATA_ENGINE_SEL(2) |
>  				 WRITE_DATA_DST_SEL(8) |
>  				 WR_CONFIRM) |
>  				 WRITE_DATA_CACHE_POLICY(0));
> -	amdgpu_ring_write(ring, lower_32_bits(csa_addr + offsetof(struct v9_gfx_meta_data, ce_payload)));
> -	amdgpu_ring_write(ring, upper_32_bits(csa_addr + offsetof(struct v9_gfx_meta_data, ce_payload)));
> -	amdgpu_ring_write_multiple(ring, (void *)&ce_payload, sizeof(ce_payload) >> 2);
> +	amdgpu_ring_write(ring, lower_32_bits(ce_payload_gpu_addr));
> +	amdgpu_ring_write(ring, upper_32_bits(ce_payload_gpu_addr));
> +
> +	if (resume)
> +		amdgpu_ring_write_multiple(ring, ce_payload_cpu_addr,
> +					   sizeof(ce_payload) >> 2);
> +	else
> +		amdgpu_ring_write_multiple(ring, (void *)&ce_payload,
> +					   sizeof(ce_payload) >> 2);
> +}
> +
> +static int gfx_v9_0_ring_preempt_ib(struct amdgpu_ring *ring)
> +{
> +	int i, r = 0;
> +	struct amdgpu_device *adev = ring->adev;
> +	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
> +	struct amdgpu_ring *kiq_ring = &kiq->ring;
> +	unsigned long flags;
> +
> +	if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues)
> +		return -EINVAL;
> +
> +	spin_lock_irqsave(&kiq->ring_lock, flags);
> +
> +	if (amdgpu_ring_alloc(kiq_ring, kiq->pmf->unmap_queues_size)) {
> +		spin_unlock_irqrestore(&kiq->ring_lock, flags);
> +		return -ENOMEM;
> +	}
> +
> +	/* assert preemption condition */
> +	amdgpu_ring_set_preempt_cond_exec(ring, false);
> +
> +	ring->trail_seq += 1;
> +	amdgpu_ring_alloc(ring, 13);
> +	gfx_v9_0_ring_emit_fence(ring, ring->trail_fence_gpu_addr,
> +				 ring->trail_seq, AMDGPU_FENCE_FLAG_EXEC);
> +	/*reset the CP_VMID_PREEMPT after trailing fence*/
> +	amdgpu_ring_emit_wreg(ring,
> +			      SOC15_REG_OFFSET(GC, 0, mmCP_VMID_PREEMPT),
> +			      0x0);
> +
> +	/* assert IB preemption, emit the trailing fence */
> +	kiq->pmf->kiq_unmap_queues(kiq_ring, ring, PREEMPT_QUEUES_NO_UNMAP,
> +				   ring->trail_fence_gpu_addr,
> +				   ring->trail_seq);
> +
> +	amdgpu_ring_commit(kiq_ring);
> +	spin_unlock_irqrestore(&kiq->ring_lock, flags);
> +
> +	/* poll the trailing fence */
> +	for (i = 0; i < adev->usec_timeout; i++) {
> +		if (ring->trail_seq ==
> +			le32_to_cpu(*ring->trail_fence_cpu_addr))
> +			break;
> +		udelay(1);
> +	}
> +
> +	if (i >= adev->usec_timeout) {
> +		r = -EINVAL;
> +		DRM_ERROR("ring %d failed to preempt ib\n", ring->idx);
> +	}
> +
> +	amdgpu_ring_commit(ring);
> +
> +	/* deassert preemption condition */
> +	amdgpu_ring_set_preempt_cond_exec(ring, true);
> +	return r;
>  }
>  
> -static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring)
> +static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring, bool resume)
>  {
> +	struct amdgpu_device *adev = ring->adev;
>  	struct v9_de_ib_state de_payload = {0};
> -	uint64_t csa_addr, gds_addr;
> +	uint64_t offset, gds_addr, de_payload_gpu_addr;
> +	void *de_payload_cpu_addr;
>  	int cnt;
>  
> -	csa_addr = amdgpu_csa_vaddr(ring->adev);
> -	gds_addr = csa_addr + 4096;
> +	if (ring->is_mes_queue) {
> +		offset = offsetof(struct amdgpu_mes_ctx_meta_data,
> +				  gfx[0].gfx_meta_data) +
> +			offsetof(struct v9_gfx_meta_data, de_payload);
> +		de_payload_gpu_addr =
> +			amdgpu_mes_ctx_get_offs_gpu_addr(ring, offset);
> +		de_payload_cpu_addr =
> +			amdgpu_mes_ctx_get_offs_cpu_addr(ring, offset);
> +
> +		offset = offsetof(struct amdgpu_mes_ctx_meta_data,
> +				  gfx[0].gds_backup) +
> +			offsetof(struct v9_gfx_meta_data, de_payload);
> +		gds_addr = amdgpu_mes_ctx_get_offs_gpu_addr(ring, offset);
> +	} else {
> +		offset = offsetof(struct v9_gfx_meta_data, de_payload);
> +		de_payload_gpu_addr = amdgpu_csa_vaddr(ring->adev) + offset;
> +		de_payload_cpu_addr = adev->virt.csa_cpu_addr + offset;
> +
> +		gds_addr = ALIGN(amdgpu_csa_vaddr(ring->adev) +
> +				 AMDGPU_CSA_SIZE - adev->gds.gds_size,
> +				 PAGE_SIZE);
> +	}
> +
>  	de_payload.gds_backup_addrlo = lower_32_bits(gds_addr);
>  	de_payload.gds_backup_addrhi = upper_32_bits(gds_addr);
>  
> @@ -5579,9 +5694,15 @@ static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring)
>  				 WRITE_DATA_DST_SEL(8) |
>  				 WR_CONFIRM) |
>  				 WRITE_DATA_CACHE_POLICY(0));
> -	amdgpu_ring_write(ring, lower_32_bits(csa_addr + offsetof(struct v9_gfx_meta_data, de_payload)));
> -	amdgpu_ring_write(ring, upper_32_bits(csa_addr + offsetof(struct v9_gfx_meta_data, de_payload)));
> -	amdgpu_ring_write_multiple(ring, (void *)&de_payload, sizeof(de_payload) >> 2);
> +	amdgpu_ring_write(ring, lower_32_bits(de_payload_gpu_addr));
> +	amdgpu_ring_write(ring, upper_32_bits(de_payload_gpu_addr));
> +
> +	if (resume)
> +		amdgpu_ring_write_multiple(ring, de_payload_cpu_addr,
> +					   sizeof(de_payload) >> 2);
> +	else
> +		amdgpu_ring_write_multiple(ring, (void *)&de_payload,
> +					   sizeof(de_payload) >> 2);
>  }
>  
>  static void gfx_v9_0_ring_emit_frame_cntl(struct amdgpu_ring *ring, bool start,
> @@ -5597,8 +5718,9 @@ static void gfx_v9_ring_emit_cntxcntl(struct amdgpu_ring *ring, uint32_t flags)
>  {
>  	uint32_t dw2 = 0;
>  
> -	if (amdgpu_sriov_vf(ring->adev))
> -		gfx_v9_0_ring_emit_ce_meta(ring);
> +	gfx_v9_0_ring_emit_ce_meta(ring,
> +				   (!amdgpu_sriov_vf(ring->adev) &&
> +				   flags & AMDGPU_IB_PREEMPTED) ? true : false);
>  
>  	dw2 |= 0x80000000; /* set load_enable otherwise this package is just NOPs */
>  	if (flags & AMDGPU_HAVE_CTX_SWITCH) {
> @@ -6925,6 +7047,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>  	.emit_cntxcntl = gfx_v9_ring_emit_cntxcntl,
>  	.init_cond_exec = gfx_v9_0_ring_emit_init_cond_exec,
>  	.patch_cond_exec = gfx_v9_0_ring_emit_patch_cond_exec,
> +	.preempt_ib = gfx_v9_0_ring_preempt_ib,
>  	.emit_frame_cntl = gfx_v9_0_ring_emit_frame_cntl,
>  	.emit_wreg = gfx_v9_0_ring_emit_wreg,
>  	.emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
> diff --git a/drivers/gpu/drm/amd/amdgpu/soc15d.h b/drivers/gpu/drm/amd/amdgpu/soc15d.h
> index 799925d22fc8..2357ff39323f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/soc15d.h
> +++ b/drivers/gpu/drm/amd/amdgpu/soc15d.h
> @@ -162,6 +162,7 @@
>  		 * 2 - Bypass
>  		 */
>  #define     INDIRECT_BUFFER_PRE_ENB(x)		 ((x) << 21)
> +#define     INDIRECT_BUFFER_PRE_RESUME(x)               ((x) << 30)
>  #define	PACKET3_COPY_DATA				0x40
>  #define	PACKET3_PFP_SYNC_ME				0x42
>  #define	PACKET3_COND_WRITE				0x45
> @@ -184,6 +185,7 @@
>  #define		EOP_TC_ACTION_EN                        (1 << 17) /* L2 */
>  #define		EOP_TC_NC_ACTION_EN			(1 << 19)
>  #define		EOP_TC_MD_ACTION_EN			(1 << 21) /* L2 metadata */
> +#define		EOP_EXEC				(1 << 28) /* For Trailing Fence */
>  
>  #define		DATA_SEL(x)                             ((x) << 29)
>  		/* 0 - discard

^ permalink raw reply	[flat|nested] 29+ messages in thread

end of thread, other threads:[~2022-11-22  5:46 UTC | newest]

Thread overview: 29+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-10-18  9:08 [PATCH 1/5] drm/amdgpu: Introduce gfx software ring (v8) jiadong.zhu
2022-10-18  9:08 ` [PATCH 2/5] drm/amdgpu: Add software ring callbacks for gfx9 (v8) jiadong.zhu
2022-10-18  9:08 ` [PATCH 3/5] drm/amdgpu: Modify unmap_queue format for gfx9 (v4) jiadong.zhu
2022-11-22  5:46   ` Luben Tuikov
2022-10-18  9:08 ` [PATCH 4/5] drm/amdgpu: MCBP based on DRM scheduler (v8) jiadong.zhu
2022-10-31 12:01   ` Michel Dänzer
2022-11-01  1:04     ` Zhu, Jiadong
2022-11-01  9:10       ` Michel Dänzer
2022-11-01  9:58         ` Zhu, Jiadong
2022-11-01 10:09           ` Michel Dänzer
2022-11-02 11:26             ` Michel Dänzer
2022-11-03  2:58               ` Zhu, Jiadong
2022-11-03  9:04                 ` Michel Dänzer
2022-11-08  8:01                   ` Zhu, Jiadong
2022-11-10 17:00                     ` Michel Dänzer
2022-11-10 17:54                       ` Alex Deucher
2022-11-11  6:15                         ` Zhu, Jiadong
2022-11-14 17:15                       ` Michel Dänzer
2022-11-17  3:34                         ` Zhu, Jiadong
2022-11-10 19:27   ` Luben Tuikov
2022-10-18  9:08 ` [PATCH 5/5] drm/amdgpu: Improve the software ring priority scheduler jiadong.zhu
2022-10-18 11:24   ` Christian König
2022-10-19 15:14 ` [PATCH 1/5] drm/amdgpu: Introduce gfx software ring (v8) Luben Tuikov
2022-10-20 14:49 ` Michel Dänzer
2022-10-20 14:59   ` Christian König
2022-10-21  7:42     ` Michel Dänzer
2022-10-31  8:10     ` Zhu, Jiadong
2022-10-31 11:58       ` Michel Dänzer
2022-11-22  5:33 ` Luben Tuikov

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.