All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/4] drm/amdgpu: Introduce gfx software ring (v7)
@ 2022-09-23 13:16 jiadong.zhu
  2022-09-23 13:16 ` [PATCH 2/4] drm/amdgpu: Add software ring callbacks for gfx9 (v6) jiadong.zhu
                   ` (3 more replies)
  0 siblings, 4 replies; 15+ messages in thread
From: jiadong.zhu @ 2022-09-23 13:16 UTC (permalink / raw)
  To: amd-gfx; +Cc: Luben Tuikov, Jiadong.Zhu, Christian Koenig, Andrey Grodzovsky

From: "Jiadong.Zhu" <Jiadong.Zhu@amd.com>

The software ring is created to support priority context while there is only
one hardware queue for gfx.

Every software ring has its fence driver and could be used as an ordinary ring
for the GPU scheduler.
Multiple software rings are bound to a real ring with the ring muxer. The
packages committed on the software ring are copied to the real ring.

v2: Use array to store software ring entry.
v3: Remove unnecessary prints.
v4: Remove amdgpu_ring_sw_init/fini functions,
using gtt for sw ring buffer for later dma copy
optimization.
v5: Allocate ring entry dynamically in the muxer.
v6: Update comments for the ring muxer.
v7: Modify for function naming.

Cc: Christian Koenig <Christian.Koenig@amd.com>
Cc: Luben Tuikov <Luben.Tuikov@amd.com>
Cc: Andrey Grodzovsky  <Andrey.Grodzovsky@amd.com>
Signed-off-by: Jiadong.Zhu <Jiadong.Zhu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/Makefile          |   3 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h      |   3 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h     |   4 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c | 183 +++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h |  68 +++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.c  |  61 +++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.h  |  38 ++++
 7 files changed, 359 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
 create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
 create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.c
 create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.h

diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile
index 3e0e2eb7e235..85224bc81ce5 100644
--- a/drivers/gpu/drm/amd/amdgpu/Makefile
+++ b/drivers/gpu/drm/amd/amdgpu/Makefile
@@ -58,7 +58,8 @@ amdgpu-y += amdgpu_device.o amdgpu_kms.o \
 	amdgpu_vm_sdma.o amdgpu_discovery.o amdgpu_ras_eeprom.o amdgpu_nbio.o \
 	amdgpu_umc.o smu_v11_0_i2c.o amdgpu_fru_eeprom.o amdgpu_rap.o \
 	amdgpu_fw_attestation.o amdgpu_securedisplay.o \
-	amdgpu_eeprom.o amdgpu_mca.o amdgpu_psp_ta.o amdgpu_lsdma.o
+	amdgpu_eeprom.o amdgpu_mca.o amdgpu_psp_ta.o amdgpu_lsdma.o \
+	amdgpu_sw_ring.o amdgpu_ring_mux.o
 
 amdgpu-$(CONFIG_PROC_FS) += amdgpu_fdinfo.o
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index 53526ffb2ce1..9996dadb39f7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -33,6 +33,7 @@
 #include "amdgpu_imu.h"
 #include "soc15.h"
 #include "amdgpu_ras.h"
+#include "amdgpu_ring_mux.h"
 
 /* GFX current status */
 #define AMDGPU_GFX_NORMAL_MODE			0x00000000L
@@ -346,6 +347,8 @@ struct amdgpu_gfx {
 	struct amdgpu_gfx_ras		*ras;
 
 	bool				is_poweron;
+
+	struct amdgpu_ring_mux          muxer;
 };
 
 #define amdgpu_gfx_get_gpu_clock_counter(adev) (adev)->gfx.funcs->get_gpu_clock_counter((adev))
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 7d89a52091c0..40b1277b4f0c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -278,6 +278,10 @@ struct amdgpu_ring {
 	bool			is_mes_queue;
 	uint32_t		hw_queue_id;
 	struct amdgpu_mes_ctx_data *mes_ctx;
+
+	bool            is_sw_ring;
+	unsigned int    entry_index;
+
 };
 
 #define amdgpu_ring_parse_cs(r, p, job, ib) ((r)->funcs->parse_cs((p), (job), (ib)))
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
new file mode 100644
index 000000000000..662aadebf111
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
@@ -0,0 +1,183 @@
+/*
+ * Copyright 2022 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#include <linux/slab.h>
+#include <drm/drm_print.h>
+
+#include "amdgpu_ring_mux.h"
+#include "amdgpu_ring.h"
+
+#define AMDGPU_MUX_RESUBMIT_JIFFIES_TIMEOUT (HZ / 2)
+
+int amdgpu_ring_mux_init(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring,
+			 unsigned int entry_size)
+{
+	mux->real_ring = ring;
+	mux->num_ring_entries = 0;
+	mux->ring_entry = kcalloc(entry_size, sizeof(struct amdgpu_mux_entry), GFP_KERNEL);
+	if (!mux->ring_entry)
+		return -ENOMEM;
+
+	mux->ring_entry_size = entry_size;
+	spin_lock_init(&mux->lock);
+
+	return 0;
+}
+
+void amdgpu_ring_mux_fini(struct amdgpu_ring_mux *mux)
+{
+	kfree(mux->ring_entry);
+	mux->ring_entry = NULL;
+	mux->num_ring_entries = 0;
+	mux->ring_entry_size = 0;
+}
+
+int amdgpu_ring_mux_add_sw_ring(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring)
+{
+	struct amdgpu_mux_entry *e;
+
+	if (mux->num_ring_entries >= mux->ring_entry_size) {
+		DRM_ERROR("add sw ring exceeding max entry size\n");
+		return -ENOENT;
+	}
+
+	e = &mux->ring_entry[mux->num_ring_entries];
+	ring->entry_index = mux->num_ring_entries;
+	e->ring = ring;
+
+	mux->num_ring_entries += 1;
+	return 0;
+}
+
+static inline struct amdgpu_mux_entry *amdgpu_ring_mux_sw_entry(struct amdgpu_ring_mux *mux,
+								struct amdgpu_ring *ring)
+{
+	return ring->entry_index < mux->ring_entry_size ?
+			&mux->ring_entry[ring->entry_index] : NULL;
+}
+
+/* copy packages on sw ring range[begin, end) */
+static void amdgpu_ring_mux_copy_pkt_from_sw_ring(struct amdgpu_ring_mux *mux,
+						  struct amdgpu_ring *ring,
+						  u64 s_start, u64 s_end)
+{
+	u64 start, end;
+	struct amdgpu_ring *real_ring = mux->real_ring;
+
+	start = s_start & ring->buf_mask;
+	end = s_end & ring->buf_mask;
+
+	if (start == end) {
+		DRM_ERROR("no more data copied from sw ring\n");
+		return;
+	}
+	if (start > end) {
+		amdgpu_ring_alloc(real_ring, (ring->ring_size >> 2) + end - start);
+		amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[start],
+					   (ring->ring_size >> 2) - start);
+		amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[0], end);
+	} else {
+		amdgpu_ring_alloc(real_ring, end - start);
+		amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[start], end - start);
+	}
+}
+
+void amdgpu_ring_mux_set_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring, u64 wptr)
+{
+	struct amdgpu_mux_entry *e;
+
+	e = amdgpu_ring_mux_sw_entry(mux, ring);
+	if (!e) {
+		DRM_ERROR("cannot find entry for sw ring\n");
+		return;
+	}
+
+	spin_lock(&mux->lock);
+	e->sw_cptr = e->sw_wptr;
+	e->sw_wptr = wptr;
+	e->start_ptr_in_hw_ring = mux->real_ring->wptr;
+
+	amdgpu_ring_mux_copy_pkt_from_sw_ring(mux, ring, e->sw_cptr, wptr);
+	e->end_ptr_in_hw_ring = mux->real_ring->wptr;
+	amdgpu_ring_commit(mux->real_ring);
+
+	spin_unlock(&mux->lock);
+}
+
+u64 amdgpu_ring_mux_get_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring)
+{
+	struct amdgpu_mux_entry *e;
+
+	e = amdgpu_ring_mux_sw_entry(mux, ring);
+	if (!e) {
+		DRM_ERROR("cannot find entry for sw ring\n");
+		return 0;
+	}
+
+	return e->sw_wptr;
+}
+
+/*
+ * The return value of the readptr is not precise while the other rings could
+ * write data onto the real ring buffer.After overwriting on the real ring, we
+ * can not decide if our packages have been excuted or not read yet. However,
+ * this function is only called by the tools such as umr to collect the latest
+ * packages for the hang analysis. We assume the hang happens near our latest
+ * submit. Thus we could use the following logic to give the clue:
+ * If the readptr is between start and end, then we return the copy pointer
+ * plus the distance from start to readptr. If the readptr is before start, we
+ * return the copy pointer. Lastly, if the readptr is past end, we return the
+ * write pointer.
+ */
+u64 amdgpu_ring_mux_get_rptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring)
+{
+	struct amdgpu_mux_entry *e;
+	u64 readp, offset, start, end;
+
+	e = amdgpu_ring_mux_sw_entry(mux, ring);
+	if (!e) {
+		DRM_ERROR("no sw entry found!\n");
+		return 0;
+	}
+
+	readp = amdgpu_ring_get_rptr(mux->real_ring);
+
+	start = e->start_ptr_in_hw_ring & mux->real_ring->buf_mask;
+	end = e->end_ptr_in_hw_ring & mux->real_ring->buf_mask;
+	if (start > end) {
+		if (readp <= end)
+			readp += mux->real_ring->ring_size >> 2;
+		end += mux->real_ring->ring_size >> 2;
+	}
+
+	if (start <= readp && readp <= end) {
+		offset = readp - start;
+		e->sw_rptr = (e->sw_cptr + offset) & ring->buf_mask;
+	} else if (readp < start) {
+		e->sw_rptr = e->sw_cptr;
+	} else {
+		/* end < readptr */
+		e->sw_rptr = e->sw_wptr;
+	}
+
+	return e->sw_rptr;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
new file mode 100644
index 000000000000..8c1691e11b1c
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright 2022 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __AMDGPU_RING_MUX__
+#define __AMDGPU_RING_MUX__
+
+#include <linux/timer.h>
+#include <linux/spinlock.h>
+#include "amdgpu_ring.h"
+
+struct amdgpu_ring;
+/**
+ * struct amdgpu_mux_entry - the entry recording software rings copying information.
+ * @ring: the pointer to the software ring.
+ * @start_ptr_in_hw_ring: last start location copied to in the hardware ring.
+ * @end_ptr_in_hw_ring: last end location copied to in the hardware ring.
+ * @sw_cptr: the position of the copy pointer in the sw ring.
+ * @sw_rptr: the read pointer in software ring.
+ * @sw_wptr: the write pointer in software ring.
+ */
+struct amdgpu_mux_entry {
+	struct                  amdgpu_ring *ring;
+	u64                     start_ptr_in_hw_ring;
+	u64                     end_ptr_in_hw_ring;
+	u64                     sw_cptr;
+	u64                     sw_rptr;
+	u64                     sw_wptr;
+};
+
+struct amdgpu_ring_mux {
+	struct amdgpu_ring      *real_ring;
+
+	struct amdgpu_mux_entry *ring_entry;
+	unsigned int            num_ring_entries;
+	unsigned int            ring_entry_size;
+	/*the lock for copy data from different software rings*/
+	spinlock_t              lock;
+};
+
+int amdgpu_ring_mux_init(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring,
+			 unsigned int entry_size);
+void amdgpu_ring_mux_fini(struct amdgpu_ring_mux *mux);
+int amdgpu_ring_mux_add_sw_ring(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring);
+void amdgpu_ring_mux_set_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring, u64 wptr);
+u64 amdgpu_ring_mux_get_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring);
+u64 amdgpu_ring_mux_get_rptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring);
+
+#endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.c
new file mode 100644
index 000000000000..5ae12d6641ca
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.c
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2022 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ */
+
+#include "amdgpu_sw_ring.h"
+#include "amdgpu_ring_mux.h"
+
+u64 amdgpu_sw_ring_get_rptr_gfx(struct amdgpu_ring *ring)
+{
+	struct amdgpu_device *adev = ring->adev;
+	struct amdgpu_ring_mux *mux = &adev->gfx.muxer;
+
+	WARN_ON(!ring->is_sw_ring);
+	return amdgpu_ring_mux_get_rptr(mux, ring);
+}
+
+u64 amdgpu_sw_ring_get_wptr_gfx(struct amdgpu_ring *ring)
+{
+	struct amdgpu_device *adev = ring->adev;
+	struct amdgpu_ring_mux *mux = &adev->gfx.muxer;
+
+	WARN_ON(!ring->is_sw_ring);
+	return amdgpu_ring_mux_get_wptr(mux, ring);
+}
+
+void amdgpu_sw_ring_set_wptr_gfx(struct amdgpu_ring *ring)
+{
+	struct amdgpu_device *adev = ring->adev;
+	struct amdgpu_ring_mux *mux = &adev->gfx.muxer;
+
+	WARN_ON(!ring->is_sw_ring);
+	amdgpu_ring_mux_set_wptr(mux, ring, ring->wptr);
+}
+
+/* Override insert_nop to prevent emitting nops to the software rings */
+void amdgpu_sw_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count)
+{
+	WARN_ON(!ring->is_sw_ring);
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.h
new file mode 100644
index 000000000000..a66524b8b36e
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2012 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "amdgpu_ring.h"
+#include "amdgpu.h"
+
+#ifndef __AMDGPU_SWRING_H__
+#define __AMDGPU_SWRING_H__
+
+u64 amdgpu_sw_ring_get_rptr_gfx(struct amdgpu_ring *ring);
+u64 amdgpu_sw_ring_get_wptr_gfx(struct amdgpu_ring *ring);
+void amdgpu_sw_ring_set_wptr_gfx(struct amdgpu_ring *ring);
+
+void amdgpu_sw_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count);
+void amdgpu_sw_ring_ib_begin(struct amdgpu_ring *ring);
+void amdgpu_sw_ring_ib_end(struct amdgpu_ring *ring);
+
+#endif
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 2/4] drm/amdgpu: Add software ring callbacks for gfx9 (v6)
  2022-09-23 13:16 [PATCH 1/4] drm/amdgpu: Introduce gfx software ring (v7) jiadong.zhu
@ 2022-09-23 13:16 ` jiadong.zhu
  2022-09-26  6:43   ` Christian König
  2022-09-23 13:16 ` [PATCH 3/4] drm/amdgpu: Modify unmap_queue format for gfx9 (v4) jiadong.zhu
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 15+ messages in thread
From: jiadong.zhu @ 2022-09-23 13:16 UTC (permalink / raw)
  To: amd-gfx; +Cc: Luben Tuikov, Jiadong.Zhu, Christian Koenig, Andrey Grodzovsky

From: "Jiadong.Zhu" <Jiadong.Zhu@amd.com>

Set ring functions with software ring callbacks on gfx9.

The software ring could be tested by debugfs_test_ib case.

v2: Set sw_ring 2 to enable software ring by default.
v3: Remove the parameter for software ring enablement.
v4: Use amdgpu_ring_init/fini for software rings.
v5: Update for code format. Fix conflict.
v6: Remove unnecessary checks and enable software ring on gfx9 by default.

Acked-by: Luben Tuikov <luben.tuikov@amd.com>
Cc: Christian Koenig <Christian.Koenig@amd.com>
Cc: Luben Tuikov <Luben.Tuikov@amd.com>
Cc: Andrey Grodzovsky <Andrey.Grodzovsky@amd.com>
Signed-off-by: Jiadong.Zhu <Jiadong.Zhu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h  |   1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |   1 +
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 108 ++++++++++++++++++++++-
 3 files changed, 109 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index 9996dadb39f7..4fdfc3ec134a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -348,6 +348,7 @@ struct amdgpu_gfx {
 
 	bool				is_poweron;
 
+	struct amdgpu_ring		sw_gfx_ring[AMDGPU_MAX_SW_GFX_RINGS];
 	struct amdgpu_ring_mux          muxer;
 };
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 40b1277b4f0c..f08ee1ac281c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -39,6 +39,7 @@ struct amdgpu_vm;
 #define AMDGPU_MAX_RINGS		28
 #define AMDGPU_MAX_HWIP_RINGS		8
 #define AMDGPU_MAX_GFX_RINGS		2
+#define AMDGPU_MAX_SW_GFX_RINGS         2
 #define AMDGPU_MAX_COMPUTE_RINGS	8
 #define AMDGPU_MAX_VCE_RINGS		3
 #define AMDGPU_MAX_UVD_ENC_RINGS	2
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 5349ca4d19e3..e688665cd1e0 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -47,6 +47,7 @@
 
 #include "amdgpu_ras.h"
 
+#include "amdgpu_sw_ring.h"
 #include "gfx_v9_4.h"
 #include "gfx_v9_0.h"
 #include "gfx_v9_4_2.h"
@@ -56,6 +57,7 @@
 #include "asic_reg/gc/gc_9_0_default.h"
 
 #define GFX9_NUM_GFX_RINGS     1
+#define GFX9_NUM_SW_GFX_RINGS  2
 #define GFX9_MEC_HPD_SIZE 4096
 #define RLCG_UCODE_LOADING_START_ADDRESS 0x00002000L
 #define RLC_SAVE_RESTORE_ADDR_STARTING_OFFSET 0x00000000L
@@ -2273,6 +2275,7 @@ static int gfx_v9_0_sw_init(void *handle)
 	struct amdgpu_ring *ring;
 	struct amdgpu_kiq *kiq;
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+	unsigned int hw_prio;
 
 	switch (adev->ip_versions[GC_HWIP][0]) {
 	case IP_VERSION(9, 0, 1):
@@ -2356,6 +2359,9 @@ static int gfx_v9_0_sw_init(void *handle)
 			sprintf(ring->name, "gfx_%d", i);
 		ring->use_doorbell = true;
 		ring->doorbell_index = adev->doorbell_index.gfx_ring0 << 1;
+
+		/* disable scheduler on the real ring */
+		ring->no_scheduler = true;
 		r = amdgpu_ring_init(adev, ring, 1024, &adev->gfx.eop_irq,
 				     AMDGPU_CP_IRQ_GFX_ME0_PIPE0_EOP,
 				     AMDGPU_RING_PRIO_DEFAULT, NULL);
@@ -2363,6 +2369,42 @@ static int gfx_v9_0_sw_init(void *handle)
 			return r;
 	}
 
+	/* set up the software rings */
+	for (i = 0; i < GFX9_NUM_SW_GFX_RINGS; i++) {
+		ring = &adev->gfx.sw_gfx_ring[i];
+		ring->ring_obj = NULL;
+		if (!i)
+			sprintf(ring->name, "gfx_sw");
+		else
+			sprintf(ring->name, "gfx_sw_%d", i);
+		ring->use_doorbell = true;
+		ring->doorbell_index = adev->doorbell_index.gfx_ring0 << 1;
+		ring->is_sw_ring = true;
+		hw_prio = (i == 1) ? AMDGPU_RING_PRIO_2 :
+			AMDGPU_RING_PRIO_DEFAULT;
+		r = amdgpu_ring_init(adev, ring, 1024, &adev->gfx.eop_irq,
+				     AMDGPU_CP_IRQ_GFX_ME0_PIPE0_EOP, hw_prio,
+				     NULL);
+		if (r)
+			return r;
+		ring->wptr = 0;
+	}
+
+	/* init the muxer and add software rings */
+	r = amdgpu_ring_mux_init(&adev->gfx.muxer, &adev->gfx.gfx_ring[0],
+				 GFX9_NUM_SW_GFX_RINGS);
+	if (r) {
+		DRM_ERROR("amdgpu_ring_mux_init failed(%d)\n", r);
+		return r;
+	}
+	for (i = 0; i < GFX9_NUM_SW_GFX_RINGS; i++) {
+		r = amdgpu_ring_mux_add_sw_ring(&adev->gfx.muxer, &adev->gfx.sw_gfx_ring[i]);
+		if (r) {
+			DRM_ERROR("amdgpu_ring_mux_add_sw_ring failed(%d)\n", r);
+			return r;
+		}
+	}
+
 	/* set up the compute queues - allocate horizontally across pipes */
 	ring_id = 0;
 	for (i = 0; i < adev->gfx.mec.num_mec; ++i) {
@@ -2413,6 +2455,10 @@ static int gfx_v9_0_sw_fini(void *handle)
 	int i;
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
+	for (i = 0; i < GFX9_NUM_SW_GFX_RINGS; i++)
+		amdgpu_ring_fini(&adev->gfx.sw_gfx_ring[i]);
+	amdgpu_ring_mux_fini(&adev->gfx.muxer);
+
 	for (i = 0; i < adev->gfx.num_gfx_rings; i++)
 		amdgpu_ring_fini(&adev->gfx.gfx_ring[i]);
 	for (i = 0; i < adev->gfx.num_compute_rings; i++)
@@ -5877,7 +5923,9 @@ static int gfx_v9_0_eop_irq(struct amdgpu_device *adev,
 
 	switch (me_id) {
 	case 0:
-		amdgpu_fence_process(&adev->gfx.gfx_ring[0]);
+		/* Fence signals are handled on the software rings*/
+		for (i = 0; i < GFX9_NUM_SW_GFX_RINGS; i++)
+			amdgpu_fence_process(&adev->gfx.sw_gfx_ring[i]);
 		break;
 	case 1:
 	case 2:
@@ -6882,6 +6930,61 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
 	.emit_mem_sync = gfx_v9_0_emit_mem_sync,
 };
 
+static const struct amdgpu_ring_funcs gfx_v9_0_sw_ring_funcs_gfx = {
+	.type = AMDGPU_RING_TYPE_GFX,
+	.align_mask = 0xff,
+	.nop = PACKET3(PACKET3_NOP, 0x3FFF),
+	.support_64bit_ptrs = true,
+	.secure_submission_supported = true,
+	.vmhub = AMDGPU_GFXHUB_0,
+	.get_rptr = amdgpu_sw_ring_get_rptr_gfx,
+	.get_wptr = amdgpu_sw_ring_get_wptr_gfx,
+	.set_wptr = amdgpu_sw_ring_set_wptr_gfx,
+	.emit_frame_size = /* totally 242 maximum if 16 IBs */
+		5 +  /* COND_EXEC */
+		7 +  /* PIPELINE_SYNC */
+		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
+		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
+		2 + /* VM_FLUSH */
+		8 +  /* FENCE for VM_FLUSH */
+		20 + /* GDS switch */
+		4 + /* double SWITCH_BUFFER,
+		     * the first COND_EXEC jump to the place just
+		     * prior to this double SWITCH_BUFFER
+		     */
+		5 + /* COND_EXEC */
+		7 +	 /*	HDP_flush */
+		4 +	 /*	VGT_flush */
+		14 + /*	CE_META */
+		31 + /*	DE_META */
+		3 + /* CNTX_CTRL */
+		5 + /* HDP_INVL */
+		8 + 8 + /* FENCE x2 */
+		2 + /* SWITCH_BUFFER */
+		7, /* gfx_v9_0_emit_mem_sync */
+	.emit_ib_size =	4, /* gfx_v9_0_ring_emit_ib_gfx */
+	.emit_ib = gfx_v9_0_ring_emit_ib_gfx,
+	.emit_fence = gfx_v9_0_ring_emit_fence,
+	.emit_pipeline_sync = gfx_v9_0_ring_emit_pipeline_sync,
+	.emit_vm_flush = gfx_v9_0_ring_emit_vm_flush,
+	.emit_gds_switch = gfx_v9_0_ring_emit_gds_switch,
+	.emit_hdp_flush = gfx_v9_0_ring_emit_hdp_flush,
+	.test_ring = gfx_v9_0_ring_test_ring,
+	.test_ib = gfx_v9_0_ring_test_ib,
+	.insert_nop = amdgpu_sw_ring_insert_nop,
+	.pad_ib = amdgpu_ring_generic_pad_ib,
+	.emit_switch_buffer = gfx_v9_ring_emit_sb,
+	.emit_cntxcntl = gfx_v9_ring_emit_cntxcntl,
+	.init_cond_exec = gfx_v9_0_ring_emit_init_cond_exec,
+	.patch_cond_exec = gfx_v9_0_ring_emit_patch_cond_exec,
+	.emit_frame_cntl = gfx_v9_0_ring_emit_frame_cntl,
+	.emit_wreg = gfx_v9_0_ring_emit_wreg,
+	.emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
+	.emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
+	.soft_recovery = gfx_v9_0_ring_soft_recovery,
+	.emit_mem_sync = gfx_v9_0_emit_mem_sync,
+};
+
 static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
 	.type = AMDGPU_RING_TYPE_COMPUTE,
 	.align_mask = 0xff,
@@ -6959,6 +7062,9 @@ static void gfx_v9_0_set_ring_funcs(struct amdgpu_device *adev)
 	for (i = 0; i < adev->gfx.num_gfx_rings; i++)
 		adev->gfx.gfx_ring[i].funcs = &gfx_v9_0_ring_funcs_gfx;
 
+	for (i = 0; i < GFX9_NUM_SW_GFX_RINGS; i++)
+		adev->gfx.sw_gfx_ring[i].funcs = &gfx_v9_0_sw_ring_funcs_gfx;
+
 	for (i = 0; i < adev->gfx.num_compute_rings; i++)
 		adev->gfx.compute_ring[i].funcs = &gfx_v9_0_ring_funcs_compute;
 }
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 3/4] drm/amdgpu: Modify unmap_queue format for gfx9 (v4)
  2022-09-23 13:16 [PATCH 1/4] drm/amdgpu: Introduce gfx software ring (v7) jiadong.zhu
  2022-09-23 13:16 ` [PATCH 2/4] drm/amdgpu: Add software ring callbacks for gfx9 (v6) jiadong.zhu
@ 2022-09-23 13:16 ` jiadong.zhu
  2022-09-23 13:16 ` [PATCH 4/4] drm/amdgpu: MCBP based on DRM scheduler (v6) jiadong.zhu
  2022-09-26  6:38 ` [PATCH 1/4] drm/amdgpu: Introduce gfx software ring (v7) Christian König
  3 siblings, 0 replies; 15+ messages in thread
From: jiadong.zhu @ 2022-09-23 13:16 UTC (permalink / raw)
  To: amd-gfx; +Cc: Jiadong.Zhu

From: "Jiadong.Zhu" <Jiadong.Zhu@amd.com>

1. Modify the unmap_queue package on gfx9. Add trailing fence to track the
   preemption done.
2. Modify emit_ce_meta emit_de_meta functions for the resumed ibs.

v2: Restyle code not to use ternary operator.
v3: Modify code format.
v4: Enable Mid-Command Buffer Preemption for gfx9 by default.

Signed-off-by: Jiadong.Zhu <Jiadong.Zhu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |   1 +
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 181 +++++++++++++++++++----
 drivers/gpu/drm/amd/amdgpu/soc15d.h      |   2 +
 3 files changed, 155 insertions(+), 29 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index f08ee1ac281c..e90d327a589e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -60,6 +60,7 @@ enum amdgpu_ring_priority_level {
 #define AMDGPU_FENCE_FLAG_64BIT         (1 << 0)
 #define AMDGPU_FENCE_FLAG_INT           (1 << 1)
 #define AMDGPU_FENCE_FLAG_TC_WB_ONLY    (1 << 2)
+#define AMDGPU_FENCE_FLAG_EXEC          (1 << 3)
 
 #define to_amdgpu_ring(s) container_of((s), struct amdgpu_ring, sched)
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index e688665cd1e0..669532f658da 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -753,7 +753,7 @@ static void gfx_v9_0_set_rlc_funcs(struct amdgpu_device *adev);
 static int gfx_v9_0_get_cu_info(struct amdgpu_device *adev,
 				struct amdgpu_cu_info *cu_info);
 static uint64_t gfx_v9_0_get_gpu_clock_counter(struct amdgpu_device *adev);
-static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring);
+static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring, bool resume);
 static u64 gfx_v9_0_ring_get_rptr_compute(struct amdgpu_ring *ring);
 static void gfx_v9_0_query_ras_error_count(struct amdgpu_device *adev,
 					  void *ras_error_status);
@@ -826,9 +826,10 @@ static void gfx_v9_0_kiq_unmap_queues(struct amdgpu_ring *kiq_ring,
 			PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET0(ring->doorbell_index));
 
 	if (action == PREEMPT_QUEUES_NO_UNMAP) {
-		amdgpu_ring_write(kiq_ring, lower_32_bits(gpu_addr));
-		amdgpu_ring_write(kiq_ring, upper_32_bits(gpu_addr));
-		amdgpu_ring_write(kiq_ring, seq);
+		amdgpu_ring_write(kiq_ring, lower_32_bits(ring->wptr & ring->buf_mask));
+		amdgpu_ring_write(kiq_ring, 0);
+		amdgpu_ring_write(kiq_ring, 0);
+
 	} else {
 		amdgpu_ring_write(kiq_ring, 0);
 		amdgpu_ring_write(kiq_ring, 0);
@@ -5368,11 +5369,17 @@ static void gfx_v9_0_ring_emit_ib_gfx(struct amdgpu_ring *ring,
 
 	control |= ib->length_dw | (vmid << 24);
 
-	if (amdgpu_sriov_vf(ring->adev) && (ib->flags & AMDGPU_IB_FLAG_PREEMPT)) {
+	if (ib->flags & AMDGPU_IB_FLAG_PREEMPT) {
 		control |= INDIRECT_BUFFER_PRE_ENB(1);
 
+		if (flags & AMDGPU_IB_PREEMPTED)
+			control |= INDIRECT_BUFFER_PRE_RESUME(1);
+
 		if (!(ib->flags & AMDGPU_IB_FLAG_CE) && vmid)
-			gfx_v9_0_ring_emit_de_meta(ring);
+			gfx_v9_0_ring_emit_de_meta(ring,
+						   (!amdgpu_sriov_vf(ring->adev) &&
+						   flags & AMDGPU_IB_PREEMPTED) ?
+						   true : false);
 	}
 
 	amdgpu_ring_write(ring, header);
@@ -5427,17 +5434,23 @@ static void gfx_v9_0_ring_emit_fence(struct amdgpu_ring *ring, u64 addr,
 	bool write64bit = flags & AMDGPU_FENCE_FLAG_64BIT;
 	bool int_sel = flags & AMDGPU_FENCE_FLAG_INT;
 	bool writeback = flags & AMDGPU_FENCE_FLAG_TC_WB_ONLY;
+	bool exec = flags & AMDGPU_FENCE_FLAG_EXEC;
+	uint32_t dw2 = 0;
 
 	/* RELEASE_MEM - flush caches, send int */
 	amdgpu_ring_write(ring, PACKET3(PACKET3_RELEASE_MEM, 6));
-	amdgpu_ring_write(ring, ((writeback ? (EOP_TC_WB_ACTION_EN |
-					       EOP_TC_NC_ACTION_EN) :
-					      (EOP_TCL1_ACTION_EN |
-					       EOP_TC_ACTION_EN |
-					       EOP_TC_WB_ACTION_EN |
-					       EOP_TC_MD_ACTION_EN)) |
-				 EVENT_TYPE(CACHE_FLUSH_AND_INV_TS_EVENT) |
-				 EVENT_INDEX(5)));
+
+	if (writeback) {
+		dw2 = EOP_TC_WB_ACTION_EN | EOP_TC_NC_ACTION_EN;
+	} else {
+		dw2 = EOP_TCL1_ACTION_EN | EOP_TC_ACTION_EN |
+				EOP_TC_WB_ACTION_EN | EOP_TC_MD_ACTION_EN;
+	}
+	dw2 |= EVENT_TYPE(CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5);
+	if (exec)
+		dw2 |= EOP_EXEC;
+
+	amdgpu_ring_write(ring, dw2);
 	amdgpu_ring_write(ring, DATA_SEL(write64bit ? 2 : 1) | INT_SEL(int_sel ? 2 : 0));
 
 	/*
@@ -5542,33 +5555,135 @@ static void gfx_v9_ring_emit_sb(struct amdgpu_ring *ring)
 	amdgpu_ring_write(ring, 0);
 }
 
-static void gfx_v9_0_ring_emit_ce_meta(struct amdgpu_ring *ring)
+static void gfx_v9_0_ring_emit_ce_meta(struct amdgpu_ring *ring, bool resume)
 {
+	struct amdgpu_device *adev = ring->adev;
 	struct v9_ce_ib_state ce_payload = {0};
-	uint64_t csa_addr;
+	uint64_t offset, ce_payload_gpu_addr;
+	void *ce_payload_cpu_addr;
 	int cnt;
 
 	cnt = (sizeof(ce_payload) >> 2) + 4 - 2;
-	csa_addr = amdgpu_csa_vaddr(ring->adev);
+
+	if (ring->is_mes_queue) {
+		offset = offsetof(struct amdgpu_mes_ctx_meta_data,
+				  gfx[0].gfx_meta_data) +
+			offsetof(struct v9_gfx_meta_data, ce_payload);
+		ce_payload_gpu_addr =
+			amdgpu_mes_ctx_get_offs_gpu_addr(ring, offset);
+		ce_payload_cpu_addr =
+			amdgpu_mes_ctx_get_offs_cpu_addr(ring, offset);
+	} else {
+		offset = offsetof(struct v9_gfx_meta_data, ce_payload);
+		ce_payload_gpu_addr = amdgpu_csa_vaddr(ring->adev) + offset;
+		ce_payload_cpu_addr = adev->virt.csa_cpu_addr + offset;
+	}
 
 	amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, cnt));
 	amdgpu_ring_write(ring, (WRITE_DATA_ENGINE_SEL(2) |
 				 WRITE_DATA_DST_SEL(8) |
 				 WR_CONFIRM) |
 				 WRITE_DATA_CACHE_POLICY(0));
-	amdgpu_ring_write(ring, lower_32_bits(csa_addr + offsetof(struct v9_gfx_meta_data, ce_payload)));
-	amdgpu_ring_write(ring, upper_32_bits(csa_addr + offsetof(struct v9_gfx_meta_data, ce_payload)));
-	amdgpu_ring_write_multiple(ring, (void *)&ce_payload, sizeof(ce_payload) >> 2);
+	amdgpu_ring_write(ring, lower_32_bits(ce_payload_gpu_addr));
+	amdgpu_ring_write(ring, upper_32_bits(ce_payload_gpu_addr));
+
+	if (resume)
+		amdgpu_ring_write_multiple(ring, ce_payload_cpu_addr,
+					   sizeof(ce_payload) >> 2);
+	else
+		amdgpu_ring_write_multiple(ring, (void *)&ce_payload,
+					   sizeof(ce_payload) >> 2);
+}
+
+static int gfx_v9_0_ring_preempt_ib(struct amdgpu_ring *ring)
+{
+	int i, r = 0;
+	struct amdgpu_device *adev = ring->adev;
+	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
+	struct amdgpu_ring *kiq_ring = &kiq->ring;
+	unsigned long flags;
+
+	if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues)
+		return -EINVAL;
+
+	spin_lock_irqsave(&kiq->ring_lock, flags);
+
+	if (amdgpu_ring_alloc(kiq_ring, kiq->pmf->unmap_queues_size)) {
+		spin_unlock_irqrestore(&kiq->ring_lock, flags);
+		return -ENOMEM;
+	}
+
+	/* assert preemption condition */
+	amdgpu_ring_set_preempt_cond_exec(ring, false);
+
+	ring->trail_seq += 1;
+	amdgpu_ring_alloc(ring, 13);
+	gfx_v9_0_ring_emit_fence(ring, ring->trail_fence_gpu_addr,
+				 ring->trail_seq, AMDGPU_FENCE_FLAG_EXEC);
+	/*reset the CP_VMID_PREEMPT after trailing fence*/
+	amdgpu_ring_emit_wreg(ring,
+			      SOC15_REG_OFFSET(GC, 0, mmCP_VMID_PREEMPT),
+			      0x0);
+
+	/* assert IB preemption, emit the trailing fence */
+	kiq->pmf->kiq_unmap_queues(kiq_ring, ring, PREEMPT_QUEUES_NO_UNMAP,
+				   ring->trail_fence_gpu_addr,
+				   ring->trail_seq);
+
+	amdgpu_ring_commit(kiq_ring);
+	spin_unlock_irqrestore(&kiq->ring_lock, flags);
+
+	/* poll the trailing fence */
+	for (i = 0; i < adev->usec_timeout; i++) {
+		if (ring->trail_seq ==
+			le32_to_cpu(*ring->trail_fence_cpu_addr))
+			break;
+		udelay(1);
+	}
+
+	if (i >= adev->usec_timeout) {
+		r = -EINVAL;
+		DRM_ERROR("ring %d failed to preempt ib\n", ring->idx);
+	}
+
+	amdgpu_ring_commit(ring);
+
+	/* deassert preemption condition */
+	amdgpu_ring_set_preempt_cond_exec(ring, true);
+	return r;
 }
 
-static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring)
+static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring, bool resume)
 {
+	struct amdgpu_device *adev = ring->adev;
 	struct v9_de_ib_state de_payload = {0};
-	uint64_t csa_addr, gds_addr;
+	uint64_t offset, gds_addr, de_payload_gpu_addr;
+	void *de_payload_cpu_addr;
 	int cnt;
 
-	csa_addr = amdgpu_csa_vaddr(ring->adev);
-	gds_addr = csa_addr + 4096;
+	if (ring->is_mes_queue) {
+		offset = offsetof(struct amdgpu_mes_ctx_meta_data,
+				  gfx[0].gfx_meta_data) +
+			offsetof(struct v9_gfx_meta_data, de_payload);
+		de_payload_gpu_addr =
+			amdgpu_mes_ctx_get_offs_gpu_addr(ring, offset);
+		de_payload_cpu_addr =
+			amdgpu_mes_ctx_get_offs_cpu_addr(ring, offset);
+
+		offset = offsetof(struct amdgpu_mes_ctx_meta_data,
+				  gfx[0].gds_backup) +
+			offsetof(struct v9_gfx_meta_data, de_payload);
+		gds_addr = amdgpu_mes_ctx_get_offs_gpu_addr(ring, offset);
+	} else {
+		offset = offsetof(struct v9_gfx_meta_data, de_payload);
+		de_payload_gpu_addr = amdgpu_csa_vaddr(ring->adev) + offset;
+		de_payload_cpu_addr = adev->virt.csa_cpu_addr + offset;
+
+		gds_addr = ALIGN(amdgpu_csa_vaddr(ring->adev) +
+				 AMDGPU_CSA_SIZE - adev->gds.gds_size,
+				 PAGE_SIZE);
+	}
+
 	de_payload.gds_backup_addrlo = lower_32_bits(gds_addr);
 	de_payload.gds_backup_addrhi = upper_32_bits(gds_addr);
 
@@ -5578,9 +5693,15 @@ static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring)
 				 WRITE_DATA_DST_SEL(8) |
 				 WR_CONFIRM) |
 				 WRITE_DATA_CACHE_POLICY(0));
-	amdgpu_ring_write(ring, lower_32_bits(csa_addr + offsetof(struct v9_gfx_meta_data, de_payload)));
-	amdgpu_ring_write(ring, upper_32_bits(csa_addr + offsetof(struct v9_gfx_meta_data, de_payload)));
-	amdgpu_ring_write_multiple(ring, (void *)&de_payload, sizeof(de_payload) >> 2);
+	amdgpu_ring_write(ring, lower_32_bits(de_payload_gpu_addr));
+	amdgpu_ring_write(ring, upper_32_bits(de_payload_gpu_addr));
+
+	if (resume)
+		amdgpu_ring_write_multiple(ring, de_payload_cpu_addr,
+					   sizeof(de_payload) >> 2);
+	else
+		amdgpu_ring_write_multiple(ring, (void *)&de_payload,
+					   sizeof(de_payload) >> 2);
 }
 
 static void gfx_v9_0_ring_emit_frame_cntl(struct amdgpu_ring *ring, bool start,
@@ -5596,8 +5717,9 @@ static void gfx_v9_ring_emit_cntxcntl(struct amdgpu_ring *ring, uint32_t flags)
 {
 	uint32_t dw2 = 0;
 
-	if (amdgpu_sriov_vf(ring->adev))
-		gfx_v9_0_ring_emit_ce_meta(ring);
+	gfx_v9_0_ring_emit_ce_meta(ring,
+				   (!amdgpu_sriov_vf(ring->adev) &&
+				   flags & AMDGPU_IB_PREEMPTED) ? true : false);
 
 	dw2 |= 0x80000000; /* set load_enable otherwise this package is just NOPs */
 	if (flags & AMDGPU_HAVE_CTX_SWITCH) {
@@ -6922,6 +7044,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
 	.emit_cntxcntl = gfx_v9_ring_emit_cntxcntl,
 	.init_cond_exec = gfx_v9_0_ring_emit_init_cond_exec,
 	.patch_cond_exec = gfx_v9_0_ring_emit_patch_cond_exec,
+	.preempt_ib = gfx_v9_0_ring_preempt_ib,
 	.emit_frame_cntl = gfx_v9_0_ring_emit_frame_cntl,
 	.emit_wreg = gfx_v9_0_ring_emit_wreg,
 	.emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
diff --git a/drivers/gpu/drm/amd/amdgpu/soc15d.h b/drivers/gpu/drm/amd/amdgpu/soc15d.h
index 799925d22fc8..2357ff39323f 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15d.h
+++ b/drivers/gpu/drm/amd/amdgpu/soc15d.h
@@ -162,6 +162,7 @@
 		 * 2 - Bypass
 		 */
 #define     INDIRECT_BUFFER_PRE_ENB(x)		 ((x) << 21)
+#define     INDIRECT_BUFFER_PRE_RESUME(x)               ((x) << 30)
 #define	PACKET3_COPY_DATA				0x40
 #define	PACKET3_PFP_SYNC_ME				0x42
 #define	PACKET3_COND_WRITE				0x45
@@ -184,6 +185,7 @@
 #define		EOP_TC_ACTION_EN                        (1 << 17) /* L2 */
 #define		EOP_TC_NC_ACTION_EN			(1 << 19)
 #define		EOP_TC_MD_ACTION_EN			(1 << 21) /* L2 metadata */
+#define		EOP_EXEC				(1 << 28) /* For Trailing Fence */
 
 #define		DATA_SEL(x)                             ((x) << 29)
 		/* 0 - discard
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 4/4] drm/amdgpu: MCBP based on DRM scheduler (v6)
  2022-09-23 13:16 [PATCH 1/4] drm/amdgpu: Introduce gfx software ring (v7) jiadong.zhu
  2022-09-23 13:16 ` [PATCH 2/4] drm/amdgpu: Add software ring callbacks for gfx9 (v6) jiadong.zhu
  2022-09-23 13:16 ` [PATCH 3/4] drm/amdgpu: Modify unmap_queue format for gfx9 (v4) jiadong.zhu
@ 2022-09-23 13:16 ` jiadong.zhu
  2022-09-26  6:49   ` Christian König
  2022-09-26  6:38 ` [PATCH 1/4] drm/amdgpu: Introduce gfx software ring (v7) Christian König
  3 siblings, 1 reply; 15+ messages in thread
From: jiadong.zhu @ 2022-09-23 13:16 UTC (permalink / raw)
  To: amd-gfx; +Cc: Luben Tuikov, Jiadong.Zhu, Christian Koenig, Andrey Grodzovsky

From: "Jiadong.Zhu" <Jiadong.Zhu@amd.com>

Trigger Mid-Command Buffer Preemption according to the priority of the software
rings and the hw fence signalling condition.

The muxer saves the locations of the indirect buffer frames from the software
ring together with the fence sequence number in its fifo queue, and pops out
those records when the fences are signalled. The locations are used to resubmit
packages in preemption scenarios by coping the chunks from the software ring.

v2: Update comment style.
v3: Fix conflict caused by previous modifications.
v4: Remove unnecessary prints.
v5: Fix corner cases for resubmission cases.
v6: Refactor functions for resubmission, calling fence_process in irq handler.

Cc: Christian Koenig <Christian.Koenig@amd.com>
Cc: Luben Tuikov <Luben.Tuikov@amd.com>
Cc: Andrey Grodzovsky <Andrey.Grodzovsky@amd.com>
Acked-by: Luben Tuikov <luben.tuikov@amd.com>
Signed-off-by: Jiadong.Zhu <Jiadong.Zhu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c       |   2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c     |  13 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h     |   3 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c | 323 ++++++++++++++++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h |  30 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.c  |  26 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c       |   2 +
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c        |  10 +-
 8 files changed, 368 insertions(+), 41 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
index 258cffe3c06a..af86d87e2f3b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
@@ -211,6 +211,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
 		}
 	}
 
+	amdgpu_ring_ib_begin(ring);
 	if (job && ring->funcs->init_cond_exec)
 		patch_offset = amdgpu_ring_init_cond_exec(ring);
 
@@ -285,6 +286,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
 	    ring->hw_prio == AMDGPU_GFX_PIPE_PRIO_HIGH)
 		ring->funcs->emit_wave_limit(ring, false);
 
+	amdgpu_ring_ib_end(ring);
 	amdgpu_ring_commit(ring);
 	return 0;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index 13db99d653bd..84b0b3c7d40f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -33,6 +33,7 @@
 
 #include <drm/amdgpu_drm.h>
 #include "amdgpu.h"
+#include "amdgpu_sw_ring.h"
 #include "atom.h"
 
 /*
@@ -569,3 +570,15 @@ int amdgpu_ring_init_mqd(struct amdgpu_ring *ring)
 
 	return mqd_mgr->init_mqd(adev, ring->mqd_ptr, &prop);
 }
+
+void amdgpu_ring_ib_begin(struct amdgpu_ring *ring)
+{
+	if (ring->is_sw_ring)
+		amdgpu_sw_ring_ib_begin(ring);
+}
+
+void amdgpu_ring_ib_end(struct amdgpu_ring *ring)
+{
+	if (ring->is_sw_ring)
+		amdgpu_sw_ring_ib_end(ring);
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index e90d327a589e..6fbc1627dab7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -312,6 +312,9 @@ struct amdgpu_ring {
 #define amdgpu_ring_preempt_ib(r) (r)->funcs->preempt_ib(r)
 
 int amdgpu_ring_alloc(struct amdgpu_ring *ring, unsigned ndw);
+void amdgpu_ring_ib_begin(struct amdgpu_ring *ring);
+void amdgpu_ring_ib_end(struct amdgpu_ring *ring);
+
 void amdgpu_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count);
 void amdgpu_ring_generic_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib);
 void amdgpu_ring_commit(struct amdgpu_ring *ring);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
index 662aadebf111..788567e3b743 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
@@ -28,23 +28,146 @@
 
 #define AMDGPU_MUX_RESUBMIT_JIFFIES_TIMEOUT (HZ / 2)
 
+static struct kmem_cache *amdgpu_mux_chunk_slab;
+
+static inline struct amdgpu_mux_entry *amdgpu_ring_mux_sw_entry(struct amdgpu_ring_mux *mux,
+								struct amdgpu_ring *ring)
+{
+	return ring->entry_index < mux->ring_entry_size ?
+			&mux->ring_entry[ring->entry_index] : NULL;
+}
+
+/* copy packages on sw ring range[begin, end) */
+static void amdgpu_ring_mux_copy_pkt_from_sw_ring(struct amdgpu_ring_mux *mux,
+						  struct amdgpu_ring *ring,
+						  u64 s_start, u64 s_end)
+{
+	u64 start, end;
+	struct amdgpu_ring *real_ring = mux->real_ring;
+
+	start = s_start & ring->buf_mask;
+	end = s_end & ring->buf_mask;
+
+	if (start == end) {
+		DRM_ERROR("no more data copied from sw ring\n");
+		return;
+	}
+	if (start > end) {
+		amdgpu_ring_alloc(real_ring, (ring->ring_size >> 2) + end - start);
+		amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[start],
+					   (ring->ring_size >> 2) - start);
+		amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[0], end);
+	} else {
+		amdgpu_ring_alloc(real_ring, end - start);
+		amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[start], end - start);
+	}
+}
+
+static void amdgpu_mux_resubmit_chunks(struct amdgpu_ring_mux *mux, bool is_fallback)
+{
+	struct amdgpu_mux_entry *e = NULL;
+	struct amdgpu_mux_chunk *chunk;
+	uint32_t seq, last_seq;
+	int i;
+
+	if (is_fallback) {
+		if (!spin_trylock(&mux->lock)) {
+			amdgpu_ring_mux_schedule_resubmit(mux);
+			DRM_ERROR("reschedule resubmit\n");
+			return;
+		}
+	} else {
+		spin_lock(&mux->lock);
+	}
+
+	/*find low priority entries:*/
+	if (!mux->s_resubmit) {
+		spin_unlock(&mux->lock);
+		return;
+	}
+
+	for (i = 0; i < mux->num_ring_entries; i++) {
+		if (mux->ring_entry[i].ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT) {
+			e = &mux->ring_entry[i];
+			break;
+		}
+	}
+
+	if (!e) {
+		DRM_ERROR("%s no low priority ring found\n", __func__);
+		spin_unlock(&mux->lock);
+		return;
+	}
+
+	last_seq = atomic_read(&e->ring->fence_drv.last_seq);
+	seq = mux->seqno_to_resubmit;
+	if (last_seq < seq) {
+		/*resubmit all the fences between (last_seq, seq]*/
+		list_for_each_entry(chunk, &e->list, entry) {
+			if (chunk->sync_seq > last_seq && chunk->sync_seq <= seq) {
+				amdgpu_ring_mux_copy_pkt_from_sw_ring(mux, e->ring,
+								      chunk->start,
+								      chunk->end);
+				mux->wptr_resubmit = chunk->end;
+				amdgpu_ring_commit(mux->real_ring);
+			}
+		}
+	}
+
+	del_timer(&mux->resubmit_timer);
+	mux->s_resubmit = false;
+	spin_unlock(&mux->lock);
+}
+
+static void amdgpu_mux_resubmit_fallback(struct timer_list *t)
+{
+	struct amdgpu_ring_mux *mux = from_timer(mux, t, resubmit_timer);
+
+	DRM_INFO("calling %s\n", __func__);
+	amdgpu_mux_resubmit_chunks(mux, true);
+}
+
 int amdgpu_ring_mux_init(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring,
 			 unsigned int entry_size)
 {
 	mux->real_ring = ring;
 	mux->num_ring_entries = 0;
+
 	mux->ring_entry = kcalloc(entry_size, sizeof(struct amdgpu_mux_entry), GFP_KERNEL);
 	if (!mux->ring_entry)
 		return -ENOMEM;
 
 	mux->ring_entry_size = entry_size;
+	mux->s_resubmit = false;
+
+	amdgpu_mux_chunk_slab = kmem_cache_create("amdgpu_mux_chunk",
+						  sizeof(struct amdgpu_mux_chunk), 0,
+						  SLAB_HWCACHE_ALIGN, NULL);
+	if (!amdgpu_mux_chunk_slab) {
+		DRM_ERROR("create amdgpu_mux_chunk cache failed\n");
+		return -ENOMEM;
+	}
+
 	spin_lock_init(&mux->lock);
+	timer_setup(&mux->resubmit_timer, amdgpu_mux_resubmit_fallback, 0);
 
 	return 0;
 }
 
 void amdgpu_ring_mux_fini(struct amdgpu_ring_mux *mux)
 {
+	struct amdgpu_mux_entry *e;
+	struct amdgpu_mux_chunk *chunk, *chunk2;
+	int i;
+
+	for (i = 0; i < mux->num_ring_entries; i++) {
+		e = &mux->ring_entry[i];
+		list_for_each_entry_safe(chunk, chunk2, &e->list, entry) {
+			list_del(&chunk->entry);
+			kmem_cache_free(amdgpu_mux_chunk_slab, chunk);
+		}
+	}
+	kmem_cache_destroy(amdgpu_mux_chunk_slab);
 	kfree(mux->ring_entry);
 	mux->ring_entry = NULL;
 	mux->num_ring_entries = 0;
@@ -64,62 +187,46 @@ int amdgpu_ring_mux_add_sw_ring(struct amdgpu_ring_mux *mux, struct amdgpu_ring
 	ring->entry_index = mux->num_ring_entries;
 	e->ring = ring;
 
+	INIT_LIST_HEAD(&e->list);
 	mux->num_ring_entries += 1;
 	return 0;
 }
 
-static inline struct amdgpu_mux_entry *amdgpu_ring_mux_sw_entry(struct amdgpu_ring_mux *mux,
-								struct amdgpu_ring *ring)
-{
-	return ring->entry_index < mux->ring_entry_size ?
-			&mux->ring_entry[ring->entry_index] : NULL;
-}
-
-/* copy packages on sw ring range[begin, end) */
-static void amdgpu_ring_mux_copy_pkt_from_sw_ring(struct amdgpu_ring_mux *mux,
-						  struct amdgpu_ring *ring,
-						  u64 s_start, u64 s_end)
-{
-	u64 start, end;
-	struct amdgpu_ring *real_ring = mux->real_ring;
-
-	start = s_start & ring->buf_mask;
-	end = s_end & ring->buf_mask;
-
-	if (start == end) {
-		DRM_ERROR("no more data copied from sw ring\n");
-		return;
-	}
-	if (start > end) {
-		amdgpu_ring_alloc(real_ring, (ring->ring_size >> 2) + end - start);
-		amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[start],
-					   (ring->ring_size >> 2) - start);
-		amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[0], end);
-	} else {
-		amdgpu_ring_alloc(real_ring, end - start);
-		amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[start], end - start);
-	}
-}
-
 void amdgpu_ring_mux_set_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring, u64 wptr)
 {
 	struct amdgpu_mux_entry *e;
 
+	if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT)
+		amdgpu_mux_resubmit_chunks(mux, false);
+
 	e = amdgpu_ring_mux_sw_entry(mux, ring);
 	if (!e) {
 		DRM_ERROR("cannot find entry for sw ring\n");
 		return;
 	}
 
+	/* We could skip this set wptr as preemption in process. */
+	if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT && mux->pending_trailing_fence_signaled) {
+		DRM_ERROR("amdgpu_ring_mux_set_wptr skipped\n");
+		return;
+	}
+
 	spin_lock(&mux->lock);
 	e->sw_cptr = e->sw_wptr;
+	/* Update cptr if the package already copied in resubmit functions */
+	if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT && e->sw_cptr < mux->wptr_resubmit)
+		e->sw_cptr = mux->wptr_resubmit;
 	e->sw_wptr = wptr;
 	e->start_ptr_in_hw_ring = mux->real_ring->wptr;
 
-	amdgpu_ring_mux_copy_pkt_from_sw_ring(mux, ring, e->sw_cptr, wptr);
-	e->end_ptr_in_hw_ring = mux->real_ring->wptr;
-	amdgpu_ring_commit(mux->real_ring);
-
+	/* Skip copying for the packages already resubmitted.*/
+	if (ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT || mux->wptr_resubmit < wptr) {
+		amdgpu_ring_mux_copy_pkt_from_sw_ring(mux, ring, e->sw_cptr, wptr);
+		e->end_ptr_in_hw_ring = mux->real_ring->wptr;
+		amdgpu_ring_commit(mux->real_ring);
+	} else {
+		e->end_ptr_in_hw_ring = mux->real_ring->wptr;
+	}
 	spin_unlock(&mux->lock);
 }
 
@@ -181,3 +288,145 @@ u64 amdgpu_ring_mux_get_rptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ri
 
 	return e->sw_rptr;
 }
+
+void amdgpu_ring_mux_schedule_resubmit(struct amdgpu_ring_mux *mux)
+{
+	mod_timer(&mux->resubmit_timer, jiffies + AMDGPU_MUX_RESUBMIT_JIFFIES_TIMEOUT);
+}
+
+void amdgpu_ring_mux_start_ib(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring)
+{
+	struct amdgpu_mux_entry *e;
+	struct amdgpu_mux_chunk *chunk;
+
+	amdgpu_mux_resubmit_chunks(mux, false);
+
+	e = amdgpu_ring_mux_sw_entry(mux, ring);
+	if (!e) {
+		DRM_ERROR("cannot find entry!\n");
+		return;
+	}
+
+	chunk = kmem_cache_alloc(amdgpu_mux_chunk_slab, GFP_KERNEL);
+	if (!chunk) {
+		DRM_ERROR("alloc amdgpu_mux_chunk_slab failed\n");
+		return;
+	}
+
+	chunk->start = ring->wptr;
+	list_add_tail(&chunk->entry, &e->list);
+}
+
+static void scan_and_remove_signaled_chunk(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring)
+{
+	uint32_t last_seq, size = 0;
+	struct amdgpu_mux_entry *e;
+	struct amdgpu_mux_chunk *chunk, *tmp;
+
+	e = amdgpu_ring_mux_sw_entry(mux, ring);
+	if (!e) {
+		DRM_ERROR("cannot find entry!\n");
+		return;
+	}
+
+	last_seq = atomic_read(&ring->fence_drv.last_seq);
+
+	list_for_each_entry_safe(chunk, tmp, &e->list, entry) {
+		if (chunk->sync_seq <= last_seq) {
+			list_del(&chunk->entry);
+			kmem_cache_free(amdgpu_mux_chunk_slab, chunk);
+		} else {
+			size++;
+		}
+	}
+}
+
+void amdgpu_ring_mux_end_ib(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring)
+{
+	struct amdgpu_mux_entry *e;
+	struct amdgpu_mux_chunk *chunk;
+
+	e = amdgpu_ring_mux_sw_entry(mux, ring);
+	if (!e) {
+		DRM_ERROR("cannot find entry!\n");
+		return;
+	}
+
+	chunk = list_last_entry(&e->list, struct amdgpu_mux_chunk, entry);
+	if (!chunk) {
+		DRM_ERROR("cannot find chunk!\n");
+		return;
+	}
+
+	chunk->end = ring->wptr;
+	chunk->sync_seq = READ_ONCE(ring->fence_drv.sync_seq);
+
+	scan_and_remove_signaled_chunk(mux, ring);
+}
+
+/* Trigger Mid-Command Buffer Preemption (MCBP) and find if we need to resubmit. */
+int amdgpu_mcbp_trigger_preempt(struct amdgpu_ring_mux *mux)
+{
+	int r;
+
+	spin_lock(&mux->lock);
+	mux->pending_trailing_fence_signaled = true;
+	r = amdgpu_ring_preempt_ib(mux->real_ring);
+	spin_unlock(&mux->lock);
+	return r;
+}
+
+bool amdgpu_mcbp_handle_trailing_fence_irq(struct amdgpu_ring_mux *mux)
+{
+	struct amdgpu_mux_entry *e;
+	struct amdgpu_ring *ring = NULL;
+	int i;
+
+	if (!mux->pending_trailing_fence_signaled)
+		return false;
+
+	if (mux->real_ring->trail_seq != le32_to_cpu(*mux->real_ring->trail_fence_cpu_addr))
+		return false;
+
+	for (i = 0; i < mux->num_ring_entries; i++) {
+		e = &mux->ring_entry[i];
+		if (e->ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT) {
+			ring = e->ring;
+			break;
+		}
+	}
+
+	if (!ring) {
+		DRM_ERROR("cannot find low priority ring\n");
+		return false;
+	}
+
+	amdgpu_fence_process(ring);
+	if (amdgpu_fence_count_emitted(ring) > 0) {
+		mux->s_resubmit = true;
+		mux->seqno_to_resubmit = ring->fence_drv.sync_seq;
+		amdgpu_ring_mux_schedule_resubmit(mux);
+	}
+
+	mux->pending_trailing_fence_signaled = false;
+	return true;
+}
+
+/*scan on low prio rings to have unsignaled fence and high ring has no fence.*/
+int amdgpu_mcbp_scan(struct amdgpu_ring_mux *mux)
+{
+	struct amdgpu_ring *ring;
+	int i, need_preempt;
+
+	need_preempt = 0;
+	for (i = 0; i < mux->num_ring_entries; i++) {
+		ring = mux->ring_entry[i].ring;
+		if (ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT &&
+			amdgpu_fence_count_emitted(ring) > 0)
+			return 0;
+		if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT &&
+			amdgpu_fence_count_emitted(ring) > 0)
+			need_preempt = 1;
+	}
+	return need_preempt && !mux->s_resubmit;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
index 8c1691e11b1c..bf8f5ca61605 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
@@ -29,6 +29,7 @@
 #include "amdgpu_ring.h"
 
 struct amdgpu_ring;
+
 /**
  * struct amdgpu_mux_entry - the entry recording software rings copying information.
  * @ring: the pointer to the software ring.
@@ -37,6 +38,7 @@ struct amdgpu_ring;
  * @sw_cptr: the position of the copy pointer in the sw ring.
  * @sw_rptr: the read pointer in software ring.
  * @sw_wptr: the write pointer in software ring.
+ * @list: list head for amdgpu_mux_chunk
  */
 struct amdgpu_mux_entry {
 	struct                  amdgpu_ring *ring;
@@ -45,6 +47,7 @@ struct amdgpu_mux_entry {
 	u64                     sw_cptr;
 	u64                     sw_rptr;
 	u64                     sw_wptr;
+	struct list_head        list;
 };
 
 struct amdgpu_ring_mux {
@@ -55,6 +58,26 @@ struct amdgpu_ring_mux {
 	unsigned int            ring_entry_size;
 	/*the lock for copy data from different software rings*/
 	spinlock_t              lock;
+	bool                    s_resubmit;
+	uint32_t                seqno_to_resubmit;
+	u64                     wptr_resubmit;
+	struct timer_list       resubmit_timer;
+
+	bool                    pending_trailing_fence_signaled;
+};
+
+/**
+ * struct amdgpu_mux_chunk - save the location of indirect buffer's package on softare rings.
+ * @entry: the list entry.
+ * @sync_seq: the fence seqno related with the saved IB.
+ * @start:- start location on the software ring.
+ * @end:- end location on the software ring.
+ */
+struct amdgpu_mux_chunk {
+	struct list_head        entry;
+	uint32_t                sync_seq;
+	u64                     start;
+	u64                     end;
 };
 
 int amdgpu_ring_mux_init(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring,
@@ -65,4 +88,11 @@ void amdgpu_ring_mux_set_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *r
 u64 amdgpu_ring_mux_get_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring);
 u64 amdgpu_ring_mux_get_rptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring);
 
+void amdgpu_ring_mux_start_ib(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring);
+void amdgpu_ring_mux_end_ib(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring);
+void amdgpu_ring_mux_schedule_resubmit(struct amdgpu_ring_mux *mux);
+
+int amdgpu_mcbp_trigger_preempt(struct amdgpu_ring_mux *mux);
+int amdgpu_mcbp_scan(struct amdgpu_ring_mux *mux);
+bool amdgpu_mcbp_handle_trailing_fence_irq(struct amdgpu_ring_mux *mux);
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.c
index 5ae12d6641ca..a3ec7bdf72a6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.c
@@ -59,3 +59,29 @@ void amdgpu_sw_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count)
 {
 	WARN_ON(!ring->is_sw_ring);
 }
+
+void amdgpu_sw_ring_ib_begin(struct amdgpu_ring *ring)
+{
+	struct amdgpu_device *adev = ring->adev;
+	struct amdgpu_ring_mux *mux = &adev->gfx.muxer;
+
+	WARN_ON(!ring->is_sw_ring);
+	if (ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT) {
+		if (amdgpu_mcbp_scan(mux) > 0)
+			amdgpu_mcbp_trigger_preempt(mux);
+		return;
+	}
+
+	amdgpu_ring_mux_start_ib(mux, ring);
+}
+
+void amdgpu_sw_ring_ib_end(struct amdgpu_ring *ring)
+{
+	struct amdgpu_device *adev = ring->adev;
+	struct amdgpu_ring_mux *mux = &adev->gfx.muxer;
+
+	WARN_ON(!ring->is_sw_ring);
+	if (ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT)
+		return;
+	amdgpu_ring_mux_end_ib(mux, ring);
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 9596c22fded6..b7e94553f4fb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -601,6 +601,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
 	if (!vm_flush_needed && !gds_switch_needed && !need_pipe_sync)
 		return 0;
 
+	amdgpu_ring_ib_begin(ring);
 	if (ring->funcs->init_cond_exec)
 		patch_offset = amdgpu_ring_init_cond_exec(ring);
 
@@ -661,6 +662,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
 		amdgpu_ring_emit_switch_buffer(ring);
 		amdgpu_ring_emit_switch_buffer(ring);
 	}
+	amdgpu_ring_ib_end(ring);
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 669532f658da..1620300f0dde 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -5619,7 +5619,7 @@ static int gfx_v9_0_ring_preempt_ib(struct amdgpu_ring *ring)
 	ring->trail_seq += 1;
 	amdgpu_ring_alloc(ring, 13);
 	gfx_v9_0_ring_emit_fence(ring, ring->trail_fence_gpu_addr,
-				 ring->trail_seq, AMDGPU_FENCE_FLAG_EXEC);
+				 ring->trail_seq, AMDGPU_FENCE_FLAG_EXEC | AMDGPU_FENCE_FLAG_INT);
 	/*reset the CP_VMID_PREEMPT after trailing fence*/
 	amdgpu_ring_emit_wreg(ring,
 			      SOC15_REG_OFFSET(GC, 0, mmCP_VMID_PREEMPT),
@@ -6045,9 +6045,11 @@ static int gfx_v9_0_eop_irq(struct amdgpu_device *adev,
 
 	switch (me_id) {
 	case 0:
-		/* Fence signals are handled on the software rings*/
-		for (i = 0; i < GFX9_NUM_SW_GFX_RINGS; i++)
-			amdgpu_fence_process(&adev->gfx.sw_gfx_ring[i]);
+		if (!amdgpu_mcbp_handle_trailing_fence_irq(&adev->gfx.muxer)) {
+			/* Fence signals are handled on the software rings*/
+			for (i = 0; i < GFX9_NUM_SW_GFX_RINGS; i++)
+				amdgpu_fence_process(&adev->gfx.sw_gfx_ring[i]);
+		}
 		break;
 	case 1:
 	case 2:
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/4] drm/amdgpu: Introduce gfx software ring (v7)
  2022-09-23 13:16 [PATCH 1/4] drm/amdgpu: Introduce gfx software ring (v7) jiadong.zhu
                   ` (2 preceding siblings ...)
  2022-09-23 13:16 ` [PATCH 4/4] drm/amdgpu: MCBP based on DRM scheduler (v6) jiadong.zhu
@ 2022-09-26  6:38 ` Christian König
  3 siblings, 0 replies; 15+ messages in thread
From: Christian König @ 2022-09-26  6:38 UTC (permalink / raw)
  To: jiadong.zhu, amd-gfx; +Cc: Andrey Grodzovsky, Luben Tuikov

Am 23.09.22 um 15:16 schrieb jiadong.zhu@amd.com:
> From: "Jiadong.Zhu" <Jiadong.Zhu@amd.com>
>
> The software ring is created to support priority context while there is only
> one hardware queue for gfx.
>
> Every software ring has its fence driver and could be used as an ordinary ring
> for the GPU scheduler.
> Multiple software rings are bound to a real ring with the ring muxer. The
> packages committed on the software ring are copied to the real ring.
>
> v2: Use array to store software ring entry.
> v3: Remove unnecessary prints.
> v4: Remove amdgpu_ring_sw_init/fini functions,
> using gtt for sw ring buffer for later dma copy
> optimization.
> v5: Allocate ring entry dynamically in the muxer.
> v6: Update comments for the ring muxer.
> v7: Modify for function naming.
>
> Cc: Christian Koenig <Christian.Koenig@amd.com>
> Cc: Luben Tuikov <Luben.Tuikov@amd.com>
> Cc: Andrey Grodzovsky  <Andrey.Grodzovsky@amd.com>
> Signed-off-by: Jiadong.Zhu <Jiadong.Zhu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/Makefile          |   3 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h      |   3 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h     |   4 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c | 183 +++++++++++++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h |  68 +++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.c  |  61 +++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.h  |  38 ++++
>   7 files changed, 359 insertions(+), 1 deletion(-)
>   create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
>   create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
>   create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.c
>   create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.h
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile
> index 3e0e2eb7e235..85224bc81ce5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/Makefile
> +++ b/drivers/gpu/drm/amd/amdgpu/Makefile
> @@ -58,7 +58,8 @@ amdgpu-y += amdgpu_device.o amdgpu_kms.o \
>   	amdgpu_vm_sdma.o amdgpu_discovery.o amdgpu_ras_eeprom.o amdgpu_nbio.o \
>   	amdgpu_umc.o smu_v11_0_i2c.o amdgpu_fru_eeprom.o amdgpu_rap.o \
>   	amdgpu_fw_attestation.o amdgpu_securedisplay.o \
> -	amdgpu_eeprom.o amdgpu_mca.o amdgpu_psp_ta.o amdgpu_lsdma.o
> +	amdgpu_eeprom.o amdgpu_mca.o amdgpu_psp_ta.o amdgpu_lsdma.o \
> +	amdgpu_sw_ring.o amdgpu_ring_mux.o
>   
>   amdgpu-$(CONFIG_PROC_FS) += amdgpu_fdinfo.o
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> index 53526ffb2ce1..9996dadb39f7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> @@ -33,6 +33,7 @@
>   #include "amdgpu_imu.h"
>   #include "soc15.h"
>   #include "amdgpu_ras.h"
> +#include "amdgpu_ring_mux.h"
>   
>   /* GFX current status */
>   #define AMDGPU_GFX_NORMAL_MODE			0x00000000L
> @@ -346,6 +347,8 @@ struct amdgpu_gfx {
>   	struct amdgpu_gfx_ras		*ras;
>   
>   	bool				is_poweron;
> +
> +	struct amdgpu_ring_mux          muxer;
>   };
>   
>   #define amdgpu_gfx_get_gpu_clock_counter(adev) (adev)->gfx.funcs->get_gpu_clock_counter((adev))
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index 7d89a52091c0..40b1277b4f0c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -278,6 +278,10 @@ struct amdgpu_ring {
>   	bool			is_mes_queue;
>   	uint32_t		hw_queue_id;
>   	struct amdgpu_mes_ctx_data *mes_ctx;
> +
> +	bool            is_sw_ring;
> +	unsigned int    entry_index;
> +
>   };
>   
>   #define amdgpu_ring_parse_cs(r, p, job, ib) ((r)->funcs->parse_cs((p), (job), (ib)))
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
> new file mode 100644
> index 000000000000..662aadebf111
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
> @@ -0,0 +1,183 @@
> +/*
> + * Copyright 2022 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + *
> + */
> +#include <linux/slab.h>
> +#include <drm/drm_print.h>
> +
> +#include "amdgpu_ring_mux.h"
> +#include "amdgpu_ring.h"
> +
> +#define AMDGPU_MUX_RESUBMIT_JIFFIES_TIMEOUT (HZ / 2)
> +
> +int amdgpu_ring_mux_init(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring,
> +			 unsigned int entry_size)
> +{
> +	mux->real_ring = ring;
> +	mux->num_ring_entries = 0;
> +	mux->ring_entry = kcalloc(entry_size, sizeof(struct amdgpu_mux_entry), GFP_KERNEL);
> +	if (!mux->ring_entry)
> +		return -ENOMEM;
> +
> +	mux->ring_entry_size = entry_size;
> +	spin_lock_init(&mux->lock);
> +
> +	return 0;
> +}
> +
> +void amdgpu_ring_mux_fini(struct amdgpu_ring_mux *mux)
> +{
> +	kfree(mux->ring_entry);
> +	mux->ring_entry = NULL;
> +	mux->num_ring_entries = 0;
> +	mux->ring_entry_size = 0;
> +}
> +
> +int amdgpu_ring_mux_add_sw_ring(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring)
> +{
> +	struct amdgpu_mux_entry *e;
> +
> +	if (mux->num_ring_entries >= mux->ring_entry_size) {
> +		DRM_ERROR("add sw ring exceeding max entry size\n");
> +		return -ENOENT;
> +	}
> +
> +	e = &mux->ring_entry[mux->num_ring_entries];
> +	ring->entry_index = mux->num_ring_entries;
> +	e->ring = ring;
> +
> +	mux->num_ring_entries += 1;
> +	return 0;
> +}
> +
> +static inline struct amdgpu_mux_entry *amdgpu_ring_mux_sw_entry(struct amdgpu_ring_mux *mux,
> +								struct amdgpu_ring *ring)
> +{
> +	return ring->entry_index < mux->ring_entry_size ?
> +			&mux->ring_entry[ring->entry_index] : NULL;
> +}
> +
> +/* copy packages on sw ring range[begin, end) */
> +static void amdgpu_ring_mux_copy_pkt_from_sw_ring(struct amdgpu_ring_mux *mux,
> +						  struct amdgpu_ring *ring,
> +						  u64 s_start, u64 s_end)
> +{
> +	u64 start, end;
> +	struct amdgpu_ring *real_ring = mux->real_ring;
> +
> +	start = s_start & ring->buf_mask;
> +	end = s_end & ring->buf_mask;
> +
> +	if (start == end) {
> +		DRM_ERROR("no more data copied from sw ring\n");
> +		return;
> +	}
> +	if (start > end) {
> +		amdgpu_ring_alloc(real_ring, (ring->ring_size >> 2) + end - start);
> +		amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[start],
> +					   (ring->ring_size >> 2) - start);
> +		amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[0], end);
> +	} else {
> +		amdgpu_ring_alloc(real_ring, end - start);
> +		amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[start], end - start);
> +	}
> +}
> +
> +void amdgpu_ring_mux_set_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring, u64 wptr)
> +{
> +	struct amdgpu_mux_entry *e;
> +
> +	e = amdgpu_ring_mux_sw_entry(mux, ring);
> +	if (!e) {
> +		DRM_ERROR("cannot find entry for sw ring\n");
> +		return;
> +	}
> +
> +	spin_lock(&mux->lock);
> +	e->sw_cptr = e->sw_wptr;
> +	e->sw_wptr = wptr;
> +	e->start_ptr_in_hw_ring = mux->real_ring->wptr;
> +
> +	amdgpu_ring_mux_copy_pkt_from_sw_ring(mux, ring, e->sw_cptr, wptr);
> +	e->end_ptr_in_hw_ring = mux->real_ring->wptr;
> +	amdgpu_ring_commit(mux->real_ring);
> +
> +	spin_unlock(&mux->lock);
> +}
> +
> +u64 amdgpu_ring_mux_get_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring)
> +{
> +	struct amdgpu_mux_entry *e;
> +
> +	e = amdgpu_ring_mux_sw_entry(mux, ring);
> +	if (!e) {
> +		DRM_ERROR("cannot find entry for sw ring\n");
> +		return 0;
> +	}
> +
> +	return e->sw_wptr;
> +}
> +
> +/*
> + * The return value of the readptr is not precise while the other rings could
> + * write data onto the real ring buffer.After overwriting on the real ring, we
> + * can not decide if our packages have been excuted or not read yet. However,
> + * this function is only called by the tools such as umr to collect the latest
> + * packages for the hang analysis. We assume the hang happens near our latest
> + * submit. Thus we could use the following logic to give the clue:
> + * If the readptr is between start and end, then we return the copy pointer
> + * plus the distance from start to readptr. If the readptr is before start, we
> + * return the copy pointer. Lastly, if the readptr is past end, we return the
> + * write pointer.
> + */
> +u64 amdgpu_ring_mux_get_rptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring)
> +{
> +	struct amdgpu_mux_entry *e;
> +	u64 readp, offset, start, end;
> +
> +	e = amdgpu_ring_mux_sw_entry(mux, ring);
> +	if (!e) {
> +		DRM_ERROR("no sw entry found!\n");
> +		return 0;
> +	}
> +
> +	readp = amdgpu_ring_get_rptr(mux->real_ring);
> +
> +	start = e->start_ptr_in_hw_ring & mux->real_ring->buf_mask;
> +	end = e->end_ptr_in_hw_ring & mux->real_ring->buf_mask;
> +	if (start > end) {
> +		if (readp <= end)
> +			readp += mux->real_ring->ring_size >> 2;
> +		end += mux->real_ring->ring_size >> 2;
> +	}
> +
> +	if (start <= readp && readp <= end) {
> +		offset = readp - start;
> +		e->sw_rptr = (e->sw_cptr + offset) & ring->buf_mask;
> +	} else if (readp < start) {
> +		e->sw_rptr = e->sw_cptr;
> +	} else {
> +		/* end < readptr */
> +		e->sw_rptr = e->sw_wptr;
> +	}
> +
> +	return e->sw_rptr;
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
> new file mode 100644
> index 000000000000..8c1691e11b1c
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
> @@ -0,0 +1,68 @@
> +/*
> + * Copyright 2022 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + *
> + */
> +
> +#ifndef __AMDGPU_RING_MUX__
> +#define __AMDGPU_RING_MUX__
> +
> +#include <linux/timer.h>
> +#include <linux/spinlock.h>
> +#include "amdgpu_ring.h"
> +
> +struct amdgpu_ring;
> +/**
> + * struct amdgpu_mux_entry - the entry recording software rings copying information.
> + * @ring: the pointer to the software ring.
> + * @start_ptr_in_hw_ring: last start location copied to in the hardware ring.
> + * @end_ptr_in_hw_ring: last end location copied to in the hardware ring.
> + * @sw_cptr: the position of the copy pointer in the sw ring.
> + * @sw_rptr: the read pointer in software ring.
> + * @sw_wptr: the write pointer in software ring.
> + */
> +struct amdgpu_mux_entry {
> +	struct                  amdgpu_ring *ring;
> +	u64                     start_ptr_in_hw_ring;
> +	u64                     end_ptr_in_hw_ring;
> +	u64                     sw_cptr;
> +	u64                     sw_rptr;
> +	u64                     sw_wptr;
> +};
> +
> +struct amdgpu_ring_mux {
> +	struct amdgpu_ring      *real_ring;
> +
> +	struct amdgpu_mux_entry *ring_entry;
> +	unsigned int            num_ring_entries;
> +	unsigned int            ring_entry_size;
> +	/*the lock for copy data from different software rings*/
> +	spinlock_t              lock;
> +};
> +
> +int amdgpu_ring_mux_init(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring,
> +			 unsigned int entry_size);
> +void amdgpu_ring_mux_fini(struct amdgpu_ring_mux *mux);
> +int amdgpu_ring_mux_add_sw_ring(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring);
> +void amdgpu_ring_mux_set_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring, u64 wptr);
> +u64 amdgpu_ring_mux_get_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring);
> +u64 amdgpu_ring_mux_get_rptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring);
> +
> +#endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.c
> new file mode 100644
> index 000000000000..5ae12d6641ca
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.c
> @@ -0,0 +1,61 @@
> +/*
> + * Copyright 2022 Advanced Micro Devices, Inc.
> + * All Rights Reserved.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the
> + * "Software"), to deal in the Software without restriction, including
> + * without limitation the rights to use, copy, modify, merge, publish,
> + * distribute, sub license, and/or sell copies of the Software, and to
> + * permit persons to whom the Software is furnished to do so, subject to
> + * the following conditions:
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
> + * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
> + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
> + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
> + * USE OR OTHER DEALINGS IN THE SOFTWARE.
> + *
> + * The above copyright notice and this permission notice (including the
> + * next paragraph) shall be included in all copies or substantial portions
> + * of the Software.
> + *
> + */
> +
> +#include "amdgpu_sw_ring.h"
> +#include "amdgpu_ring_mux.h"
> +
> +u64 amdgpu_sw_ring_get_rptr_gfx(struct amdgpu_ring *ring)
> +{
> +	struct amdgpu_device *adev = ring->adev;
> +	struct amdgpu_ring_mux *mux = &adev->gfx.muxer;
> +
> +	WARN_ON(!ring->is_sw_ring);
> +	return amdgpu_ring_mux_get_rptr(mux, ring);
> +}
> +
> +u64 amdgpu_sw_ring_get_wptr_gfx(struct amdgpu_ring *ring)
> +{
> +	struct amdgpu_device *adev = ring->adev;
> +	struct amdgpu_ring_mux *mux = &adev->gfx.muxer;
> +
> +	WARN_ON(!ring->is_sw_ring);
> +	return amdgpu_ring_mux_get_wptr(mux, ring);
> +}
> +
> +void amdgpu_sw_ring_set_wptr_gfx(struct amdgpu_ring *ring)
> +{
> +	struct amdgpu_device *adev = ring->adev;
> +	struct amdgpu_ring_mux *mux = &adev->gfx.muxer;
> +
> +	WARN_ON(!ring->is_sw_ring);
> +	amdgpu_ring_mux_set_wptr(mux, ring, ring->wptr);
> +}
> +
> +/* Override insert_nop to prevent emitting nops to the software rings */
> +void amdgpu_sw_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count)
> +{
> +	WARN_ON(!ring->is_sw_ring);
> +}

Putting that into a separate file seems to be overkill.

Please either move this into amdgpu_gfx.c or the ring mux code directly.

Apart from that the patch looks good to me.

Regards,
Christian.

> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.h
> new file mode 100644
> index 000000000000..a66524b8b36e
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.h
> @@ -0,0 +1,38 @@
> +/*
> + * Copyright 2012 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + *
> + */
> +
> +#include "amdgpu_ring.h"
> +#include "amdgpu.h"
> +
> +#ifndef __AMDGPU_SWRING_H__
> +#define __AMDGPU_SWRING_H__
> +
> +u64 amdgpu_sw_ring_get_rptr_gfx(struct amdgpu_ring *ring);
> +u64 amdgpu_sw_ring_get_wptr_gfx(struct amdgpu_ring *ring);
> +void amdgpu_sw_ring_set_wptr_gfx(struct amdgpu_ring *ring);
> +
> +void amdgpu_sw_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count);
> +void amdgpu_sw_ring_ib_begin(struct amdgpu_ring *ring);
> +void amdgpu_sw_ring_ib_end(struct amdgpu_ring *ring);
> +
> +#endif


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/4] drm/amdgpu: Add software ring callbacks for gfx9 (v6)
  2022-09-23 13:16 ` [PATCH 2/4] drm/amdgpu: Add software ring callbacks for gfx9 (v6) jiadong.zhu
@ 2022-09-26  6:43   ` Christian König
  0 siblings, 0 replies; 15+ messages in thread
From: Christian König @ 2022-09-26  6:43 UTC (permalink / raw)
  To: jiadong.zhu, amd-gfx; +Cc: Andrey Grodzovsky, Luben Tuikov

Am 23.09.22 um 15:16 schrieb jiadong.zhu@amd.com:
> From: "Jiadong.Zhu" <Jiadong.Zhu@amd.com>
>
> Set ring functions with software ring callbacks on gfx9.
>
> The software ring could be tested by debugfs_test_ib case.
>
> v2: Set sw_ring 2 to enable software ring by default.
> v3: Remove the parameter for software ring enablement.
> v4: Use amdgpu_ring_init/fini for software rings.
> v5: Update for code format. Fix conflict.
> v6: Remove unnecessary checks and enable software ring on gfx9 by default.
>
> Acked-by: Luben Tuikov <luben.tuikov@amd.com>
> Cc: Christian Koenig <Christian.Koenig@amd.com>
> Cc: Luben Tuikov <Luben.Tuikov@amd.com>
> Cc: Andrey Grodzovsky <Andrey.Grodzovsky@amd.com>
> Signed-off-by: Jiadong.Zhu <Jiadong.Zhu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h  |   1 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |   1 +
>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 108 ++++++++++++++++++++++-
>   3 files changed, 109 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> index 9996dadb39f7..4fdfc3ec134a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> @@ -348,6 +348,7 @@ struct amdgpu_gfx {
>   
>   	bool				is_poweron;
>   
> +	struct amdgpu_ring		sw_gfx_ring[AMDGPU_MAX_SW_GFX_RINGS];
>   	struct amdgpu_ring_mux          muxer;
>   };
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index 40b1277b4f0c..f08ee1ac281c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -39,6 +39,7 @@ struct amdgpu_vm;
>   #define AMDGPU_MAX_RINGS		28
>   #define AMDGPU_MAX_HWIP_RINGS		8
>   #define AMDGPU_MAX_GFX_RINGS		2
> +#define AMDGPU_MAX_SW_GFX_RINGS         2
>   #define AMDGPU_MAX_COMPUTE_RINGS	8
>   #define AMDGPU_MAX_VCE_RINGS		3
>   #define AMDGPU_MAX_UVD_ENC_RINGS	2
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 5349ca4d19e3..e688665cd1e0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -47,6 +47,7 @@
>   
>   #include "amdgpu_ras.h"
>   
> +#include "amdgpu_sw_ring.h"
>   #include "gfx_v9_4.h"
>   #include "gfx_v9_0.h"
>   #include "gfx_v9_4_2.h"
> @@ -56,6 +57,7 @@
>   #include "asic_reg/gc/gc_9_0_default.h"
>   
>   #define GFX9_NUM_GFX_RINGS     1
> +#define GFX9_NUM_SW_GFX_RINGS  2
>   #define GFX9_MEC_HPD_SIZE 4096
>   #define RLCG_UCODE_LOADING_START_ADDRESS 0x00002000L
>   #define RLC_SAVE_RESTORE_ADDR_STARTING_OFFSET 0x00000000L
> @@ -2273,6 +2275,7 @@ static int gfx_v9_0_sw_init(void *handle)
>   	struct amdgpu_ring *ring;
>   	struct amdgpu_kiq *kiq;
>   	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> +	unsigned int hw_prio;
>   
>   	switch (adev->ip_versions[GC_HWIP][0]) {
>   	case IP_VERSION(9, 0, 1):
> @@ -2356,6 +2359,9 @@ static int gfx_v9_0_sw_init(void *handle)
>   			sprintf(ring->name, "gfx_%d", i);
>   		ring->use_doorbell = true;
>   		ring->doorbell_index = adev->doorbell_index.gfx_ring0 << 1;
> +
> +		/* disable scheduler on the real ring */
> +		ring->no_scheduler = true;
>   		r = amdgpu_ring_init(adev, ring, 1024, &adev->gfx.eop_irq,
>   				     AMDGPU_CP_IRQ_GFX_ME0_PIPE0_EOP,
>   				     AMDGPU_RING_PRIO_DEFAULT, NULL);
> @@ -2363,6 +2369,42 @@ static int gfx_v9_0_sw_init(void *handle)
>   			return r;
>   	}
>   
> +	/* set up the software rings */
> +	for (i = 0; i < GFX9_NUM_SW_GFX_RINGS; i++) {
> +		ring = &adev->gfx.sw_gfx_ring[i];
> +		ring->ring_obj = NULL;
> +		if (!i)
> +			sprintf(ring->name, "gfx_sw");
> +		else
> +			sprintf(ring->name, "gfx_sw_%d", i);

I think we should use something like gfx_low/gfx_high for the ring name 
here.

That this is implemented by a sw muxer is pretty much irrelevant for 
overspace.

Maybe use a static array for the names or something like this.

Apart from that looks good to me.

Regards,
Christian.

> +		ring->use_doorbell = true;
> +		ring->doorbell_index = adev->doorbell_index.gfx_ring0 << 1;
> +		ring->is_sw_ring = true;
> +		hw_prio = (i == 1) ? AMDGPU_RING_PRIO_2 :
> +			AMDGPU_RING_PRIO_DEFAULT;
> +		r = amdgpu_ring_init(adev, ring, 1024, &adev->gfx.eop_irq,
> +				     AMDGPU_CP_IRQ_GFX_ME0_PIPE0_EOP, hw_prio,
> +				     NULL);
> +		if (r)
> +			return r;
> +		ring->wptr = 0;
> +	}
> +
> +	/* init the muxer and add software rings */
> +	r = amdgpu_ring_mux_init(&adev->gfx.muxer, &adev->gfx.gfx_ring[0],
> +				 GFX9_NUM_SW_GFX_RINGS);
> +	if (r) {
> +		DRM_ERROR("amdgpu_ring_mux_init failed(%d)\n", r);
> +		return r;
> +	}
> +	for (i = 0; i < GFX9_NUM_SW_GFX_RINGS; i++) {
> +		r = amdgpu_ring_mux_add_sw_ring(&adev->gfx.muxer, &adev->gfx.sw_gfx_ring[i]);
> +		if (r) {
> +			DRM_ERROR("amdgpu_ring_mux_add_sw_ring failed(%d)\n", r);
> +			return r;
> +		}
> +	}
> +
>   	/* set up the compute queues - allocate horizontally across pipes */
>   	ring_id = 0;
>   	for (i = 0; i < adev->gfx.mec.num_mec; ++i) {
> @@ -2413,6 +2455,10 @@ static int gfx_v9_0_sw_fini(void *handle)
>   	int i;
>   	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>   
> +	for (i = 0; i < GFX9_NUM_SW_GFX_RINGS; i++)
> +		amdgpu_ring_fini(&adev->gfx.sw_gfx_ring[i]);
> +	amdgpu_ring_mux_fini(&adev->gfx.muxer);
> +
>   	for (i = 0; i < adev->gfx.num_gfx_rings; i++)
>   		amdgpu_ring_fini(&adev->gfx.gfx_ring[i]);
>   	for (i = 0; i < adev->gfx.num_compute_rings; i++)
> @@ -5877,7 +5923,9 @@ static int gfx_v9_0_eop_irq(struct amdgpu_device *adev,
>   
>   	switch (me_id) {
>   	case 0:
> -		amdgpu_fence_process(&adev->gfx.gfx_ring[0]);
> +		/* Fence signals are handled on the software rings*/
> +		for (i = 0; i < GFX9_NUM_SW_GFX_RINGS; i++)
> +			amdgpu_fence_process(&adev->gfx.sw_gfx_ring[i]);
>   		break;
>   	case 1:
>   	case 2:
> @@ -6882,6 +6930,61 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>   	.emit_mem_sync = gfx_v9_0_emit_mem_sync,
>   };
>   
> +static const struct amdgpu_ring_funcs gfx_v9_0_sw_ring_funcs_gfx = {
> +	.type = AMDGPU_RING_TYPE_GFX,
> +	.align_mask = 0xff,
> +	.nop = PACKET3(PACKET3_NOP, 0x3FFF),
> +	.support_64bit_ptrs = true,
> +	.secure_submission_supported = true,
> +	.vmhub = AMDGPU_GFXHUB_0,
> +	.get_rptr = amdgpu_sw_ring_get_rptr_gfx,
> +	.get_wptr = amdgpu_sw_ring_get_wptr_gfx,
> +	.set_wptr = amdgpu_sw_ring_set_wptr_gfx,
> +	.emit_frame_size = /* totally 242 maximum if 16 IBs */
> +		5 +  /* COND_EXEC */
> +		7 +  /* PIPELINE_SYNC */
> +		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> +		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> +		2 + /* VM_FLUSH */
> +		8 +  /* FENCE for VM_FLUSH */
> +		20 + /* GDS switch */
> +		4 + /* double SWITCH_BUFFER,
> +		     * the first COND_EXEC jump to the place just
> +		     * prior to this double SWITCH_BUFFER
> +		     */
> +		5 + /* COND_EXEC */
> +		7 +	 /*	HDP_flush */
> +		4 +	 /*	VGT_flush */
> +		14 + /*	CE_META */
> +		31 + /*	DE_META */
> +		3 + /* CNTX_CTRL */
> +		5 + /* HDP_INVL */
> +		8 + 8 + /* FENCE x2 */
> +		2 + /* SWITCH_BUFFER */
> +		7, /* gfx_v9_0_emit_mem_sync */
> +	.emit_ib_size =	4, /* gfx_v9_0_ring_emit_ib_gfx */
> +	.emit_ib = gfx_v9_0_ring_emit_ib_gfx,
> +	.emit_fence = gfx_v9_0_ring_emit_fence,
> +	.emit_pipeline_sync = gfx_v9_0_ring_emit_pipeline_sync,
> +	.emit_vm_flush = gfx_v9_0_ring_emit_vm_flush,
> +	.emit_gds_switch = gfx_v9_0_ring_emit_gds_switch,
> +	.emit_hdp_flush = gfx_v9_0_ring_emit_hdp_flush,
> +	.test_ring = gfx_v9_0_ring_test_ring,
> +	.test_ib = gfx_v9_0_ring_test_ib,
> +	.insert_nop = amdgpu_sw_ring_insert_nop,
> +	.pad_ib = amdgpu_ring_generic_pad_ib,
> +	.emit_switch_buffer = gfx_v9_ring_emit_sb,
> +	.emit_cntxcntl = gfx_v9_ring_emit_cntxcntl,
> +	.init_cond_exec = gfx_v9_0_ring_emit_init_cond_exec,
> +	.patch_cond_exec = gfx_v9_0_ring_emit_patch_cond_exec,
> +	.emit_frame_cntl = gfx_v9_0_ring_emit_frame_cntl,
> +	.emit_wreg = gfx_v9_0_ring_emit_wreg,
> +	.emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
> +	.emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
> +	.soft_recovery = gfx_v9_0_ring_soft_recovery,
> +	.emit_mem_sync = gfx_v9_0_emit_mem_sync,
> +};
> +
>   static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>   	.type = AMDGPU_RING_TYPE_COMPUTE,
>   	.align_mask = 0xff,
> @@ -6959,6 +7062,9 @@ static void gfx_v9_0_set_ring_funcs(struct amdgpu_device *adev)
>   	for (i = 0; i < adev->gfx.num_gfx_rings; i++)
>   		adev->gfx.gfx_ring[i].funcs = &gfx_v9_0_ring_funcs_gfx;
>   
> +	for (i = 0; i < GFX9_NUM_SW_GFX_RINGS; i++)
> +		adev->gfx.sw_gfx_ring[i].funcs = &gfx_v9_0_sw_ring_funcs_gfx;
> +
>   	for (i = 0; i < adev->gfx.num_compute_rings; i++)
>   		adev->gfx.compute_ring[i].funcs = &gfx_v9_0_ring_funcs_compute;
>   }


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 4/4] drm/amdgpu: MCBP based on DRM scheduler (v6)
  2022-09-23 13:16 ` [PATCH 4/4] drm/amdgpu: MCBP based on DRM scheduler (v6) jiadong.zhu
@ 2022-09-26  6:49   ` Christian König
  2022-09-27  3:18     ` Zhu, Jiadong
  0 siblings, 1 reply; 15+ messages in thread
From: Christian König @ 2022-09-26  6:49 UTC (permalink / raw)
  To: jiadong.zhu, amd-gfx; +Cc: Andrey Grodzovsky, Luben Tuikov, Christian Koenig

Am 23.09.22 um 15:16 schrieb jiadong.zhu@amd.com:
> From: "Jiadong.Zhu" <Jiadong.Zhu@amd.com>
>
> Trigger Mid-Command Buffer Preemption according to the priority of the software
> rings and the hw fence signalling condition.
>
> The muxer saves the locations of the indirect buffer frames from the software
> ring together with the fence sequence number in its fifo queue, and pops out
> those records when the fences are signalled. The locations are used to resubmit
> packages in preemption scenarios by coping the chunks from the software ring.
>
> v2: Update comment style.
> v3: Fix conflict caused by previous modifications.
> v4: Remove unnecessary prints.
> v5: Fix corner cases for resubmission cases.
> v6: Refactor functions for resubmission, calling fence_process in irq handler.
>
> Cc: Christian Koenig <Christian.Koenig@amd.com>
> Cc: Luben Tuikov <Luben.Tuikov@amd.com>
> Cc: Andrey Grodzovsky <Andrey.Grodzovsky@amd.com>
> Acked-by: Luben Tuikov <luben.tuikov@amd.com>
> Signed-off-by: Jiadong.Zhu <Jiadong.Zhu@amd.com>

I need more time for an in deep review of this, but form the one mile 
high view it looks correct to me now.

Can we do some pre-commit qa testing with this?

Thanks,
Christian.

> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c       |   2 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c     |  13 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h     |   3 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c | 323 ++++++++++++++++---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h |  30 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.c  |  26 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c       |   2 +
>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c        |  10 +-
>   8 files changed, 368 insertions(+), 41 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> index 258cffe3c06a..af86d87e2f3b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> @@ -211,6 +211,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>   		}
>   	}
>   
> +	amdgpu_ring_ib_begin(ring);
>   	if (job && ring->funcs->init_cond_exec)
>   		patch_offset = amdgpu_ring_init_cond_exec(ring);
>   
> @@ -285,6 +286,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>   	    ring->hw_prio == AMDGPU_GFX_PIPE_PRIO_HIGH)
>   		ring->funcs->emit_wave_limit(ring, false);
>   
> +	amdgpu_ring_ib_end(ring);
>   	amdgpu_ring_commit(ring);
>   	return 0;
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> index 13db99d653bd..84b0b3c7d40f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> @@ -33,6 +33,7 @@
>   
>   #include <drm/amdgpu_drm.h>
>   #include "amdgpu.h"
> +#include "amdgpu_sw_ring.h"
>   #include "atom.h"
>   
>   /*
> @@ -569,3 +570,15 @@ int amdgpu_ring_init_mqd(struct amdgpu_ring *ring)
>   
>   	return mqd_mgr->init_mqd(adev, ring->mqd_ptr, &prop);
>   }
> +
> +void amdgpu_ring_ib_begin(struct amdgpu_ring *ring)
> +{
> +	if (ring->is_sw_ring)
> +		amdgpu_sw_ring_ib_begin(ring);
> +}
> +
> +void amdgpu_ring_ib_end(struct amdgpu_ring *ring)
> +{
> +	if (ring->is_sw_ring)
> +		amdgpu_sw_ring_ib_end(ring);
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index e90d327a589e..6fbc1627dab7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -312,6 +312,9 @@ struct amdgpu_ring {
>   #define amdgpu_ring_preempt_ib(r) (r)->funcs->preempt_ib(r)
>   
>   int amdgpu_ring_alloc(struct amdgpu_ring *ring, unsigned ndw);
> +void amdgpu_ring_ib_begin(struct amdgpu_ring *ring);
> +void amdgpu_ring_ib_end(struct amdgpu_ring *ring);
> +
>   void amdgpu_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count);
>   void amdgpu_ring_generic_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib);
>   void amdgpu_ring_commit(struct amdgpu_ring *ring);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
> index 662aadebf111..788567e3b743 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
> @@ -28,23 +28,146 @@
>   
>   #define AMDGPU_MUX_RESUBMIT_JIFFIES_TIMEOUT (HZ / 2)
>   
> +static struct kmem_cache *amdgpu_mux_chunk_slab;
> +
> +static inline struct amdgpu_mux_entry *amdgpu_ring_mux_sw_entry(struct amdgpu_ring_mux *mux,
> +								struct amdgpu_ring *ring)
> +{
> +	return ring->entry_index < mux->ring_entry_size ?
> +			&mux->ring_entry[ring->entry_index] : NULL;
> +}
> +
> +/* copy packages on sw ring range[begin, end) */
> +static void amdgpu_ring_mux_copy_pkt_from_sw_ring(struct amdgpu_ring_mux *mux,
> +						  struct amdgpu_ring *ring,
> +						  u64 s_start, u64 s_end)
> +{
> +	u64 start, end;
> +	struct amdgpu_ring *real_ring = mux->real_ring;
> +
> +	start = s_start & ring->buf_mask;
> +	end = s_end & ring->buf_mask;
> +
> +	if (start == end) {
> +		DRM_ERROR("no more data copied from sw ring\n");
> +		return;
> +	}
> +	if (start > end) {
> +		amdgpu_ring_alloc(real_ring, (ring->ring_size >> 2) + end - start);
> +		amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[start],
> +					   (ring->ring_size >> 2) - start);
> +		amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[0], end);
> +	} else {
> +		amdgpu_ring_alloc(real_ring, end - start);
> +		amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[start], end - start);
> +	}
> +}
> +
> +static void amdgpu_mux_resubmit_chunks(struct amdgpu_ring_mux *mux, bool is_fallback)
> +{
> +	struct amdgpu_mux_entry *e = NULL;
> +	struct amdgpu_mux_chunk *chunk;
> +	uint32_t seq, last_seq;
> +	int i;
> +
> +	if (is_fallback) {
> +		if (!spin_trylock(&mux->lock)) {
> +			amdgpu_ring_mux_schedule_resubmit(mux);
> +			DRM_ERROR("reschedule resubmit\n");
> +			return;
> +		}
> +	} else {
> +		spin_lock(&mux->lock);
> +	}
> +
> +	/*find low priority entries:*/
> +	if (!mux->s_resubmit) {
> +		spin_unlock(&mux->lock);
> +		return;
> +	}
> +
> +	for (i = 0; i < mux->num_ring_entries; i++) {
> +		if (mux->ring_entry[i].ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT) {
> +			e = &mux->ring_entry[i];
> +			break;
> +		}
> +	}
> +
> +	if (!e) {
> +		DRM_ERROR("%s no low priority ring found\n", __func__);
> +		spin_unlock(&mux->lock);
> +		return;
> +	}
> +
> +	last_seq = atomic_read(&e->ring->fence_drv.last_seq);
> +	seq = mux->seqno_to_resubmit;
> +	if (last_seq < seq) {
> +		/*resubmit all the fences between (last_seq, seq]*/
> +		list_for_each_entry(chunk, &e->list, entry) {
> +			if (chunk->sync_seq > last_seq && chunk->sync_seq <= seq) {
> +				amdgpu_ring_mux_copy_pkt_from_sw_ring(mux, e->ring,
> +								      chunk->start,
> +								      chunk->end);
> +				mux->wptr_resubmit = chunk->end;
> +				amdgpu_ring_commit(mux->real_ring);
> +			}
> +		}
> +	}
> +
> +	del_timer(&mux->resubmit_timer);
> +	mux->s_resubmit = false;
> +	spin_unlock(&mux->lock);
> +}
> +
> +static void amdgpu_mux_resubmit_fallback(struct timer_list *t)
> +{
> +	struct amdgpu_ring_mux *mux = from_timer(mux, t, resubmit_timer);
> +
> +	DRM_INFO("calling %s\n", __func__);
> +	amdgpu_mux_resubmit_chunks(mux, true);
> +}
> +
>   int amdgpu_ring_mux_init(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring,
>   			 unsigned int entry_size)
>   {
>   	mux->real_ring = ring;
>   	mux->num_ring_entries = 0;
> +
>   	mux->ring_entry = kcalloc(entry_size, sizeof(struct amdgpu_mux_entry), GFP_KERNEL);
>   	if (!mux->ring_entry)
>   		return -ENOMEM;
>   
>   	mux->ring_entry_size = entry_size;
> +	mux->s_resubmit = false;
> +
> +	amdgpu_mux_chunk_slab = kmem_cache_create("amdgpu_mux_chunk",
> +						  sizeof(struct amdgpu_mux_chunk), 0,
> +						  SLAB_HWCACHE_ALIGN, NULL);
> +	if (!amdgpu_mux_chunk_slab) {
> +		DRM_ERROR("create amdgpu_mux_chunk cache failed\n");
> +		return -ENOMEM;
> +	}
> +
>   	spin_lock_init(&mux->lock);
> +	timer_setup(&mux->resubmit_timer, amdgpu_mux_resubmit_fallback, 0);
>   
>   	return 0;
>   }
>   
>   void amdgpu_ring_mux_fini(struct amdgpu_ring_mux *mux)
>   {
> +	struct amdgpu_mux_entry *e;
> +	struct amdgpu_mux_chunk *chunk, *chunk2;
> +	int i;
> +
> +	for (i = 0; i < mux->num_ring_entries; i++) {
> +		e = &mux->ring_entry[i];
> +		list_for_each_entry_safe(chunk, chunk2, &e->list, entry) {
> +			list_del(&chunk->entry);
> +			kmem_cache_free(amdgpu_mux_chunk_slab, chunk);
> +		}
> +	}
> +	kmem_cache_destroy(amdgpu_mux_chunk_slab);
>   	kfree(mux->ring_entry);
>   	mux->ring_entry = NULL;
>   	mux->num_ring_entries = 0;
> @@ -64,62 +187,46 @@ int amdgpu_ring_mux_add_sw_ring(struct amdgpu_ring_mux *mux, struct amdgpu_ring
>   	ring->entry_index = mux->num_ring_entries;
>   	e->ring = ring;
>   
> +	INIT_LIST_HEAD(&e->list);
>   	mux->num_ring_entries += 1;
>   	return 0;
>   }
>   
> -static inline struct amdgpu_mux_entry *amdgpu_ring_mux_sw_entry(struct amdgpu_ring_mux *mux,
> -								struct amdgpu_ring *ring)
> -{
> -	return ring->entry_index < mux->ring_entry_size ?
> -			&mux->ring_entry[ring->entry_index] : NULL;
> -}
> -
> -/* copy packages on sw ring range[begin, end) */
> -static void amdgpu_ring_mux_copy_pkt_from_sw_ring(struct amdgpu_ring_mux *mux,
> -						  struct amdgpu_ring *ring,
> -						  u64 s_start, u64 s_end)
> -{
> -	u64 start, end;
> -	struct amdgpu_ring *real_ring = mux->real_ring;
> -
> -	start = s_start & ring->buf_mask;
> -	end = s_end & ring->buf_mask;
> -
> -	if (start == end) {
> -		DRM_ERROR("no more data copied from sw ring\n");
> -		return;
> -	}
> -	if (start > end) {
> -		amdgpu_ring_alloc(real_ring, (ring->ring_size >> 2) + end - start);
> -		amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[start],
> -					   (ring->ring_size >> 2) - start);
> -		amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[0], end);
> -	} else {
> -		amdgpu_ring_alloc(real_ring, end - start);
> -		amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[start], end - start);
> -	}
> -}
> -
>   void amdgpu_ring_mux_set_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring, u64 wptr)
>   {
>   	struct amdgpu_mux_entry *e;
>   
> +	if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT)
> +		amdgpu_mux_resubmit_chunks(mux, false);
> +
>   	e = amdgpu_ring_mux_sw_entry(mux, ring);
>   	if (!e) {
>   		DRM_ERROR("cannot find entry for sw ring\n");
>   		return;
>   	}
>   
> +	/* We could skip this set wptr as preemption in process. */
> +	if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT && mux->pending_trailing_fence_signaled) {
> +		DRM_ERROR("amdgpu_ring_mux_set_wptr skipped\n");
> +		return;
> +	}
> +
>   	spin_lock(&mux->lock);
>   	e->sw_cptr = e->sw_wptr;
> +	/* Update cptr if the package already copied in resubmit functions */
> +	if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT && e->sw_cptr < mux->wptr_resubmit)
> +		e->sw_cptr = mux->wptr_resubmit;
>   	e->sw_wptr = wptr;
>   	e->start_ptr_in_hw_ring = mux->real_ring->wptr;
>   
> -	amdgpu_ring_mux_copy_pkt_from_sw_ring(mux, ring, e->sw_cptr, wptr);
> -	e->end_ptr_in_hw_ring = mux->real_ring->wptr;
> -	amdgpu_ring_commit(mux->real_ring);
> -
> +	/* Skip copying for the packages already resubmitted.*/
> +	if (ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT || mux->wptr_resubmit < wptr) {
> +		amdgpu_ring_mux_copy_pkt_from_sw_ring(mux, ring, e->sw_cptr, wptr);
> +		e->end_ptr_in_hw_ring = mux->real_ring->wptr;
> +		amdgpu_ring_commit(mux->real_ring);
> +	} else {
> +		e->end_ptr_in_hw_ring = mux->real_ring->wptr;
> +	}
>   	spin_unlock(&mux->lock);
>   }
>   
> @@ -181,3 +288,145 @@ u64 amdgpu_ring_mux_get_rptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ri
>   
>   	return e->sw_rptr;
>   }
> +
> +void amdgpu_ring_mux_schedule_resubmit(struct amdgpu_ring_mux *mux)
> +{
> +	mod_timer(&mux->resubmit_timer, jiffies + AMDGPU_MUX_RESUBMIT_JIFFIES_TIMEOUT);
> +}
> +
> +void amdgpu_ring_mux_start_ib(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring)
> +{
> +	struct amdgpu_mux_entry *e;
> +	struct amdgpu_mux_chunk *chunk;
> +
> +	amdgpu_mux_resubmit_chunks(mux, false);
> +
> +	e = amdgpu_ring_mux_sw_entry(mux, ring);
> +	if (!e) {
> +		DRM_ERROR("cannot find entry!\n");
> +		return;
> +	}
> +
> +	chunk = kmem_cache_alloc(amdgpu_mux_chunk_slab, GFP_KERNEL);
> +	if (!chunk) {
> +		DRM_ERROR("alloc amdgpu_mux_chunk_slab failed\n");
> +		return;
> +	}
> +
> +	chunk->start = ring->wptr;
> +	list_add_tail(&chunk->entry, &e->list);
> +}
> +
> +static void scan_and_remove_signaled_chunk(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring)
> +{
> +	uint32_t last_seq, size = 0;
> +	struct amdgpu_mux_entry *e;
> +	struct amdgpu_mux_chunk *chunk, *tmp;
> +
> +	e = amdgpu_ring_mux_sw_entry(mux, ring);
> +	if (!e) {
> +		DRM_ERROR("cannot find entry!\n");
> +		return;
> +	}
> +
> +	last_seq = atomic_read(&ring->fence_drv.last_seq);
> +
> +	list_for_each_entry_safe(chunk, tmp, &e->list, entry) {
> +		if (chunk->sync_seq <= last_seq) {
> +			list_del(&chunk->entry);
> +			kmem_cache_free(amdgpu_mux_chunk_slab, chunk);
> +		} else {
> +			size++;
> +		}
> +	}
> +}
> +
> +void amdgpu_ring_mux_end_ib(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring)
> +{
> +	struct amdgpu_mux_entry *e;
> +	struct amdgpu_mux_chunk *chunk;
> +
> +	e = amdgpu_ring_mux_sw_entry(mux, ring);
> +	if (!e) {
> +		DRM_ERROR("cannot find entry!\n");
> +		return;
> +	}
> +
> +	chunk = list_last_entry(&e->list, struct amdgpu_mux_chunk, entry);
> +	if (!chunk) {
> +		DRM_ERROR("cannot find chunk!\n");
> +		return;
> +	}
> +
> +	chunk->end = ring->wptr;
> +	chunk->sync_seq = READ_ONCE(ring->fence_drv.sync_seq);
> +
> +	scan_and_remove_signaled_chunk(mux, ring);
> +}
> +
> +/* Trigger Mid-Command Buffer Preemption (MCBP) and find if we need to resubmit. */
> +int amdgpu_mcbp_trigger_preempt(struct amdgpu_ring_mux *mux)
> +{
> +	int r;
> +
> +	spin_lock(&mux->lock);
> +	mux->pending_trailing_fence_signaled = true;
> +	r = amdgpu_ring_preempt_ib(mux->real_ring);
> +	spin_unlock(&mux->lock);
> +	return r;
> +}
> +
> +bool amdgpu_mcbp_handle_trailing_fence_irq(struct amdgpu_ring_mux *mux)
> +{
> +	struct amdgpu_mux_entry *e;
> +	struct amdgpu_ring *ring = NULL;
> +	int i;
> +
> +	if (!mux->pending_trailing_fence_signaled)
> +		return false;
> +
> +	if (mux->real_ring->trail_seq != le32_to_cpu(*mux->real_ring->trail_fence_cpu_addr))
> +		return false;
> +
> +	for (i = 0; i < mux->num_ring_entries; i++) {
> +		e = &mux->ring_entry[i];
> +		if (e->ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT) {
> +			ring = e->ring;
> +			break;
> +		}
> +	}
> +
> +	if (!ring) {
> +		DRM_ERROR("cannot find low priority ring\n");
> +		return false;
> +	}
> +
> +	amdgpu_fence_process(ring);
> +	if (amdgpu_fence_count_emitted(ring) > 0) {
> +		mux->s_resubmit = true;
> +		mux->seqno_to_resubmit = ring->fence_drv.sync_seq;
> +		amdgpu_ring_mux_schedule_resubmit(mux);
> +	}
> +
> +	mux->pending_trailing_fence_signaled = false;
> +	return true;
> +}
> +
> +/*scan on low prio rings to have unsignaled fence and high ring has no fence.*/
> +int amdgpu_mcbp_scan(struct amdgpu_ring_mux *mux)
> +{
> +	struct amdgpu_ring *ring;
> +	int i, need_preempt;
> +
> +	need_preempt = 0;
> +	for (i = 0; i < mux->num_ring_entries; i++) {
> +		ring = mux->ring_entry[i].ring;
> +		if (ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT &&
> +			amdgpu_fence_count_emitted(ring) > 0)
> +			return 0;
> +		if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT &&
> +			amdgpu_fence_count_emitted(ring) > 0)
> +			need_preempt = 1;
> +	}
> +	return need_preempt && !mux->s_resubmit;
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
> index 8c1691e11b1c..bf8f5ca61605 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
> @@ -29,6 +29,7 @@
>   #include "amdgpu_ring.h"
>   
>   struct amdgpu_ring;
> +
>   /**
>    * struct amdgpu_mux_entry - the entry recording software rings copying information.
>    * @ring: the pointer to the software ring.
> @@ -37,6 +38,7 @@ struct amdgpu_ring;
>    * @sw_cptr: the position of the copy pointer in the sw ring.
>    * @sw_rptr: the read pointer in software ring.
>    * @sw_wptr: the write pointer in software ring.
> + * @list: list head for amdgpu_mux_chunk
>    */
>   struct amdgpu_mux_entry {
>   	struct                  amdgpu_ring *ring;
> @@ -45,6 +47,7 @@ struct amdgpu_mux_entry {
>   	u64                     sw_cptr;
>   	u64                     sw_rptr;
>   	u64                     sw_wptr;
> +	struct list_head        list;
>   };
>   
>   struct amdgpu_ring_mux {
> @@ -55,6 +58,26 @@ struct amdgpu_ring_mux {
>   	unsigned int            ring_entry_size;
>   	/*the lock for copy data from different software rings*/
>   	spinlock_t              lock;
> +	bool                    s_resubmit;
> +	uint32_t                seqno_to_resubmit;
> +	u64                     wptr_resubmit;
> +	struct timer_list       resubmit_timer;
> +
> +	bool                    pending_trailing_fence_signaled;
> +};
> +
> +/**
> + * struct amdgpu_mux_chunk - save the location of indirect buffer's package on softare rings.
> + * @entry: the list entry.
> + * @sync_seq: the fence seqno related with the saved IB.
> + * @start:- start location on the software ring.
> + * @end:- end location on the software ring.
> + */
> +struct amdgpu_mux_chunk {
> +	struct list_head        entry;
> +	uint32_t                sync_seq;
> +	u64                     start;
> +	u64                     end;
>   };
>   
>   int amdgpu_ring_mux_init(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring,
> @@ -65,4 +88,11 @@ void amdgpu_ring_mux_set_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *r
>   u64 amdgpu_ring_mux_get_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring);
>   u64 amdgpu_ring_mux_get_rptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring);
>   
> +void amdgpu_ring_mux_start_ib(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring);
> +void amdgpu_ring_mux_end_ib(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring);
> +void amdgpu_ring_mux_schedule_resubmit(struct amdgpu_ring_mux *mux);
> +
> +int amdgpu_mcbp_trigger_preempt(struct amdgpu_ring_mux *mux);
> +int amdgpu_mcbp_scan(struct amdgpu_ring_mux *mux);
> +bool amdgpu_mcbp_handle_trailing_fence_irq(struct amdgpu_ring_mux *mux);
>   #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.c
> index 5ae12d6641ca..a3ec7bdf72a6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.c
> @@ -59,3 +59,29 @@ void amdgpu_sw_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count)
>   {
>   	WARN_ON(!ring->is_sw_ring);
>   }
> +
> +void amdgpu_sw_ring_ib_begin(struct amdgpu_ring *ring)
> +{
> +	struct amdgpu_device *adev = ring->adev;
> +	struct amdgpu_ring_mux *mux = &adev->gfx.muxer;
> +
> +	WARN_ON(!ring->is_sw_ring);
> +	if (ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT) {
> +		if (amdgpu_mcbp_scan(mux) > 0)
> +			amdgpu_mcbp_trigger_preempt(mux);
> +		return;
> +	}
> +
> +	amdgpu_ring_mux_start_ib(mux, ring);
> +}
> +
> +void amdgpu_sw_ring_ib_end(struct amdgpu_ring *ring)
> +{
> +	struct amdgpu_device *adev = ring->adev;
> +	struct amdgpu_ring_mux *mux = &adev->gfx.muxer;
> +
> +	WARN_ON(!ring->is_sw_ring);
> +	if (ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT)
> +		return;
> +	amdgpu_ring_mux_end_ib(mux, ring);
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 9596c22fded6..b7e94553f4fb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -601,6 +601,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
>   	if (!vm_flush_needed && !gds_switch_needed && !need_pipe_sync)
>   		return 0;
>   
> +	amdgpu_ring_ib_begin(ring);
>   	if (ring->funcs->init_cond_exec)
>   		patch_offset = amdgpu_ring_init_cond_exec(ring);
>   
> @@ -661,6 +662,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
>   		amdgpu_ring_emit_switch_buffer(ring);
>   		amdgpu_ring_emit_switch_buffer(ring);
>   	}
> +	amdgpu_ring_ib_end(ring);
>   	return 0;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 669532f658da..1620300f0dde 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -5619,7 +5619,7 @@ static int gfx_v9_0_ring_preempt_ib(struct amdgpu_ring *ring)
>   	ring->trail_seq += 1;
>   	amdgpu_ring_alloc(ring, 13);
>   	gfx_v9_0_ring_emit_fence(ring, ring->trail_fence_gpu_addr,
> -				 ring->trail_seq, AMDGPU_FENCE_FLAG_EXEC);
> +				 ring->trail_seq, AMDGPU_FENCE_FLAG_EXEC | AMDGPU_FENCE_FLAG_INT);
>   	/*reset the CP_VMID_PREEMPT after trailing fence*/
>   	amdgpu_ring_emit_wreg(ring,
>   			      SOC15_REG_OFFSET(GC, 0, mmCP_VMID_PREEMPT),
> @@ -6045,9 +6045,11 @@ static int gfx_v9_0_eop_irq(struct amdgpu_device *adev,
>   
>   	switch (me_id) {
>   	case 0:
> -		/* Fence signals are handled on the software rings*/
> -		for (i = 0; i < GFX9_NUM_SW_GFX_RINGS; i++)
> -			amdgpu_fence_process(&adev->gfx.sw_gfx_ring[i]);
> +		if (!amdgpu_mcbp_handle_trailing_fence_irq(&adev->gfx.muxer)) {
> +			/* Fence signals are handled on the software rings*/
> +			for (i = 0; i < GFX9_NUM_SW_GFX_RINGS; i++)
> +				amdgpu_fence_process(&adev->gfx.sw_gfx_ring[i]);
> +		}
>   		break;
>   	case 1:
>   	case 2:


^ permalink raw reply	[flat|nested] 15+ messages in thread

* RE: [PATCH 4/4] drm/amdgpu: MCBP based on DRM scheduler (v6)
  2022-09-26  6:49   ` Christian König
@ 2022-09-27  3:18     ` Zhu, Jiadong
  2022-09-27  6:06       ` Christian König
  0 siblings, 1 reply; 15+ messages in thread
From: Zhu, Jiadong @ 2022-09-27  3:18 UTC (permalink / raw)
  To: Christian König, amd-gfx
  Cc: Grodzovsky, Andrey, Tuikov, Luben, Koenig, Christian

[AMD Official Use Only - General]

>I need more time for an in deep review of this, but form the one mile high view it looks correct to me now.

>Can we do some pre-commit qa testing with this?

I changed drm test "Command submission Test (GFX)" to send high priority ibs meanwhile running Manhattan on Screen/Unigine heaven foreground, checking mcbp/resubmit triggered by cat /sys/kernel/debug/dri/0/amdgpu_fence_info

I have continued running this scenario for 2 daytime and 1 night, no hangs happen yet(lots of hangs has been fixed in the previous patches).

I will ask QA team to do more test.

Thanks,
JIadong

-----Original Message-----
From: Christian König <ckoenig.leichtzumerken@gmail.com>
Sent: Monday, September 26, 2022 2:49 PM
To: Zhu, Jiadong <Jiadong.Zhu@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Tuikov, Luben <Luben.Tuikov@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
Subject: Re: [PATCH 4/4] drm/amdgpu: MCBP based on DRM scheduler (v6)

Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding.


Am 23.09.22 um 15:16 schrieb jiadong.zhu@amd.com:
> From: "Jiadong.Zhu" <Jiadong.Zhu@amd.com>
>
> Trigger Mid-Command Buffer Preemption according to the priority of the
> software rings and the hw fence signalling condition.
>
> The muxer saves the locations of the indirect buffer frames from the
> software ring together with the fence sequence number in its fifo
> queue, and pops out those records when the fences are signalled. The
> locations are used to resubmit packages in preemption scenarios by coping the chunks from the software ring.
>
> v2: Update comment style.
> v3: Fix conflict caused by previous modifications.
> v4: Remove unnecessary prints.
> v5: Fix corner cases for resubmission cases.
> v6: Refactor functions for resubmission, calling fence_process in irq handler.
>
> Cc: Christian Koenig <Christian.Koenig@amd.com>
> Cc: Luben Tuikov <Luben.Tuikov@amd.com>
> Cc: Andrey Grodzovsky <Andrey.Grodzovsky@amd.com>
> Acked-by: Luben Tuikov <luben.tuikov@amd.com>
> Signed-off-by: Jiadong.Zhu <Jiadong.Zhu@amd.com>

I need more time for an in deep review of this, but form the one mile high view it looks correct to me now.

Can we do some pre-commit qa testing with this?

Thanks,
Christian.

> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c       |   2 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c     |  13 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h     |   3 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c | 323 ++++++++++++++++---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h |  30 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.c  |  26 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c       |   2 +
>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c        |  10 +-
>   8 files changed, 368 insertions(+), 41 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> index 258cffe3c06a..af86d87e2f3b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> @@ -211,6 +211,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>               }
>       }
>
> +     amdgpu_ring_ib_begin(ring);
>       if (job && ring->funcs->init_cond_exec)
>               patch_offset = amdgpu_ring_init_cond_exec(ring);
>
> @@ -285,6 +286,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>           ring->hw_prio == AMDGPU_GFX_PIPE_PRIO_HIGH)
>               ring->funcs->emit_wave_limit(ring, false);
>
> +     amdgpu_ring_ib_end(ring);
>       amdgpu_ring_commit(ring);
>       return 0;
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> index 13db99d653bd..84b0b3c7d40f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> @@ -33,6 +33,7 @@
>
>   #include <drm/amdgpu_drm.h>
>   #include "amdgpu.h"
> +#include "amdgpu_sw_ring.h"
>   #include "atom.h"
>
>   /*
> @@ -569,3 +570,15 @@ int amdgpu_ring_init_mqd(struct amdgpu_ring
> *ring)
>
>       return mqd_mgr->init_mqd(adev, ring->mqd_ptr, &prop);
>   }
> +
> +void amdgpu_ring_ib_begin(struct amdgpu_ring *ring) {
> +     if (ring->is_sw_ring)
> +             amdgpu_sw_ring_ib_begin(ring); }
> +
> +void amdgpu_ring_ib_end(struct amdgpu_ring *ring) {
> +     if (ring->is_sw_ring)
> +             amdgpu_sw_ring_ib_end(ring); }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index e90d327a589e..6fbc1627dab7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -312,6 +312,9 @@ struct amdgpu_ring {
>   #define amdgpu_ring_preempt_ib(r) (r)->funcs->preempt_ib(r)
>
>   int amdgpu_ring_alloc(struct amdgpu_ring *ring, unsigned ndw);
> +void amdgpu_ring_ib_begin(struct amdgpu_ring *ring); void
> +amdgpu_ring_ib_end(struct amdgpu_ring *ring);
> +
>   void amdgpu_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count);
>   void amdgpu_ring_generic_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib);
>   void amdgpu_ring_commit(struct amdgpu_ring *ring); diff --git
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
> index 662aadebf111..788567e3b743 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
> @@ -28,23 +28,146 @@
>
>   #define AMDGPU_MUX_RESUBMIT_JIFFIES_TIMEOUT (HZ / 2)
>
> +static struct kmem_cache *amdgpu_mux_chunk_slab;
> +
> +static inline struct amdgpu_mux_entry *amdgpu_ring_mux_sw_entry(struct amdgpu_ring_mux *mux,
> +                                                             struct
> +amdgpu_ring *ring) {
> +     return ring->entry_index < mux->ring_entry_size ?
> +                     &mux->ring_entry[ring->entry_index] : NULL; }
> +
> +/* copy packages on sw ring range[begin, end) */ static void
> +amdgpu_ring_mux_copy_pkt_from_sw_ring(struct amdgpu_ring_mux *mux,
> +                                               struct amdgpu_ring *ring,
> +                                               u64 s_start, u64
> +s_end) {
> +     u64 start, end;
> +     struct amdgpu_ring *real_ring = mux->real_ring;
> +
> +     start = s_start & ring->buf_mask;
> +     end = s_end & ring->buf_mask;
> +
> +     if (start == end) {
> +             DRM_ERROR("no more data copied from sw ring\n");
> +             return;
> +     }
> +     if (start > end) {
> +             amdgpu_ring_alloc(real_ring, (ring->ring_size >> 2) + end - start);
> +             amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[start],
> +                                        (ring->ring_size >> 2) - start);
> +             amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[0], end);
> +     } else {
> +             amdgpu_ring_alloc(real_ring, end - start);
> +             amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[start], end - start);
> +     }
> +}
> +
> +static void amdgpu_mux_resubmit_chunks(struct amdgpu_ring_mux *mux,
> +bool is_fallback) {
> +     struct amdgpu_mux_entry *e = NULL;
> +     struct amdgpu_mux_chunk *chunk;
> +     uint32_t seq, last_seq;
> +     int i;
> +
> +     if (is_fallback) {
> +             if (!spin_trylock(&mux->lock)) {
> +                     amdgpu_ring_mux_schedule_resubmit(mux);
> +                     DRM_ERROR("reschedule resubmit\n");
> +                     return;
> +             }
> +     } else {
> +             spin_lock(&mux->lock);
> +     }
> +
> +     /*find low priority entries:*/
> +     if (!mux->s_resubmit) {
> +             spin_unlock(&mux->lock);
> +             return;
> +     }
> +
> +     for (i = 0; i < mux->num_ring_entries; i++) {
> +             if (mux->ring_entry[i].ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT) {
> +                     e = &mux->ring_entry[i];
> +                     break;
> +             }
> +     }
> +
> +     if (!e) {
> +             DRM_ERROR("%s no low priority ring found\n", __func__);
> +             spin_unlock(&mux->lock);
> +             return;
> +     }
> +
> +     last_seq = atomic_read(&e->ring->fence_drv.last_seq);
> +     seq = mux->seqno_to_resubmit;
> +     if (last_seq < seq) {
> +             /*resubmit all the fences between (last_seq, seq]*/
> +             list_for_each_entry(chunk, &e->list, entry) {
> +                     if (chunk->sync_seq > last_seq && chunk->sync_seq <= seq) {
> +                             amdgpu_ring_mux_copy_pkt_from_sw_ring(mux, e->ring,
> +                                                                   chunk->start,
> +                                                                   chunk->end);
> +                             mux->wptr_resubmit = chunk->end;
> +                             amdgpu_ring_commit(mux->real_ring);
> +                     }
> +             }
> +     }
> +
> +     del_timer(&mux->resubmit_timer);
> +     mux->s_resubmit = false;
> +     spin_unlock(&mux->lock);
> +}
> +
> +static void amdgpu_mux_resubmit_fallback(struct timer_list *t) {
> +     struct amdgpu_ring_mux *mux = from_timer(mux, t,
> +resubmit_timer);
> +
> +     DRM_INFO("calling %s\n", __func__);
> +     amdgpu_mux_resubmit_chunks(mux, true); }
> +
>   int amdgpu_ring_mux_init(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring,
>                        unsigned int entry_size)
>   {
>       mux->real_ring = ring;
>       mux->num_ring_entries = 0;
> +
>       mux->ring_entry = kcalloc(entry_size, sizeof(struct amdgpu_mux_entry), GFP_KERNEL);
>       if (!mux->ring_entry)
>               return -ENOMEM;
>
>       mux->ring_entry_size = entry_size;
> +     mux->s_resubmit = false;
> +
> +     amdgpu_mux_chunk_slab = kmem_cache_create("amdgpu_mux_chunk",
> +                                               sizeof(struct amdgpu_mux_chunk), 0,
> +                                               SLAB_HWCACHE_ALIGN, NULL);
> +     if (!amdgpu_mux_chunk_slab) {
> +             DRM_ERROR("create amdgpu_mux_chunk cache failed\n");
> +             return -ENOMEM;
> +     }
> +
>       spin_lock_init(&mux->lock);
> +     timer_setup(&mux->resubmit_timer, amdgpu_mux_resubmit_fallback,
> + 0);
>
>       return 0;
>   }
>
>   void amdgpu_ring_mux_fini(struct amdgpu_ring_mux *mux)
>   {
> +     struct amdgpu_mux_entry *e;
> +     struct amdgpu_mux_chunk *chunk, *chunk2;
> +     int i;
> +
> +     for (i = 0; i < mux->num_ring_entries; i++) {
> +             e = &mux->ring_entry[i];
> +             list_for_each_entry_safe(chunk, chunk2, &e->list, entry) {
> +                     list_del(&chunk->entry);
> +                     kmem_cache_free(amdgpu_mux_chunk_slab, chunk);
> +             }
> +     }
> +     kmem_cache_destroy(amdgpu_mux_chunk_slab);
>       kfree(mux->ring_entry);
>       mux->ring_entry = NULL;
>       mux->num_ring_entries = 0;
> @@ -64,62 +187,46 @@ int amdgpu_ring_mux_add_sw_ring(struct amdgpu_ring_mux *mux, struct amdgpu_ring
>       ring->entry_index = mux->num_ring_entries;
>       e->ring = ring;
>
> +     INIT_LIST_HEAD(&e->list);
>       mux->num_ring_entries += 1;
>       return 0;
>   }
>
> -static inline struct amdgpu_mux_entry *amdgpu_ring_mux_sw_entry(struct amdgpu_ring_mux *mux,
> -                                                             struct amdgpu_ring *ring)
> -{
> -     return ring->entry_index < mux->ring_entry_size ?
> -                     &mux->ring_entry[ring->entry_index] : NULL;
> -}
> -
> -/* copy packages on sw ring range[begin, end) */ -static void
> amdgpu_ring_mux_copy_pkt_from_sw_ring(struct amdgpu_ring_mux *mux,
> -                                               struct amdgpu_ring *ring,
> -                                               u64 s_start, u64 s_end)
> -{
> -     u64 start, end;
> -     struct amdgpu_ring *real_ring = mux->real_ring;
> -
> -     start = s_start & ring->buf_mask;
> -     end = s_end & ring->buf_mask;
> -
> -     if (start == end) {
> -             DRM_ERROR("no more data copied from sw ring\n");
> -             return;
> -     }
> -     if (start > end) {
> -             amdgpu_ring_alloc(real_ring, (ring->ring_size >> 2) + end - start);
> -             amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[start],
> -                                        (ring->ring_size >> 2) - start);
> -             amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[0], end);
> -     } else {
> -             amdgpu_ring_alloc(real_ring, end - start);
> -             amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[start], end - start);
> -     }
> -}
> -
>   void amdgpu_ring_mux_set_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring, u64 wptr)
>   {
>       struct amdgpu_mux_entry *e;
>
> +     if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT)
> +             amdgpu_mux_resubmit_chunks(mux, false);
> +
>       e = amdgpu_ring_mux_sw_entry(mux, ring);
>       if (!e) {
>               DRM_ERROR("cannot find entry for sw ring\n");
>               return;
>       }
>
> +     /* We could skip this set wptr as preemption in process. */
> +     if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT && mux->pending_trailing_fence_signaled) {
> +             DRM_ERROR("amdgpu_ring_mux_set_wptr skipped\n");
> +             return;
> +     }
> +
>       spin_lock(&mux->lock);
>       e->sw_cptr = e->sw_wptr;
> +     /* Update cptr if the package already copied in resubmit functions */
> +     if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT && e->sw_cptr < mux->wptr_resubmit)
> +             e->sw_cptr = mux->wptr_resubmit;
>       e->sw_wptr = wptr;
>       e->start_ptr_in_hw_ring = mux->real_ring->wptr;
>
> -     amdgpu_ring_mux_copy_pkt_from_sw_ring(mux, ring, e->sw_cptr, wptr);
> -     e->end_ptr_in_hw_ring = mux->real_ring->wptr;
> -     amdgpu_ring_commit(mux->real_ring);
> -
> +     /* Skip copying for the packages already resubmitted.*/
> +     if (ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT || mux->wptr_resubmit < wptr) {
> +             amdgpu_ring_mux_copy_pkt_from_sw_ring(mux, ring, e->sw_cptr, wptr);
> +             e->end_ptr_in_hw_ring = mux->real_ring->wptr;
> +             amdgpu_ring_commit(mux->real_ring);
> +     } else {
> +             e->end_ptr_in_hw_ring = mux->real_ring->wptr;
> +     }
>       spin_unlock(&mux->lock);
>   }
>
> @@ -181,3 +288,145 @@ u64 amdgpu_ring_mux_get_rptr(struct
> amdgpu_ring_mux *mux, struct amdgpu_ring *ri
>
>       return e->sw_rptr;
>   }
> +
> +void amdgpu_ring_mux_schedule_resubmit(struct amdgpu_ring_mux *mux) {
> +     mod_timer(&mux->resubmit_timer, jiffies +
> +AMDGPU_MUX_RESUBMIT_JIFFIES_TIMEOUT);
> +}
> +
> +void amdgpu_ring_mux_start_ib(struct amdgpu_ring_mux *mux, struct
> +amdgpu_ring *ring) {
> +     struct amdgpu_mux_entry *e;
> +     struct amdgpu_mux_chunk *chunk;
> +
> +     amdgpu_mux_resubmit_chunks(mux, false);
> +
> +     e = amdgpu_ring_mux_sw_entry(mux, ring);
> +     if (!e) {
> +             DRM_ERROR("cannot find entry!\n");
> +             return;
> +     }
> +
> +     chunk = kmem_cache_alloc(amdgpu_mux_chunk_slab, GFP_KERNEL);
> +     if (!chunk) {
> +             DRM_ERROR("alloc amdgpu_mux_chunk_slab failed\n");
> +             return;
> +     }
> +
> +     chunk->start = ring->wptr;
> +     list_add_tail(&chunk->entry, &e->list); }
> +
> +static void scan_and_remove_signaled_chunk(struct amdgpu_ring_mux
> +*mux, struct amdgpu_ring *ring) {
> +     uint32_t last_seq, size = 0;
> +     struct amdgpu_mux_entry *e;
> +     struct amdgpu_mux_chunk *chunk, *tmp;
> +
> +     e = amdgpu_ring_mux_sw_entry(mux, ring);
> +     if (!e) {
> +             DRM_ERROR("cannot find entry!\n");
> +             return;
> +     }
> +
> +     last_seq = atomic_read(&ring->fence_drv.last_seq);
> +
> +     list_for_each_entry_safe(chunk, tmp, &e->list, entry) {
> +             if (chunk->sync_seq <= last_seq) {
> +                     list_del(&chunk->entry);
> +                     kmem_cache_free(amdgpu_mux_chunk_slab, chunk);
> +             } else {
> +                     size++;
> +             }
> +     }
> +}
> +
> +void amdgpu_ring_mux_end_ib(struct amdgpu_ring_mux *mux, struct
> +amdgpu_ring *ring) {
> +     struct amdgpu_mux_entry *e;
> +     struct amdgpu_mux_chunk *chunk;
> +
> +     e = amdgpu_ring_mux_sw_entry(mux, ring);
> +     if (!e) {
> +             DRM_ERROR("cannot find entry!\n");
> +             return;
> +     }
> +
> +     chunk = list_last_entry(&e->list, struct amdgpu_mux_chunk, entry);
> +     if (!chunk) {
> +             DRM_ERROR("cannot find chunk!\n");
> +             return;
> +     }
> +
> +     chunk->end = ring->wptr;
> +     chunk->sync_seq = READ_ONCE(ring->fence_drv.sync_seq);
> +
> +     scan_and_remove_signaled_chunk(mux, ring); }
> +
> +/* Trigger Mid-Command Buffer Preemption (MCBP) and find if we need
> +to resubmit. */ int amdgpu_mcbp_trigger_preempt(struct
> +amdgpu_ring_mux *mux) {
> +     int r;
> +
> +     spin_lock(&mux->lock);
> +     mux->pending_trailing_fence_signaled = true;
> +     r = amdgpu_ring_preempt_ib(mux->real_ring);
> +     spin_unlock(&mux->lock);
> +     return r;
> +}
> +
> +bool amdgpu_mcbp_handle_trailing_fence_irq(struct amdgpu_ring_mux
> +*mux) {
> +     struct amdgpu_mux_entry *e;
> +     struct amdgpu_ring *ring = NULL;
> +     int i;
> +
> +     if (!mux->pending_trailing_fence_signaled)
> +             return false;
> +
> +     if (mux->real_ring->trail_seq != le32_to_cpu(*mux->real_ring->trail_fence_cpu_addr))
> +             return false;
> +
> +     for (i = 0; i < mux->num_ring_entries; i++) {
> +             e = &mux->ring_entry[i];
> +             if (e->ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT) {
> +                     ring = e->ring;
> +                     break;
> +             }
> +     }
> +
> +     if (!ring) {
> +             DRM_ERROR("cannot find low priority ring\n");
> +             return false;
> +     }
> +
> +     amdgpu_fence_process(ring);
> +     if (amdgpu_fence_count_emitted(ring) > 0) {
> +             mux->s_resubmit = true;
> +             mux->seqno_to_resubmit = ring->fence_drv.sync_seq;
> +             amdgpu_ring_mux_schedule_resubmit(mux);
> +     }
> +
> +     mux->pending_trailing_fence_signaled = false;
> +     return true;
> +}
> +
> +/*scan on low prio rings to have unsignaled fence and high ring has
> +no fence.*/ int amdgpu_mcbp_scan(struct amdgpu_ring_mux *mux) {
> +     struct amdgpu_ring *ring;
> +     int i, need_preempt;
> +
> +     need_preempt = 0;
> +     for (i = 0; i < mux->num_ring_entries; i++) {
> +             ring = mux->ring_entry[i].ring;
> +             if (ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT &&
> +                     amdgpu_fence_count_emitted(ring) > 0)
> +                     return 0;
> +             if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT &&
> +                     amdgpu_fence_count_emitted(ring) > 0)
> +                     need_preempt = 1;
> +     }
> +     return need_preempt && !mux->s_resubmit; }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
> index 8c1691e11b1c..bf8f5ca61605 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
> @@ -29,6 +29,7 @@
>   #include "amdgpu_ring.h"
>
>   struct amdgpu_ring;
> +
>   /**
>    * struct amdgpu_mux_entry - the entry recording software rings copying information.
>    * @ring: the pointer to the software ring.
> @@ -37,6 +38,7 @@ struct amdgpu_ring;
>    * @sw_cptr: the position of the copy pointer in the sw ring.
>    * @sw_rptr: the read pointer in software ring.
>    * @sw_wptr: the write pointer in software ring.
> + * @list: list head for amdgpu_mux_chunk
>    */
>   struct amdgpu_mux_entry {
>       struct                  amdgpu_ring *ring;
> @@ -45,6 +47,7 @@ struct amdgpu_mux_entry {
>       u64                     sw_cptr;
>       u64                     sw_rptr;
>       u64                     sw_wptr;
> +     struct list_head        list;
>   };
>
>   struct amdgpu_ring_mux {
> @@ -55,6 +58,26 @@ struct amdgpu_ring_mux {
>       unsigned int            ring_entry_size;
>       /*the lock for copy data from different software rings*/
>       spinlock_t              lock;
> +     bool                    s_resubmit;
> +     uint32_t                seqno_to_resubmit;
> +     u64                     wptr_resubmit;
> +     struct timer_list       resubmit_timer;
> +
> +     bool                    pending_trailing_fence_signaled;
> +};
> +
> +/**
> + * struct amdgpu_mux_chunk - save the location of indirect buffer's package on softare rings.
> + * @entry: the list entry.
> + * @sync_seq: the fence seqno related with the saved IB.
> + * @start:- start location on the software ring.
> + * @end:- end location on the software ring.
> + */
> +struct amdgpu_mux_chunk {
> +     struct list_head        entry;
> +     uint32_t                sync_seq;
> +     u64                     start;
> +     u64                     end;
>   };
>
>   int amdgpu_ring_mux_init(struct amdgpu_ring_mux *mux, struct
> amdgpu_ring *ring, @@ -65,4 +88,11 @@ void amdgpu_ring_mux_set_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *r
>   u64 amdgpu_ring_mux_get_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring);
>   u64 amdgpu_ring_mux_get_rptr(struct amdgpu_ring_mux *mux, struct
> amdgpu_ring *ring);
>
> +void amdgpu_ring_mux_start_ib(struct amdgpu_ring_mux *mux, struct
> +amdgpu_ring *ring); void amdgpu_ring_mux_end_ib(struct
> +amdgpu_ring_mux *mux, struct amdgpu_ring *ring); void
> +amdgpu_ring_mux_schedule_resubmit(struct amdgpu_ring_mux *mux);
> +
> +int amdgpu_mcbp_trigger_preempt(struct amdgpu_ring_mux *mux); int
> +amdgpu_mcbp_scan(struct amdgpu_ring_mux *mux); bool
> +amdgpu_mcbp_handle_trailing_fence_irq(struct amdgpu_ring_mux *mux);
>   #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.c
> index 5ae12d6641ca..a3ec7bdf72a6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.c
> @@ -59,3 +59,29 @@ void amdgpu_sw_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count)
>   {
>       WARN_ON(!ring->is_sw_ring);
>   }
> +
> +void amdgpu_sw_ring_ib_begin(struct amdgpu_ring *ring) {
> +     struct amdgpu_device *adev = ring->adev;
> +     struct amdgpu_ring_mux *mux = &adev->gfx.muxer;
> +
> +     WARN_ON(!ring->is_sw_ring);
> +     if (ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT) {
> +             if (amdgpu_mcbp_scan(mux) > 0)
> +                     amdgpu_mcbp_trigger_preempt(mux);
> +             return;
> +     }
> +
> +     amdgpu_ring_mux_start_ib(mux, ring); }
> +
> +void amdgpu_sw_ring_ib_end(struct amdgpu_ring *ring) {
> +     struct amdgpu_device *adev = ring->adev;
> +     struct amdgpu_ring_mux *mux = &adev->gfx.muxer;
> +
> +     WARN_ON(!ring->is_sw_ring);
> +     if (ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT)
> +             return;
> +     amdgpu_ring_mux_end_ib(mux, ring); }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 9596c22fded6..b7e94553f4fb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -601,6 +601,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
>       if (!vm_flush_needed && !gds_switch_needed && !need_pipe_sync)
>               return 0;
>
> +     amdgpu_ring_ib_begin(ring);
>       if (ring->funcs->init_cond_exec)
>               patch_offset = amdgpu_ring_init_cond_exec(ring);
>
> @@ -661,6 +662,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
>               amdgpu_ring_emit_switch_buffer(ring);
>               amdgpu_ring_emit_switch_buffer(ring);
>       }
> +     amdgpu_ring_ib_end(ring);
>       return 0;
>   }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 669532f658da..1620300f0dde 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -5619,7 +5619,7 @@ static int gfx_v9_0_ring_preempt_ib(struct amdgpu_ring *ring)
>       ring->trail_seq += 1;
>       amdgpu_ring_alloc(ring, 13);
>       gfx_v9_0_ring_emit_fence(ring, ring->trail_fence_gpu_addr,
> -                              ring->trail_seq, AMDGPU_FENCE_FLAG_EXEC);
> +                              ring->trail_seq, AMDGPU_FENCE_FLAG_EXEC
> + | AMDGPU_FENCE_FLAG_INT);
>       /*reset the CP_VMID_PREEMPT after trailing fence*/
>       amdgpu_ring_emit_wreg(ring,
>                             SOC15_REG_OFFSET(GC, 0,
> mmCP_VMID_PREEMPT), @@ -6045,9 +6045,11 @@ static int
> gfx_v9_0_eop_irq(struct amdgpu_device *adev,
>
>       switch (me_id) {
>       case 0:
> -             /* Fence signals are handled on the software rings*/
> -             for (i = 0; i < GFX9_NUM_SW_GFX_RINGS; i++)
> -                     amdgpu_fence_process(&adev->gfx.sw_gfx_ring[i]);
> +             if (!amdgpu_mcbp_handle_trailing_fence_irq(&adev->gfx.muxer)) {
> +                     /* Fence signals are handled on the software rings*/
> +                     for (i = 0; i < GFX9_NUM_SW_GFX_RINGS; i++)
> +                             amdgpu_fence_process(&adev->gfx.sw_gfx_ring[i]);
> +             }
>               break;
>       case 1:
>       case 2:


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 4/4] drm/amdgpu: MCBP based on DRM scheduler (v6)
  2022-09-27  3:18     ` Zhu, Jiadong
@ 2022-09-27  6:06       ` Christian König
  2022-09-27 17:49         ` Michel Dänzer
  0 siblings, 1 reply; 15+ messages in thread
From: Christian König @ 2022-09-27  6:06 UTC (permalink / raw)
  To: amd-gfx, Michel Dänzer
  Cc: Grodzovsky, Andrey, Tuikov, Luben, Zhu, Jiadong

Hey Michel,

JIadong is working on exposing high/low priority gfx queues for gfx9 and 
older hw generations by using mid command buffer preemption.

I know that you have been working on Gnome Mutter to make use from 
userspace for this. Do you have time to run some tests with that?

Thanks,
Christian.

Am 27.09.22 um 05:18 schrieb Zhu, Jiadong:
> [AMD Official Use Only - General]
>
>> I need more time for an in deep review of this, but form the one mile high view it looks correct to me now.
>> Can we do some pre-commit qa testing with this?
> I changed drm test "Command submission Test (GFX)" to send high priority ibs meanwhile running Manhattan on Screen/Unigine heaven foreground, checking mcbp/resubmit triggered by cat /sys/kernel/debug/dri/0/amdgpu_fence_info
>
> I have continued running this scenario for 2 daytime and 1 night, no hangs happen yet(lots of hangs has been fixed in the previous patches).
>
> I will ask QA team to do more test.
>
> Thanks,
> JIadong
>
> -----Original Message-----
> From: Christian König <ckoenig.leichtzumerken@gmail.com>
> Sent: Monday, September 26, 2022 2:49 PM
> To: Zhu, Jiadong <Jiadong.Zhu@amd.com>; amd-gfx@lists.freedesktop.org
> Cc: Tuikov, Luben <Luben.Tuikov@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
> Subject: Re: [PATCH 4/4] drm/amdgpu: MCBP based on DRM scheduler (v6)
>
> Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding.
>
>
> Am 23.09.22 um 15:16 schrieb jiadong.zhu@amd.com:
>> From: "Jiadong.Zhu" <Jiadong.Zhu@amd.com>
>>
>> Trigger Mid-Command Buffer Preemption according to the priority of the
>> software rings and the hw fence signalling condition.
>>
>> The muxer saves the locations of the indirect buffer frames from the
>> software ring together with the fence sequence number in its fifo
>> queue, and pops out those records when the fences are signalled. The
>> locations are used to resubmit packages in preemption scenarios by coping the chunks from the software ring.
>>
>> v2: Update comment style.
>> v3: Fix conflict caused by previous modifications.
>> v4: Remove unnecessary prints.
>> v5: Fix corner cases for resubmission cases.
>> v6: Refactor functions for resubmission, calling fence_process in irq handler.
>>
>> Cc: Christian Koenig <Christian.Koenig@amd.com>
>> Cc: Luben Tuikov <Luben.Tuikov@amd.com>
>> Cc: Andrey Grodzovsky <Andrey.Grodzovsky@amd.com>
>> Acked-by: Luben Tuikov <luben.tuikov@amd.com>
>> Signed-off-by: Jiadong.Zhu <Jiadong.Zhu@amd.com>
> I need more time for an in deep review of this, but form the one mile high view it looks correct to me now.
>
> Can we do some pre-commit qa testing with this?
>
> Thanks,
> Christian.
>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c       |   2 +
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c     |  13 +
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h     |   3 +
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c | 323 ++++++++++++++++---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h |  30 ++
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.c  |  26 ++
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c       |   2 +
>>    drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c        |  10 +-
>>    8 files changed, 368 insertions(+), 41 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>> index 258cffe3c06a..af86d87e2f3b 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>> @@ -211,6 +211,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>>                }
>>        }
>>
>> +     amdgpu_ring_ib_begin(ring);
>>        if (job && ring->funcs->init_cond_exec)
>>                patch_offset = amdgpu_ring_init_cond_exec(ring);
>>
>> @@ -285,6 +286,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>>            ring->hw_prio == AMDGPU_GFX_PIPE_PRIO_HIGH)
>>                ring->funcs->emit_wave_limit(ring, false);
>>
>> +     amdgpu_ring_ib_end(ring);
>>        amdgpu_ring_commit(ring);
>>        return 0;
>>    }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> index 13db99d653bd..84b0b3c7d40f 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> @@ -33,6 +33,7 @@
>>
>>    #include <drm/amdgpu_drm.h>
>>    #include "amdgpu.h"
>> +#include "amdgpu_sw_ring.h"
>>    #include "atom.h"
>>
>>    /*
>> @@ -569,3 +570,15 @@ int amdgpu_ring_init_mqd(struct amdgpu_ring
>> *ring)
>>
>>        return mqd_mgr->init_mqd(adev, ring->mqd_ptr, &prop);
>>    }
>> +
>> +void amdgpu_ring_ib_begin(struct amdgpu_ring *ring) {
>> +     if (ring->is_sw_ring)
>> +             amdgpu_sw_ring_ib_begin(ring); }
>> +
>> +void amdgpu_ring_ib_end(struct amdgpu_ring *ring) {
>> +     if (ring->is_sw_ring)
>> +             amdgpu_sw_ring_ib_end(ring); }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> index e90d327a589e..6fbc1627dab7 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> @@ -312,6 +312,9 @@ struct amdgpu_ring {
>>    #define amdgpu_ring_preempt_ib(r) (r)->funcs->preempt_ib(r)
>>
>>    int amdgpu_ring_alloc(struct amdgpu_ring *ring, unsigned ndw);
>> +void amdgpu_ring_ib_begin(struct amdgpu_ring *ring); void
>> +amdgpu_ring_ib_end(struct amdgpu_ring *ring);
>> +
>>    void amdgpu_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count);
>>    void amdgpu_ring_generic_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib);
>>    void amdgpu_ring_commit(struct amdgpu_ring *ring); diff --git
>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
>> index 662aadebf111..788567e3b743 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
>> @@ -28,23 +28,146 @@
>>
>>    #define AMDGPU_MUX_RESUBMIT_JIFFIES_TIMEOUT (HZ / 2)
>>
>> +static struct kmem_cache *amdgpu_mux_chunk_slab;
>> +
>> +static inline struct amdgpu_mux_entry *amdgpu_ring_mux_sw_entry(struct amdgpu_ring_mux *mux,
>> +                                                             struct
>> +amdgpu_ring *ring) {
>> +     return ring->entry_index < mux->ring_entry_size ?
>> +                     &mux->ring_entry[ring->entry_index] : NULL; }
>> +
>> +/* copy packages on sw ring range[begin, end) */ static void
>> +amdgpu_ring_mux_copy_pkt_from_sw_ring(struct amdgpu_ring_mux *mux,
>> +                                               struct amdgpu_ring *ring,
>> +                                               u64 s_start, u64
>> +s_end) {
>> +     u64 start, end;
>> +     struct amdgpu_ring *real_ring = mux->real_ring;
>> +
>> +     start = s_start & ring->buf_mask;
>> +     end = s_end & ring->buf_mask;
>> +
>> +     if (start == end) {
>> +             DRM_ERROR("no more data copied from sw ring\n");
>> +             return;
>> +     }
>> +     if (start > end) {
>> +             amdgpu_ring_alloc(real_ring, (ring->ring_size >> 2) + end - start);
>> +             amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[start],
>> +                                        (ring->ring_size >> 2) - start);
>> +             amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[0], end);
>> +     } else {
>> +             amdgpu_ring_alloc(real_ring, end - start);
>> +             amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[start], end - start);
>> +     }
>> +}
>> +
>> +static void amdgpu_mux_resubmit_chunks(struct amdgpu_ring_mux *mux,
>> +bool is_fallback) {
>> +     struct amdgpu_mux_entry *e = NULL;
>> +     struct amdgpu_mux_chunk *chunk;
>> +     uint32_t seq, last_seq;
>> +     int i;
>> +
>> +     if (is_fallback) {
>> +             if (!spin_trylock(&mux->lock)) {
>> +                     amdgpu_ring_mux_schedule_resubmit(mux);
>> +                     DRM_ERROR("reschedule resubmit\n");
>> +                     return;
>> +             }
>> +     } else {
>> +             spin_lock(&mux->lock);
>> +     }
>> +
>> +     /*find low priority entries:*/
>> +     if (!mux->s_resubmit) {
>> +             spin_unlock(&mux->lock);
>> +             return;
>> +     }
>> +
>> +     for (i = 0; i < mux->num_ring_entries; i++) {
>> +             if (mux->ring_entry[i].ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT) {
>> +                     e = &mux->ring_entry[i];
>> +                     break;
>> +             }
>> +     }
>> +
>> +     if (!e) {
>> +             DRM_ERROR("%s no low priority ring found\n", __func__);
>> +             spin_unlock(&mux->lock);
>> +             return;
>> +     }
>> +
>> +     last_seq = atomic_read(&e->ring->fence_drv.last_seq);
>> +     seq = mux->seqno_to_resubmit;
>> +     if (last_seq < seq) {
>> +             /*resubmit all the fences between (last_seq, seq]*/
>> +             list_for_each_entry(chunk, &e->list, entry) {
>> +                     if (chunk->sync_seq > last_seq && chunk->sync_seq <= seq) {
>> +                             amdgpu_ring_mux_copy_pkt_from_sw_ring(mux, e->ring,
>> +                                                                   chunk->start,
>> +                                                                   chunk->end);
>> +                             mux->wptr_resubmit = chunk->end;
>> +                             amdgpu_ring_commit(mux->real_ring);
>> +                     }
>> +             }
>> +     }
>> +
>> +     del_timer(&mux->resubmit_timer);
>> +     mux->s_resubmit = false;
>> +     spin_unlock(&mux->lock);
>> +}
>> +
>> +static void amdgpu_mux_resubmit_fallback(struct timer_list *t) {
>> +     struct amdgpu_ring_mux *mux = from_timer(mux, t,
>> +resubmit_timer);
>> +
>> +     DRM_INFO("calling %s\n", __func__);
>> +     amdgpu_mux_resubmit_chunks(mux, true); }
>> +
>>    int amdgpu_ring_mux_init(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring,
>>                         unsigned int entry_size)
>>    {
>>        mux->real_ring = ring;
>>        mux->num_ring_entries = 0;
>> +
>>        mux->ring_entry = kcalloc(entry_size, sizeof(struct amdgpu_mux_entry), GFP_KERNEL);
>>        if (!mux->ring_entry)
>>                return -ENOMEM;
>>
>>        mux->ring_entry_size = entry_size;
>> +     mux->s_resubmit = false;
>> +
>> +     amdgpu_mux_chunk_slab = kmem_cache_create("amdgpu_mux_chunk",
>> +                                               sizeof(struct amdgpu_mux_chunk), 0,
>> +                                               SLAB_HWCACHE_ALIGN, NULL);
>> +     if (!amdgpu_mux_chunk_slab) {
>> +             DRM_ERROR("create amdgpu_mux_chunk cache failed\n");
>> +             return -ENOMEM;
>> +     }
>> +
>>        spin_lock_init(&mux->lock);
>> +     timer_setup(&mux->resubmit_timer, amdgpu_mux_resubmit_fallback,
>> + 0);
>>
>>        return 0;
>>    }
>>
>>    void amdgpu_ring_mux_fini(struct amdgpu_ring_mux *mux)
>>    {
>> +     struct amdgpu_mux_entry *e;
>> +     struct amdgpu_mux_chunk *chunk, *chunk2;
>> +     int i;
>> +
>> +     for (i = 0; i < mux->num_ring_entries; i++) {
>> +             e = &mux->ring_entry[i];
>> +             list_for_each_entry_safe(chunk, chunk2, &e->list, entry) {
>> +                     list_del(&chunk->entry);
>> +                     kmem_cache_free(amdgpu_mux_chunk_slab, chunk);
>> +             }
>> +     }
>> +     kmem_cache_destroy(amdgpu_mux_chunk_slab);
>>        kfree(mux->ring_entry);
>>        mux->ring_entry = NULL;
>>        mux->num_ring_entries = 0;
>> @@ -64,62 +187,46 @@ int amdgpu_ring_mux_add_sw_ring(struct amdgpu_ring_mux *mux, struct amdgpu_ring
>>        ring->entry_index = mux->num_ring_entries;
>>        e->ring = ring;
>>
>> +     INIT_LIST_HEAD(&e->list);
>>        mux->num_ring_entries += 1;
>>        return 0;
>>    }
>>
>> -static inline struct amdgpu_mux_entry *amdgpu_ring_mux_sw_entry(struct amdgpu_ring_mux *mux,
>> -                                                             struct amdgpu_ring *ring)
>> -{
>> -     return ring->entry_index < mux->ring_entry_size ?
>> -                     &mux->ring_entry[ring->entry_index] : NULL;
>> -}
>> -
>> -/* copy packages on sw ring range[begin, end) */ -static void
>> amdgpu_ring_mux_copy_pkt_from_sw_ring(struct amdgpu_ring_mux *mux,
>> -                                               struct amdgpu_ring *ring,
>> -                                               u64 s_start, u64 s_end)
>> -{
>> -     u64 start, end;
>> -     struct amdgpu_ring *real_ring = mux->real_ring;
>> -
>> -     start = s_start & ring->buf_mask;
>> -     end = s_end & ring->buf_mask;
>> -
>> -     if (start == end) {
>> -             DRM_ERROR("no more data copied from sw ring\n");
>> -             return;
>> -     }
>> -     if (start > end) {
>> -             amdgpu_ring_alloc(real_ring, (ring->ring_size >> 2) + end - start);
>> -             amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[start],
>> -                                        (ring->ring_size >> 2) - start);
>> -             amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[0], end);
>> -     } else {
>> -             amdgpu_ring_alloc(real_ring, end - start);
>> -             amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[start], end - start);
>> -     }
>> -}
>> -
>>    void amdgpu_ring_mux_set_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring, u64 wptr)
>>    {
>>        struct amdgpu_mux_entry *e;
>>
>> +     if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT)
>> +             amdgpu_mux_resubmit_chunks(mux, false);
>> +
>>        e = amdgpu_ring_mux_sw_entry(mux, ring);
>>        if (!e) {
>>                DRM_ERROR("cannot find entry for sw ring\n");
>>                return;
>>        }
>>
>> +     /* We could skip this set wptr as preemption in process. */
>> +     if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT && mux->pending_trailing_fence_signaled) {
>> +             DRM_ERROR("amdgpu_ring_mux_set_wptr skipped\n");
>> +             return;
>> +     }
>> +
>>        spin_lock(&mux->lock);
>>        e->sw_cptr = e->sw_wptr;
>> +     /* Update cptr if the package already copied in resubmit functions */
>> +     if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT && e->sw_cptr < mux->wptr_resubmit)
>> +             e->sw_cptr = mux->wptr_resubmit;
>>        e->sw_wptr = wptr;
>>        e->start_ptr_in_hw_ring = mux->real_ring->wptr;
>>
>> -     amdgpu_ring_mux_copy_pkt_from_sw_ring(mux, ring, e->sw_cptr, wptr);
>> -     e->end_ptr_in_hw_ring = mux->real_ring->wptr;
>> -     amdgpu_ring_commit(mux->real_ring);
>> -
>> +     /* Skip copying for the packages already resubmitted.*/
>> +     if (ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT || mux->wptr_resubmit < wptr) {
>> +             amdgpu_ring_mux_copy_pkt_from_sw_ring(mux, ring, e->sw_cptr, wptr);
>> +             e->end_ptr_in_hw_ring = mux->real_ring->wptr;
>> +             amdgpu_ring_commit(mux->real_ring);
>> +     } else {
>> +             e->end_ptr_in_hw_ring = mux->real_ring->wptr;
>> +     }
>>        spin_unlock(&mux->lock);
>>    }
>>
>> @@ -181,3 +288,145 @@ u64 amdgpu_ring_mux_get_rptr(struct
>> amdgpu_ring_mux *mux, struct amdgpu_ring *ri
>>
>>        return e->sw_rptr;
>>    }
>> +
>> +void amdgpu_ring_mux_schedule_resubmit(struct amdgpu_ring_mux *mux) {
>> +     mod_timer(&mux->resubmit_timer, jiffies +
>> +AMDGPU_MUX_RESUBMIT_JIFFIES_TIMEOUT);
>> +}
>> +
>> +void amdgpu_ring_mux_start_ib(struct amdgpu_ring_mux *mux, struct
>> +amdgpu_ring *ring) {
>> +     struct amdgpu_mux_entry *e;
>> +     struct amdgpu_mux_chunk *chunk;
>> +
>> +     amdgpu_mux_resubmit_chunks(mux, false);
>> +
>> +     e = amdgpu_ring_mux_sw_entry(mux, ring);
>> +     if (!e) {
>> +             DRM_ERROR("cannot find entry!\n");
>> +             return;
>> +     }
>> +
>> +     chunk = kmem_cache_alloc(amdgpu_mux_chunk_slab, GFP_KERNEL);
>> +     if (!chunk) {
>> +             DRM_ERROR("alloc amdgpu_mux_chunk_slab failed\n");
>> +             return;
>> +     }
>> +
>> +     chunk->start = ring->wptr;
>> +     list_add_tail(&chunk->entry, &e->list); }
>> +
>> +static void scan_and_remove_signaled_chunk(struct amdgpu_ring_mux
>> +*mux, struct amdgpu_ring *ring) {
>> +     uint32_t last_seq, size = 0;
>> +     struct amdgpu_mux_entry *e;
>> +     struct amdgpu_mux_chunk *chunk, *tmp;
>> +
>> +     e = amdgpu_ring_mux_sw_entry(mux, ring);
>> +     if (!e) {
>> +             DRM_ERROR("cannot find entry!\n");
>> +             return;
>> +     }
>> +
>> +     last_seq = atomic_read(&ring->fence_drv.last_seq);
>> +
>> +     list_for_each_entry_safe(chunk, tmp, &e->list, entry) {
>> +             if (chunk->sync_seq <= last_seq) {
>> +                     list_del(&chunk->entry);
>> +                     kmem_cache_free(amdgpu_mux_chunk_slab, chunk);
>> +             } else {
>> +                     size++;
>> +             }
>> +     }
>> +}
>> +
>> +void amdgpu_ring_mux_end_ib(struct amdgpu_ring_mux *mux, struct
>> +amdgpu_ring *ring) {
>> +     struct amdgpu_mux_entry *e;
>> +     struct amdgpu_mux_chunk *chunk;
>> +
>> +     e = amdgpu_ring_mux_sw_entry(mux, ring);
>> +     if (!e) {
>> +             DRM_ERROR("cannot find entry!\n");
>> +             return;
>> +     }
>> +
>> +     chunk = list_last_entry(&e->list, struct amdgpu_mux_chunk, entry);
>> +     if (!chunk) {
>> +             DRM_ERROR("cannot find chunk!\n");
>> +             return;
>> +     }
>> +
>> +     chunk->end = ring->wptr;
>> +     chunk->sync_seq = READ_ONCE(ring->fence_drv.sync_seq);
>> +
>> +     scan_and_remove_signaled_chunk(mux, ring); }
>> +
>> +/* Trigger Mid-Command Buffer Preemption (MCBP) and find if we need
>> +to resubmit. */ int amdgpu_mcbp_trigger_preempt(struct
>> +amdgpu_ring_mux *mux) {
>> +     int r;
>> +
>> +     spin_lock(&mux->lock);
>> +     mux->pending_trailing_fence_signaled = true;
>> +     r = amdgpu_ring_preempt_ib(mux->real_ring);
>> +     spin_unlock(&mux->lock);
>> +     return r;
>> +}
>> +
>> +bool amdgpu_mcbp_handle_trailing_fence_irq(struct amdgpu_ring_mux
>> +*mux) {
>> +     struct amdgpu_mux_entry *e;
>> +     struct amdgpu_ring *ring = NULL;
>> +     int i;
>> +
>> +     if (!mux->pending_trailing_fence_signaled)
>> +             return false;
>> +
>> +     if (mux->real_ring->trail_seq != le32_to_cpu(*mux->real_ring->trail_fence_cpu_addr))
>> +             return false;
>> +
>> +     for (i = 0; i < mux->num_ring_entries; i++) {
>> +             e = &mux->ring_entry[i];
>> +             if (e->ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT) {
>> +                     ring = e->ring;
>> +                     break;
>> +             }
>> +     }
>> +
>> +     if (!ring) {
>> +             DRM_ERROR("cannot find low priority ring\n");
>> +             return false;
>> +     }
>> +
>> +     amdgpu_fence_process(ring);
>> +     if (amdgpu_fence_count_emitted(ring) > 0) {
>> +             mux->s_resubmit = true;
>> +             mux->seqno_to_resubmit = ring->fence_drv.sync_seq;
>> +             amdgpu_ring_mux_schedule_resubmit(mux);
>> +     }
>> +
>> +     mux->pending_trailing_fence_signaled = false;
>> +     return true;
>> +}
>> +
>> +/*scan on low prio rings to have unsignaled fence and high ring has
>> +no fence.*/ int amdgpu_mcbp_scan(struct amdgpu_ring_mux *mux) {
>> +     struct amdgpu_ring *ring;
>> +     int i, need_preempt;
>> +
>> +     need_preempt = 0;
>> +     for (i = 0; i < mux->num_ring_entries; i++) {
>> +             ring = mux->ring_entry[i].ring;
>> +             if (ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT &&
>> +                     amdgpu_fence_count_emitted(ring) > 0)
>> +                     return 0;
>> +             if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT &&
>> +                     amdgpu_fence_count_emitted(ring) > 0)
>> +                     need_preempt = 1;
>> +     }
>> +     return need_preempt && !mux->s_resubmit; }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
>> index 8c1691e11b1c..bf8f5ca61605 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
>> @@ -29,6 +29,7 @@
>>    #include "amdgpu_ring.h"
>>
>>    struct amdgpu_ring;
>> +
>>    /**
>>     * struct amdgpu_mux_entry - the entry recording software rings copying information.
>>     * @ring: the pointer to the software ring.
>> @@ -37,6 +38,7 @@ struct amdgpu_ring;
>>     * @sw_cptr: the position of the copy pointer in the sw ring.
>>     * @sw_rptr: the read pointer in software ring.
>>     * @sw_wptr: the write pointer in software ring.
>> + * @list: list head for amdgpu_mux_chunk
>>     */
>>    struct amdgpu_mux_entry {
>>        struct                  amdgpu_ring *ring;
>> @@ -45,6 +47,7 @@ struct amdgpu_mux_entry {
>>        u64                     sw_cptr;
>>        u64                     sw_rptr;
>>        u64                     sw_wptr;
>> +     struct list_head        list;
>>    };
>>
>>    struct amdgpu_ring_mux {
>> @@ -55,6 +58,26 @@ struct amdgpu_ring_mux {
>>        unsigned int            ring_entry_size;
>>        /*the lock for copy data from different software rings*/
>>        spinlock_t              lock;
>> +     bool                    s_resubmit;
>> +     uint32_t                seqno_to_resubmit;
>> +     u64                     wptr_resubmit;
>> +     struct timer_list       resubmit_timer;
>> +
>> +     bool                    pending_trailing_fence_signaled;
>> +};
>> +
>> +/**
>> + * struct amdgpu_mux_chunk - save the location of indirect buffer's package on softare rings.
>> + * @entry: the list entry.
>> + * @sync_seq: the fence seqno related with the saved IB.
>> + * @start:- start location on the software ring.
>> + * @end:- end location on the software ring.
>> + */
>> +struct amdgpu_mux_chunk {
>> +     struct list_head        entry;
>> +     uint32_t                sync_seq;
>> +     u64                     start;
>> +     u64                     end;
>>    };
>>
>>    int amdgpu_ring_mux_init(struct amdgpu_ring_mux *mux, struct
>> amdgpu_ring *ring, @@ -65,4 +88,11 @@ void amdgpu_ring_mux_set_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *r
>>    u64 amdgpu_ring_mux_get_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring);
>>    u64 amdgpu_ring_mux_get_rptr(struct amdgpu_ring_mux *mux, struct
>> amdgpu_ring *ring);
>>
>> +void amdgpu_ring_mux_start_ib(struct amdgpu_ring_mux *mux, struct
>> +amdgpu_ring *ring); void amdgpu_ring_mux_end_ib(struct
>> +amdgpu_ring_mux *mux, struct amdgpu_ring *ring); void
>> +amdgpu_ring_mux_schedule_resubmit(struct amdgpu_ring_mux *mux);
>> +
>> +int amdgpu_mcbp_trigger_preempt(struct amdgpu_ring_mux *mux); int
>> +amdgpu_mcbp_scan(struct amdgpu_ring_mux *mux); bool
>> +amdgpu_mcbp_handle_trailing_fence_irq(struct amdgpu_ring_mux *mux);
>>    #endif
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.c
>> index 5ae12d6641ca..a3ec7bdf72a6 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.c
>> @@ -59,3 +59,29 @@ void amdgpu_sw_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count)
>>    {
>>        WARN_ON(!ring->is_sw_ring);
>>    }
>> +
>> +void amdgpu_sw_ring_ib_begin(struct amdgpu_ring *ring) {
>> +     struct amdgpu_device *adev = ring->adev;
>> +     struct amdgpu_ring_mux *mux = &adev->gfx.muxer;
>> +
>> +     WARN_ON(!ring->is_sw_ring);
>> +     if (ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT) {
>> +             if (amdgpu_mcbp_scan(mux) > 0)
>> +                     amdgpu_mcbp_trigger_preempt(mux);
>> +             return;
>> +     }
>> +
>> +     amdgpu_ring_mux_start_ib(mux, ring); }
>> +
>> +void amdgpu_sw_ring_ib_end(struct amdgpu_ring *ring) {
>> +     struct amdgpu_device *adev = ring->adev;
>> +     struct amdgpu_ring_mux *mux = &adev->gfx.muxer;
>> +
>> +     WARN_ON(!ring->is_sw_ring);
>> +     if (ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT)
>> +             return;
>> +     amdgpu_ring_mux_end_ib(mux, ring); }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> index 9596c22fded6..b7e94553f4fb 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> @@ -601,6 +601,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
>>        if (!vm_flush_needed && !gds_switch_needed && !need_pipe_sync)
>>                return 0;
>>
>> +     amdgpu_ring_ib_begin(ring);
>>        if (ring->funcs->init_cond_exec)
>>                patch_offset = amdgpu_ring_init_cond_exec(ring);
>>
>> @@ -661,6 +662,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
>>                amdgpu_ring_emit_switch_buffer(ring);
>>                amdgpu_ring_emit_switch_buffer(ring);
>>        }
>> +     amdgpu_ring_ib_end(ring);
>>        return 0;
>>    }
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> index 669532f658da..1620300f0dde 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> @@ -5619,7 +5619,7 @@ static int gfx_v9_0_ring_preempt_ib(struct amdgpu_ring *ring)
>>        ring->trail_seq += 1;
>>        amdgpu_ring_alloc(ring, 13);
>>        gfx_v9_0_ring_emit_fence(ring, ring->trail_fence_gpu_addr,
>> -                              ring->trail_seq, AMDGPU_FENCE_FLAG_EXEC);
>> +                              ring->trail_seq, AMDGPU_FENCE_FLAG_EXEC
>> + | AMDGPU_FENCE_FLAG_INT);
>>        /*reset the CP_VMID_PREEMPT after trailing fence*/
>>        amdgpu_ring_emit_wreg(ring,
>>                              SOC15_REG_OFFSET(GC, 0,
>> mmCP_VMID_PREEMPT), @@ -6045,9 +6045,11 @@ static int
>> gfx_v9_0_eop_irq(struct amdgpu_device *adev,
>>
>>        switch (me_id) {
>>        case 0:
>> -             /* Fence signals are handled on the software rings*/
>> -             for (i = 0; i < GFX9_NUM_SW_GFX_RINGS; i++)
>> -                     amdgpu_fence_process(&adev->gfx.sw_gfx_ring[i]);
>> +             if (!amdgpu_mcbp_handle_trailing_fence_irq(&adev->gfx.muxer)) {
>> +                     /* Fence signals are handled on the software rings*/
>> +                     for (i = 0; i < GFX9_NUM_SW_GFX_RINGS; i++)
>> +                             amdgpu_fence_process(&adev->gfx.sw_gfx_ring[i]);
>> +             }
>>                break;
>>        case 1:
>>        case 2:


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 4/4] drm/amdgpu: MCBP based on DRM scheduler (v6)
  2022-09-27  6:06       ` Christian König
@ 2022-09-27 17:49         ` Michel Dänzer
  2022-09-27 18:20           ` Christian König
  0 siblings, 1 reply; 15+ messages in thread
From: Michel Dänzer @ 2022-09-27 17:49 UTC (permalink / raw)
  To: Christian König
  Cc: Grodzovsky, Andrey, Tuikov, Luben, Zhu, Jiadong, amd-gfx

On 2022-09-27 08:06, Christian König wrote:
> Hey Michel,
> 
> JIadong is working on exposing high/low priority gfx queues for gfx9 and older hw generations by using mid command buffer preemption.

Yeah, I've been keeping an eye on these patches. I'm looking forward to this working.


> I know that you have been working on Gnome Mutter to make use from userspace for this. Do you have time to run some tests with that?

I just tested the v8 series (first without amdgpu.mcbp=1 on the kernel command line, then with it, since I wasn't sure if it's needed) with https://gitlab.gnome.org/GNOME/mutter/-/merge_requests/1880 on Navi 14.

Unfortunately, I'm not seeing any change in behaviour. Even though mutter uses a high priority context via the EGL_IMG_context_priority extension, it's unable to reach a higher frame rate than a GPU-limited client[0]. The "Last preempted" line of /sys/kernel/debug/dri/0/amdgpu_fence_info remains at 0x00000000.

Did I miss a step?


[0] I used the GpuTest pixmark piano & plot3d benchmarks. With an Intel iGPU, mutter can achieve a higher frame rate than plot3d, though not than pixmark piano (presumably due to limited GPU preemption granularity).

> Am 27.09.22 um 05:18 schrieb Zhu, Jiadong:
>> [AMD Official Use Only - General]
>>
>>> I need more time for an in deep review of this, but form the one mile high view it looks correct to me now.
>>> Can we do some pre-commit qa testing with this?
>> I changed drm test "Command submission Test (GFX)" to send high priority ibs meanwhile running Manhattan on Screen/Unigine heaven foreground, checking mcbp/resubmit triggered by cat /sys/kernel/debug/dri/0/amdgpu_fence_info
>>
>> I have continued running this scenario for 2 daytime and 1 night, no hangs happen yet(lots of hangs has been fixed in the previous patches).
>>
>> I will ask QA team to do more test.
>>
>> Thanks,
>> JIadong
>>
>> -----Original Message-----
>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>> Sent: Monday, September 26, 2022 2:49 PM
>> To: Zhu, Jiadong <Jiadong.Zhu@amd.com>; amd-gfx@lists.freedesktop.org
>> Cc: Tuikov, Luben <Luben.Tuikov@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>> Subject: Re: [PATCH 4/4] drm/amdgpu: MCBP based on DRM scheduler (v6)
>>
>> Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding.
>>
>>
>> Am 23.09.22 um 15:16 schrieb jiadong.zhu@amd.com:
>>> From: "Jiadong.Zhu" <Jiadong.Zhu@amd.com>
>>>
>>> Trigger Mid-Command Buffer Preemption according to the priority of the
>>> software rings and the hw fence signalling condition.
>>>
>>> The muxer saves the locations of the indirect buffer frames from the
>>> software ring together with the fence sequence number in its fifo
>>> queue, and pops out those records when the fences are signalled. The
>>> locations are used to resubmit packages in preemption scenarios by coping the chunks from the software ring.
>>>
>>> v2: Update comment style.
>>> v3: Fix conflict caused by previous modifications.
>>> v4: Remove unnecessary prints.
>>> v5: Fix corner cases for resubmission cases.
>>> v6: Refactor functions for resubmission, calling fence_process in irq handler.
>>>
>>> Cc: Christian Koenig <Christian.Koenig@amd.com>
>>> Cc: Luben Tuikov <Luben.Tuikov@amd.com>
>>> Cc: Andrey Grodzovsky <Andrey.Grodzovsky@amd.com>
>>> Acked-by: Luben Tuikov <luben.tuikov@amd.com>
>>> Signed-off-by: Jiadong.Zhu <Jiadong.Zhu@amd.com>
>> I need more time for an in deep review of this, but form the one mile high view it looks correct to me now.
>>
>> Can we do some pre-commit qa testing with this?
>>
>> Thanks,
>> Christian.
>>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c       |   2 +
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c     |  13 +
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h     |   3 +
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c | 323 ++++++++++++++++---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h |  30 ++
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.c  |  26 ++
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c       |   2 +
>>>    drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c        |  10 +-
>>>    8 files changed, 368 insertions(+), 41 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>> index 258cffe3c06a..af86d87e2f3b 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>> @@ -211,6 +211,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>>>                }
>>>        }
>>>
>>> +     amdgpu_ring_ib_begin(ring);
>>>        if (job && ring->funcs->init_cond_exec)
>>>                patch_offset = amdgpu_ring_init_cond_exec(ring);
>>>
>>> @@ -285,6 +286,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>>>            ring->hw_prio == AMDGPU_GFX_PIPE_PRIO_HIGH)
>>>                ring->funcs->emit_wave_limit(ring, false);
>>>
>>> +     amdgpu_ring_ib_end(ring);
>>>        amdgpu_ring_commit(ring);
>>>        return 0;
>>>    }
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>> index 13db99d653bd..84b0b3c7d40f 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>> @@ -33,6 +33,7 @@
>>>
>>>    #include <drm/amdgpu_drm.h>
>>>    #include "amdgpu.h"
>>> +#include "amdgpu_sw_ring.h"
>>>    #include "atom.h"
>>>
>>>    /*
>>> @@ -569,3 +570,15 @@ int amdgpu_ring_init_mqd(struct amdgpu_ring
>>> *ring)
>>>
>>>        return mqd_mgr->init_mqd(adev, ring->mqd_ptr, &prop);
>>>    }
>>> +
>>> +void amdgpu_ring_ib_begin(struct amdgpu_ring *ring) {
>>> +     if (ring->is_sw_ring)
>>> +             amdgpu_sw_ring_ib_begin(ring); }
>>> +
>>> +void amdgpu_ring_ib_end(struct amdgpu_ring *ring) {
>>> +     if (ring->is_sw_ring)
>>> +             amdgpu_sw_ring_ib_end(ring); }
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> index e90d327a589e..6fbc1627dab7 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> @@ -312,6 +312,9 @@ struct amdgpu_ring {
>>>    #define amdgpu_ring_preempt_ib(r) (r)->funcs->preempt_ib(r)
>>>
>>>    int amdgpu_ring_alloc(struct amdgpu_ring *ring, unsigned ndw);
>>> +void amdgpu_ring_ib_begin(struct amdgpu_ring *ring); void
>>> +amdgpu_ring_ib_end(struct amdgpu_ring *ring);
>>> +
>>>    void amdgpu_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count);
>>>    void amdgpu_ring_generic_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib);
>>>    void amdgpu_ring_commit(struct amdgpu_ring *ring); diff --git
>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
>>> index 662aadebf111..788567e3b743 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
>>> @@ -28,23 +28,146 @@
>>>
>>>    #define AMDGPU_MUX_RESUBMIT_JIFFIES_TIMEOUT (HZ / 2)
>>>
>>> +static struct kmem_cache *amdgpu_mux_chunk_slab;
>>> +
>>> +static inline struct amdgpu_mux_entry *amdgpu_ring_mux_sw_entry(struct amdgpu_ring_mux *mux,
>>> +                                                             struct
>>> +amdgpu_ring *ring) {
>>> +     return ring->entry_index < mux->ring_entry_size ?
>>> +                     &mux->ring_entry[ring->entry_index] : NULL; }
>>> +
>>> +/* copy packages on sw ring range[begin, end) */ static void
>>> +amdgpu_ring_mux_copy_pkt_from_sw_ring(struct amdgpu_ring_mux *mux,
>>> +                                               struct amdgpu_ring *ring,
>>> +                                               u64 s_start, u64
>>> +s_end) {
>>> +     u64 start, end;
>>> +     struct amdgpu_ring *real_ring = mux->real_ring;
>>> +
>>> +     start = s_start & ring->buf_mask;
>>> +     end = s_end & ring->buf_mask;
>>> +
>>> +     if (start == end) {
>>> +             DRM_ERROR("no more data copied from sw ring\n");
>>> +             return;
>>> +     }
>>> +     if (start > end) {
>>> +             amdgpu_ring_alloc(real_ring, (ring->ring_size >> 2) + end - start);
>>> +             amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[start],
>>> +                                        (ring->ring_size >> 2) - start);
>>> +             amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[0], end);
>>> +     } else {
>>> +             amdgpu_ring_alloc(real_ring, end - start);
>>> +             amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[start], end - start);
>>> +     }
>>> +}
>>> +
>>> +static void amdgpu_mux_resubmit_chunks(struct amdgpu_ring_mux *mux,
>>> +bool is_fallback) {
>>> +     struct amdgpu_mux_entry *e = NULL;
>>> +     struct amdgpu_mux_chunk *chunk;
>>> +     uint32_t seq, last_seq;
>>> +     int i;
>>> +
>>> +     if (is_fallback) {
>>> +             if (!spin_trylock(&mux->lock)) {
>>> +                     amdgpu_ring_mux_schedule_resubmit(mux);
>>> +                     DRM_ERROR("reschedule resubmit\n");
>>> +                     return;
>>> +             }
>>> +     } else {
>>> +             spin_lock(&mux->lock);
>>> +     }
>>> +
>>> +     /*find low priority entries:*/
>>> +     if (!mux->s_resubmit) {
>>> +             spin_unlock(&mux->lock);
>>> +             return;
>>> +     }
>>> +
>>> +     for (i = 0; i < mux->num_ring_entries; i++) {
>>> +             if (mux->ring_entry[i].ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT) {
>>> +                     e = &mux->ring_entry[i];
>>> +                     break;
>>> +             }
>>> +     }
>>> +
>>> +     if (!e) {
>>> +             DRM_ERROR("%s no low priority ring found\n", __func__);
>>> +             spin_unlock(&mux->lock);
>>> +             return;
>>> +     }
>>> +
>>> +     last_seq = atomic_read(&e->ring->fence_drv.last_seq);
>>> +     seq = mux->seqno_to_resubmit;
>>> +     if (last_seq < seq) {
>>> +             /*resubmit all the fences between (last_seq, seq]*/
>>> +             list_for_each_entry(chunk, &e->list, entry) {
>>> +                     if (chunk->sync_seq > last_seq && chunk->sync_seq <= seq) {
>>> +                             amdgpu_ring_mux_copy_pkt_from_sw_ring(mux, e->ring,
>>> +                                                                   chunk->start,
>>> +                                                                   chunk->end);
>>> +                             mux->wptr_resubmit = chunk->end;
>>> +                             amdgpu_ring_commit(mux->real_ring);
>>> +                     }
>>> +             }
>>> +     }
>>> +
>>> +     del_timer(&mux->resubmit_timer);
>>> +     mux->s_resubmit = false;
>>> +     spin_unlock(&mux->lock);
>>> +}
>>> +
>>> +static void amdgpu_mux_resubmit_fallback(struct timer_list *t) {
>>> +     struct amdgpu_ring_mux *mux = from_timer(mux, t,
>>> +resubmit_timer);
>>> +
>>> +     DRM_INFO("calling %s\n", __func__);
>>> +     amdgpu_mux_resubmit_chunks(mux, true); }
>>> +
>>>    int amdgpu_ring_mux_init(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring,
>>>                         unsigned int entry_size)
>>>    {
>>>        mux->real_ring = ring;
>>>        mux->num_ring_entries = 0;
>>> +
>>>        mux->ring_entry = kcalloc(entry_size, sizeof(struct amdgpu_mux_entry), GFP_KERNEL);
>>>        if (!mux->ring_entry)
>>>                return -ENOMEM;
>>>
>>>        mux->ring_entry_size = entry_size;
>>> +     mux->s_resubmit = false;
>>> +
>>> +     amdgpu_mux_chunk_slab = kmem_cache_create("amdgpu_mux_chunk",
>>> +                                               sizeof(struct amdgpu_mux_chunk), 0,
>>> +                                               SLAB_HWCACHE_ALIGN, NULL);
>>> +     if (!amdgpu_mux_chunk_slab) {
>>> +             DRM_ERROR("create amdgpu_mux_chunk cache failed\n");
>>> +             return -ENOMEM;
>>> +     }
>>> +
>>>        spin_lock_init(&mux->lock);
>>> +     timer_setup(&mux->resubmit_timer, amdgpu_mux_resubmit_fallback,
>>> + 0);
>>>
>>>        return 0;
>>>    }
>>>
>>>    void amdgpu_ring_mux_fini(struct amdgpu_ring_mux *mux)
>>>    {
>>> +     struct amdgpu_mux_entry *e;
>>> +     struct amdgpu_mux_chunk *chunk, *chunk2;
>>> +     int i;
>>> +
>>> +     for (i = 0; i < mux->num_ring_entries; i++) {
>>> +             e = &mux->ring_entry[i];
>>> +             list_for_each_entry_safe(chunk, chunk2, &e->list, entry) {
>>> +                     list_del(&chunk->entry);
>>> +                     kmem_cache_free(amdgpu_mux_chunk_slab, chunk);
>>> +             }
>>> +     }
>>> +     kmem_cache_destroy(amdgpu_mux_chunk_slab);
>>>        kfree(mux->ring_entry);
>>>        mux->ring_entry = NULL;
>>>        mux->num_ring_entries = 0;
>>> @@ -64,62 +187,46 @@ int amdgpu_ring_mux_add_sw_ring(struct amdgpu_ring_mux *mux, struct amdgpu_ring
>>>        ring->entry_index = mux->num_ring_entries;
>>>        e->ring = ring;
>>>
>>> +     INIT_LIST_HEAD(&e->list);
>>>        mux->num_ring_entries += 1;
>>>        return 0;
>>>    }
>>>
>>> -static inline struct amdgpu_mux_entry *amdgpu_ring_mux_sw_entry(struct amdgpu_ring_mux *mux,
>>> -                                                             struct amdgpu_ring *ring)
>>> -{
>>> -     return ring->entry_index < mux->ring_entry_size ?
>>> -                     &mux->ring_entry[ring->entry_index] : NULL;
>>> -}
>>> -
>>> -/* copy packages on sw ring range[begin, end) */ -static void
>>> amdgpu_ring_mux_copy_pkt_from_sw_ring(struct amdgpu_ring_mux *mux,
>>> -                                               struct amdgpu_ring *ring,
>>> -                                               u64 s_start, u64 s_end)
>>> -{
>>> -     u64 start, end;
>>> -     struct amdgpu_ring *real_ring = mux->real_ring;
>>> -
>>> -     start = s_start & ring->buf_mask;
>>> -     end = s_end & ring->buf_mask;
>>> -
>>> -     if (start == end) {
>>> -             DRM_ERROR("no more data copied from sw ring\n");
>>> -             return;
>>> -     }
>>> -     if (start > end) {
>>> -             amdgpu_ring_alloc(real_ring, (ring->ring_size >> 2) + end - start);
>>> -             amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[start],
>>> -                                        (ring->ring_size >> 2) - start);
>>> -             amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[0], end);
>>> -     } else {
>>> -             amdgpu_ring_alloc(real_ring, end - start);
>>> -             amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[start], end - start);
>>> -     }
>>> -}
>>> -
>>>    void amdgpu_ring_mux_set_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring, u64 wptr)
>>>    {
>>>        struct amdgpu_mux_entry *e;
>>>
>>> +     if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT)
>>> +             amdgpu_mux_resubmit_chunks(mux, false);
>>> +
>>>        e = amdgpu_ring_mux_sw_entry(mux, ring);
>>>        if (!e) {
>>>                DRM_ERROR("cannot find entry for sw ring\n");
>>>                return;
>>>        }
>>>
>>> +     /* We could skip this set wptr as preemption in process. */
>>> +     if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT && mux->pending_trailing_fence_signaled) {
>>> +             DRM_ERROR("amdgpu_ring_mux_set_wptr skipped\n");
>>> +             return;
>>> +     }
>>> +
>>>        spin_lock(&mux->lock);
>>>        e->sw_cptr = e->sw_wptr;
>>> +     /* Update cptr if the package already copied in resubmit functions */
>>> +     if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT && e->sw_cptr < mux->wptr_resubmit)
>>> +             e->sw_cptr = mux->wptr_resubmit;
>>>        e->sw_wptr = wptr;
>>>        e->start_ptr_in_hw_ring = mux->real_ring->wptr;
>>>
>>> -     amdgpu_ring_mux_copy_pkt_from_sw_ring(mux, ring, e->sw_cptr, wptr);
>>> -     e->end_ptr_in_hw_ring = mux->real_ring->wptr;
>>> -     amdgpu_ring_commit(mux->real_ring);
>>> -
>>> +     /* Skip copying for the packages already resubmitted.*/
>>> +     if (ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT || mux->wptr_resubmit < wptr) {
>>> +             amdgpu_ring_mux_copy_pkt_from_sw_ring(mux, ring, e->sw_cptr, wptr);
>>> +             e->end_ptr_in_hw_ring = mux->real_ring->wptr;
>>> +             amdgpu_ring_commit(mux->real_ring);
>>> +     } else {
>>> +             e->end_ptr_in_hw_ring = mux->real_ring->wptr;
>>> +     }
>>>        spin_unlock(&mux->lock);
>>>    }
>>>
>>> @@ -181,3 +288,145 @@ u64 amdgpu_ring_mux_get_rptr(struct
>>> amdgpu_ring_mux *mux, struct amdgpu_ring *ri
>>>
>>>        return e->sw_rptr;
>>>    }
>>> +
>>> +void amdgpu_ring_mux_schedule_resubmit(struct amdgpu_ring_mux *mux) {
>>> +     mod_timer(&mux->resubmit_timer, jiffies +
>>> +AMDGPU_MUX_RESUBMIT_JIFFIES_TIMEOUT);
>>> +}
>>> +
>>> +void amdgpu_ring_mux_start_ib(struct amdgpu_ring_mux *mux, struct
>>> +amdgpu_ring *ring) {
>>> +     struct amdgpu_mux_entry *e;
>>> +     struct amdgpu_mux_chunk *chunk;
>>> +
>>> +     amdgpu_mux_resubmit_chunks(mux, false);
>>> +
>>> +     e = amdgpu_ring_mux_sw_entry(mux, ring);
>>> +     if (!e) {
>>> +             DRM_ERROR("cannot find entry!\n");
>>> +             return;
>>> +     }
>>> +
>>> +     chunk = kmem_cache_alloc(amdgpu_mux_chunk_slab, GFP_KERNEL);
>>> +     if (!chunk) {
>>> +             DRM_ERROR("alloc amdgpu_mux_chunk_slab failed\n");
>>> +             return;
>>> +     }
>>> +
>>> +     chunk->start = ring->wptr;
>>> +     list_add_tail(&chunk->entry, &e->list); }
>>> +
>>> +static void scan_and_remove_signaled_chunk(struct amdgpu_ring_mux
>>> +*mux, struct amdgpu_ring *ring) {
>>> +     uint32_t last_seq, size = 0;
>>> +     struct amdgpu_mux_entry *e;
>>> +     struct amdgpu_mux_chunk *chunk, *tmp;
>>> +
>>> +     e = amdgpu_ring_mux_sw_entry(mux, ring);
>>> +     if (!e) {
>>> +             DRM_ERROR("cannot find entry!\n");
>>> +             return;
>>> +     }
>>> +
>>> +     last_seq = atomic_read(&ring->fence_drv.last_seq);
>>> +
>>> +     list_for_each_entry_safe(chunk, tmp, &e->list, entry) {
>>> +             if (chunk->sync_seq <= last_seq) {
>>> +                     list_del(&chunk->entry);
>>> +                     kmem_cache_free(amdgpu_mux_chunk_slab, chunk);
>>> +             } else {
>>> +                     size++;
>>> +             }
>>> +     }
>>> +}
>>> +
>>> +void amdgpu_ring_mux_end_ib(struct amdgpu_ring_mux *mux, struct
>>> +amdgpu_ring *ring) {
>>> +     struct amdgpu_mux_entry *e;
>>> +     struct amdgpu_mux_chunk *chunk;
>>> +
>>> +     e = amdgpu_ring_mux_sw_entry(mux, ring);
>>> +     if (!e) {
>>> +             DRM_ERROR("cannot find entry!\n");
>>> +             return;
>>> +     }
>>> +
>>> +     chunk = list_last_entry(&e->list, struct amdgpu_mux_chunk, entry);
>>> +     if (!chunk) {
>>> +             DRM_ERROR("cannot find chunk!\n");
>>> +             return;
>>> +     }
>>> +
>>> +     chunk->end = ring->wptr;
>>> +     chunk->sync_seq = READ_ONCE(ring->fence_drv.sync_seq);
>>> +
>>> +     scan_and_remove_signaled_chunk(mux, ring); }
>>> +
>>> +/* Trigger Mid-Command Buffer Preemption (MCBP) and find if we need
>>> +to resubmit. */ int amdgpu_mcbp_trigger_preempt(struct
>>> +amdgpu_ring_mux *mux) {
>>> +     int r;
>>> +
>>> +     spin_lock(&mux->lock);
>>> +     mux->pending_trailing_fence_signaled = true;
>>> +     r = amdgpu_ring_preempt_ib(mux->real_ring);
>>> +     spin_unlock(&mux->lock);
>>> +     return r;
>>> +}
>>> +
>>> +bool amdgpu_mcbp_handle_trailing_fence_irq(struct amdgpu_ring_mux
>>> +*mux) {
>>> +     struct amdgpu_mux_entry *e;
>>> +     struct amdgpu_ring *ring = NULL;
>>> +     int i;
>>> +
>>> +     if (!mux->pending_trailing_fence_signaled)
>>> +             return false;
>>> +
>>> +     if (mux->real_ring->trail_seq != le32_to_cpu(*mux->real_ring->trail_fence_cpu_addr))
>>> +             return false;
>>> +
>>> +     for (i = 0; i < mux->num_ring_entries; i++) {
>>> +             e = &mux->ring_entry[i];
>>> +             if (e->ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT) {
>>> +                     ring = e->ring;
>>> +                     break;
>>> +             }
>>> +     }
>>> +
>>> +     if (!ring) {
>>> +             DRM_ERROR("cannot find low priority ring\n");
>>> +             return false;
>>> +     }
>>> +
>>> +     amdgpu_fence_process(ring);
>>> +     if (amdgpu_fence_count_emitted(ring) > 0) {
>>> +             mux->s_resubmit = true;
>>> +             mux->seqno_to_resubmit = ring->fence_drv.sync_seq;
>>> +             amdgpu_ring_mux_schedule_resubmit(mux);
>>> +     }
>>> +
>>> +     mux->pending_trailing_fence_signaled = false;
>>> +     return true;
>>> +}
>>> +
>>> +/*scan on low prio rings to have unsignaled fence and high ring has
>>> +no fence.*/ int amdgpu_mcbp_scan(struct amdgpu_ring_mux *mux) {
>>> +     struct amdgpu_ring *ring;
>>> +     int i, need_preempt;
>>> +
>>> +     need_preempt = 0;
>>> +     for (i = 0; i < mux->num_ring_entries; i++) {
>>> +             ring = mux->ring_entry[i].ring;
>>> +             if (ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT &&
>>> +                     amdgpu_fence_count_emitted(ring) > 0)
>>> +                     return 0;
>>> +             if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT &&
>>> +                     amdgpu_fence_count_emitted(ring) > 0)
>>> +                     need_preempt = 1;
>>> +     }
>>> +     return need_preempt && !mux->s_resubmit; }
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
>>> index 8c1691e11b1c..bf8f5ca61605 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
>>> @@ -29,6 +29,7 @@
>>>    #include "amdgpu_ring.h"
>>>
>>>    struct amdgpu_ring;
>>> +
>>>    /**
>>>     * struct amdgpu_mux_entry - the entry recording software rings copying information.
>>>     * @ring: the pointer to the software ring.
>>> @@ -37,6 +38,7 @@ struct amdgpu_ring;
>>>     * @sw_cptr: the position of the copy pointer in the sw ring.
>>>     * @sw_rptr: the read pointer in software ring.
>>>     * @sw_wptr: the write pointer in software ring.
>>> + * @list: list head for amdgpu_mux_chunk
>>>     */
>>>    struct amdgpu_mux_entry {
>>>        struct                  amdgpu_ring *ring;
>>> @@ -45,6 +47,7 @@ struct amdgpu_mux_entry {
>>>        u64                     sw_cptr;
>>>        u64                     sw_rptr;
>>>        u64                     sw_wptr;
>>> +     struct list_head        list;
>>>    };
>>>
>>>    struct amdgpu_ring_mux {
>>> @@ -55,6 +58,26 @@ struct amdgpu_ring_mux {
>>>        unsigned int            ring_entry_size;
>>>        /*the lock for copy data from different software rings*/
>>>        spinlock_t              lock;
>>> +     bool                    s_resubmit;
>>> +     uint32_t                seqno_to_resubmit;
>>> +     u64                     wptr_resubmit;
>>> +     struct timer_list       resubmit_timer;
>>> +
>>> +     bool                    pending_trailing_fence_signaled;
>>> +};
>>> +
>>> +/**
>>> + * struct amdgpu_mux_chunk - save the location of indirect buffer's package on softare rings.
>>> + * @entry: the list entry.
>>> + * @sync_seq: the fence seqno related with the saved IB.
>>> + * @start:- start location on the software ring.
>>> + * @end:- end location on the software ring.
>>> + */
>>> +struct amdgpu_mux_chunk {
>>> +     struct list_head        entry;
>>> +     uint32_t                sync_seq;
>>> +     u64                     start;
>>> +     u64                     end;
>>>    };
>>>
>>>    int amdgpu_ring_mux_init(struct amdgpu_ring_mux *mux, struct
>>> amdgpu_ring *ring, @@ -65,4 +88,11 @@ void amdgpu_ring_mux_set_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *r
>>>    u64 amdgpu_ring_mux_get_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring);
>>>    u64 amdgpu_ring_mux_get_rptr(struct amdgpu_ring_mux *mux, struct
>>> amdgpu_ring *ring);
>>>
>>> +void amdgpu_ring_mux_start_ib(struct amdgpu_ring_mux *mux, struct
>>> +amdgpu_ring *ring); void amdgpu_ring_mux_end_ib(struct
>>> +amdgpu_ring_mux *mux, struct amdgpu_ring *ring); void
>>> +amdgpu_ring_mux_schedule_resubmit(struct amdgpu_ring_mux *mux);
>>> +
>>> +int amdgpu_mcbp_trigger_preempt(struct amdgpu_ring_mux *mux); int
>>> +amdgpu_mcbp_scan(struct amdgpu_ring_mux *mux); bool
>>> +amdgpu_mcbp_handle_trailing_fence_irq(struct amdgpu_ring_mux *mux);
>>>    #endif
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.c
>>> index 5ae12d6641ca..a3ec7bdf72a6 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.c
>>> @@ -59,3 +59,29 @@ void amdgpu_sw_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count)
>>>    {
>>>        WARN_ON(!ring->is_sw_ring);
>>>    }
>>> +
>>> +void amdgpu_sw_ring_ib_begin(struct amdgpu_ring *ring) {
>>> +     struct amdgpu_device *adev = ring->adev;
>>> +     struct amdgpu_ring_mux *mux = &adev->gfx.muxer;
>>> +
>>> +     WARN_ON(!ring->is_sw_ring);
>>> +     if (ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT) {
>>> +             if (amdgpu_mcbp_scan(mux) > 0)
>>> +                     amdgpu_mcbp_trigger_preempt(mux);
>>> +             return;
>>> +     }
>>> +
>>> +     amdgpu_ring_mux_start_ib(mux, ring); }
>>> +
>>> +void amdgpu_sw_ring_ib_end(struct amdgpu_ring *ring) {
>>> +     struct amdgpu_device *adev = ring->adev;
>>> +     struct amdgpu_ring_mux *mux = &adev->gfx.muxer;
>>> +
>>> +     WARN_ON(!ring->is_sw_ring);
>>> +     if (ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT)
>>> +             return;
>>> +     amdgpu_ring_mux_end_ib(mux, ring); }
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> index 9596c22fded6..b7e94553f4fb 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> @@ -601,6 +601,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
>>>        if (!vm_flush_needed && !gds_switch_needed && !need_pipe_sync)
>>>                return 0;
>>>
>>> +     amdgpu_ring_ib_begin(ring);
>>>        if (ring->funcs->init_cond_exec)
>>>                patch_offset = amdgpu_ring_init_cond_exec(ring);
>>>
>>> @@ -661,6 +662,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
>>>                amdgpu_ring_emit_switch_buffer(ring);
>>>                amdgpu_ring_emit_switch_buffer(ring);
>>>        }
>>> +     amdgpu_ring_ib_end(ring);
>>>        return 0;
>>>    }
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> index 669532f658da..1620300f0dde 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> @@ -5619,7 +5619,7 @@ static int gfx_v9_0_ring_preempt_ib(struct amdgpu_ring *ring)
>>>        ring->trail_seq += 1;
>>>        amdgpu_ring_alloc(ring, 13);
>>>        gfx_v9_0_ring_emit_fence(ring, ring->trail_fence_gpu_addr,
>>> -                              ring->trail_seq, AMDGPU_FENCE_FLAG_EXEC);
>>> +                              ring->trail_seq, AMDGPU_FENCE_FLAG_EXEC
>>> + | AMDGPU_FENCE_FLAG_INT);
>>>        /*reset the CP_VMID_PREEMPT after trailing fence*/
>>>        amdgpu_ring_emit_wreg(ring,
>>>                              SOC15_REG_OFFSET(GC, 0,
>>> mmCP_VMID_PREEMPT), @@ -6045,9 +6045,11 @@ static int
>>> gfx_v9_0_eop_irq(struct amdgpu_device *adev,
>>>
>>>        switch (me_id) {
>>>        case 0:
>>> -             /* Fence signals are handled on the software rings*/
>>> -             for (i = 0; i < GFX9_NUM_SW_GFX_RINGS; i++)
>>> -                     amdgpu_fence_process(&adev->gfx.sw_gfx_ring[i]);
>>> +             if (!amdgpu_mcbp_handle_trailing_fence_irq(&adev->gfx.muxer)) {
>>> +                     /* Fence signals are handled on the software rings*/
>>> +                     for (i = 0; i < GFX9_NUM_SW_GFX_RINGS; i++)
>>> +                             amdgpu_fence_process(&adev->gfx.sw_gfx_ring[i]);
>>> +             }
>>>                break;
>>>        case 1:
>>>        case 2:
> 

-- 
Earthling Michel Dänzer            |                  https://redhat.com
Libre software enthusiast          |         Mesa and Xwayland developer


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 4/4] drm/amdgpu: MCBP based on DRM scheduler (v6)
  2022-09-27 17:49         ` Michel Dänzer
@ 2022-09-27 18:20           ` Christian König
  2022-09-28  1:01             ` Zhu, Jiadong
  0 siblings, 1 reply; 15+ messages in thread
From: Christian König @ 2022-09-27 18:20 UTC (permalink / raw)
  To: Michel Dänzer
  Cc: Grodzovsky, Andrey, Tuikov, Luben, Zhu, Jiadong, amd-gfx

This work is solely for gfx9 (e.g. Vega) and older.

Navi has a completely separate high priority gfx queue we can use for this.

Thanks,
Christian.

Am 27.09.22 um 19:49 schrieb Michel Dänzer:
> On 2022-09-27 08:06, Christian König wrote:
>> Hey Michel,
>>
>> JIadong is working on exposing high/low priority gfx queues for gfx9 and older hw generations by using mid command buffer preemption.
> Yeah, I've been keeping an eye on these patches. I'm looking forward to this working.
>
>
>> I know that you have been working on Gnome Mutter to make use from userspace for this. Do you have time to run some tests with that?
> I just tested the v8 series (first without amdgpu.mcbp=1 on the kernel command line, then with it, since I wasn't sure if it's needed) with https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgitlab.gnome.org%2FGNOME%2Fmutter%2F-%2Fmerge_requests%2F1880&amp;data=05%7C01%7Cchristian.koenig%40amd.com%7Cc6345d9230004549ba4d08daa0b0abcd%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637998977913548768%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&amp;sdata=P1Qo2AwDmfmPrxJe2SxTFsVjdJ9vjabK8s84ZVz%2Beh8%3D&amp;reserved=0 on Navi 14.
>
> Unfortunately, I'm not seeing any change in behaviour. Even though mutter uses a high priority context via the EGL_IMG_context_priority extension, it's unable to reach a higher frame rate than a GPU-limited client[0]. The "Last preempted" line of /sys/kernel/debug/dri/0/amdgpu_fence_info remains at 0x00000000.
>
> Did I miss a step?
>
>
> [0] I used the GpuTest pixmark piano & plot3d benchmarks. With an Intel iGPU, mutter can achieve a higher frame rate than plot3d, though not than pixmark piano (presumably due to limited GPU preemption granularity).
>
>> Am 27.09.22 um 05:18 schrieb Zhu, Jiadong:
>>> [AMD Official Use Only - General]
>>>
>>>> I need more time for an in deep review of this, but form the one mile high view it looks correct to me now.
>>>> Can we do some pre-commit qa testing with this?
>>> I changed drm test "Command submission Test (GFX)" to send high priority ibs meanwhile running Manhattan on Screen/Unigine heaven foreground, checking mcbp/resubmit triggered by cat /sys/kernel/debug/dri/0/amdgpu_fence_info
>>>
>>> I have continued running this scenario for 2 daytime and 1 night, no hangs happen yet(lots of hangs has been fixed in the previous patches).
>>>
>>> I will ask QA team to do more test.
>>>
>>> Thanks,
>>> JIadong
>>>
>>> -----Original Message-----
>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>> Sent: Monday, September 26, 2022 2:49 PM
>>> To: Zhu, Jiadong <Jiadong.Zhu@amd.com>; amd-gfx@lists.freedesktop.org
>>> Cc: Tuikov, Luben <Luben.Tuikov@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>>> Subject: Re: [PATCH 4/4] drm/amdgpu: MCBP based on DRM scheduler (v6)
>>>
>>> Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding.
>>>
>>>
>>> Am 23.09.22 um 15:16 schrieb jiadong.zhu@amd.com:
>>>> From: "Jiadong.Zhu" <Jiadong.Zhu@amd.com>
>>>>
>>>> Trigger Mid-Command Buffer Preemption according to the priority of the
>>>> software rings and the hw fence signalling condition.
>>>>
>>>> The muxer saves the locations of the indirect buffer frames from the
>>>> software ring together with the fence sequence number in its fifo
>>>> queue, and pops out those records when the fences are signalled. The
>>>> locations are used to resubmit packages in preemption scenarios by coping the chunks from the software ring.
>>>>
>>>> v2: Update comment style.
>>>> v3: Fix conflict caused by previous modifications.
>>>> v4: Remove unnecessary prints.
>>>> v5: Fix corner cases for resubmission cases.
>>>> v6: Refactor functions for resubmission, calling fence_process in irq handler.
>>>>
>>>> Cc: Christian Koenig <Christian.Koenig@amd.com>
>>>> Cc: Luben Tuikov <Luben.Tuikov@amd.com>
>>>> Cc: Andrey Grodzovsky <Andrey.Grodzovsky@amd.com>
>>>> Acked-by: Luben Tuikov <luben.tuikov@amd.com>
>>>> Signed-off-by: Jiadong.Zhu <Jiadong.Zhu@amd.com>
>>> I need more time for an in deep review of this, but form the one mile high view it looks correct to me now.
>>>
>>> Can we do some pre-commit qa testing with this?
>>>
>>> Thanks,
>>> Christian.
>>>
>>>> ---
>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c       |   2 +
>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c     |  13 +
>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h     |   3 +
>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c | 323 ++++++++++++++++---
>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h |  30 ++
>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.c  |  26 ++
>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c       |   2 +
>>>>     drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c        |  10 +-
>>>>     8 files changed, 368 insertions(+), 41 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>>> index 258cffe3c06a..af86d87e2f3b 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>>> @@ -211,6 +211,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>>>>                 }
>>>>         }
>>>>
>>>> +     amdgpu_ring_ib_begin(ring);
>>>>         if (job && ring->funcs->init_cond_exec)
>>>>                 patch_offset = amdgpu_ring_init_cond_exec(ring);
>>>>
>>>> @@ -285,6 +286,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>>>>             ring->hw_prio == AMDGPU_GFX_PIPE_PRIO_HIGH)
>>>>                 ring->funcs->emit_wave_limit(ring, false);
>>>>
>>>> +     amdgpu_ring_ib_end(ring);
>>>>         amdgpu_ring_commit(ring);
>>>>         return 0;
>>>>     }
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>>> index 13db99d653bd..84b0b3c7d40f 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>>> @@ -33,6 +33,7 @@
>>>>
>>>>     #include <drm/amdgpu_drm.h>
>>>>     #include "amdgpu.h"
>>>> +#include "amdgpu_sw_ring.h"
>>>>     #include "atom.h"
>>>>
>>>>     /*
>>>> @@ -569,3 +570,15 @@ int amdgpu_ring_init_mqd(struct amdgpu_ring
>>>> *ring)
>>>>
>>>>         return mqd_mgr->init_mqd(adev, ring->mqd_ptr, &prop);
>>>>     }
>>>> +
>>>> +void amdgpu_ring_ib_begin(struct amdgpu_ring *ring) {
>>>> +     if (ring->is_sw_ring)
>>>> +             amdgpu_sw_ring_ib_begin(ring); }
>>>> +
>>>> +void amdgpu_ring_ib_end(struct amdgpu_ring *ring) {
>>>> +     if (ring->is_sw_ring)
>>>> +             amdgpu_sw_ring_ib_end(ring); }
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>> index e90d327a589e..6fbc1627dab7 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>> @@ -312,6 +312,9 @@ struct amdgpu_ring {
>>>>     #define amdgpu_ring_preempt_ib(r) (r)->funcs->preempt_ib(r)
>>>>
>>>>     int amdgpu_ring_alloc(struct amdgpu_ring *ring, unsigned ndw);
>>>> +void amdgpu_ring_ib_begin(struct amdgpu_ring *ring); void
>>>> +amdgpu_ring_ib_end(struct amdgpu_ring *ring);
>>>> +
>>>>     void amdgpu_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count);
>>>>     void amdgpu_ring_generic_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib);
>>>>     void amdgpu_ring_commit(struct amdgpu_ring *ring); diff --git
>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
>>>> index 662aadebf111..788567e3b743 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
>>>> @@ -28,23 +28,146 @@
>>>>
>>>>     #define AMDGPU_MUX_RESUBMIT_JIFFIES_TIMEOUT (HZ / 2)
>>>>
>>>> +static struct kmem_cache *amdgpu_mux_chunk_slab;
>>>> +
>>>> +static inline struct amdgpu_mux_entry *amdgpu_ring_mux_sw_entry(struct amdgpu_ring_mux *mux,
>>>> +                                                             struct
>>>> +amdgpu_ring *ring) {
>>>> +     return ring->entry_index < mux->ring_entry_size ?
>>>> +                     &mux->ring_entry[ring->entry_index] : NULL; }
>>>> +
>>>> +/* copy packages on sw ring range[begin, end) */ static void
>>>> +amdgpu_ring_mux_copy_pkt_from_sw_ring(struct amdgpu_ring_mux *mux,
>>>> +                                               struct amdgpu_ring *ring,
>>>> +                                               u64 s_start, u64
>>>> +s_end) {
>>>> +     u64 start, end;
>>>> +     struct amdgpu_ring *real_ring = mux->real_ring;
>>>> +
>>>> +     start = s_start & ring->buf_mask;
>>>> +     end = s_end & ring->buf_mask;
>>>> +
>>>> +     if (start == end) {
>>>> +             DRM_ERROR("no more data copied from sw ring\n");
>>>> +             return;
>>>> +     }
>>>> +     if (start > end) {
>>>> +             amdgpu_ring_alloc(real_ring, (ring->ring_size >> 2) + end - start);
>>>> +             amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[start],
>>>> +                                        (ring->ring_size >> 2) - start);
>>>> +             amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[0], end);
>>>> +     } else {
>>>> +             amdgpu_ring_alloc(real_ring, end - start);
>>>> +             amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[start], end - start);
>>>> +     }
>>>> +}
>>>> +
>>>> +static void amdgpu_mux_resubmit_chunks(struct amdgpu_ring_mux *mux,
>>>> +bool is_fallback) {
>>>> +     struct amdgpu_mux_entry *e = NULL;
>>>> +     struct amdgpu_mux_chunk *chunk;
>>>> +     uint32_t seq, last_seq;
>>>> +     int i;
>>>> +
>>>> +     if (is_fallback) {
>>>> +             if (!spin_trylock(&mux->lock)) {
>>>> +                     amdgpu_ring_mux_schedule_resubmit(mux);
>>>> +                     DRM_ERROR("reschedule resubmit\n");
>>>> +                     return;
>>>> +             }
>>>> +     } else {
>>>> +             spin_lock(&mux->lock);
>>>> +     }
>>>> +
>>>> +     /*find low priority entries:*/
>>>> +     if (!mux->s_resubmit) {
>>>> +             spin_unlock(&mux->lock);
>>>> +             return;
>>>> +     }
>>>> +
>>>> +     for (i = 0; i < mux->num_ring_entries; i++) {
>>>> +             if (mux->ring_entry[i].ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT) {
>>>> +                     e = &mux->ring_entry[i];
>>>> +                     break;
>>>> +             }
>>>> +     }
>>>> +
>>>> +     if (!e) {
>>>> +             DRM_ERROR("%s no low priority ring found\n", __func__);
>>>> +             spin_unlock(&mux->lock);
>>>> +             return;
>>>> +     }
>>>> +
>>>> +     last_seq = atomic_read(&e->ring->fence_drv.last_seq);
>>>> +     seq = mux->seqno_to_resubmit;
>>>> +     if (last_seq < seq) {
>>>> +             /*resubmit all the fences between (last_seq, seq]*/
>>>> +             list_for_each_entry(chunk, &e->list, entry) {
>>>> +                     if (chunk->sync_seq > last_seq && chunk->sync_seq <= seq) {
>>>> +                             amdgpu_ring_mux_copy_pkt_from_sw_ring(mux, e->ring,
>>>> +                                                                   chunk->start,
>>>> +                                                                   chunk->end);
>>>> +                             mux->wptr_resubmit = chunk->end;
>>>> +                             amdgpu_ring_commit(mux->real_ring);
>>>> +                     }
>>>> +             }
>>>> +     }
>>>> +
>>>> +     del_timer(&mux->resubmit_timer);
>>>> +     mux->s_resubmit = false;
>>>> +     spin_unlock(&mux->lock);
>>>> +}
>>>> +
>>>> +static void amdgpu_mux_resubmit_fallback(struct timer_list *t) {
>>>> +     struct amdgpu_ring_mux *mux = from_timer(mux, t,
>>>> +resubmit_timer);
>>>> +
>>>> +     DRM_INFO("calling %s\n", __func__);
>>>> +     amdgpu_mux_resubmit_chunks(mux, true); }
>>>> +
>>>>     int amdgpu_ring_mux_init(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring,
>>>>                          unsigned int entry_size)
>>>>     {
>>>>         mux->real_ring = ring;
>>>>         mux->num_ring_entries = 0;
>>>> +
>>>>         mux->ring_entry = kcalloc(entry_size, sizeof(struct amdgpu_mux_entry), GFP_KERNEL);
>>>>         if (!mux->ring_entry)
>>>>                 return -ENOMEM;
>>>>
>>>>         mux->ring_entry_size = entry_size;
>>>> +     mux->s_resubmit = false;
>>>> +
>>>> +     amdgpu_mux_chunk_slab = kmem_cache_create("amdgpu_mux_chunk",
>>>> +                                               sizeof(struct amdgpu_mux_chunk), 0,
>>>> +                                               SLAB_HWCACHE_ALIGN, NULL);
>>>> +     if (!amdgpu_mux_chunk_slab) {
>>>> +             DRM_ERROR("create amdgpu_mux_chunk cache failed\n");
>>>> +             return -ENOMEM;
>>>> +     }
>>>> +
>>>>         spin_lock_init(&mux->lock);
>>>> +     timer_setup(&mux->resubmit_timer, amdgpu_mux_resubmit_fallback,
>>>> + 0);
>>>>
>>>>         return 0;
>>>>     }
>>>>
>>>>     void amdgpu_ring_mux_fini(struct amdgpu_ring_mux *mux)
>>>>     {
>>>> +     struct amdgpu_mux_entry *e;
>>>> +     struct amdgpu_mux_chunk *chunk, *chunk2;
>>>> +     int i;
>>>> +
>>>> +     for (i = 0; i < mux->num_ring_entries; i++) {
>>>> +             e = &mux->ring_entry[i];
>>>> +             list_for_each_entry_safe(chunk, chunk2, &e->list, entry) {
>>>> +                     list_del(&chunk->entry);
>>>> +                     kmem_cache_free(amdgpu_mux_chunk_slab, chunk);
>>>> +             }
>>>> +     }
>>>> +     kmem_cache_destroy(amdgpu_mux_chunk_slab);
>>>>         kfree(mux->ring_entry);
>>>>         mux->ring_entry = NULL;
>>>>         mux->num_ring_entries = 0;
>>>> @@ -64,62 +187,46 @@ int amdgpu_ring_mux_add_sw_ring(struct amdgpu_ring_mux *mux, struct amdgpu_ring
>>>>         ring->entry_index = mux->num_ring_entries;
>>>>         e->ring = ring;
>>>>
>>>> +     INIT_LIST_HEAD(&e->list);
>>>>         mux->num_ring_entries += 1;
>>>>         return 0;
>>>>     }
>>>>
>>>> -static inline struct amdgpu_mux_entry *amdgpu_ring_mux_sw_entry(struct amdgpu_ring_mux *mux,
>>>> -                                                             struct amdgpu_ring *ring)
>>>> -{
>>>> -     return ring->entry_index < mux->ring_entry_size ?
>>>> -                     &mux->ring_entry[ring->entry_index] : NULL;
>>>> -}
>>>> -
>>>> -/* copy packages on sw ring range[begin, end) */ -static void
>>>> amdgpu_ring_mux_copy_pkt_from_sw_ring(struct amdgpu_ring_mux *mux,
>>>> -                                               struct amdgpu_ring *ring,
>>>> -                                               u64 s_start, u64 s_end)
>>>> -{
>>>> -     u64 start, end;
>>>> -     struct amdgpu_ring *real_ring = mux->real_ring;
>>>> -
>>>> -     start = s_start & ring->buf_mask;
>>>> -     end = s_end & ring->buf_mask;
>>>> -
>>>> -     if (start == end) {
>>>> -             DRM_ERROR("no more data copied from sw ring\n");
>>>> -             return;
>>>> -     }
>>>> -     if (start > end) {
>>>> -             amdgpu_ring_alloc(real_ring, (ring->ring_size >> 2) + end - start);
>>>> -             amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[start],
>>>> -                                        (ring->ring_size >> 2) - start);
>>>> -             amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[0], end);
>>>> -     } else {
>>>> -             amdgpu_ring_alloc(real_ring, end - start);
>>>> -             amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[start], end - start);
>>>> -     }
>>>> -}
>>>> -
>>>>     void amdgpu_ring_mux_set_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring, u64 wptr)
>>>>     {
>>>>         struct amdgpu_mux_entry *e;
>>>>
>>>> +     if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT)
>>>> +             amdgpu_mux_resubmit_chunks(mux, false);
>>>> +
>>>>         e = amdgpu_ring_mux_sw_entry(mux, ring);
>>>>         if (!e) {
>>>>                 DRM_ERROR("cannot find entry for sw ring\n");
>>>>                 return;
>>>>         }
>>>>
>>>> +     /* We could skip this set wptr as preemption in process. */
>>>> +     if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT && mux->pending_trailing_fence_signaled) {
>>>> +             DRM_ERROR("amdgpu_ring_mux_set_wptr skipped\n");
>>>> +             return;
>>>> +     }
>>>> +
>>>>         spin_lock(&mux->lock);
>>>>         e->sw_cptr = e->sw_wptr;
>>>> +     /* Update cptr if the package already copied in resubmit functions */
>>>> +     if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT && e->sw_cptr < mux->wptr_resubmit)
>>>> +             e->sw_cptr = mux->wptr_resubmit;
>>>>         e->sw_wptr = wptr;
>>>>         e->start_ptr_in_hw_ring = mux->real_ring->wptr;
>>>>
>>>> -     amdgpu_ring_mux_copy_pkt_from_sw_ring(mux, ring, e->sw_cptr, wptr);
>>>> -     e->end_ptr_in_hw_ring = mux->real_ring->wptr;
>>>> -     amdgpu_ring_commit(mux->real_ring);
>>>> -
>>>> +     /* Skip copying for the packages already resubmitted.*/
>>>> +     if (ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT || mux->wptr_resubmit < wptr) {
>>>> +             amdgpu_ring_mux_copy_pkt_from_sw_ring(mux, ring, e->sw_cptr, wptr);
>>>> +             e->end_ptr_in_hw_ring = mux->real_ring->wptr;
>>>> +             amdgpu_ring_commit(mux->real_ring);
>>>> +     } else {
>>>> +             e->end_ptr_in_hw_ring = mux->real_ring->wptr;
>>>> +     }
>>>>         spin_unlock(&mux->lock);
>>>>     }
>>>>
>>>> @@ -181,3 +288,145 @@ u64 amdgpu_ring_mux_get_rptr(struct
>>>> amdgpu_ring_mux *mux, struct amdgpu_ring *ri
>>>>
>>>>         return e->sw_rptr;
>>>>     }
>>>> +
>>>> +void amdgpu_ring_mux_schedule_resubmit(struct amdgpu_ring_mux *mux) {
>>>> +     mod_timer(&mux->resubmit_timer, jiffies +
>>>> +AMDGPU_MUX_RESUBMIT_JIFFIES_TIMEOUT);
>>>> +}
>>>> +
>>>> +void amdgpu_ring_mux_start_ib(struct amdgpu_ring_mux *mux, struct
>>>> +amdgpu_ring *ring) {
>>>> +     struct amdgpu_mux_entry *e;
>>>> +     struct amdgpu_mux_chunk *chunk;
>>>> +
>>>> +     amdgpu_mux_resubmit_chunks(mux, false);
>>>> +
>>>> +     e = amdgpu_ring_mux_sw_entry(mux, ring);
>>>> +     if (!e) {
>>>> +             DRM_ERROR("cannot find entry!\n");
>>>> +             return;
>>>> +     }
>>>> +
>>>> +     chunk = kmem_cache_alloc(amdgpu_mux_chunk_slab, GFP_KERNEL);
>>>> +     if (!chunk) {
>>>> +             DRM_ERROR("alloc amdgpu_mux_chunk_slab failed\n");
>>>> +             return;
>>>> +     }
>>>> +
>>>> +     chunk->start = ring->wptr;
>>>> +     list_add_tail(&chunk->entry, &e->list); }
>>>> +
>>>> +static void scan_and_remove_signaled_chunk(struct amdgpu_ring_mux
>>>> +*mux, struct amdgpu_ring *ring) {
>>>> +     uint32_t last_seq, size = 0;
>>>> +     struct amdgpu_mux_entry *e;
>>>> +     struct amdgpu_mux_chunk *chunk, *tmp;
>>>> +
>>>> +     e = amdgpu_ring_mux_sw_entry(mux, ring);
>>>> +     if (!e) {
>>>> +             DRM_ERROR("cannot find entry!\n");
>>>> +             return;
>>>> +     }
>>>> +
>>>> +     last_seq = atomic_read(&ring->fence_drv.last_seq);
>>>> +
>>>> +     list_for_each_entry_safe(chunk, tmp, &e->list, entry) {
>>>> +             if (chunk->sync_seq <= last_seq) {
>>>> +                     list_del(&chunk->entry);
>>>> +                     kmem_cache_free(amdgpu_mux_chunk_slab, chunk);
>>>> +             } else {
>>>> +                     size++;
>>>> +             }
>>>> +     }
>>>> +}
>>>> +
>>>> +void amdgpu_ring_mux_end_ib(struct amdgpu_ring_mux *mux, struct
>>>> +amdgpu_ring *ring) {
>>>> +     struct amdgpu_mux_entry *e;
>>>> +     struct amdgpu_mux_chunk *chunk;
>>>> +
>>>> +     e = amdgpu_ring_mux_sw_entry(mux, ring);
>>>> +     if (!e) {
>>>> +             DRM_ERROR("cannot find entry!\n");
>>>> +             return;
>>>> +     }
>>>> +
>>>> +     chunk = list_last_entry(&e->list, struct amdgpu_mux_chunk, entry);
>>>> +     if (!chunk) {
>>>> +             DRM_ERROR("cannot find chunk!\n");
>>>> +             return;
>>>> +     }
>>>> +
>>>> +     chunk->end = ring->wptr;
>>>> +     chunk->sync_seq = READ_ONCE(ring->fence_drv.sync_seq);
>>>> +
>>>> +     scan_and_remove_signaled_chunk(mux, ring); }
>>>> +
>>>> +/* Trigger Mid-Command Buffer Preemption (MCBP) and find if we need
>>>> +to resubmit. */ int amdgpu_mcbp_trigger_preempt(struct
>>>> +amdgpu_ring_mux *mux) {
>>>> +     int r;
>>>> +
>>>> +     spin_lock(&mux->lock);
>>>> +     mux->pending_trailing_fence_signaled = true;
>>>> +     r = amdgpu_ring_preempt_ib(mux->real_ring);
>>>> +     spin_unlock(&mux->lock);
>>>> +     return r;
>>>> +}
>>>> +
>>>> +bool amdgpu_mcbp_handle_trailing_fence_irq(struct amdgpu_ring_mux
>>>> +*mux) {
>>>> +     struct amdgpu_mux_entry *e;
>>>> +     struct amdgpu_ring *ring = NULL;
>>>> +     int i;
>>>> +
>>>> +     if (!mux->pending_trailing_fence_signaled)
>>>> +             return false;
>>>> +
>>>> +     if (mux->real_ring->trail_seq != le32_to_cpu(*mux->real_ring->trail_fence_cpu_addr))
>>>> +             return false;
>>>> +
>>>> +     for (i = 0; i < mux->num_ring_entries; i++) {
>>>> +             e = &mux->ring_entry[i];
>>>> +             if (e->ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT) {
>>>> +                     ring = e->ring;
>>>> +                     break;
>>>> +             }
>>>> +     }
>>>> +
>>>> +     if (!ring) {
>>>> +             DRM_ERROR("cannot find low priority ring\n");
>>>> +             return false;
>>>> +     }
>>>> +
>>>> +     amdgpu_fence_process(ring);
>>>> +     if (amdgpu_fence_count_emitted(ring) > 0) {
>>>> +             mux->s_resubmit = true;
>>>> +             mux->seqno_to_resubmit = ring->fence_drv.sync_seq;
>>>> +             amdgpu_ring_mux_schedule_resubmit(mux);
>>>> +     }
>>>> +
>>>> +     mux->pending_trailing_fence_signaled = false;
>>>> +     return true;
>>>> +}
>>>> +
>>>> +/*scan on low prio rings to have unsignaled fence and high ring has
>>>> +no fence.*/ int amdgpu_mcbp_scan(struct amdgpu_ring_mux *mux) {
>>>> +     struct amdgpu_ring *ring;
>>>> +     int i, need_preempt;
>>>> +
>>>> +     need_preempt = 0;
>>>> +     for (i = 0; i < mux->num_ring_entries; i++) {
>>>> +             ring = mux->ring_entry[i].ring;
>>>> +             if (ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT &&
>>>> +                     amdgpu_fence_count_emitted(ring) > 0)
>>>> +                     return 0;
>>>> +             if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT &&
>>>> +                     amdgpu_fence_count_emitted(ring) > 0)
>>>> +                     need_preempt = 1;
>>>> +     }
>>>> +     return need_preempt && !mux->s_resubmit; }
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
>>>> index 8c1691e11b1c..bf8f5ca61605 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
>>>> @@ -29,6 +29,7 @@
>>>>     #include "amdgpu_ring.h"
>>>>
>>>>     struct amdgpu_ring;
>>>> +
>>>>     /**
>>>>      * struct amdgpu_mux_entry - the entry recording software rings copying information.
>>>>      * @ring: the pointer to the software ring.
>>>> @@ -37,6 +38,7 @@ struct amdgpu_ring;
>>>>      * @sw_cptr: the position of the copy pointer in the sw ring.
>>>>      * @sw_rptr: the read pointer in software ring.
>>>>      * @sw_wptr: the write pointer in software ring.
>>>> + * @list: list head for amdgpu_mux_chunk
>>>>      */
>>>>     struct amdgpu_mux_entry {
>>>>         struct                  amdgpu_ring *ring;
>>>> @@ -45,6 +47,7 @@ struct amdgpu_mux_entry {
>>>>         u64                     sw_cptr;
>>>>         u64                     sw_rptr;
>>>>         u64                     sw_wptr;
>>>> +     struct list_head        list;
>>>>     };
>>>>
>>>>     struct amdgpu_ring_mux {
>>>> @@ -55,6 +58,26 @@ struct amdgpu_ring_mux {
>>>>         unsigned int            ring_entry_size;
>>>>         /*the lock for copy data from different software rings*/
>>>>         spinlock_t              lock;
>>>> +     bool                    s_resubmit;
>>>> +     uint32_t                seqno_to_resubmit;
>>>> +     u64                     wptr_resubmit;
>>>> +     struct timer_list       resubmit_timer;
>>>> +
>>>> +     bool                    pending_trailing_fence_signaled;
>>>> +};
>>>> +
>>>> +/**
>>>> + * struct amdgpu_mux_chunk - save the location of indirect buffer's package on softare rings.
>>>> + * @entry: the list entry.
>>>> + * @sync_seq: the fence seqno related with the saved IB.
>>>> + * @start:- start location on the software ring.
>>>> + * @end:- end location on the software ring.
>>>> + */
>>>> +struct amdgpu_mux_chunk {
>>>> +     struct list_head        entry;
>>>> +     uint32_t                sync_seq;
>>>> +     u64                     start;
>>>> +     u64                     end;
>>>>     };
>>>>
>>>>     int amdgpu_ring_mux_init(struct amdgpu_ring_mux *mux, struct
>>>> amdgpu_ring *ring, @@ -65,4 +88,11 @@ void amdgpu_ring_mux_set_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *r
>>>>     u64 amdgpu_ring_mux_get_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring);
>>>>     u64 amdgpu_ring_mux_get_rptr(struct amdgpu_ring_mux *mux, struct
>>>> amdgpu_ring *ring);
>>>>
>>>> +void amdgpu_ring_mux_start_ib(struct amdgpu_ring_mux *mux, struct
>>>> +amdgpu_ring *ring); void amdgpu_ring_mux_end_ib(struct
>>>> +amdgpu_ring_mux *mux, struct amdgpu_ring *ring); void
>>>> +amdgpu_ring_mux_schedule_resubmit(struct amdgpu_ring_mux *mux);
>>>> +
>>>> +int amdgpu_mcbp_trigger_preempt(struct amdgpu_ring_mux *mux); int
>>>> +amdgpu_mcbp_scan(struct amdgpu_ring_mux *mux); bool
>>>> +amdgpu_mcbp_handle_trailing_fence_irq(struct amdgpu_ring_mux *mux);
>>>>     #endif
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.c
>>>> index 5ae12d6641ca..a3ec7bdf72a6 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.c
>>>> @@ -59,3 +59,29 @@ void amdgpu_sw_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count)
>>>>     {
>>>>         WARN_ON(!ring->is_sw_ring);
>>>>     }
>>>> +
>>>> +void amdgpu_sw_ring_ib_begin(struct amdgpu_ring *ring) {
>>>> +     struct amdgpu_device *adev = ring->adev;
>>>> +     struct amdgpu_ring_mux *mux = &adev->gfx.muxer;
>>>> +
>>>> +     WARN_ON(!ring->is_sw_ring);
>>>> +     if (ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT) {
>>>> +             if (amdgpu_mcbp_scan(mux) > 0)
>>>> +                     amdgpu_mcbp_trigger_preempt(mux);
>>>> +             return;
>>>> +     }
>>>> +
>>>> +     amdgpu_ring_mux_start_ib(mux, ring); }
>>>> +
>>>> +void amdgpu_sw_ring_ib_end(struct amdgpu_ring *ring) {
>>>> +     struct amdgpu_device *adev = ring->adev;
>>>> +     struct amdgpu_ring_mux *mux = &adev->gfx.muxer;
>>>> +
>>>> +     WARN_ON(!ring->is_sw_ring);
>>>> +     if (ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT)
>>>> +             return;
>>>> +     amdgpu_ring_mux_end_ib(mux, ring); }
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>> index 9596c22fded6..b7e94553f4fb 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>> @@ -601,6 +601,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
>>>>         if (!vm_flush_needed && !gds_switch_needed && !need_pipe_sync)
>>>>                 return 0;
>>>>
>>>> +     amdgpu_ring_ib_begin(ring);
>>>>         if (ring->funcs->init_cond_exec)
>>>>                 patch_offset = amdgpu_ring_init_cond_exec(ring);
>>>>
>>>> @@ -661,6 +662,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
>>>>                 amdgpu_ring_emit_switch_buffer(ring);
>>>>                 amdgpu_ring_emit_switch_buffer(ring);
>>>>         }
>>>> +     amdgpu_ring_ib_end(ring);
>>>>         return 0;
>>>>     }
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>> index 669532f658da..1620300f0dde 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>> @@ -5619,7 +5619,7 @@ static int gfx_v9_0_ring_preempt_ib(struct amdgpu_ring *ring)
>>>>         ring->trail_seq += 1;
>>>>         amdgpu_ring_alloc(ring, 13);
>>>>         gfx_v9_0_ring_emit_fence(ring, ring->trail_fence_gpu_addr,
>>>> -                              ring->trail_seq, AMDGPU_FENCE_FLAG_EXEC);
>>>> +                              ring->trail_seq, AMDGPU_FENCE_FLAG_EXEC
>>>> + | AMDGPU_FENCE_FLAG_INT);
>>>>         /*reset the CP_VMID_PREEMPT after trailing fence*/
>>>>         amdgpu_ring_emit_wreg(ring,
>>>>                               SOC15_REG_OFFSET(GC, 0,
>>>> mmCP_VMID_PREEMPT), @@ -6045,9 +6045,11 @@ static int
>>>> gfx_v9_0_eop_irq(struct amdgpu_device *adev,
>>>>
>>>>         switch (me_id) {
>>>>         case 0:
>>>> -             /* Fence signals are handled on the software rings*/
>>>> -             for (i = 0; i < GFX9_NUM_SW_GFX_RINGS; i++)
>>>> -                     amdgpu_fence_process(&adev->gfx.sw_gfx_ring[i]);
>>>> +             if (!amdgpu_mcbp_handle_trailing_fence_irq(&adev->gfx.muxer)) {
>>>> +                     /* Fence signals are handled on the software rings*/
>>>> +                     for (i = 0; i < GFX9_NUM_SW_GFX_RINGS; i++)
>>>> +                             amdgpu_fence_process(&adev->gfx.sw_gfx_ring[i]);
>>>> +             }
>>>>                 break;
>>>>         case 1:
>>>>         case 2:


^ permalink raw reply	[flat|nested] 15+ messages in thread

* RE: [PATCH 4/4] drm/amdgpu: MCBP based on DRM scheduler (v6)
  2022-09-27 18:20           ` Christian König
@ 2022-09-28  1:01             ` Zhu, Jiadong
  2022-09-28 13:52               ` Michel Dänzer
  0 siblings, 1 reply; 15+ messages in thread
From: Zhu, Jiadong @ 2022-09-28  1:01 UTC (permalink / raw)
  To: Koenig, Christian, Michel Dänzer
  Cc: Grodzovsky, Andrey, Tuikov, Luben, amd-gfx

[AMD Official Use Only - General]

Hi Michel,

Please make sure umd is calling the libdrm function to create context with different priories,
amdgpu_cs_ctx_create2(device_handle, AMDGPU_CTX_PRIORITY_HIGH, &context_handle).

Here is the behavior we could see:
1. After modprobe amdgpu, two software rings named gfx_high/gfx_low(in previous patch named gfx_sw) is visible in UMR. We could check the wptr/ptr to see if it is being used.
2. MCBP happens while the two different priority ibs are submitted at the same time. We could check fence info as below:
Last signaled trailing fence++  when the mcbp triggers by kmd. Last preempted may not increase as the mcbp is not triggered from CP.

--- ring 0 (gfx) ---
Last signaled fence          0x00000001
Last emitted                 0x00000001
Last signaled trailing fence 0x0022eb84
Last emitted                 0x0022eb84
Last preempted               0x00000000
Last reset                   0x00000000

Thanks,
Jiadong
-----Original Message-----
From: Koenig, Christian <Christian.Koenig@amd.com>
Sent: Wednesday, September 28, 2022 2:20 AM
To: Michel Dänzer <michel@daenzer.net>
Cc: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Tuikov, Luben <Luben.Tuikov@amd.com>; Zhu, Jiadong <Jiadong.Zhu@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH 4/4] drm/amdgpu: MCBP based on DRM scheduler (v6)

This work is solely for gfx9 (e.g. Vega) and older.

Navi has a completely separate high priority gfx queue we can use for this.

Thanks,
Christian.

Am 27.09.22 um 19:49 schrieb Michel Dänzer:
> On 2022-09-27 08:06, Christian König wrote:
>> Hey Michel,
>>
>> JIadong is working on exposing high/low priority gfx queues for gfx9 and older hw generations by using mid command buffer preemption.
> Yeah, I've been keeping an eye on these patches. I'm looking forward to this working.
>
>
>> I know that you have been working on Gnome Mutter to make use from userspace for this. Do you have time to run some tests with that?
> I just tested the v8 series (first without amdgpu.mcbp=1 on the kernel command line, then with it, since I wasn't sure if it's needed) with https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgitlab.gnome.org%2FGNOME%2Fmutter%2F-%2Fmerge_requests%2F1880&amp;data=05%7C01%7Cchristian.koenig%40amd.com%7Cc6345d9230004549ba4d08daa0b0abcd%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637998977913548768%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&amp;sdata=P1Qo2AwDmfmPrxJe2SxTFsVjdJ9vjabK8s84ZVz%2Beh8%3D&amp;reserved=0 on Navi 14.
>
> Unfortunately, I'm not seeing any change in behaviour. Even though mutter uses a high priority context via the EGL_IMG_context_priority extension, it's unable to reach a higher frame rate than a GPU-limited client[0]. The "Last preempted" line of /sys/kernel/debug/dri/0/amdgpu_fence_info remains at 0x00000000.
>
> Did I miss a step?
>
>
> [0] I used the GpuTest pixmark piano & plot3d benchmarks. With an Intel iGPU, mutter can achieve a higher frame rate than plot3d, though not than pixmark piano (presumably due to limited GPU preemption granularity).
>
>> Am 27.09.22 um 05:18 schrieb Zhu, Jiadong:
>>> [AMD Official Use Only - General]
>>>
>>>> I need more time for an in deep review of this, but form the one mile high view it looks correct to me now.
>>>> Can we do some pre-commit qa testing with this?
>>> I changed drm test "Command submission Test (GFX)" to send high
>>> priority ibs meanwhile running Manhattan on Screen/Unigine heaven
>>> foreground, checking mcbp/resubmit triggered by cat
>>> /sys/kernel/debug/dri/0/amdgpu_fence_info
>>>
>>> I have continued running this scenario for 2 daytime and 1 night, no hangs happen yet(lots of hangs has been fixed in the previous patches).
>>>
>>> I will ask QA team to do more test.
>>>
>>> Thanks,
>>> JIadong
>>>
>>> -----Original Message-----
>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>> Sent: Monday, September 26, 2022 2:49 PM
>>> To: Zhu, Jiadong <Jiadong.Zhu@amd.com>;
>>> amd-gfx@lists.freedesktop.org
>>> Cc: Tuikov, Luben <Luben.Tuikov@amd.com>; Koenig, Christian
>>> <Christian.Koenig@amd.com>; Grodzovsky, Andrey
>>> <Andrey.Grodzovsky@amd.com>
>>> Subject: Re: [PATCH 4/4] drm/amdgpu: MCBP based on DRM scheduler
>>> (v6)
>>>
>>> Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding.
>>>
>>>
>>> Am 23.09.22 um 15:16 schrieb jiadong.zhu@amd.com:
>>>> From: "Jiadong.Zhu" <Jiadong.Zhu@amd.com>
>>>>
>>>> Trigger Mid-Command Buffer Preemption according to the priority of
>>>> the software rings and the hw fence signalling condition.
>>>>
>>>> The muxer saves the locations of the indirect buffer frames from
>>>> the software ring together with the fence sequence number in its
>>>> fifo queue, and pops out those records when the fences are
>>>> signalled. The locations are used to resubmit packages in preemption scenarios by coping the chunks from the software ring.
>>>>
>>>> v2: Update comment style.
>>>> v3: Fix conflict caused by previous modifications.
>>>> v4: Remove unnecessary prints.
>>>> v5: Fix corner cases for resubmission cases.
>>>> v6: Refactor functions for resubmission, calling fence_process in irq handler.
>>>>
>>>> Cc: Christian Koenig <Christian.Koenig@amd.com>
>>>> Cc: Luben Tuikov <Luben.Tuikov@amd.com>
>>>> Cc: Andrey Grodzovsky <Andrey.Grodzovsky@amd.com>
>>>> Acked-by: Luben Tuikov <luben.tuikov@amd.com>
>>>> Signed-off-by: Jiadong.Zhu <Jiadong.Zhu@amd.com>
>>> I need more time for an in deep review of this, but form the one mile high view it looks correct to me now.
>>>
>>> Can we do some pre-commit qa testing with this?
>>>
>>> Thanks,
>>> Christian.
>>>
>>>> ---
>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c       |   2 +
>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c     |  13 +
>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h     |   3 +
>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c | 323
>>>> ++++++++++++++++---
>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h |  30 ++
>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.c  |  26 ++
>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c       |   2 +
>>>>     drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c        |  10 +-
>>>>     8 files changed, 368 insertions(+), 41 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>>> index 258cffe3c06a..af86d87e2f3b 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>>> @@ -211,6 +211,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring
>>>> *ring, unsigned num_ibs,
>>>>                 }
>>>>         }
>>>>
>>>> +     amdgpu_ring_ib_begin(ring);
>>>>         if (job && ring->funcs->init_cond_exec)
>>>>                 patch_offset = amdgpu_ring_init_cond_exec(ring);
>>>>
>>>> @@ -285,6 +286,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring
>>>> *ring, unsigned num_ibs,
>>>>             ring->hw_prio == AMDGPU_GFX_PIPE_PRIO_HIGH)
>>>>                 ring->funcs->emit_wave_limit(ring, false);
>>>>
>>>> +     amdgpu_ring_ib_end(ring);
>>>>         amdgpu_ring_commit(ring);
>>>>         return 0;
>>>>     }
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>>> index 13db99d653bd..84b0b3c7d40f 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>>> @@ -33,6 +33,7 @@
>>>>
>>>>     #include <drm/amdgpu_drm.h>
>>>>     #include "amdgpu.h"
>>>> +#include "amdgpu_sw_ring.h"
>>>>     #include "atom.h"
>>>>
>>>>     /*
>>>> @@ -569,3 +570,15 @@ int amdgpu_ring_init_mqd(struct amdgpu_ring
>>>> *ring)
>>>>
>>>>         return mqd_mgr->init_mqd(adev, ring->mqd_ptr, &prop);
>>>>     }
>>>> +
>>>> +void amdgpu_ring_ib_begin(struct amdgpu_ring *ring) {
>>>> +     if (ring->is_sw_ring)
>>>> +             amdgpu_sw_ring_ib_begin(ring); }
>>>> +
>>>> +void amdgpu_ring_ib_end(struct amdgpu_ring *ring) {
>>>> +     if (ring->is_sw_ring)
>>>> +             amdgpu_sw_ring_ib_end(ring); }
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>> index e90d327a589e..6fbc1627dab7 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>> @@ -312,6 +312,9 @@ struct amdgpu_ring {
>>>>     #define amdgpu_ring_preempt_ib(r) (r)->funcs->preempt_ib(r)
>>>>
>>>>     int amdgpu_ring_alloc(struct amdgpu_ring *ring, unsigned ndw);
>>>> +void amdgpu_ring_ib_begin(struct amdgpu_ring *ring); void
>>>> +amdgpu_ring_ib_end(struct amdgpu_ring *ring);
>>>> +
>>>>     void amdgpu_ring_insert_nop(struct amdgpu_ring *ring, uint32_t
>>>> count);
>>>>     void amdgpu_ring_generic_pad_ib(struct amdgpu_ring *ring,
>>>> struct amdgpu_ib *ib);
>>>>     void amdgpu_ring_commit(struct amdgpu_ring *ring); diff --git
>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
>>>> index 662aadebf111..788567e3b743 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
>>>> @@ -28,23 +28,146 @@
>>>>
>>>>     #define AMDGPU_MUX_RESUBMIT_JIFFIES_TIMEOUT (HZ / 2)
>>>>
>>>> +static struct kmem_cache *amdgpu_mux_chunk_slab;
>>>> +
>>>> +static inline struct amdgpu_mux_entry
>>>> +*amdgpu_ring_mux_sw_entry(struct amdgpu_ring_mux *mux,
>>>> +
>>>> +struct amdgpu_ring *ring) {
>>>> +     return ring->entry_index < mux->ring_entry_size ?
>>>> +                     &mux->ring_entry[ring->entry_index] : NULL; }
>>>> +
>>>> +/* copy packages on sw ring range[begin, end) */ static void
>>>> +amdgpu_ring_mux_copy_pkt_from_sw_ring(struct amdgpu_ring_mux *mux,
>>>> +                                               struct amdgpu_ring
>>>> +*ring,
>>>> +                                               u64 s_start, u64
>>>> +s_end) {
>>>> +     u64 start, end;
>>>> +     struct amdgpu_ring *real_ring = mux->real_ring;
>>>> +
>>>> +     start = s_start & ring->buf_mask;
>>>> +     end = s_end & ring->buf_mask;
>>>> +
>>>> +     if (start == end) {
>>>> +             DRM_ERROR("no more data copied from sw ring\n");
>>>> +             return;
>>>> +     }
>>>> +     if (start > end) {
>>>> +             amdgpu_ring_alloc(real_ring, (ring->ring_size >> 2) +
>>>> +end - start);
>>>> +             amdgpu_ring_write_multiple(real_ring, (void
>>>> +*)&ring->ring[start],
>>>> +                                        (ring->ring_size >> 2) -
>>>> +start);
>>>> +             amdgpu_ring_write_multiple(real_ring, (void
>>>> +*)&ring->ring[0], end);
>>>> +     } else {
>>>> +             amdgpu_ring_alloc(real_ring, end - start);
>>>> +             amdgpu_ring_write_multiple(real_ring, (void
>>>> +*)&ring->ring[start], end - start);
>>>> +     }
>>>> +}
>>>> +
>>>> +static void amdgpu_mux_resubmit_chunks(struct amdgpu_ring_mux
>>>> +*mux, bool is_fallback) {
>>>> +     struct amdgpu_mux_entry *e = NULL;
>>>> +     struct amdgpu_mux_chunk *chunk;
>>>> +     uint32_t seq, last_seq;
>>>> +     int i;
>>>> +
>>>> +     if (is_fallback) {
>>>> +             if (!spin_trylock(&mux->lock)) {
>>>> +                     amdgpu_ring_mux_schedule_resubmit(mux);
>>>> +                     DRM_ERROR("reschedule resubmit\n");
>>>> +                     return;
>>>> +             }
>>>> +     } else {
>>>> +             spin_lock(&mux->lock);
>>>> +     }
>>>> +
>>>> +     /*find low priority entries:*/
>>>> +     if (!mux->s_resubmit) {
>>>> +             spin_unlock(&mux->lock);
>>>> +             return;
>>>> +     }
>>>> +
>>>> +     for (i = 0; i < mux->num_ring_entries; i++) {
>>>> +             if (mux->ring_entry[i].ring->hw_prio <=
>>>> +AMDGPU_RING_PRIO_DEFAULT) {
>>>> +                     e = &mux->ring_entry[i];
>>>> +                     break;
>>>> +             }
>>>> +     }
>>>> +
>>>> +     if (!e) {
>>>> +             DRM_ERROR("%s no low priority ring found\n",
>>>> +__func__);
>>>> +             spin_unlock(&mux->lock);
>>>> +             return;
>>>> +     }
>>>> +
>>>> +     last_seq = atomic_read(&e->ring->fence_drv.last_seq);
>>>> +     seq = mux->seqno_to_resubmit;
>>>> +     if (last_seq < seq) {
>>>> +             /*resubmit all the fences between (last_seq, seq]*/
>>>> +             list_for_each_entry(chunk, &e->list, entry) {
>>>> +                     if (chunk->sync_seq > last_seq &&
>>>> +chunk->sync_seq <= seq) {
>>>> +
>>>> +amdgpu_ring_mux_copy_pkt_from_sw_ring(mux, e->ring,
>>>> +
>>>> +chunk->start,
>>>> +
>>>> +chunk->end);
>>>> +                             mux->wptr_resubmit = chunk->end;
>>>> +                             amdgpu_ring_commit(mux->real_ring);
>>>> +                     }
>>>> +             }
>>>> +     }
>>>> +
>>>> +     del_timer(&mux->resubmit_timer);
>>>> +     mux->s_resubmit = false;
>>>> +     spin_unlock(&mux->lock);
>>>> +}
>>>> +
>>>> +static void amdgpu_mux_resubmit_fallback(struct timer_list *t) {
>>>> +     struct amdgpu_ring_mux *mux = from_timer(mux, t,
>>>> +resubmit_timer);
>>>> +
>>>> +     DRM_INFO("calling %s\n", __func__);
>>>> +     amdgpu_mux_resubmit_chunks(mux, true); }
>>>> +
>>>>     int amdgpu_ring_mux_init(struct amdgpu_ring_mux *mux, struct
>>>> amdgpu_ring *ring,
>>>>                          unsigned int entry_size)
>>>>     {
>>>>         mux->real_ring = ring;
>>>>         mux->num_ring_entries = 0;
>>>> +
>>>>         mux->ring_entry = kcalloc(entry_size, sizeof(struct
>>>> amdgpu_mux_entry), GFP_KERNEL);
>>>>         if (!mux->ring_entry)
>>>>                 return -ENOMEM;
>>>>
>>>>         mux->ring_entry_size = entry_size;
>>>> +     mux->s_resubmit = false;
>>>> +
>>>> +     amdgpu_mux_chunk_slab = kmem_cache_create("amdgpu_mux_chunk",
>>>> +                                               sizeof(struct
>>>> +amdgpu_mux_chunk), 0,
>>>> +                                               SLAB_HWCACHE_ALIGN,
>>>> +NULL);
>>>> +     if (!amdgpu_mux_chunk_slab) {
>>>> +             DRM_ERROR("create amdgpu_mux_chunk cache failed\n");
>>>> +             return -ENOMEM;
>>>> +     }
>>>> +
>>>>         spin_lock_init(&mux->lock);
>>>> +     timer_setup(&mux->resubmit_timer,
>>>> +amdgpu_mux_resubmit_fallback,  0);
>>>>
>>>>         return 0;
>>>>     }
>>>>
>>>>     void amdgpu_ring_mux_fini(struct amdgpu_ring_mux *mux)
>>>>     {
>>>> +     struct amdgpu_mux_entry *e;
>>>> +     struct amdgpu_mux_chunk *chunk, *chunk2;
>>>> +     int i;
>>>> +
>>>> +     for (i = 0; i < mux->num_ring_entries; i++) {
>>>> +             e = &mux->ring_entry[i];
>>>> +             list_for_each_entry_safe(chunk, chunk2, &e->list,
>>>> +entry) {
>>>> +                     list_del(&chunk->entry);
>>>> +                     kmem_cache_free(amdgpu_mux_chunk_slab,
>>>> +chunk);
>>>> +             }
>>>> +     }
>>>> +     kmem_cache_destroy(amdgpu_mux_chunk_slab);
>>>>         kfree(mux->ring_entry);
>>>>         mux->ring_entry = NULL;
>>>>         mux->num_ring_entries = 0;
>>>> @@ -64,62 +187,46 @@ int amdgpu_ring_mux_add_sw_ring(struct
>>>> amdgpu_ring_mux *mux, struct amdgpu_ring
>>>>         ring->entry_index = mux->num_ring_entries;
>>>>         e->ring = ring;
>>>>
>>>> +     INIT_LIST_HEAD(&e->list);
>>>>         mux->num_ring_entries += 1;
>>>>         return 0;
>>>>     }
>>>>
>>>> -static inline struct amdgpu_mux_entry
>>>> *amdgpu_ring_mux_sw_entry(struct amdgpu_ring_mux *mux,
>>>> -
>>>> struct amdgpu_ring *ring) -{
>>>> -     return ring->entry_index < mux->ring_entry_size ?
>>>> -                     &mux->ring_entry[ring->entry_index] : NULL;
>>>> -}
>>>> -
>>>> -/* copy packages on sw ring range[begin, end) */ -static void
>>>> amdgpu_ring_mux_copy_pkt_from_sw_ring(struct amdgpu_ring_mux *mux,
>>>> -                                               struct amdgpu_ring
>>>> *ring,
>>>> -                                               u64 s_start, u64
>>>> s_end) -{
>>>> -     u64 start, end;
>>>> -     struct amdgpu_ring *real_ring = mux->real_ring;
>>>> -
>>>> -     start = s_start & ring->buf_mask;
>>>> -     end = s_end & ring->buf_mask;
>>>> -
>>>> -     if (start == end) {
>>>> -             DRM_ERROR("no more data copied from sw ring\n");
>>>> -             return;
>>>> -     }
>>>> -     if (start > end) {
>>>> -             amdgpu_ring_alloc(real_ring, (ring->ring_size >> 2) +
>>>> end - start);
>>>> -             amdgpu_ring_write_multiple(real_ring, (void
>>>> *)&ring->ring[start],
>>>> -                                        (ring->ring_size >> 2) -
>>>> start);
>>>> -             amdgpu_ring_write_multiple(real_ring, (void
>>>> *)&ring->ring[0], end);
>>>> -     } else {
>>>> -             amdgpu_ring_alloc(real_ring, end - start);
>>>> -             amdgpu_ring_write_multiple(real_ring, (void
>>>> *)&ring->ring[start], end - start);
>>>> -     }
>>>> -}
>>>> -
>>>>     void amdgpu_ring_mux_set_wptr(struct amdgpu_ring_mux *mux,
>>>> struct amdgpu_ring *ring, u64 wptr)
>>>>     {
>>>>         struct amdgpu_mux_entry *e;
>>>>
>>>> +     if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT)
>>>> +             amdgpu_mux_resubmit_chunks(mux, false);
>>>> +
>>>>         e = amdgpu_ring_mux_sw_entry(mux, ring);
>>>>         if (!e) {
>>>>                 DRM_ERROR("cannot find entry for sw ring\n");
>>>>                 return;
>>>>         }
>>>>
>>>> +     /* We could skip this set wptr as preemption in process. */
>>>> +     if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT &&
>>>> +mux->pending_trailing_fence_signaled) {
>>>> +             DRM_ERROR("amdgpu_ring_mux_set_wptr skipped\n");
>>>> +             return;
>>>> +     }
>>>> +
>>>>         spin_lock(&mux->lock);
>>>>         e->sw_cptr = e->sw_wptr;
>>>> +     /* Update cptr if the package already copied in resubmit
>>>> +functions */
>>>> +     if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT && e->sw_cptr <
>>>> +mux->wptr_resubmit)
>>>> +             e->sw_cptr = mux->wptr_resubmit;
>>>>         e->sw_wptr = wptr;
>>>>         e->start_ptr_in_hw_ring = mux->real_ring->wptr;
>>>>
>>>> -     amdgpu_ring_mux_copy_pkt_from_sw_ring(mux, ring, e->sw_cptr,
>>>> wptr);
>>>> -     e->end_ptr_in_hw_ring = mux->real_ring->wptr;
>>>> -     amdgpu_ring_commit(mux->real_ring);
>>>> -
>>>> +     /* Skip copying for the packages already resubmitted.*/
>>>> +     if (ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT ||
>>>> +mux->wptr_resubmit < wptr) {
>>>> +             amdgpu_ring_mux_copy_pkt_from_sw_ring(mux, ring,
>>>> +e->sw_cptr, wptr);
>>>> +             e->end_ptr_in_hw_ring = mux->real_ring->wptr;
>>>> +             amdgpu_ring_commit(mux->real_ring);
>>>> +     } else {
>>>> +             e->end_ptr_in_hw_ring = mux->real_ring->wptr;
>>>> +     }
>>>>         spin_unlock(&mux->lock);
>>>>     }
>>>>
>>>> @@ -181,3 +288,145 @@ u64 amdgpu_ring_mux_get_rptr(struct
>>>> amdgpu_ring_mux *mux, struct amdgpu_ring *ri
>>>>
>>>>         return e->sw_rptr;
>>>>     }
>>>> +
>>>> +void amdgpu_ring_mux_schedule_resubmit(struct amdgpu_ring_mux
>>>> +*mux) {
>>>> +     mod_timer(&mux->resubmit_timer, jiffies +
>>>> +AMDGPU_MUX_RESUBMIT_JIFFIES_TIMEOUT);
>>>> +}
>>>> +
>>>> +void amdgpu_ring_mux_start_ib(struct amdgpu_ring_mux *mux, struct
>>>> +amdgpu_ring *ring) {
>>>> +     struct amdgpu_mux_entry *e;
>>>> +     struct amdgpu_mux_chunk *chunk;
>>>> +
>>>> +     amdgpu_mux_resubmit_chunks(mux, false);
>>>> +
>>>> +     e = amdgpu_ring_mux_sw_entry(mux, ring);
>>>> +     if (!e) {
>>>> +             DRM_ERROR("cannot find entry!\n");
>>>> +             return;
>>>> +     }
>>>> +
>>>> +     chunk = kmem_cache_alloc(amdgpu_mux_chunk_slab, GFP_KERNEL);
>>>> +     if (!chunk) {
>>>> +             DRM_ERROR("alloc amdgpu_mux_chunk_slab failed\n");
>>>> +             return;
>>>> +     }
>>>> +
>>>> +     chunk->start = ring->wptr;
>>>> +     list_add_tail(&chunk->entry, &e->list); }
>>>> +
>>>> +static void scan_and_remove_signaled_chunk(struct amdgpu_ring_mux
>>>> +*mux, struct amdgpu_ring *ring) {
>>>> +     uint32_t last_seq, size = 0;
>>>> +     struct amdgpu_mux_entry *e;
>>>> +     struct amdgpu_mux_chunk *chunk, *tmp;
>>>> +
>>>> +     e = amdgpu_ring_mux_sw_entry(mux, ring);
>>>> +     if (!e) {
>>>> +             DRM_ERROR("cannot find entry!\n");
>>>> +             return;
>>>> +     }
>>>> +
>>>> +     last_seq = atomic_read(&ring->fence_drv.last_seq);
>>>> +
>>>> +     list_for_each_entry_safe(chunk, tmp, &e->list, entry) {
>>>> +             if (chunk->sync_seq <= last_seq) {
>>>> +                     list_del(&chunk->entry);
>>>> +                     kmem_cache_free(amdgpu_mux_chunk_slab,
>>>> +chunk);
>>>> +             } else {
>>>> +                     size++;
>>>> +             }
>>>> +     }
>>>> +}
>>>> +
>>>> +void amdgpu_ring_mux_end_ib(struct amdgpu_ring_mux *mux, struct
>>>> +amdgpu_ring *ring) {
>>>> +     struct amdgpu_mux_entry *e;
>>>> +     struct amdgpu_mux_chunk *chunk;
>>>> +
>>>> +     e = amdgpu_ring_mux_sw_entry(mux, ring);
>>>> +     if (!e) {
>>>> +             DRM_ERROR("cannot find entry!\n");
>>>> +             return;
>>>> +     }
>>>> +
>>>> +     chunk = list_last_entry(&e->list, struct amdgpu_mux_chunk,
>>>> +entry);
>>>> +     if (!chunk) {
>>>> +             DRM_ERROR("cannot find chunk!\n");
>>>> +             return;
>>>> +     }
>>>> +
>>>> +     chunk->end = ring->wptr;
>>>> +     chunk->sync_seq = READ_ONCE(ring->fence_drv.sync_seq);
>>>> +
>>>> +     scan_and_remove_signaled_chunk(mux, ring); }
>>>> +
>>>> +/* Trigger Mid-Command Buffer Preemption (MCBP) and find if we
>>>> +need to resubmit. */ int amdgpu_mcbp_trigger_preempt(struct
>>>> +amdgpu_ring_mux *mux) {
>>>> +     int r;
>>>> +
>>>> +     spin_lock(&mux->lock);
>>>> +     mux->pending_trailing_fence_signaled = true;
>>>> +     r = amdgpu_ring_preempt_ib(mux->real_ring);
>>>> +     spin_unlock(&mux->lock);
>>>> +     return r;
>>>> +}
>>>> +
>>>> +bool amdgpu_mcbp_handle_trailing_fence_irq(struct amdgpu_ring_mux
>>>> +*mux) {
>>>> +     struct amdgpu_mux_entry *e;
>>>> +     struct amdgpu_ring *ring = NULL;
>>>> +     int i;
>>>> +
>>>> +     if (!mux->pending_trailing_fence_signaled)
>>>> +             return false;
>>>> +
>>>> +     if (mux->real_ring->trail_seq !=
>>>> +le32_to_cpu(*mux->real_ring->trail_fence_cpu_addr))
>>>> +             return false;
>>>> +
>>>> +     for (i = 0; i < mux->num_ring_entries; i++) {
>>>> +             e = &mux->ring_entry[i];
>>>> +             if (e->ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT) {
>>>> +                     ring = e->ring;
>>>> +                     break;
>>>> +             }
>>>> +     }
>>>> +
>>>> +     if (!ring) {
>>>> +             DRM_ERROR("cannot find low priority ring\n");
>>>> +             return false;
>>>> +     }
>>>> +
>>>> +     amdgpu_fence_process(ring);
>>>> +     if (amdgpu_fence_count_emitted(ring) > 0) {
>>>> +             mux->s_resubmit = true;
>>>> +             mux->seqno_to_resubmit = ring->fence_drv.sync_seq;
>>>> +             amdgpu_ring_mux_schedule_resubmit(mux);
>>>> +     }
>>>> +
>>>> +     mux->pending_trailing_fence_signaled = false;
>>>> +     return true;
>>>> +}
>>>> +
>>>> +/*scan on low prio rings to have unsignaled fence and high ring
>>>> +has no fence.*/ int amdgpu_mcbp_scan(struct amdgpu_ring_mux *mux)
>>>> +{
>>>> +     struct amdgpu_ring *ring;
>>>> +     int i, need_preempt;
>>>> +
>>>> +     need_preempt = 0;
>>>> +     for (i = 0; i < mux->num_ring_entries; i++) {
>>>> +             ring = mux->ring_entry[i].ring;
>>>> +             if (ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT &&
>>>> +                     amdgpu_fence_count_emitted(ring) > 0)
>>>> +                     return 0;
>>>> +             if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT &&
>>>> +                     amdgpu_fence_count_emitted(ring) > 0)
>>>> +                     need_preempt = 1;
>>>> +     }
>>>> +     return need_preempt && !mux->s_resubmit; }
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
>>>> index 8c1691e11b1c..bf8f5ca61605 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
>>>> @@ -29,6 +29,7 @@
>>>>     #include "amdgpu_ring.h"
>>>>
>>>>     struct amdgpu_ring;
>>>> +
>>>>     /**
>>>>      * struct amdgpu_mux_entry - the entry recording software rings copying information.
>>>>      * @ring: the pointer to the software ring.
>>>> @@ -37,6 +38,7 @@ struct amdgpu_ring;
>>>>      * @sw_cptr: the position of the copy pointer in the sw ring.
>>>>      * @sw_rptr: the read pointer in software ring.
>>>>      * @sw_wptr: the write pointer in software ring.
>>>> + * @list: list head for amdgpu_mux_chunk
>>>>      */
>>>>     struct amdgpu_mux_entry {
>>>>         struct                  amdgpu_ring *ring; @@ -45,6 +47,7
>>>> @@ struct amdgpu_mux_entry {
>>>>         u64                     sw_cptr;
>>>>         u64                     sw_rptr;
>>>>         u64                     sw_wptr;
>>>> +     struct list_head        list;
>>>>     };
>>>>
>>>>     struct amdgpu_ring_mux {
>>>> @@ -55,6 +58,26 @@ struct amdgpu_ring_mux {
>>>>         unsigned int            ring_entry_size;
>>>>         /*the lock for copy data from different software rings*/
>>>>         spinlock_t              lock;
>>>> +     bool                    s_resubmit;
>>>> +     uint32_t                seqno_to_resubmit;
>>>> +     u64                     wptr_resubmit;
>>>> +     struct timer_list       resubmit_timer;
>>>> +
>>>> +     bool                    pending_trailing_fence_signaled; };
>>>> +
>>>> +/**
>>>> + * struct amdgpu_mux_chunk - save the location of indirect buffer's package on softare rings.
>>>> + * @entry: the list entry.
>>>> + * @sync_seq: the fence seqno related with the saved IB.
>>>> + * @start:- start location on the software ring.
>>>> + * @end:- end location on the software ring.
>>>> + */
>>>> +struct amdgpu_mux_chunk {
>>>> +     struct list_head        entry;
>>>> +     uint32_t                sync_seq;
>>>> +     u64                     start;
>>>> +     u64                     end;
>>>>     };
>>>>
>>>>     int amdgpu_ring_mux_init(struct amdgpu_ring_mux *mux, struct
>>>> amdgpu_ring *ring, @@ -65,4 +88,11 @@ void
>>>> amdgpu_ring_mux_set_wptr(struct amdgpu_ring_mux *mux, struct
>>>> amdgpu_ring *r
>>>>     u64 amdgpu_ring_mux_get_wptr(struct amdgpu_ring_mux *mux,
>>>> struct amdgpu_ring *ring);
>>>>     u64 amdgpu_ring_mux_get_rptr(struct amdgpu_ring_mux *mux,
>>>> struct amdgpu_ring *ring);
>>>>
>>>> +void amdgpu_ring_mux_start_ib(struct amdgpu_ring_mux *mux, struct
>>>> +amdgpu_ring *ring); void amdgpu_ring_mux_end_ib(struct
>>>> +amdgpu_ring_mux *mux, struct amdgpu_ring *ring); void
>>>> +amdgpu_ring_mux_schedule_resubmit(struct amdgpu_ring_mux *mux);
>>>> +
>>>> +int amdgpu_mcbp_trigger_preempt(struct amdgpu_ring_mux *mux); int
>>>> +amdgpu_mcbp_scan(struct amdgpu_ring_mux *mux); bool
>>>> +amdgpu_mcbp_handle_trailing_fence_irq(struct amdgpu_ring_mux
>>>> +*mux);
>>>>     #endif
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.c
>>>> index 5ae12d6641ca..a3ec7bdf72a6 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.c
>>>> @@ -59,3 +59,29 @@ void amdgpu_sw_ring_insert_nop(struct
>>>> amdgpu_ring *ring, uint32_t count)
>>>>     {
>>>>         WARN_ON(!ring->is_sw_ring);
>>>>     }
>>>> +
>>>> +void amdgpu_sw_ring_ib_begin(struct amdgpu_ring *ring) {
>>>> +     struct amdgpu_device *adev = ring->adev;
>>>> +     struct amdgpu_ring_mux *mux = &adev->gfx.muxer;
>>>> +
>>>> +     WARN_ON(!ring->is_sw_ring);
>>>> +     if (ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT) {
>>>> +             if (amdgpu_mcbp_scan(mux) > 0)
>>>> +                     amdgpu_mcbp_trigger_preempt(mux);
>>>> +             return;
>>>> +     }
>>>> +
>>>> +     amdgpu_ring_mux_start_ib(mux, ring); }
>>>> +
>>>> +void amdgpu_sw_ring_ib_end(struct amdgpu_ring *ring) {
>>>> +     struct amdgpu_device *adev = ring->adev;
>>>> +     struct amdgpu_ring_mux *mux = &adev->gfx.muxer;
>>>> +
>>>> +     WARN_ON(!ring->is_sw_ring);
>>>> +     if (ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT)
>>>> +             return;
>>>> +     amdgpu_ring_mux_end_ib(mux, ring); }
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>> index 9596c22fded6..b7e94553f4fb 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>> @@ -601,6 +601,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring,
>>>> struct amdgpu_job *job,
>>>>         if (!vm_flush_needed && !gds_switch_needed &&
>>>> !need_pipe_sync)
>>>>                 return 0;
>>>>
>>>> +     amdgpu_ring_ib_begin(ring);
>>>>         if (ring->funcs->init_cond_exec)
>>>>                 patch_offset = amdgpu_ring_init_cond_exec(ring);
>>>>
>>>> @@ -661,6 +662,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring,
>>>> struct amdgpu_job *job,
>>>>                 amdgpu_ring_emit_switch_buffer(ring);
>>>>                 amdgpu_ring_emit_switch_buffer(ring);
>>>>         }
>>>> +     amdgpu_ring_ib_end(ring);
>>>>         return 0;
>>>>     }
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>> index 669532f658da..1620300f0dde 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>> @@ -5619,7 +5619,7 @@ static int gfx_v9_0_ring_preempt_ib(struct
>>>> amdgpu_ring *ring)
>>>>         ring->trail_seq += 1;
>>>>         amdgpu_ring_alloc(ring, 13);
>>>>         gfx_v9_0_ring_emit_fence(ring, ring->trail_fence_gpu_addr,
>>>> -                              ring->trail_seq,
>>>> AMDGPU_FENCE_FLAG_EXEC);
>>>> +                              ring->trail_seq,
>>>> +AMDGPU_FENCE_FLAG_EXEC
>>>> + | AMDGPU_FENCE_FLAG_INT);
>>>>         /*reset the CP_VMID_PREEMPT after trailing fence*/
>>>>         amdgpu_ring_emit_wreg(ring,
>>>>                               SOC15_REG_OFFSET(GC, 0,
>>>> mmCP_VMID_PREEMPT), @@ -6045,9 +6045,11 @@ static int
>>>> gfx_v9_0_eop_irq(struct amdgpu_device *adev,
>>>>
>>>>         switch (me_id) {
>>>>         case 0:
>>>> -             /* Fence signals are handled on the software rings*/
>>>> -             for (i = 0; i < GFX9_NUM_SW_GFX_RINGS; i++)
>>>> -
>>>> amdgpu_fence_process(&adev->gfx.sw_gfx_ring[i]);
>>>> +             if
>>>> +(!amdgpu_mcbp_handle_trailing_fence_irq(&adev->gfx.muxer)) {
>>>> +                     /* Fence signals are handled on the software
>>>> +rings*/
>>>> +                     for (i = 0; i < GFX9_NUM_SW_GFX_RINGS; i++)
>>>> +
>>>> +amdgpu_fence_process(&adev->gfx.sw_gfx_ring[i]);
>>>> +             }
>>>>                 break;
>>>>         case 1:
>>>>         case 2:


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 4/4] drm/amdgpu: MCBP based on DRM scheduler (v6)
  2022-09-28  1:01             ` Zhu, Jiadong
@ 2022-09-28 13:52               ` Michel Dänzer
  2022-09-28 14:46                 ` Christian König
  0 siblings, 1 reply; 15+ messages in thread
From: Michel Dänzer @ 2022-09-28 13:52 UTC (permalink / raw)
  To: Zhu, Jiadong, Koenig, Christian
  Cc: Grodzovsky, Andrey, Tuikov, Luben, amd-gfx

On 2022-09-28 03:01, Zhu, Jiadong wrote:> 
> Please make sure umd is calling the libdrm function to create context with different priories,
> amdgpu_cs_ctx_create2(device_handle, AMDGPU_CTX_PRIORITY_HIGH, &context_handle).

Yes, I double-checked that, and that it returns success.


> Here is the behavior we could see:
> 1. After modprobe amdgpu, two software rings named gfx_high/gfx_low(in previous patch named gfx_sw) is visible in UMR. We could check the wptr/ptr to see if it is being used.
> 2. MCBP happens while the two different priority ibs are submitted at the same time. We could check fence info as below:
> Last signaled trailing fence++  when the mcbp triggers by kmd. Last preempted may not increase as the mcbp is not triggered from CP.
> 
> --- ring 0 (gfx) ---
> Last signaled fence          0x00000001
> Last emitted                 0x00000001
> Last signaled trailing fence 0x0022eb84
> Last emitted                 0x0022eb84
> Last preempted               0x00000000
> Last reset                   0x00000000

I've now tested on this Picasso (GFX9) laptop as well. The "Last signaled trailing fence" line is changing here (seems to always match the "Last emitted" line).

However, mutter's frame rate still cannot exceed that of GPU-limited clients. BTW, you can test with a GNOME Wayland session, even without my MR referenced below. Preemption will just be less effective without that MR. mutter has used a high priority context when possible for a long time.

I'm also seeing intermittent freezes, where not even the mouse cursor moves for up to around one second, e.g. when interacting with the GNOME top bar. I'm not sure yet if these are related to this patch series, but I never noticed it before. I wonder if the freezes might occur when GPU preemption is attempted.


> From: Koenig, Christian <Christian.Koenig@amd.com>
> 
> > This work is solely for gfx9 (e.g. Vega) and older.
> > 
> > Navi has a completely separate high priority gfx queue we can use for this.

Right, but 4c7631800e6b ("drm/amd/amdgpu: add pipe1 hardware support") was for Sienna Cichlid only, and turned out to be unstable, so it had to reverted.

It would be nice to make the SW ring solution take effect by default whenever there is no separate high priority HW gfx queue available (and any other requirements are met).


> Am 27.09.22 um 19:49 schrieb Michel Dänzer:
>> On 2022-09-27 08:06, Christian König wrote:
>>> Hey Michel,
>>>
>>> JIadong is working on exposing high/low priority gfx queues for gfx9 and older hw generations by using mid command buffer preemption.
>> Yeah, I've been keeping an eye on these patches. I'm looking forward to this working.
>>
>>
>>> I know that you have been working on Gnome Mutter to make use from userspace for this. Do you have time to run some tests with that?
>> I just tested the v8 series (first without amdgpu.mcbp=1 on the kernel command line, then with it, since I wasn't sure if it's needed) with https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgitlab.gnome.org%2FGNOME%2Fmutter%2F-%2Fmerge_requests%2F1880&amp;data=05%7C01%7Cchristian.koenig%40amd.com%7Cc6345d9230004549ba4d08daa0b0abcd%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637998977913548768%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&amp;sdata=P1Qo2AwDmfmPrxJe2SxTFsVjdJ9vjabK8s84ZVz%2Beh8%3D&amp;reserved=0 on Navi 14.
>>
>> Unfortunately, I'm not seeing any change in behaviour. Even though mutter uses a high priority context via the EGL_IMG_context_priority extension, it's unable to reach a higher frame rate than a GPU-limited client[0]. The "Last preempted" line of /sys/kernel/debug/dri/0/amdgpu_fence_info remains at 0x00000000.
>>
>> Did I miss a step?
>>
>>
>> [0] I used the GpuTest pixmark piano & plot3d benchmarks. With an Intel iGPU, mutter can achieve a higher frame rate than plot3d, though not than pixmark piano (presumably due to limited GPU preemption granularity).


-- 
Earthling Michel Dänzer            |                  https://redhat.com
Libre software enthusiast          |         Mesa and Xwayland developer



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 4/4] drm/amdgpu: MCBP based on DRM scheduler (v6)
  2022-09-28 13:52               ` Michel Dänzer
@ 2022-09-28 14:46                 ` Christian König
  2022-09-28 15:07                   ` Michel Dänzer
  0 siblings, 1 reply; 15+ messages in thread
From: Christian König @ 2022-09-28 14:46 UTC (permalink / raw)
  To: Michel Dänzer, Zhu, Jiadong, Koenig, Christian
  Cc: Grodzovsky, Andrey, Tuikov, Luben, amd-gfx

Am 28.09.22 um 15:52 schrieb Michel Dänzer:
> On 2022-09-28 03:01, Zhu, Jiadong wrote:>
>> Please make sure umd is calling the libdrm function to create context with different priories,
>> amdgpu_cs_ctx_create2(device_handle, AMDGPU_CTX_PRIORITY_HIGH, &context_handle).
> Yes, I double-checked that, and that it returns success.
>
>
>> Here is the behavior we could see:
>> 1. After modprobe amdgpu, two software rings named gfx_high/gfx_low(in previous patch named gfx_sw) is visible in UMR. We could check the wptr/ptr to see if it is being used.
>> 2. MCBP happens while the two different priority ibs are submitted at the same time. We could check fence info as below:
>> Last signaled trailing fence++  when the mcbp triggers by kmd. Last preempted may not increase as the mcbp is not triggered from CP.
>>
>> --- ring 0 (gfx) ---
>> Last signaled fence          0x00000001
>> Last emitted                 0x00000001
>> Last signaled trailing fence 0x0022eb84
>> Last emitted                 0x0022eb84
>> Last preempted               0x00000000
>> Last reset                   0x00000000
> I've now tested on this Picasso (GFX9) laptop as well. The "Last signaled trailing fence" line is changing here (seems to always match the "Last emitted" line).
>
> However, mutter's frame rate still cannot exceed that of GPU-limited clients. BTW, you can test with a GNOME Wayland session, even without my MR referenced below. Preemption will just be less effective without that MR. mutter has used a high priority context when possible for a long time.
>
> I'm also seeing intermittent freezes, where not even the mouse cursor moves for up to around one second, e.g. when interacting with the GNOME top bar. I'm not sure yet if these are related to this patch series, but I never noticed it before. I wonder if the freezes might occur when GPU preemption is attempted.

Keep in mind that this doesn't have the same fine granularity as the 
separate hw ring buffer found on gfx10.

With MCBP we can only preempt on draw command boundary, while the 
separate hw ring solution can preempt as soon as a CU is available.

>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>
>>> This work is solely for gfx9 (e.g. Vega) and older.
>>>
>>> Navi has a completely separate high priority gfx queue we can use for this.
> Right, but 4c7631800e6b ("drm/amd/amdgpu: add pipe1 hardware support") was for Sienna Cichlid only, and turned out to be unstable, so it had to reverted.
>
> It would be nice to make the SW ring solution take effect by default whenever there is no separate high priority HW gfx queue available (and any other requirements are met).

I don't think that this will be a good idea. The hw ring buffer or even 
hw scheduler have much nicer properties and we should focus on getting 
that working when available.

Regards,
Christian.

>
>
>> Am 27.09.22 um 19:49 schrieb Michel Dänzer:
>>> On 2022-09-27 08:06, Christian König wrote:
>>>> Hey Michel,
>>>>
>>>> JIadong is working on exposing high/low priority gfx queues for gfx9 and older hw generations by using mid command buffer preemption.
>>> Yeah, I've been keeping an eye on these patches. I'm looking forward to this working.
>>>
>>>
>>>> I know that you have been working on Gnome Mutter to make use from userspace for this. Do you have time to run some tests with that?
>>> I just tested the v8 series (first without amdgpu.mcbp=1 on the kernel command line, then with it, since I wasn't sure if it's needed) with https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgitlab.gnome.org%2FGNOME%2Fmutter%2F-%2Fmerge_requests%2F1880&amp;data=05%7C01%7Cchristian.koenig%40amd.com%7Cc6345d9230004549ba4d08daa0b0abcd%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637998977913548768%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&amp;sdata=P1Qo2AwDmfmPrxJe2SxTFsVjdJ9vjabK8s84ZVz%2Beh8%3D&amp;reserved=0 on Navi 14.
>>>
>>> Unfortunately, I'm not seeing any change in behaviour. Even though mutter uses a high priority context via the EGL_IMG_context_priority extension, it's unable to reach a higher frame rate than a GPU-limited client[0]. The "Last preempted" line of /sys/kernel/debug/dri/0/amdgpu_fence_info remains at 0x00000000.
>>>
>>> Did I miss a step?
>>>
>>>
>>> [0] I used the GpuTest pixmark piano & plot3d benchmarks. With an Intel iGPU, mutter can achieve a higher frame rate than plot3d, though not than pixmark piano (presumably due to limited GPU preemption granularity).
>


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 4/4] drm/amdgpu: MCBP based on DRM scheduler (v6)
  2022-09-28 14:46                 ` Christian König
@ 2022-09-28 15:07                   ` Michel Dänzer
  0 siblings, 0 replies; 15+ messages in thread
From: Michel Dänzer @ 2022-09-28 15:07 UTC (permalink / raw)
  To: Christian König, Zhu, Jiadong, Koenig, Christian
  Cc: Grodzovsky, Andrey, Tuikov, Luben, amd-gfx

On 2022-09-28 16:46, Christian König wrote:
> Am 28.09.22 um 15:52 schrieb Michel Dänzer:
>> On 2022-09-28 03:01, Zhu, Jiadong wrote:>
>>> Please make sure umd is calling the libdrm function to create context with different priories,
>>> amdgpu_cs_ctx_create2(device_handle, AMDGPU_CTX_PRIORITY_HIGH, &context_handle).
>> Yes, I double-checked that, and that it returns success.
>>
>>
>>> Here is the behavior we could see:
>>> 1. After modprobe amdgpu, two software rings named gfx_high/gfx_low(in previous patch named gfx_sw) is visible in UMR. We could check the wptr/ptr to see if it is being used.
>>> 2. MCBP happens while the two different priority ibs are submitted at the same time. We could check fence info as below:
>>> Last signaled trailing fence++  when the mcbp triggers by kmd. Last preempted may not increase as the mcbp is not triggered from CP.
>>>
>>> --- ring 0 (gfx) ---
>>> Last signaled fence          0x00000001
>>> Last emitted                 0x00000001
>>> Last signaled trailing fence 0x0022eb84
>>> Last emitted                 0x0022eb84
>>> Last preempted               0x00000000
>>> Last reset                   0x00000000
>> I've now tested on this Picasso (GFX9) laptop as well. The "Last signaled trailing fence" line is changing here (seems to always match the "Last emitted" line).
>>
>> However, mutter's frame rate still cannot exceed that of GPU-limited clients. BTW, you can test with a GNOME Wayland session, even without my MR referenced below. Preemption will just be less effective without that MR. mutter has used a high priority context when possible for a long time.
>>
>> I'm also seeing intermittent freezes, where not even the mouse cursor moves for up to around one second, e.g. when interacting with the GNOME top bar. I'm not sure yet if these are related to this patch series, but I never noticed it before. I wonder if the freezes might occur when GPU preemption is attempted.
> 
> Keep in mind that this doesn't have the same fine granularity as the separate hw ring buffer found on gfx10.
> 
> With MCBP we can only preempt on draw command boundary, while the separate hw ring solution can preempt as soon as a CU is available.

Right, but so far I haven't noticed any positive effect. That and the intermittent freezes indicate the MCBP based preemption isn't actually working as intended yet.


>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>
>>>> This work is solely for gfx9 (e.g. Vega) and older.
>>>>
>>>> Navi has a completely separate high priority gfx queue we can use for this.
>> Right, but 4c7631800e6b ("drm/amd/amdgpu: add pipe1 hardware support") was for Sienna Cichlid only, and turned out to be unstable, so it had to reverted.
>>
>> It would be nice to make the SW ring solution take effect by default whenever there is no separate high priority HW gfx queue available (and any other requirements are met).
> 
> I don't think that this will be a good idea. The hw ring buffer or even hw scheduler have much nicer properties and we should focus on getting that working when available.

Of course, the HW features should have priority. I mean as a fallback when the HW features effectively aren't available (which is currently always the case with amdgpu, even when the GPU has the HW features).


>>> Am 27.09.22 um 19:49 schrieb Michel Dänzer:
>>>> On 2022-09-27 08:06, Christian König wrote:
>>>>> Hey Michel,
>>>>>
>>>>> JIadong is working on exposing high/low priority gfx queues for gfx9 and older hw generations by using mid command buffer preemption.
>>>> Yeah, I've been keeping an eye on these patches. I'm looking forward to this working.
>>>>
>>>>
>>>>> I know that you have been working on Gnome Mutter to make use from userspace for this. Do you have time to run some tests with that?
>>>> I just tested the v8 series (first without amdgpu.mcbp=1 on the kernel command line, then with it, since I wasn't sure if it's needed) with https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgitlab.gnome.org%2FGNOME%2Fmutter%2F-%2Fmerge_requests%2F1880&amp;data=05%7C01%7Cchristian.koenig%40amd.com%7Cc6345d9230004549ba4d08daa0b0abcd%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637998977913548768%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&amp;sdata=P1Qo2AwDmfmPrxJe2SxTFsVjdJ9vjabK8s84ZVz%2Beh8%3D&amp;reserved=0 on Navi 14.
>>>>
>>>> Unfortunately, I'm not seeing any change in behaviour. Even though mutter uses a high priority context via the EGL_IMG_context_priority extension, it's unable to reach a higher frame rate than a GPU-limited client[0]. The "Last preempted" line of /sys/kernel/debug/dri/0/amdgpu_fence_info remains at 0x00000000.
>>>>
>>>> Did I miss a step?
>>>>
>>>>
>>>> [0] I used the GpuTest pixmark piano & plot3d benchmarks. With an Intel iGPU, mutter can achieve a higher frame rate than plot3d, though not than pixmark piano (presumably due to limited GPU preemption granularity).
>>
> 

-- 
Earthling Michel Dänzer            |                  https://redhat.com
Libre software enthusiast          |         Mesa and Xwayland developer


^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2022-09-28 15:07 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-09-23 13:16 [PATCH 1/4] drm/amdgpu: Introduce gfx software ring (v7) jiadong.zhu
2022-09-23 13:16 ` [PATCH 2/4] drm/amdgpu: Add software ring callbacks for gfx9 (v6) jiadong.zhu
2022-09-26  6:43   ` Christian König
2022-09-23 13:16 ` [PATCH 3/4] drm/amdgpu: Modify unmap_queue format for gfx9 (v4) jiadong.zhu
2022-09-23 13:16 ` [PATCH 4/4] drm/amdgpu: MCBP based on DRM scheduler (v6) jiadong.zhu
2022-09-26  6:49   ` Christian König
2022-09-27  3:18     ` Zhu, Jiadong
2022-09-27  6:06       ` Christian König
2022-09-27 17:49         ` Michel Dänzer
2022-09-27 18:20           ` Christian König
2022-09-28  1:01             ` Zhu, Jiadong
2022-09-28 13:52               ` Michel Dänzer
2022-09-28 14:46                 ` Christian König
2022-09-28 15:07                   ` Michel Dänzer
2022-09-26  6:38 ` [PATCH 1/4] drm/amdgpu: Introduce gfx software ring (v7) Christian König

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.