All of lore.kernel.org
 help / color / mirror / Atom feed
* [Intel-xe] [RFC v2 0/6] Enable compression handling on LNL.
@ 2023-11-21 10:09 Himal Prasad Ghimiray
  2023-11-21 10:09 ` [Intel-xe] [RFC v2 1/6] drm/xe/xe2: Support flat ccs Himal Prasad Ghimiray
                   ` (6 more replies)
  0 siblings, 7 replies; 28+ messages in thread
From: Himal Prasad Ghimiray @ 2023-11-21 10:09 UTC (permalink / raw)
  To: intel-xe

Series enables flat ccs on xe2 igfx platforms and handles clearing and copy 
of ccs metadata during bo creation and xe_bo_move respectively.

Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>

Himal Prasad Ghimiray (6):
  drm/xe/xe2: Support flat ccs.
  drm/xe/xe2: Determine bios enablement for flat ccs on igfx.
  drm/xe/xe2: Allocate extra pages for ccs during bo create.
  drm/xe/xe2: Updates on XY_CTRL_SURF_COPY_BLT
  drm/xe/xe_migrate.c: Use NULL 1G PTE mapped at 255GiB VA for ccs
    clear.
  drm/xe/xe2: Handle flat ccs move for igfx.

 drivers/gpu/drm/xe/regs/xe_gpu_commands.h |   3 +-
 drivers/gpu/drm/xe/regs/xe_gt_regs.h      |   3 +
 drivers/gpu/drm/xe/xe_bo.c                |  40 ++++---
 drivers/gpu/drm/xe/xe_bo_types.h          |   2 +
 drivers/gpu/drm/xe/xe_device.c            |  32 ++++-
 drivers/gpu/drm/xe/xe_migrate.c           | 139 +++++++++-------------
 drivers/gpu/drm/xe/xe_pci.c               |   2 +-
 7 files changed, 119 insertions(+), 102 deletions(-)

-- 
2.25.1


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [Intel-xe] [RFC v2 1/6] drm/xe/xe2: Support flat ccs.
  2023-11-21 10:09 [Intel-xe] [RFC v2 0/6] Enable compression handling on LNL Himal Prasad Ghimiray
@ 2023-11-21 10:09 ` Himal Prasad Ghimiray
  2023-11-23 16:28   ` Matthew Auld
  2023-11-24 11:02   ` Thomas Hellström
  2023-11-21 10:09 ` [Intel-xe] [RFC v2 2/6] drm/xe/xe2: Determine bios enablement for flat ccs on igfx Himal Prasad Ghimiray
                   ` (5 subsequent siblings)
  6 siblings, 2 replies; 28+ messages in thread
From: Himal Prasad Ghimiray @ 2023-11-21 10:09 UTC (permalink / raw)
  To: intel-xe

Enable flat ccs for XE2_GFX_FEATURES.

Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
---
 drivers/gpu/drm/xe/xe_pci.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c
index 682ba188e456..d9754f3d45f7 100644
--- a/drivers/gpu/drm/xe/xe_pci.c
+++ b/drivers/gpu/drm/xe/xe_pci.c
@@ -157,7 +157,7 @@ static const struct xe_graphics_desc graphics_xelpg = {
 #define XE2_GFX_FEATURES \
 	.dma_mask_size = 46, \
 	.has_asid = 1, \
-	.has_flat_ccs = 0 /* FIXME: implementation missing */, \
+	.has_flat_ccs = 1, \
 	.has_range_tlb_invalidation = 1, \
 	.supports_usm = 0 /* FIXME: implementation missing */, \
 	.va_bits = 48, \
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [Intel-xe] [RFC v2 2/6] drm/xe/xe2: Determine bios enablement for flat ccs on igfx.
  2023-11-21 10:09 [Intel-xe] [RFC v2 0/6] Enable compression handling on LNL Himal Prasad Ghimiray
  2023-11-21 10:09 ` [Intel-xe] [RFC v2 1/6] drm/xe/xe2: Support flat ccs Himal Prasad Ghimiray
@ 2023-11-21 10:09 ` Himal Prasad Ghimiray
  2023-11-23 16:37   ` Matthew Auld
  2023-11-24 11:05   ` Thomas Hellström
  2023-11-21 10:09 ` [Intel-xe] [RFC v2 3/6] drm/xe/xe2: Allocate extra pages for ccs during bo create Himal Prasad Ghimiray
                   ` (4 subsequent siblings)
  6 siblings, 2 replies; 28+ messages in thread
From: Himal Prasad Ghimiray @ 2023-11-21 10:09 UTC (permalink / raw)
  To: intel-xe

If bios disables flat ccs on igfx  make has_flat_ccs as 0 and notify
via drm_info.

Bspec:59255

Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
---
 drivers/gpu/drm/xe/regs/xe_gt_regs.h |  3 +++
 drivers/gpu/drm/xe/xe_device.c       | 30 ++++++++++++++++++++++++++++
 2 files changed, 33 insertions(+)

diff --git a/drivers/gpu/drm/xe/regs/xe_gt_regs.h b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
index cc27fe8fc363..b642301947f5 100644
--- a/drivers/gpu/drm/xe/regs/xe_gt_regs.h
+++ b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
@@ -142,6 +142,9 @@
 #define XEHP_SLICE_COMMON_ECO_CHICKEN1		XE_REG_MCR(0x731c, XE_REG_OPTION_MASKED)
 #define   MSC_MSAA_REODER_BUF_BYPASS_DISABLE	REG_BIT(14)
 
+#define XE2_FLAT_CCS_BASE_RANGE_LOWER		XE_REG_MCR(0x8800)
+#define   XE2_FLAT_CCS_ENABLE			REG_BIT(0)
+
 #define VF_PREEMPTION				XE_REG(0x83a4, XE_REG_OPTION_MASKED)
 #define   PREEMPTION_VERTEX_COUNT		REG_GENMASK(15, 0)
 
diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
index 8be765adf702..07a3e4cf48d1 100644
--- a/drivers/gpu/drm/xe/xe_device.c
+++ b/drivers/gpu/drm/xe/xe_device.c
@@ -16,6 +16,7 @@
 #include <drm/xe_drm.h>
 
 #include "regs/xe_regs.h"
+#include "regs/xe_gt_regs.h"
 #include "xe_bo.h"
 #include "xe_debugfs.h"
 #include "xe_display.h"
@@ -25,6 +26,7 @@
 #include "xe_exec_queue.h"
 #include "xe_exec.h"
 #include "xe_gt.h"
+#include "xe_gt_mcr.h"
 #include "xe_irq.h"
 #include "xe_mmio.h"
 #include "xe_module.h"
@@ -342,6 +344,29 @@ static void xe_device_sanitize(struct drm_device *drm, void *arg)
 		xe_gt_sanitize(gt);
 }
 
+static int xe_device_set_has_flat_ccs(struct  xe_device *xe)
+{
+	u32 reg;
+	int err;
+
+	if (IS_DGFX(xe) || GRAPHICS_VER(xe) < 20 || !xe->info.has_flat_ccs)
+		return 0;
+
+	struct xe_gt *gt = xe_root_mmio_gt(xe);
+
+	err = xe_force_wake_get(gt_to_fw(gt), XE_FW_GT);
+	if (err)
+		return err;
+
+	reg = xe_gt_mcr_unicast_read_any(gt, XE2_FLAT_CCS_BASE_RANGE_LOWER);
+	xe->info.has_flat_ccs = (reg & XE2_FLAT_CCS_ENABLE);
+
+	if (!xe->info.has_flat_ccs)
+		drm_info(&xe->drm,
+			 "Flat CCS has been disabled in bios, May lead to performance impact");
+	return 0;
+}
+
 int xe_device_probe(struct xe_device *xe)
 {
 	struct xe_tile *tile;
@@ -352,6 +377,7 @@ int xe_device_probe(struct xe_device *xe)
 	xe_pat_init_early(xe);
 
 	xe->info.mem_region_mask = 1;
+
 	err = xe_display_init_nommio(xe);
 	if (err)
 		return err;
@@ -390,6 +416,10 @@ int xe_device_probe(struct xe_device *xe)
 			goto err_irq_shutdown;
 	}
 
+	err = xe_device_set_has_flat_ccs(xe);
+	if (err)
+		return err;
+
 	err = xe_mmio_probe_vram(xe);
 	if (err)
 		goto err_irq_shutdown;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [Intel-xe] [RFC v2 3/6] drm/xe/xe2: Allocate extra pages for ccs during bo create.
  2023-11-21 10:09 [Intel-xe] [RFC v2 0/6] Enable compression handling on LNL Himal Prasad Ghimiray
  2023-11-21 10:09 ` [Intel-xe] [RFC v2 1/6] drm/xe/xe2: Support flat ccs Himal Prasad Ghimiray
  2023-11-21 10:09 ` [Intel-xe] [RFC v2 2/6] drm/xe/xe2: Determine bios enablement for flat ccs on igfx Himal Prasad Ghimiray
@ 2023-11-21 10:09 ` Himal Prasad Ghimiray
  2023-11-24 11:09   ` Thomas Hellström
  2023-11-21 10:09 ` [Intel-xe] [RFC v2 4/6] drm/xe/xe2: Updates on XY_CTRL_SURF_COPY_BLT Himal Prasad Ghimiray
                   ` (3 subsequent siblings)
  6 siblings, 1 reply; 28+ messages in thread
From: Himal Prasad Ghimiray @ 2023-11-21 10:09 UTC (permalink / raw)
  To: intel-xe

Each byte of CCS data now represents 512 bytes of main memory data.
Allocate extra pages to handle ccs region for igfx too.

Bspec:58796

Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
---
 drivers/gpu/drm/xe/regs/xe_gpu_commands.h |  2 +-
 drivers/gpu/drm/xe/xe_bo.c                | 15 ++++++---------
 drivers/gpu/drm/xe/xe_device.c            |  2 +-
 3 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/xe/regs/xe_gpu_commands.h b/drivers/gpu/drm/xe/regs/xe_gpu_commands.h
index 4402f72481dc..7f74592f99ce 100644
--- a/drivers/gpu/drm/xe/regs/xe_gpu_commands.h
+++ b/drivers/gpu/drm/xe/regs/xe_gpu_commands.h
@@ -16,7 +16,7 @@
 #define   XY_CTRL_SURF_MOCS_MASK	GENMASK(31, 26)
 #define   XE2_XY_CTRL_SURF_MOCS_INDEX_MASK	GENMASK(31, 28)
 #define   NUM_CCS_BYTES_PER_BLOCK	256
-#define   NUM_BYTES_PER_CCS_BYTE	256
+#define   NUM_BYTES_PER_CCS_BYTE(_xe)	(GRAPHICS_VER(_xe) >= 20 ? 512 : 256)
 #define   NUM_CCS_BLKS_PER_XFER		1024
 
 #define XY_FAST_COLOR_BLT_CMD		(2 << 29 | 0x44 << 22)
diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c
index 4305f5cbc2ab..4730ee3c1012 100644
--- a/drivers/gpu/drm/xe/xe_bo.c
+++ b/drivers/gpu/drm/xe/xe_bo.c
@@ -2074,19 +2074,16 @@ int xe_bo_evict(struct xe_bo *bo, bool force_alloc)
  * placed in system memory.
  * @bo: The xe_bo
  *
- * If a bo has an allowable placement in XE_PL_TT memory, it can't use
- * flat CCS compression, because the GPU then has no way to access the
- * CCS metadata using relevant commands. For the opposite case, we need to
- * allocate storage for the CCS metadata when the BO is not resident in
- * VRAM memory.
- *
  * Return: true if extra pages need to be allocated, false otherwise.
  */
 bool xe_bo_needs_ccs_pages(struct xe_bo *bo)
 {
-	return bo->ttm.type == ttm_bo_type_device &&
-		!(bo->flags & XE_BO_CREATE_SYSTEM_BIT) &&
-		(bo->flags & XE_BO_CREATE_VRAM_MASK);
+	struct xe_device *xe = xe_bo_device(bo);
+
+	return (xe_device_has_flat_ccs(xe) &&
+		bo->ttm.type == ttm_bo_type_device &&
+		((IS_DGFX(xe) && (bo->flags & XE_BO_CREATE_VRAM_MASK)) ||
+		(!IS_DGFX(xe) && (bo->flags & XE_BO_CREATE_SYSTEM_BIT))));
 }
 
 /**
diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
index 07a3e4cf48d1..265f9ffc5323 100644
--- a/drivers/gpu/drm/xe/xe_device.c
+++ b/drivers/gpu/drm/xe/xe_device.c
@@ -551,7 +551,7 @@ void xe_device_wmb(struct xe_device *xe)
 u32 xe_device_ccs_bytes(struct xe_device *xe, u64 size)
 {
 	return xe_device_has_flat_ccs(xe) ?
-		DIV_ROUND_UP(size, NUM_BYTES_PER_CCS_BYTE) : 0;
+		DIV_ROUND_UP(size, NUM_BYTES_PER_CCS_BYTE(xe)) : 0;
 }
 
 bool xe_device_mem_access_ongoing(struct xe_device *xe)
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [Intel-xe] [RFC v2 4/6] drm/xe/xe2: Updates on XY_CTRL_SURF_COPY_BLT
  2023-11-21 10:09 [Intel-xe] [RFC v2 0/6] Enable compression handling on LNL Himal Prasad Ghimiray
                   ` (2 preceding siblings ...)
  2023-11-21 10:09 ` [Intel-xe] [RFC v2 3/6] drm/xe/xe2: Allocate extra pages for ccs during bo create Himal Prasad Ghimiray
@ 2023-11-21 10:09 ` Himal Prasad Ghimiray
  2023-11-24 11:11   ` Thomas Hellström
  2023-11-21 10:09 ` [Intel-xe] [RFC v2 5/6] drm/xe/xe_migrate.c: Use NULL 1G PTE mapped at 255GiB VA for ccs clear Himal Prasad Ghimiray
                   ` (2 subsequent siblings)
  6 siblings, 1 reply; 28+ messages in thread
From: Himal Prasad Ghimiray @ 2023-11-21 10:09 UTC (permalink / raw)
  To: intel-xe

- The XY_CTRL_SURF_COPY_BLT instruction operationg on ccs data expects
size in pages of main memory for which CCS data should be copied.
- The bitfield representing copy size in XY_CTRL_SURF_COPY_BLT has
shifted one bit higher in the instruction.

Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
---
 drivers/gpu/drm/xe/regs/xe_gpu_commands.h |  1 +
 drivers/gpu/drm/xe/xe_migrate.c           | 23 +++++++++++++++++------
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/xe/regs/xe_gpu_commands.h b/drivers/gpu/drm/xe/regs/xe_gpu_commands.h
index 7f74592f99ce..47459841aa69 100644
--- a/drivers/gpu/drm/xe/regs/xe_gpu_commands.h
+++ b/drivers/gpu/drm/xe/regs/xe_gpu_commands.h
@@ -13,6 +13,7 @@
 #define   DST_ACCESS_TYPE_SHIFT		20
 #define   CCS_SIZE_MASK			0x3FF
 #define   CCS_SIZE_SHIFT		8
+#define   XE2_CCS_SIZE_SHIFT		9
 #define   XY_CTRL_SURF_MOCS_MASK	GENMASK(31, 26)
 #define   XE2_XY_CTRL_SURF_MOCS_INDEX_MASK	GENMASK(31, 28)
 #define   NUM_CCS_BYTES_PER_BLOCK	256
diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c
index ed2f3f5109f3..06706fad67aa 100644
--- a/drivers/gpu/drm/xe/xe_migrate.c
+++ b/drivers/gpu/drm/xe/xe_migrate.c
@@ -523,21 +523,31 @@ static void emit_copy_ccs(struct xe_gt *gt, struct xe_bb *bb,
 	struct xe_device *xe = gt_to_xe(gt);
 	u32 *cs = bb->cs + bb->len;
 	u32 num_ccs_blks;
+	u32 num_ccs_pages;
+	u32 ccs_copy_size;
 	u32 mocs;
 
-	num_ccs_blks = DIV_ROUND_UP(xe_device_ccs_bytes(gt_to_xe(gt), size),
-				    NUM_CCS_BYTES_PER_BLOCK);
-	xe_gt_assert(gt, num_ccs_blks <= NUM_CCS_BLKS_PER_XFER);
+	if (GRAPHICS_VERx100(xe) >= 2000) {
+		num_ccs_pages = DIV_ROUND_UP(xe_device_ccs_bytes(gt_to_xe(gt), size),
+					     XE_PAGE_SIZE);
+		xe_gt_assert(gt, num_ccs_pages <= 1024);
 
-	if (GRAPHICS_VERx100(xe) >= 2000)
+		ccs_copy_size = ((num_ccs_pages - 1) & CCS_SIZE_MASK) << XE2_CCS_SIZE_SHIFT;
 		mocs = FIELD_PREP(XE2_XY_CTRL_SURF_MOCS_INDEX_MASK, gt->mocs.uc_index);
-	else
+
+	} else {
+		num_ccs_blks = DIV_ROUND_UP(xe_device_ccs_bytes(gt_to_xe(gt), size),
+					    NUM_CCS_BYTES_PER_BLOCK);
+		xe_gt_assert(gt, num_ccs_blks <= NUM_CCS_BLKS_PER_XFER);
+
+		ccs_copy_size = ((num_ccs_blks - 1) & CCS_SIZE_MASK) << CCS_SIZE_SHIFT;
 		mocs = FIELD_PREP(XY_CTRL_SURF_MOCS_MASK, gt->mocs.uc_index);
+	}
 
 	*cs++ = XY_CTRL_SURF_COPY_BLT |
 		(src_is_indirect ? 0x0 : 0x1) << SRC_ACCESS_TYPE_SHIFT |
 		(dst_is_indirect ? 0x0 : 0x1) << DST_ACCESS_TYPE_SHIFT |
-		((num_ccs_blks - 1) & CCS_SIZE_MASK) << CCS_SIZE_SHIFT;
+		ccs_copy_size;
 	*cs++ = lower_32_bits(src_ofs);
 	*cs++ = upper_32_bits(src_ofs) | mocs;
 	*cs++ = lower_32_bits(dst_ofs);
@@ -992,6 +1002,7 @@ struct dma_fence *xe_migrate_clear(struct xe_migrate *m,
 
 		emit_clear(gt, bb, clear_L0_ofs, clear_L0, XE_PAGE_SIZE,
 			   clear_vram);
+
 		if (xe_device_has_flat_ccs(xe) && clear_vram) {
 			emit_copy_ccs(gt, bb, clear_L0_ofs, true,
 				      m->cleared_vram_ofs, false, clear_L0);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [Intel-xe] [RFC v2 5/6] drm/xe/xe_migrate.c: Use NULL 1G PTE mapped at 255GiB VA for ccs clear.
  2023-11-21 10:09 [Intel-xe] [RFC v2 0/6] Enable compression handling on LNL Himal Prasad Ghimiray
                   ` (3 preceding siblings ...)
  2023-11-21 10:09 ` [Intel-xe] [RFC v2 4/6] drm/xe/xe2: Updates on XY_CTRL_SURF_COPY_BLT Himal Prasad Ghimiray
@ 2023-11-21 10:09 ` Himal Prasad Ghimiray
  2023-11-24 11:19   ` Thomas Hellström
  2023-11-21 10:09 ` [Intel-xe] [RFC v2 6/6] drm/xe/xe2: Handle flat ccs move for igfx Himal Prasad Ghimiray
  2023-11-28 13:36 ` [Intel-xe] [RFC v2 0/6] Enable compression handling on LNL Thomas Hellström
  6 siblings, 1 reply; 28+ messages in thread
From: Himal Prasad Ghimiray @ 2023-11-21 10:09 UTC (permalink / raw)
  To: intel-xe

Get rid of the cleared bo, instead use null 1G PTE mapped at 255GiB
offset, this can be used for both dgfx and igfx.

Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
---
 drivers/gpu/drm/xe/xe_migrate.c | 56 ++++++---------------------------
 1 file changed, 10 insertions(+), 46 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c
index 06706fad67aa..bdcb20f23531 100644
--- a/drivers/gpu/drm/xe/xe_migrate.c
+++ b/drivers/gpu/drm/xe/xe_migrate.c
@@ -54,8 +54,8 @@ struct xe_migrate {
 	u64 batch_base_ofs;
 	/** @usm_batch_base_ofs: VM offset of the usm batch buffer */
 	u64 usm_batch_base_ofs;
-	/** @cleared_vram_ofs: VM offset of @cleared_bo. */
-	u64 cleared_vram_ofs;
+	/** @cleared_mem_ofs: VM offset of @cleared_bo. */
+	u64 cleared_mem_ofs;
 	/**
 	 * @fence: dma-fence representing the last migration job batch.
 	 * Protected by @job_mutex.
@@ -125,41 +125,6 @@ static u64 xe_migrate_vram_ofs(struct xe_device *xe, u64 addr)
 	return addr + (256ULL << xe_pt_shift(2));
 }
 
-/*
- * For flat CCS clearing we need a cleared chunk of memory to copy from,
- * since the CCS clearing mode of XY_FAST_COLOR_BLT appears to be buggy
- * (it clears on only 14 bytes in each chunk of 16).
- * If clearing the main surface one can use the part of the main surface
- * already cleared, but for clearing as part of copying non-compressed
- * data out of system memory, we don't readily have a cleared part of
- * VRAM to copy from, so create one to use for that case.
- */
-static int xe_migrate_create_cleared_bo(struct xe_migrate *m, struct xe_vm *vm)
-{
-	struct xe_tile *tile = m->tile;
-	struct xe_device *xe = vm->xe;
-	size_t cleared_size;
-	u64 vram_addr;
-
-	if (!xe_device_has_flat_ccs(xe))
-		return 0;
-
-	cleared_size = xe_device_ccs_bytes(xe, MAX_PREEMPTDISABLE_TRANSFER);
-	cleared_size = PAGE_ALIGN(cleared_size);
-	m->cleared_bo = xe_bo_create_pin_map(xe, tile, vm, cleared_size,
-					     ttm_bo_type_kernel,
-					     XE_BO_CREATE_VRAM_IF_DGFX(tile) |
-					     XE_BO_CREATE_PINNED_BIT);
-	if (IS_ERR(m->cleared_bo))
-		return PTR_ERR(m->cleared_bo);
-
-	xe_map_memset(xe, &m->cleared_bo->vmap, 0, 0x00, cleared_size);
-	vram_addr = xe_bo_addr(m->cleared_bo, 0, XE_PAGE_SIZE);
-	m->cleared_vram_ofs = xe_migrate_vram_ofs(xe, vram_addr);
-
-	return 0;
-}
-
 static int xe_migrate_prepare_vm(struct xe_tile *tile, struct xe_migrate *m,
 				 struct xe_vm *vm)
 {
@@ -170,7 +135,6 @@ static int xe_migrate_prepare_vm(struct xe_tile *tile, struct xe_migrate *m,
 	u32 map_ofs, level, i;
 	struct xe_bo *bo, *batch = tile->mem.kernel_bb_pool->bo;
 	u64 entry;
-	int ret;
 
 	/* Can't bump NUM_PT_SLOTS too high */
 	BUILD_BUG_ON(NUM_PT_SLOTS > SZ_2M/XE_PAGE_SIZE);
@@ -190,12 +154,6 @@ static int xe_migrate_prepare_vm(struct xe_tile *tile, struct xe_migrate *m,
 	if (IS_ERR(bo))
 		return PTR_ERR(bo);
 
-	ret = xe_migrate_create_cleared_bo(m, vm);
-	if (ret) {
-		xe_bo_put(bo);
-		return ret;
-	}
-
 	entry = vm->pt_ops->pde_encode_bo(bo, bo->size - XE_PAGE_SIZE, pat_index);
 	xe_pt_write(xe, &vm->pt_root[id]->bo->vmap, 0, entry);
 
@@ -262,6 +220,12 @@ static int xe_migrate_prepare_vm(struct xe_tile *tile, struct xe_migrate *m,
 			  (i + 1) * 8, u64, entry);
 	}
 
+	level = 2;
+	xe_map_wr(xe, &bo->vmap, map_ofs + XE_PAGE_SIZE * level + 255 * 8, u64,
+		  vm->pt_ops->pte_encode_addr(xe, 0, pat_index, level, IS_DGFX(xe), 0)
+		  | XE_PTE_NULL);
+	m->cleared_mem_ofs = (255ULL << xe_pt_shift(level));
+
 	/* Identity map the entire vram at 256GiB offset */
 	if (IS_DGFX(xe)) {
 		u64 pos, ofs, flags;
@@ -616,7 +580,7 @@ static u32 xe_migrate_ccs_copy(struct xe_migrate *m,
 		 * Otherwise if the bo doesn't have any CCS metadata attached,
 		 * we still need to clear it for security reasons.
 		 */
-		u64 ccs_src_ofs =  src_is_vram ? src_ofs : m->cleared_vram_ofs;
+		u64 ccs_src_ofs =  src_is_vram ? src_ofs : m->cleared_mem_ofs;
 
 		emit_copy_ccs(gt, bb,
 			      dst_ofs, true,
@@ -1005,7 +969,7 @@ struct dma_fence *xe_migrate_clear(struct xe_migrate *m,
 
 		if (xe_device_has_flat_ccs(xe) && clear_vram) {
 			emit_copy_ccs(gt, bb, clear_L0_ofs, true,
-				      m->cleared_vram_ofs, false, clear_L0);
+				      m->cleared_mem_ofs, false, clear_L0);
 			flush_flags = MI_FLUSH_DW_CCS;
 		}
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [Intel-xe] [RFC v2 6/6] drm/xe/xe2: Handle flat ccs move for igfx.
  2023-11-21 10:09 [Intel-xe] [RFC v2 0/6] Enable compression handling on LNL Himal Prasad Ghimiray
                   ` (4 preceding siblings ...)
  2023-11-21 10:09 ` [Intel-xe] [RFC v2 5/6] drm/xe/xe_migrate.c: Use NULL 1G PTE mapped at 255GiB VA for ccs clear Himal Prasad Ghimiray
@ 2023-11-21 10:09 ` Himal Prasad Ghimiray
  2023-11-24 15:48   ` Thomas Hellström
  2023-11-28 13:36 ` [Intel-xe] [RFC v2 0/6] Enable compression handling on LNL Thomas Hellström
  6 siblings, 1 reply; 28+ messages in thread
From: Himal Prasad Ghimiray @ 2023-11-21 10:09 UTC (permalink / raw)
  To: intel-xe

- Clear flat ccs during user bo creation.
- copy ccs meta data between flat ccs and bo during eviction and
restore.
- Add a bool field ccs_cleared in bo, true means ccs region of bo is
already cleared.

Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
---
 drivers/gpu/drm/xe/xe_bo.c       | 25 ++++++++-----
 drivers/gpu/drm/xe/xe_bo_types.h |  2 ++
 drivers/gpu/drm/xe/xe_migrate.c  | 62 ++++++++++++++++----------------
 3 files changed, 50 insertions(+), 39 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c
index 4730ee3c1012..a40f17ae21e7 100644
--- a/drivers/gpu/drm/xe/xe_bo.c
+++ b/drivers/gpu/drm/xe/xe_bo.c
@@ -630,10 +630,12 @@ static int xe_bo_move(struct ttm_buffer_object *ttm_bo, bool evict,
 	bool move_lacks_source;
 	bool tt_has_data;
 	bool needs_clear;
+	bool handle_system_ccs = (!IS_DGFX(xe) && xe_bo_needs_ccs_pages(bo) &&
+				  ttm && ttm_tt_is_populated(ttm)) ? true : false;
 	int ret = 0;
-
-	/* Bo creation path, moving to system or TT. No clearing required. */
-	if (!old_mem && ttm) {
+	/* Bo creation path, moving to system or TT. */
+	if (((old_mem_type == XE_PL_SYSTEM && new_mem->mem_type == XE_PL_TT) ||
+	     (!old_mem && ttm)) && !handle_system_ccs) {
 		ttm_bo_move_null(ttm_bo, new_mem);
 		return 0;
 	}
@@ -648,14 +650,13 @@ static int xe_bo_move(struct ttm_buffer_object *ttm_bo, bool evict,
 	tt_has_data = ttm && (ttm_tt_is_populated(ttm) ||
 			      (ttm->page_flags & TTM_TT_FLAG_SWAPPED));
 
-	move_lacks_source = !mem_type_is_vram(old_mem_type) && !tt_has_data;
+	move_lacks_source = handle_system_ccs ? (!bo->ccs_cleared)  :
+						(!mem_type_is_vram(old_mem_type) && !tt_has_data);
 
 	needs_clear = (ttm && ttm->page_flags & TTM_TT_FLAG_ZERO_ALLOC) ||
 		(!ttm && ttm_bo->type == ttm_bo_type_device);
 
-	if ((move_lacks_source && !needs_clear) ||
-	    (old_mem_type == XE_PL_SYSTEM &&
-	     new_mem->mem_type == XE_PL_TT)) {
+	if ((move_lacks_source && !needs_clear)) {
 		ttm_bo_move_null(ttm_bo, new_mem);
 		goto out;
 	}
@@ -686,8 +687,11 @@ static int xe_bo_move(struct ttm_buffer_object *ttm_bo, bool evict,
 			ret = timeout;
 			goto out;
 		}
-		ttm_bo_move_null(ttm_bo, new_mem);
-		goto out;
+
+		if (!handle_system_ccs) {
+			ttm_bo_move_null(ttm_bo, new_mem);
+			goto out;
+		}
 	}
 
 	if (!move_lacks_source &&
@@ -708,6 +712,8 @@ static int xe_bo_move(struct ttm_buffer_object *ttm_bo, bool evict,
 		migrate = mem_type_to_migrate(xe, new_mem->mem_type);
 	else if (mem_type_is_vram(old_mem_type))
 		migrate = mem_type_to_migrate(xe, old_mem_type);
+	else
+		migrate = xe->tiles[0].migrate;
 
 	xe_assert(xe, migrate);
 
@@ -1229,6 +1235,7 @@ struct xe_bo *__xe_bo_create_locked(struct xe_device *xe, struct xe_bo *bo,
 		alignment = SZ_4K >> PAGE_SHIFT;
 	}
 
+	bo->ccs_cleared = false;
 	bo->tile = tile;
 	bo->size = size;
 	bo->flags = flags;
diff --git a/drivers/gpu/drm/xe/xe_bo_types.h b/drivers/gpu/drm/xe/xe_bo_types.h
index 4bff60996168..508e67c81427 100644
--- a/drivers/gpu/drm/xe/xe_bo_types.h
+++ b/drivers/gpu/drm/xe/xe_bo_types.h
@@ -79,6 +79,8 @@ struct xe_bo {
 	struct llist_node freed;
 	/** @created: Whether the bo has passed initial creation */
 	bool created;
+	/** @ccs_cleared */
+	bool ccs_cleared;
 };
 
 #define intel_bo_to_drm_bo(bo) (&(bo)->ttm.base)
diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c
index bdcb20f23531..bac24768fe2a 100644
--- a/drivers/gpu/drm/xe/xe_migrate.c
+++ b/drivers/gpu/drm/xe/xe_migrate.c
@@ -564,14 +564,14 @@ static u64 xe_migrate_batch_base(struct xe_migrate *m, bool usm)
 
 static u32 xe_migrate_ccs_copy(struct xe_migrate *m,
 			       struct xe_bb *bb,
-			       u64 src_ofs, bool src_is_vram,
-			       u64 dst_ofs, bool dst_is_vram, u32 dst_size,
+			       u64 src_ofs, bool src_is_indirect,
+			       u64 dst_ofs, bool dst_is_indirect, u32 dst_size,
 			       u64 ccs_ofs, bool copy_ccs)
 {
 	struct xe_gt *gt = m->tile->primary_gt;
 	u32 flush_flags = 0;
 
-	if (xe_device_has_flat_ccs(gt_to_xe(gt)) && !copy_ccs && dst_is_vram) {
+	if (xe_device_has_flat_ccs(gt_to_xe(gt)) && !copy_ccs && dst_is_indirect) {
 		/*
 		 * If the src is already in vram, then it should already
 		 * have been cleared by us, or has been populated by the
@@ -580,28 +580,24 @@ static u32 xe_migrate_ccs_copy(struct xe_migrate *m,
 		 * Otherwise if the bo doesn't have any CCS metadata attached,
 		 * we still need to clear it for security reasons.
 		 */
-		u64 ccs_src_ofs =  src_is_vram ? src_ofs : m->cleared_mem_ofs;
+		u64 ccs_src_ofs =  src_is_indirect ? src_ofs : m->cleared_mem_ofs;
 
 		emit_copy_ccs(gt, bb,
 			      dst_ofs, true,
-			      ccs_src_ofs, src_is_vram, dst_size);
+			      ccs_src_ofs, src_is_indirect, dst_size);
 
 		flush_flags = MI_FLUSH_DW_CCS;
 	} else if (copy_ccs) {
-		if (!src_is_vram)
+		if (!src_is_indirect)
 			src_ofs = ccs_ofs;
-		else if (!dst_is_vram)
+		else if (!dst_is_indirect)
 			dst_ofs = ccs_ofs;
 
-		/*
-		 * At the moment, we don't support copying CCS metadata from
-		 * system to system.
-		 */
-		xe_gt_assert(gt, src_is_vram || dst_is_vram);
+		xe_gt_assert(gt, src_is_indirect || dst_is_indirect);
 
-		emit_copy_ccs(gt, bb, dst_ofs, dst_is_vram, src_ofs,
-			      src_is_vram, dst_size);
-		if (dst_is_vram)
+		emit_copy_ccs(gt, bb, dst_ofs, dst_is_indirect, src_ofs,
+			      src_is_indirect, dst_size);
+		if (dst_is_indirect)
 			flush_flags = MI_FLUSH_DW_CCS;
 	}
 
@@ -642,6 +638,8 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m,
 	u64 src_L0, dst_L0;
 	int pass = 0;
 	int err;
+	bool src_is_pltt = src->mem_type == XE_PL_TT;
+	bool dst_is_pltt = dst->mem_type == XE_PL_TT;
 	bool src_is_vram = mem_type_is_vram(src->mem_type);
 	bool dst_is_vram = mem_type_is_vram(dst->mem_type);
 	bool copy_ccs = xe_device_has_flat_ccs(xe) &&
@@ -682,9 +680,7 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m,
 		src_L0 = xe_migrate_res_sizes(&src_it);
 		dst_L0 = xe_migrate_res_sizes(&dst_it);
 
-		drm_dbg(&xe->drm, "Pass %u, sizes: %llu & %llu\n",
-			pass++, src_L0, dst_L0);
-
+		drm_dbg(&xe->drm, "Pass %u, sizes: %llu & %llu\n", pass++, src_L0, dst_L0);
 		src_L0 = min(src_L0, dst_L0);
 
 		batch_size += pte_update_size(m, src_is_vram, src, &src_it, &src_L0,
@@ -704,8 +700,8 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m,
 		}
 
 		/* Add copy commands size here */
-		batch_size += EMIT_COPY_DW +
-			(xe_device_has_flat_ccs(xe) ? EMIT_COPY_CCS_DW : 0);
+		batch_size += ((!src_is_vram && !dst_is_vram) ? 0 : EMIT_COPY_DW) +
+			((xe_device_has_flat_ccs(xe) ? EMIT_COPY_CCS_DW : 0));
 
 		bb = xe_bb_new(gt, batch_size, usm);
 		if (IS_ERR(bb)) {
@@ -731,10 +727,13 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m,
 		bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
 		update_idx = bb->len;
 
-		emit_copy(gt, bb, src_L0_ofs, dst_L0_ofs, src_L0,
-			  XE_PAGE_SIZE);
-		flush_flags = xe_migrate_ccs_copy(m, bb, src_L0_ofs, src_is_vram,
-						  dst_L0_ofs, dst_is_vram,
+		if (src_is_vram || dst_is_vram)
+			emit_copy(gt, bb, src_L0_ofs, dst_L0_ofs, src_L0, XE_PAGE_SIZE);
+
+		flush_flags = xe_migrate_ccs_copy(m, bb, src_L0_ofs,
+						  IS_DGFX(xe) ? src_is_vram : src_is_pltt,
+						  dst_L0_ofs,
+						  IS_DGFX(xe) ? dst_is_vram : dst_is_pltt,
 						  src_L0, ccs_ofs, copy_ccs);
 
 		mutex_lock(&m->job_mutex);
@@ -907,6 +906,7 @@ struct dma_fence *xe_migrate_clear(struct xe_migrate *m,
 	bool clear_vram = mem_type_is_vram(dst->mem_type);
 	struct xe_gt *gt = m->tile->primary_gt;
 	struct xe_device *xe = gt_to_xe(gt);
+	bool clear_system_ccs = (xe_bo_needs_ccs_pages(bo) && !IS_DGFX(xe)) ? true : false;
 	struct dma_fence *fence = NULL;
 	u64 size = bo->size;
 	struct xe_res_cursor src_it;
@@ -936,9 +936,9 @@ struct dma_fence *xe_migrate_clear(struct xe_migrate *m,
 		batch_size = 2 +
 			pte_update_size(m, clear_vram, src, &src_it,
 					&clear_L0, &clear_L0_ofs, &clear_L0_pt,
-					emit_clear_cmd_len(gt), 0,
+					clear_system_ccs ? 0 : emit_clear_cmd_len(gt), 0,
 					NUM_PT_PER_BLIT);
-		if (xe_device_has_flat_ccs(xe) && clear_vram)
+		if (xe_bo_needs_ccs_pages(bo))
 			batch_size += EMIT_COPY_CCS_DW;
 
 		/* Clear commands */
@@ -953,7 +953,6 @@ struct dma_fence *xe_migrate_clear(struct xe_migrate *m,
 		}
 
 		size -= clear_L0;
-
 		/* Preemption is enabled again by the ring ops. */
 		if (!clear_vram) {
 			emit_pte(m, bb, clear_L0_pt, clear_vram, &src_it, clear_L0,
@@ -964,10 +963,10 @@ struct dma_fence *xe_migrate_clear(struct xe_migrate *m,
 		bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
 		update_idx = bb->len;
 
-		emit_clear(gt, bb, clear_L0_ofs, clear_L0, XE_PAGE_SIZE,
-			   clear_vram);
+		if (!clear_system_ccs)
+			emit_clear(gt, bb, clear_L0_ofs, clear_L0, XE_PAGE_SIZE, clear_vram);
 
-		if (xe_device_has_flat_ccs(xe) && clear_vram) {
+		if (xe_bo_needs_ccs_pages(bo)) {
 			emit_copy_ccs(gt, bb, clear_L0_ofs, true,
 				      m->cleared_mem_ofs, false, clear_L0);
 			flush_flags = MI_FLUSH_DW_CCS;
@@ -1024,6 +1023,9 @@ struct dma_fence *xe_migrate_clear(struct xe_migrate *m,
 		return ERR_PTR(err);
 	}
 
+	if (clear_system_ccs)
+		bo->ccs_cleared = true;
+
 	return fence;
 }
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [Intel-xe] [RFC v2 1/6] drm/xe/xe2: Support flat ccs.
  2023-11-21 10:09 ` [Intel-xe] [RFC v2 1/6] drm/xe/xe2: Support flat ccs Himal Prasad Ghimiray
@ 2023-11-23 16:28   ` Matthew Auld
  2023-11-27  3:04     ` Ghimiray, Himal Prasad
  2023-11-24 11:02   ` Thomas Hellström
  1 sibling, 1 reply; 28+ messages in thread
From: Matthew Auld @ 2023-11-23 16:28 UTC (permalink / raw)
  To: Himal Prasad Ghimiray; +Cc: intel-xe

On Tue, 21 Nov 2023 at 10:01, Himal Prasad Ghimiray
<himal.prasad.ghimiray@intel.com> wrote:
>
> Enable flat ccs for XE2_GFX_FEATURES.
>
> Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>

I guess this should rather be the final patch in the series, once we
have it working?

> ---
>  drivers/gpu/drm/xe/xe_pci.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c
> index 682ba188e456..d9754f3d45f7 100644
> --- a/drivers/gpu/drm/xe/xe_pci.c
> +++ b/drivers/gpu/drm/xe/xe_pci.c
> @@ -157,7 +157,7 @@ static const struct xe_graphics_desc graphics_xelpg = {
>  #define XE2_GFX_FEATURES \
>         .dma_mask_size = 46, \
>         .has_asid = 1, \
> -       .has_flat_ccs = 0 /* FIXME: implementation missing */, \
> +       .has_flat_ccs = 1, \
>         .has_range_tlb_invalidation = 1, \
>         .supports_usm = 0 /* FIXME: implementation missing */, \
>         .va_bits = 48, \
> --
> 2.25.1
>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [Intel-xe] [RFC v2 2/6] drm/xe/xe2: Determine bios enablement for flat ccs on igfx.
  2023-11-21 10:09 ` [Intel-xe] [RFC v2 2/6] drm/xe/xe2: Determine bios enablement for flat ccs on igfx Himal Prasad Ghimiray
@ 2023-11-23 16:37   ` Matthew Auld
  2023-11-23 17:01     ` Matthew Auld
  2023-11-24 11:05   ` Thomas Hellström
  1 sibling, 1 reply; 28+ messages in thread
From: Matthew Auld @ 2023-11-23 16:37 UTC (permalink / raw)
  To: Himal Prasad Ghimiray; +Cc: intel-xe

On Tue, 21 Nov 2023 at 10:01, Himal Prasad Ghimiray
<himal.prasad.ghimiray@intel.com> wrote:
>
> If bios disables flat ccs on igfx  make has_flat_ccs as 0 and notify
> via drm_info.
>
> Bspec:59255
>
> Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>

Huh, I wonder if that explains why on some particular LNL DUTs
compression just didn't seem to "work" for me.

So should this information be passed along to userspace via some query
uAPI? Maybe there is some tradeoff between enabling compression and
other features (like coherency), and so knowing this might be useful?

> ---
>  drivers/gpu/drm/xe/regs/xe_gt_regs.h |  3 +++
>  drivers/gpu/drm/xe/xe_device.c       | 30 ++++++++++++++++++++++++++++
>  2 files changed, 33 insertions(+)
>
> diff --git a/drivers/gpu/drm/xe/regs/xe_gt_regs.h b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
> index cc27fe8fc363..b642301947f5 100644
> --- a/drivers/gpu/drm/xe/regs/xe_gt_regs.h
> +++ b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
> @@ -142,6 +142,9 @@
>  #define XEHP_SLICE_COMMON_ECO_CHICKEN1         XE_REG_MCR(0x731c, XE_REG_OPTION_MASKED)
>  #define   MSC_MSAA_REODER_BUF_BYPASS_DISABLE   REG_BIT(14)
>
> +#define XE2_FLAT_CCS_BASE_RANGE_LOWER          XE_REG_MCR(0x8800)
> +#define   XE2_FLAT_CCS_ENABLE                  REG_BIT(0)
> +
>  #define VF_PREEMPTION                          XE_REG(0x83a4, XE_REG_OPTION_MASKED)
>  #define   PREEMPTION_VERTEX_COUNT              REG_GENMASK(15, 0)
>
> diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
> index 8be765adf702..07a3e4cf48d1 100644
> --- a/drivers/gpu/drm/xe/xe_device.c
> +++ b/drivers/gpu/drm/xe/xe_device.c
> @@ -16,6 +16,7 @@
>  #include <drm/xe_drm.h>
>
>  #include "regs/xe_regs.h"
> +#include "regs/xe_gt_regs.h"
>  #include "xe_bo.h"
>  #include "xe_debugfs.h"
>  #include "xe_display.h"
> @@ -25,6 +26,7 @@
>  #include "xe_exec_queue.h"
>  #include "xe_exec.h"
>  #include "xe_gt.h"
> +#include "xe_gt_mcr.h"
>  #include "xe_irq.h"
>  #include "xe_mmio.h"
>  #include "xe_module.h"
> @@ -342,6 +344,29 @@ static void xe_device_sanitize(struct drm_device *drm, void *arg)
>                 xe_gt_sanitize(gt);
>  }
>
> +static int xe_device_set_has_flat_ccs(struct  xe_device *xe)
> +{
> +       u32 reg;
> +       int err;
> +
> +       if (IS_DGFX(xe) || GRAPHICS_VER(xe) < 20 || !xe->info.has_flat_ccs)
> +               return 0;
> +
> +       struct xe_gt *gt = xe_root_mmio_gt(xe);
> +
> +       err = xe_force_wake_get(gt_to_fw(gt), XE_FW_GT);
> +       if (err)
> +               return err;
> +
> +       reg = xe_gt_mcr_unicast_read_any(gt, XE2_FLAT_CCS_BASE_RANGE_LOWER);
> +       xe->info.has_flat_ccs = (reg & XE2_FLAT_CCS_ENABLE);
> +
> +       if (!xe->info.has_flat_ccs)
> +               drm_info(&xe->drm,
> +                        "Flat CCS has been disabled in bios, May lead to performance impact");
> +       return 0;
> +}
> +
>  int xe_device_probe(struct xe_device *xe)
>  {
>         struct xe_tile *tile;
> @@ -352,6 +377,7 @@ int xe_device_probe(struct xe_device *xe)
>         xe_pat_init_early(xe);
>
>         xe->info.mem_region_mask = 1;
> +
>         err = xe_display_init_nommio(xe);
>         if (err)
>                 return err;
> @@ -390,6 +416,10 @@ int xe_device_probe(struct xe_device *xe)
>                         goto err_irq_shutdown;
>         }
>
> +       err = xe_device_set_has_flat_ccs(xe);
> +       if (err)
> +               return err;
> +
>         err = xe_mmio_probe_vram(xe);
>         if (err)
>                 goto err_irq_shutdown;
> --
> 2.25.1
>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [Intel-xe] [RFC v2 2/6] drm/xe/xe2: Determine bios enablement for flat ccs on igfx.
  2023-11-23 16:37   ` Matthew Auld
@ 2023-11-23 17:01     ` Matthew Auld
  2023-11-27  3:11       ` Ghimiray, Himal Prasad
  0 siblings, 1 reply; 28+ messages in thread
From: Matthew Auld @ 2023-11-23 17:01 UTC (permalink / raw)
  To: Himal Prasad Ghimiray; +Cc: intel-xe

On Thu, 23 Nov 2023 at 16:37, Matthew Auld
<matthew.william.auld@gmail.com> wrote:
>
> On Tue, 21 Nov 2023 at 10:01, Himal Prasad Ghimiray
> <himal.prasad.ghimiray@intel.com> wrote:
> >
> > If bios disables flat ccs on igfx  make has_flat_ccs as 0 and notify
> > via drm_info.
> >
> > Bspec:59255
> >
> > Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> > Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
>
> Huh, I wonder if that explains why on some particular LNL DUTs
> compression just didn't seem to "work" for me.
>
> So should this information be passed along to userspace via some query
> uAPI? Maybe there is some tradeoff between enabling compression and
> other features (like coherency), and so knowing this might be useful?

Another idea might be to reject any compressed pat_index, if the
platform doesn't support flat_ccs?

>
> > ---
> >  drivers/gpu/drm/xe/regs/xe_gt_regs.h |  3 +++
> >  drivers/gpu/drm/xe/xe_device.c       | 30 ++++++++++++++++++++++++++++
> >  2 files changed, 33 insertions(+)
> >
> > diff --git a/drivers/gpu/drm/xe/regs/xe_gt_regs.h b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
> > index cc27fe8fc363..b642301947f5 100644
> > --- a/drivers/gpu/drm/xe/regs/xe_gt_regs.h
> > +++ b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
> > @@ -142,6 +142,9 @@
> >  #define XEHP_SLICE_COMMON_ECO_CHICKEN1         XE_REG_MCR(0x731c, XE_REG_OPTION_MASKED)
> >  #define   MSC_MSAA_REODER_BUF_BYPASS_DISABLE   REG_BIT(14)
> >
> > +#define XE2_FLAT_CCS_BASE_RANGE_LOWER          XE_REG_MCR(0x8800)
> > +#define   XE2_FLAT_CCS_ENABLE                  REG_BIT(0)
> > +
> >  #define VF_PREEMPTION                          XE_REG(0x83a4, XE_REG_OPTION_MASKED)
> >  #define   PREEMPTION_VERTEX_COUNT              REG_GENMASK(15, 0)
> >
> > diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
> > index 8be765adf702..07a3e4cf48d1 100644
> > --- a/drivers/gpu/drm/xe/xe_device.c
> > +++ b/drivers/gpu/drm/xe/xe_device.c
> > @@ -16,6 +16,7 @@
> >  #include <drm/xe_drm.h>
> >
> >  #include "regs/xe_regs.h"
> > +#include "regs/xe_gt_regs.h"
> >  #include "xe_bo.h"
> >  #include "xe_debugfs.h"
> >  #include "xe_display.h"
> > @@ -25,6 +26,7 @@
> >  #include "xe_exec_queue.h"
> >  #include "xe_exec.h"
> >  #include "xe_gt.h"
> > +#include "xe_gt_mcr.h"
> >  #include "xe_irq.h"
> >  #include "xe_mmio.h"
> >  #include "xe_module.h"
> > @@ -342,6 +344,29 @@ static void xe_device_sanitize(struct drm_device *drm, void *arg)
> >                 xe_gt_sanitize(gt);
> >  }
> >
> > +static int xe_device_set_has_flat_ccs(struct  xe_device *xe)
> > +{
> > +       u32 reg;
> > +       int err;
> > +
> > +       if (IS_DGFX(xe) || GRAPHICS_VER(xe) < 20 || !xe->info.has_flat_ccs)
> > +               return 0;
> > +
> > +       struct xe_gt *gt = xe_root_mmio_gt(xe);
> > +
> > +       err = xe_force_wake_get(gt_to_fw(gt), XE_FW_GT);
> > +       if (err)
> > +               return err;
> > +
> > +       reg = xe_gt_mcr_unicast_read_any(gt, XE2_FLAT_CCS_BASE_RANGE_LOWER);
> > +       xe->info.has_flat_ccs = (reg & XE2_FLAT_CCS_ENABLE);
> > +
> > +       if (!xe->info.has_flat_ccs)
> > +               drm_info(&xe->drm,
> > +                        "Flat CCS has been disabled in bios, May lead to performance impact");
> > +       return 0;
> > +}
> > +
> >  int xe_device_probe(struct xe_device *xe)
> >  {
> >         struct xe_tile *tile;
> > @@ -352,6 +377,7 @@ int xe_device_probe(struct xe_device *xe)
> >         xe_pat_init_early(xe);
> >
> >         xe->info.mem_region_mask = 1;
> > +
> >         err = xe_display_init_nommio(xe);
> >         if (err)
> >                 return err;
> > @@ -390,6 +416,10 @@ int xe_device_probe(struct xe_device *xe)
> >                         goto err_irq_shutdown;
> >         }
> >
> > +       err = xe_device_set_has_flat_ccs(xe);
> > +       if (err)
> > +               return err;
> > +
> >         err = xe_mmio_probe_vram(xe);
> >         if (err)
> >                 goto err_irq_shutdown;
> > --
> > 2.25.1
> >

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [Intel-xe] [RFC v2 1/6] drm/xe/xe2: Support flat ccs.
  2023-11-21 10:09 ` [Intel-xe] [RFC v2 1/6] drm/xe/xe2: Support flat ccs Himal Prasad Ghimiray
  2023-11-23 16:28   ` Matthew Auld
@ 2023-11-24 11:02   ` Thomas Hellström
  2023-11-27  3:05     ` Ghimiray, Himal Prasad
  1 sibling, 1 reply; 28+ messages in thread
From: Thomas Hellström @ 2023-11-24 11:02 UTC (permalink / raw)
  To: Himal Prasad Ghimiray, intel-xe

Hi,

On Tue, 2023-11-21 at 15:39 +0530, Himal Prasad Ghimiray wrote:
> Enable flat ccs for XE2_GFX_FEATURES.
> 
> Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> Signed-off-by: Himal Prasad Ghimiray
> <himal.prasad.ghimiray@intel.com>
> ---
>  drivers/gpu/drm/xe/xe_pci.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)

Typically you add the implementation first and then enable it, so this
patch should go last. (Rule is the driver should not misbehave if
bisecting lands in the middle of a patch series).

/Thomas



> 
> diff --git a/drivers/gpu/drm/xe/xe_pci.c
> b/drivers/gpu/drm/xe/xe_pci.c
> index 682ba188e456..d9754f3d45f7 100644
> --- a/drivers/gpu/drm/xe/xe_pci.c
> +++ b/drivers/gpu/drm/xe/xe_pci.c
> @@ -157,7 +157,7 @@ static const struct xe_graphics_desc
> graphics_xelpg = {
>  #define XE2_GFX_FEATURES \
>         .dma_mask_size = 46, \
>         .has_asid = 1, \
> -       .has_flat_ccs = 0 /* FIXME: implementation missing */, \
> +       .has_flat_ccs = 1, \
>         .has_range_tlb_invalidation = 1, \
>         .supports_usm = 0 /* FIXME: implementation missing */, \
>         .va_bits = 48, \


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [Intel-xe] [RFC v2 2/6] drm/xe/xe2: Determine bios enablement for flat ccs on igfx.
  2023-11-21 10:09 ` [Intel-xe] [RFC v2 2/6] drm/xe/xe2: Determine bios enablement for flat ccs on igfx Himal Prasad Ghimiray
  2023-11-23 16:37   ` Matthew Auld
@ 2023-11-24 11:05   ` Thomas Hellström
  2023-11-27  3:12     ` Ghimiray, Himal Prasad
  1 sibling, 1 reply; 28+ messages in thread
From: Thomas Hellström @ 2023-11-24 11:05 UTC (permalink / raw)
  To: Himal Prasad Ghimiray, intel-xe

On Tue, 2023-11-21 at 15:39 +0530, Himal Prasad Ghimiray wrote:
> If bios disables flat ccs on igfx  make has_flat_ccs as 0 and notify
double space above                 ^^
> 
> via drm_info.
> 
> Bspec:59255
> 
> Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> Signed-off-by: Himal Prasad Ghimiray
> <himal.prasad.ghimiray@intel.com>
> ---
>  drivers/gpu/drm/xe/regs/xe_gt_regs.h |  3 +++
>  drivers/gpu/drm/xe/xe_device.c       | 30
> ++++++++++++++++++++++++++++
>  2 files changed, 33 insertions(+)
> 
> diff --git a/drivers/gpu/drm/xe/regs/xe_gt_regs.h
> b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
> index cc27fe8fc363..b642301947f5 100644
> --- a/drivers/gpu/drm/xe/regs/xe_gt_regs.h
> +++ b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
> @@ -142,6 +142,9 @@
>  #define XEHP_SLICE_COMMON_ECO_CHICKEN1         XE_REG_MCR(0x731c,
> XE_REG_OPTION_MASKED)
>  #define   MSC_MSAA_REODER_BUF_BYPASS_DISABLE   REG_BIT(14)
>  
> +#define XE2_FLAT_CCS_BASE_RANGE_LOWER          XE_REG_MCR(0x8800)
> +#define   XE2_FLAT_CCS_ENABLE                  REG_BIT(0)
> +
>  #define VF_PREEMPTION                          XE_REG(0x83a4,
> XE_REG_OPTION_MASKED)
>  #define   PREEMPTION_VERTEX_COUNT              REG_GENMASK(15, 0)
>  
> diff --git a/drivers/gpu/drm/xe/xe_device.c
> b/drivers/gpu/drm/xe/xe_device.c
> index 8be765adf702..07a3e4cf48d1 100644
> --- a/drivers/gpu/drm/xe/xe_device.c
> +++ b/drivers/gpu/drm/xe/xe_device.c
> @@ -16,6 +16,7 @@
>  #include <drm/xe_drm.h>
>  
>  #include "regs/xe_regs.h"
> +#include "regs/xe_gt_regs.h"
>  #include "xe_bo.h"
>  #include "xe_debugfs.h"
>  #include "xe_display.h"
> @@ -25,6 +26,7 @@
>  #include "xe_exec_queue.h"
>  #include "xe_exec.h"
>  #include "xe_gt.h"
> +#include "xe_gt_mcr.h"
>  #include "xe_irq.h"
>  #include "xe_mmio.h"
>  #include "xe_module.h"
> @@ -342,6 +344,29 @@ static void xe_device_sanitize(struct drm_device
> *drm, void *arg)
>                 xe_gt_sanitize(gt);
>  }
>  
> +static int xe_device_set_has_flat_ccs(struct  xe_device *xe)
> +{
> +       u32 reg;
> +       int err;
> +
> +       if (IS_DGFX(xe) || GRAPHICS_VER(xe) < 20 || !xe-
> >info.has_flat_ccs)
> +               return 0;

Depending on the outcome of the iommu discussion we might need to check
for iommu present here as well?

> +
> +       struct xe_gt *gt = xe_root_mmio_gt(xe);
> +
> +       err = xe_force_wake_get(gt_to_fw(gt), XE_FW_GT);
> +       if (err)
> +               return err;
> +
> +       reg = xe_gt_mcr_unicast_read_any(gt,
> XE2_FLAT_CCS_BASE_RANGE_LOWER);
> +       xe->info.has_flat_ccs = (reg & XE2_FLAT_CCS_ENABLE);
> +
> +       if (!xe->info.has_flat_ccs)
> +               drm_info(&xe->drm,
> +                        "Flat CCS has been disabled in bios, May
> lead to performance impact");
> +       return 0;
> +}
> +
>  int xe_device_probe(struct xe_device *xe)
>  {
>         struct xe_tile *tile;
> @@ -352,6 +377,7 @@ int xe_device_probe(struct xe_device *xe)
>         xe_pat_init_early(xe);
>  
>         xe->info.mem_region_mask = 1;
> +

Unrelated change.

>         err = xe_display_init_nommio(xe);
>         if (err)
>                 return err;
> @@ -390,6 +416,10 @@ int xe_device_probe(struct xe_device *xe)
>                         goto err_irq_shutdown;
>         }
>  
> +       err = xe_device_set_has_flat_ccs(xe);
> +       if (err)
> +               return err;
> +
>         err = xe_mmio_probe_vram(xe);
>         if (err)
>                 goto err_irq_shutdown;


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [Intel-xe] [RFC v2 3/6] drm/xe/xe2: Allocate extra pages for ccs during bo create.
  2023-11-21 10:09 ` [Intel-xe] [RFC v2 3/6] drm/xe/xe2: Allocate extra pages for ccs during bo create Himal Prasad Ghimiray
@ 2023-11-24 11:09   ` Thomas Hellström
  2023-11-27  3:19     ` Ghimiray, Himal Prasad
  0 siblings, 1 reply; 28+ messages in thread
From: Thomas Hellström @ 2023-11-24 11:09 UTC (permalink / raw)
  To: Himal Prasad Ghimiray, intel-xe

On Tue, 2023-11-21 at 15:39 +0530, Himal Prasad Ghimiray wrote:
> Each byte of CCS data now represents 512 bytes of main memory data.
> Allocate extra pages to handle ccs region for igfx too.
> 
> Bspec:58796
> 
> Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> Signed-off-by: Himal Prasad Ghimiray
> <himal.prasad.ghimiray@intel.com>
> ---
>  drivers/gpu/drm/xe/regs/xe_gpu_commands.h |  2 +-
>  drivers/gpu/drm/xe/xe_bo.c                | 15 ++++++---------
>  drivers/gpu/drm/xe/xe_device.c            |  2 +-
>  3 files changed, 8 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/regs/xe_gpu_commands.h
> b/drivers/gpu/drm/xe/regs/xe_gpu_commands.h
> index 4402f72481dc..7f74592f99ce 100644
> --- a/drivers/gpu/drm/xe/regs/xe_gpu_commands.h
> +++ b/drivers/gpu/drm/xe/regs/xe_gpu_commands.h
> @@ -16,7 +16,7 @@
>  #define   XY_CTRL_SURF_MOCS_MASK       GENMASK(31, 26)
>  #define   XE2_XY_CTRL_SURF_MOCS_INDEX_MASK     GENMASK(31, 28)
>  #define   NUM_CCS_BYTES_PER_BLOCK      256
> -#define   NUM_BYTES_PER_CCS_BYTE       256
> +#define   NUM_BYTES_PER_CCS_BYTE(_xe)  (GRAPHICS_VER(_xe) >= 20 ?
> 512 : 256)
>  #define   NUM_CCS_BLKS_PER_XFER                1024
>  
>  #define XY_FAST_COLOR_BLT_CMD          (2 << 29 | 0x44 << 22)
> diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c
> index 4305f5cbc2ab..4730ee3c1012 100644
> --- a/drivers/gpu/drm/xe/xe_bo.c
> +++ b/drivers/gpu/drm/xe/xe_bo.c
> @@ -2074,19 +2074,16 @@ int xe_bo_evict(struct xe_bo *bo, bool
> force_alloc)
>   * placed in system memory.
>   * @bo: The xe_bo
>   *
> - * If a bo has an allowable placement in XE_PL_TT memory, it can't
> use
> - * flat CCS compression, because the GPU then has no way to access
> the
> - * CCS metadata using relevant commands. For the opposite case, we
> need to
> - * allocate storage for the CCS metadata when the BO is not resident
> in
> - * VRAM memory.

Please extend modify this comment rather than deleting it


> - *
>   * Return: true if extra pages need to be allocated, false
> otherwise.
>   */
>  bool xe_bo_needs_ccs_pages(struct xe_bo *bo)
>  {
> -       return bo->ttm.type == ttm_bo_type_device &&
> -               !(bo->flags & XE_BO_CREATE_SYSTEM_BIT) &&
> -               (bo->flags & XE_BO_CREATE_VRAM_MASK);
> +       struct xe_device *xe = xe_bo_device(bo);
> +
> +       return (xe_device_has_flat_ccs(xe) &&
> +               bo->ttm.type == ttm_bo_type_device &&
> +               ((IS_DGFX(xe) && (bo->flags &
> XE_BO_CREATE_VRAM_MASK)) ||

It looks like you have removed a restriction for DGFX here: If the BO
has SYSTEM set, then ccs pages are not needed.

> +               (!IS_DGFX(xe) && (bo->flags &
> XE_BO_CREATE_SYSTEM_BIT))));
>  }
>  
>  /**
> diff --git a/drivers/gpu/drm/xe/xe_device.c
> b/drivers/gpu/drm/xe/xe_device.c
> index 07a3e4cf48d1..265f9ffc5323 100644
> --- a/drivers/gpu/drm/xe/xe_device.c
> +++ b/drivers/gpu/drm/xe/xe_device.c
> @@ -551,7 +551,7 @@ void xe_device_wmb(struct xe_device *xe)
>  u32 xe_device_ccs_bytes(struct xe_device *xe, u64 size)
>  {
>         return xe_device_has_flat_ccs(xe) ?
> -               DIV_ROUND_UP(size, NUM_BYTES_PER_CCS_BYTE) : 0;
> +               DIV_ROUND_UP(size, NUM_BYTES_PER_CCS_BYTE(xe)) : 0;
>  }
>  
>  bool xe_device_mem_access_ongoing(struct xe_device *xe)


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [Intel-xe] [RFC v2 4/6] drm/xe/xe2: Updates on XY_CTRL_SURF_COPY_BLT
  2023-11-21 10:09 ` [Intel-xe] [RFC v2 4/6] drm/xe/xe2: Updates on XY_CTRL_SURF_COPY_BLT Himal Prasad Ghimiray
@ 2023-11-24 11:11   ` Thomas Hellström
  2023-11-27  3:21     ` Ghimiray, Himal Prasad
  0 siblings, 1 reply; 28+ messages in thread
From: Thomas Hellström @ 2023-11-24 11:11 UTC (permalink / raw)
  To: Himal Prasad Ghimiray, intel-xe

On Tue, 2023-11-21 at 15:39 +0530, Himal Prasad Ghimiray wrote:
> - The XY_CTRL_SURF_COPY_BLT instruction operationg on ccs data
> expects
> size in pages of main memory for which CCS data should be copied.
> - The bitfield representing copy size in XY_CTRL_SURF_COPY_BLT has
> shifted one bit higher in the instruction.
> 
> Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> Signed-off-by: Himal Prasad Ghimiray
> <himal.prasad.ghimiray@intel.com>
> ---
>  drivers/gpu/drm/xe/regs/xe_gpu_commands.h |  1 +
>  drivers/gpu/drm/xe/xe_migrate.c           | 23 +++++++++++++++++----
> --
>  2 files changed, 18 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/regs/xe_gpu_commands.h
> b/drivers/gpu/drm/xe/regs/xe_gpu_commands.h
> index 7f74592f99ce..47459841aa69 100644
> --- a/drivers/gpu/drm/xe/regs/xe_gpu_commands.h
> +++ b/drivers/gpu/drm/xe/regs/xe_gpu_commands.h
> @@ -13,6 +13,7 @@
>  #define   DST_ACCESS_TYPE_SHIFT                20
>  #define   CCS_SIZE_MASK                        0x3FF
>  #define   CCS_SIZE_SHIFT               8
> +#define   XE2_CCS_SIZE_SHIFT           9
>  #define   XY_CTRL_SURF_MOCS_MASK       GENMASK(31, 26)
>  #define   XE2_XY_CTRL_SURF_MOCS_INDEX_MASK     GENMASK(31, 28)
>  #define   NUM_CCS_BYTES_PER_BLOCK      256
> diff --git a/drivers/gpu/drm/xe/xe_migrate.c
> b/drivers/gpu/drm/xe/xe_migrate.c
> index ed2f3f5109f3..06706fad67aa 100644
> --- a/drivers/gpu/drm/xe/xe_migrate.c
> +++ b/drivers/gpu/drm/xe/xe_migrate.c
> @@ -523,21 +523,31 @@ static void emit_copy_ccs(struct xe_gt *gt,
> struct xe_bb *bb,
>         struct xe_device *xe = gt_to_xe(gt);
>         u32 *cs = bb->cs + bb->len;
>         u32 num_ccs_blks;
> +       u32 num_ccs_pages;
> +       u32 ccs_copy_size;
>         u32 mocs;
>  
> -       num_ccs_blks = DIV_ROUND_UP(xe_device_ccs_bytes(gt_to_xe(gt),
> size),
> -                                   NUM_CCS_BYTES_PER_BLOCK);
> -       xe_gt_assert(gt, num_ccs_blks <= NUM_CCS_BLKS_PER_XFER);
> +       if (GRAPHICS_VERx100(xe) >= 2000) {
> +               num_ccs_pages =
> DIV_ROUND_UP(xe_device_ccs_bytes(gt_to_xe(gt), size),
> +                                            XE_PAGE_SIZE);
> +               xe_gt_assert(gt, num_ccs_pages <= 1024);
>  
> -       if (GRAPHICS_VERx100(xe) >= 2000)
> +               ccs_copy_size = ((num_ccs_pages - 1) & CCS_SIZE_MASK)
> << XE2_CCS_SIZE_SHIFT;
>                 mocs = FIELD_PREP(XE2_XY_CTRL_SURF_MOCS_INDEX_MASK,
> gt->mocs.uc_index);
> -       else
> +
> +       } else {
> +               num_ccs_blks =
> DIV_ROUND_UP(xe_device_ccs_bytes(gt_to_xe(gt), size),
> +                                           NUM_CCS_BYTES_PER_BLOCK);
> +               xe_gt_assert(gt, num_ccs_blks <=
> NUM_CCS_BLKS_PER_XFER);
> +
> +               ccs_copy_size = ((num_ccs_blks - 1) & CCS_SIZE_MASK)
> << CCS_SIZE_SHIFT;
>                 mocs = FIELD_PREP(XY_CTRL_SURF_MOCS_MASK, gt-
> >mocs.uc_index);
> +       }
>  
>         *cs++ = XY_CTRL_SURF_COPY_BLT |
>                 (src_is_indirect ? 0x0 : 0x1) <<
> SRC_ACCESS_TYPE_SHIFT |
>                 (dst_is_indirect ? 0x0 : 0x1) <<
> DST_ACCESS_TYPE_SHIFT |
> -               ((num_ccs_blks - 1) & CCS_SIZE_MASK) <<
> CCS_SIZE_SHIFT;
> +               ccs_copy_size;
>         *cs++ = lower_32_bits(src_ofs);
>         *cs++ = upper_32_bits(src_ofs) | mocs;
>         *cs++ = lower_32_bits(dst_ofs);
> @@ -992,6 +1002,7 @@ struct dma_fence *xe_migrate_clear(struct
> xe_migrate *m,
>  
>                 emit_clear(gt, bb, clear_L0_ofs, clear_L0,
> XE_PAGE_SIZE,
>                            clear_vram);
> +

Unrelated change. Please remove, Otherwise LGTM.
Reviewed-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>


>                 if (xe_device_has_flat_ccs(xe) && clear_vram) {
>                         emit_copy_ccs(gt, bb, clear_L0_ofs, true,
>                                       m->cleared_vram_ofs, false,
> clear_L0);


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [Intel-xe] [RFC v2 5/6] drm/xe/xe_migrate.c: Use NULL 1G PTE mapped at 255GiB VA for ccs clear.
  2023-11-21 10:09 ` [Intel-xe] [RFC v2 5/6] drm/xe/xe_migrate.c: Use NULL 1G PTE mapped at 255GiB VA for ccs clear Himal Prasad Ghimiray
@ 2023-11-24 11:19   ` Thomas Hellström
  2023-11-27  3:23     ` Ghimiray, Himal Prasad
  0 siblings, 1 reply; 28+ messages in thread
From: Thomas Hellström @ 2023-11-24 11:19 UTC (permalink / raw)
  To: Himal Prasad Ghimiray, intel-xe

On Tue, 2023-11-21 at 15:39 +0530, Himal Prasad Ghimiray wrote:
> Get rid of the cleared bo, instead use null 1G PTE mapped at 255GiB
> offset, this can be used for both dgfx and igfx.
> 
> Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> Signed-off-by: Himal Prasad Ghimiray
> <himal.prasad.ghimiray@intel.com>
> ---
>  drivers/gpu/drm/xe/xe_migrate.c | 56 ++++++-------------------------
> --
>  1 file changed, 10 insertions(+), 46 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_migrate.c
> b/drivers/gpu/drm/xe/xe_migrate.c
> index 06706fad67aa..bdcb20f23531 100644
> --- a/drivers/gpu/drm/xe/xe_migrate.c
> +++ b/drivers/gpu/drm/xe/xe_migrate.c
> @@ -54,8 +54,8 @@ struct xe_migrate {
>         u64 batch_base_ofs;
>         /** @usm_batch_base_ofs: VM offset of the usm batch buffer */
>         u64 usm_batch_base_ofs;
> -       /** @cleared_vram_ofs: VM offset of @cleared_bo. */
> -       u64 cleared_vram_ofs;
> +       /** @cleared_mem_ofs: VM offset of @cleared_bo. */
> +       u64 cleared_mem_ofs;

Can we also remove xe_migrate::cleared_bo?

>         /**
>          * @fence: dma-fence representing the last migration job
> batch.
>          * Protected by @job_mutex.
> @@ -125,41 +125,6 @@ static u64 xe_migrate_vram_ofs(struct xe_device
> *xe, u64 addr)
>         return addr + (256ULL << xe_pt_shift(2));
>  }
>  
> -/*
> - * For flat CCS clearing we need a cleared chunk of memory to copy
> from,
> - * since the CCS clearing mode of XY_FAST_COLOR_BLT appears to be
> buggy
> - * (it clears on only 14 bytes in each chunk of 16).
> - * If clearing the main surface one can use the part of the main
> surface
> - * already cleared, but for clearing as part of copying non-
> compressed
> - * data out of system memory, we don't readily have a cleared part
> of
> - * VRAM to copy from, so create one to use for that case.
> - */
> -static int xe_migrate_create_cleared_bo(struct xe_migrate *m, struct
> xe_vm *vm)
> -{
> -       struct xe_tile *tile = m->tile;
> -       struct xe_device *xe = vm->xe;
> -       size_t cleared_size;
> -       u64 vram_addr;
> -
> -       if (!xe_device_has_flat_ccs(xe))
> -               return 0;
> -
> -       cleared_size = xe_device_ccs_bytes(xe,
> MAX_PREEMPTDISABLE_TRANSFER);
> -       cleared_size = PAGE_ALIGN(cleared_size);
> -       m->cleared_bo = xe_bo_create_pin_map(xe, tile, vm,
> cleared_size,
> -                                            ttm_bo_type_kernel,
> -                                           
> XE_BO_CREATE_VRAM_IF_DGFX(tile) |
> -                                           
> XE_BO_CREATE_PINNED_BIT);
> -       if (IS_ERR(m->cleared_bo))
> -               return PTR_ERR(m->cleared_bo);
> -
> -       xe_map_memset(xe, &m->cleared_bo->vmap, 0, 0x00,
> cleared_size);
> -       vram_addr = xe_bo_addr(m->cleared_bo, 0, XE_PAGE_SIZE);
> -       m->cleared_vram_ofs = xe_migrate_vram_ofs(xe, vram_addr);
> -
> -       return 0;
> -}
> -
>  static int xe_migrate_prepare_vm(struct xe_tile *tile, struct
> xe_migrate *m,
>                                  struct xe_vm *vm)
>  {
> @@ -170,7 +135,6 @@ static int xe_migrate_prepare_vm(struct xe_tile
> *tile, struct xe_migrate *m,
>         u32 map_ofs, level, i;
>         struct xe_bo *bo, *batch = tile->mem.kernel_bb_pool->bo;
>         u64 entry;
> -       int ret;
>  
>         /* Can't bump NUM_PT_SLOTS too high */
>         BUILD_BUG_ON(NUM_PT_SLOTS > SZ_2M/XE_PAGE_SIZE);
> @@ -190,12 +154,6 @@ static int xe_migrate_prepare_vm(struct xe_tile
> *tile, struct xe_migrate *m,
>         if (IS_ERR(bo))
>                 return PTR_ERR(bo);
>  
> -       ret = xe_migrate_create_cleared_bo(m, vm);
> -       if (ret) {
> -               xe_bo_put(bo);
> -               return ret;
> -       }
> -
>         entry = vm->pt_ops->pde_encode_bo(bo, bo->size -
> XE_PAGE_SIZE, pat_index);
>         xe_pt_write(xe, &vm->pt_root[id]->bo->vmap, 0, entry);
>  
> @@ -262,6 +220,12 @@ static int xe_migrate_prepare_vm(struct xe_tile
> *tile, struct xe_migrate *m,
>                           (i + 1) * 8, u64, entry);
>         }

A short comment that we set up a 1GiB NULL mapping at 255GiB offset.

>  
> +       level = 2;
> +       xe_map_wr(xe, &bo->vmap, map_ofs + XE_PAGE_SIZE * level + 255
> * 8, u64,
> +                 vm->pt_ops->pte_encode_addr(xe, 0, pat_index,
> level, IS_DGFX(xe), 0)
> +                 | XE_PTE_NULL);
> +       m->cleared_mem_ofs = (255ULL << xe_pt_shift(level));
> +



>         /* Identity map the entire vram at 256GiB offset */
>         if (IS_DGFX(xe)) {
>                 u64 pos, ofs, flags;
> @@ -616,7 +580,7 @@ static u32 xe_migrate_ccs_copy(struct xe_migrate
> *m,
>                  * Otherwise if the bo doesn't have any CCS metadata
> attached,
>                  * we still need to clear it for security reasons.
>                  */
> -               u64 ccs_src_ofs =  src_is_vram ? src_ofs : m-
> >cleared_vram_ofs;
> +               u64 ccs_src_ofs =  src_is_vram ? src_ofs : m-
> >cleared_mem_ofs;
>  
>                 emit_copy_ccs(gt, bb,
>                               dst_ofs, true,
> @@ -1005,7 +969,7 @@ struct dma_fence *xe_migrate_clear(struct
> xe_migrate *m,
>  
>                 if (xe_device_has_flat_ccs(xe) && clear_vram) {
>                         emit_copy_ccs(gt, bb, clear_L0_ofs, true,
> -                                     m->cleared_vram_ofs, false,
> clear_L0);
> +                                     m->cleared_mem_ofs, false,
> clear_L0);
>                         flush_flags = MI_FLUSH_DW_CCS;
>                 }
>  


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [Intel-xe] [RFC v2 6/6] drm/xe/xe2: Handle flat ccs move for igfx.
  2023-11-21 10:09 ` [Intel-xe] [RFC v2 6/6] drm/xe/xe2: Handle flat ccs move for igfx Himal Prasad Ghimiray
@ 2023-11-24 15:48   ` Thomas Hellström
  2023-11-27  3:25     ` Ghimiray, Himal Prasad
  0 siblings, 1 reply; 28+ messages in thread
From: Thomas Hellström @ 2023-11-24 15:48 UTC (permalink / raw)
  To: Himal Prasad Ghimiray, intel-xe

On Tue, 2023-11-21 at 15:39 +0530, Himal Prasad Ghimiray wrote:
> - Clear flat ccs during user bo creation.
> - copy ccs meta data between flat ccs and bo during eviction and
> restore.
> - Add a bool field ccs_cleared in bo, true means ccs region of bo is
> already cleared.
> 
> Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> Signed-off-by: Himal Prasad Ghimiray
> <himal.prasad.ghimiray@intel.com>
> ---
>  drivers/gpu/drm/xe/xe_bo.c       | 25 ++++++++-----
>  drivers/gpu/drm/xe/xe_bo_types.h |  2 ++
>  drivers/gpu/drm/xe/xe_migrate.c  | 62 ++++++++++++++++--------------
> --
>  3 files changed, 50 insertions(+), 39 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c
> index 4730ee3c1012..a40f17ae21e7 100644
> --- a/drivers/gpu/drm/xe/xe_bo.c
> +++ b/drivers/gpu/drm/xe/xe_bo.c
> @@ -630,10 +630,12 @@ static int xe_bo_move(struct ttm_buffer_object
> *ttm_bo, bool evict,
>         bool move_lacks_source;
>         bool tt_has_data;
>         bool needs_clear;
> +       bool handle_system_ccs = (!IS_DGFX(xe) &&
> xe_bo_needs_ccs_pages(bo) &&
> +                                 ttm && ttm_tt_is_populated(ttm)) ?
> true : false;
>         int ret = 0;
> -
> -       /* Bo creation path, moving to system or TT. No clearing
> required. */
> -       if (!old_mem && ttm) {
> +       /* Bo creation path, moving to system or TT. */
> +       if (((old_mem_type == XE_PL_SYSTEM && new_mem->mem_type ==
> XE_PL_TT) ||

I figure moving from SYSTEM to TT must always trigger a copy or clear
of CCS in the handle_system_ccs case?

/Thomas


> +            (!old_mem && ttm)) && !handle_system_ccs) {
>                 ttm_bo_move_null(ttm_bo, new_mem);
>                 return 0;
>         }
> @@ -648,14 +650,13 @@ static int xe_bo_move(struct ttm_buffer_object
> *ttm_bo, bool evict,
>         tt_has_data = ttm && (ttm_tt_is_populated(ttm) ||
>                               (ttm->page_flags &
> TTM_TT_FLAG_SWAPPED));
>  
> -       move_lacks_source = !mem_type_is_vram(old_mem_type) &&
> !tt_has_data;
> +       move_lacks_source = handle_system_ccs ? (!bo->ccs_cleared)  :
> +                                               (!mem_type_is_vram(ol
> d_mem_type) && !tt_has_data);
>  
>         needs_clear = (ttm && ttm->page_flags &
> TTM_TT_FLAG_ZERO_ALLOC) ||
>                 (!ttm && ttm_bo->type == ttm_bo_type_device);
>  
> -       if ((move_lacks_source && !needs_clear) ||
> -           (old_mem_type == XE_PL_SYSTEM &&
> -            new_mem->mem_type == XE_PL_TT)) {
> +       if ((move_lacks_source && !needs_clear)) {
>                 ttm_bo_move_null(ttm_bo, new_mem);
>                 goto out;
>         }
> @@ -686,8 +687,11 @@ static int xe_bo_move(struct ttm_buffer_object
> *ttm_bo, bool evict,
>                         ret = timeout;
>                         goto out;
>                 }
> -               ttm_bo_move_null(ttm_bo, new_mem);
> -               goto out;
> +
> +               if (!handle_system_ccs) {
> +                       ttm_bo_move_null(ttm_bo, new_mem);
> +                       goto out;
> +               }
>         }
>  
>         if (!move_lacks_source &&
> @@ -708,6 +712,8 @@ static int xe_bo_move(struct ttm_buffer_object
> *ttm_bo, bool evict,
>                 migrate = mem_type_to_migrate(xe, new_mem->mem_type);
>         else if (mem_type_is_vram(old_mem_type))
>                 migrate = mem_type_to_migrate(xe, old_mem_type);
> +       else
> +               migrate = xe->tiles[0].migrate;
>  
>         xe_assert(xe, migrate);
>  
> @@ -1229,6 +1235,7 @@ struct xe_bo *__xe_bo_create_locked(struct
> xe_device *xe, struct xe_bo *bo,
>                 alignment = SZ_4K >> PAGE_SHIFT;
>         }
>  
> +       bo->ccs_cleared = false;
>         bo->tile = tile;
>         bo->size = size;
>         bo->flags = flags;
> diff --git a/drivers/gpu/drm/xe/xe_bo_types.h
> b/drivers/gpu/drm/xe/xe_bo_types.h
> index 4bff60996168..508e67c81427 100644
> --- a/drivers/gpu/drm/xe/xe_bo_types.h
> +++ b/drivers/gpu/drm/xe/xe_bo_types.h
> @@ -79,6 +79,8 @@ struct xe_bo {
>         struct llist_node freed;
>         /** @created: Whether the bo has passed initial creation */
>         bool created;
> +       /** @ccs_cleared */
> +       bool ccs_cleared;
>  };
>  
>  #define intel_bo_to_drm_bo(bo) (&(bo)->ttm.base)
> diff --git a/drivers/gpu/drm/xe/xe_migrate.c
> b/drivers/gpu/drm/xe/xe_migrate.c
> index bdcb20f23531..bac24768fe2a 100644
> --- a/drivers/gpu/drm/xe/xe_migrate.c
> +++ b/drivers/gpu/drm/xe/xe_migrate.c
> @@ -564,14 +564,14 @@ static u64 xe_migrate_batch_base(struct
> xe_migrate *m, bool usm)
>  
>  static u32 xe_migrate_ccs_copy(struct xe_migrate *m,
>                                struct xe_bb *bb,
> -                              u64 src_ofs, bool src_is_vram,
> -                              u64 dst_ofs, bool dst_is_vram, u32
> dst_size,
> +                              u64 src_ofs, bool src_is_indirect,
> +                              u64 dst_ofs, bool dst_is_indirect, u32
> dst_size,
>                                u64 ccs_ofs, bool copy_ccs)
>  {
>         struct xe_gt *gt = m->tile->primary_gt;
>         u32 flush_flags = 0;
>  
> -       if (xe_device_has_flat_ccs(gt_to_xe(gt)) && !copy_ccs &&
> dst_is_vram) {
> +       if (xe_device_has_flat_ccs(gt_to_xe(gt)) && !copy_ccs &&
> dst_is_indirect) {
>                 /*
>                  * If the src is already in vram, then it should
> already
>                  * have been cleared by us, or has been populated by
> the
> @@ -580,28 +580,24 @@ static u32 xe_migrate_ccs_copy(struct
> xe_migrate *m,
>                  * Otherwise if the bo doesn't have any CCS metadata
> attached,
>                  * we still need to clear it for security reasons.
>                  */
> -               u64 ccs_src_ofs =  src_is_vram ? src_ofs : m-
> >cleared_mem_ofs;
> +               u64 ccs_src_ofs =  src_is_indirect ? src_ofs : m-
> >cleared_mem_ofs;
>  
>                 emit_copy_ccs(gt, bb,
>                               dst_ofs, true,
> -                             ccs_src_ofs, src_is_vram, dst_size);
> +                             ccs_src_ofs, src_is_indirect,
> dst_size);
>  
>                 flush_flags = MI_FLUSH_DW_CCS;
>         } else if (copy_ccs) {
> -               if (!src_is_vram)
> +               if (!src_is_indirect)
>                         src_ofs = ccs_ofs;
> -               else if (!dst_is_vram)
> +               else if (!dst_is_indirect)
>                         dst_ofs = ccs_ofs;
>  
> -               /*
> -                * At the moment, we don't support copying CCS
> metadata from
> -                * system to system.
> -                */
> -               xe_gt_assert(gt, src_is_vram || dst_is_vram);
> +               xe_gt_assert(gt, src_is_indirect || dst_is_indirect);
>  
> -               emit_copy_ccs(gt, bb, dst_ofs, dst_is_vram, src_ofs,
> -                             src_is_vram, dst_size);
> -               if (dst_is_vram)
> +               emit_copy_ccs(gt, bb, dst_ofs, dst_is_indirect,
> src_ofs,
> +                             src_is_indirect, dst_size);
> +               if (dst_is_indirect)
>                         flush_flags = MI_FLUSH_DW_CCS;
>         }
>  
> @@ -642,6 +638,8 @@ struct dma_fence *xe_migrate_copy(struct
> xe_migrate *m,
>         u64 src_L0, dst_L0;
>         int pass = 0;
>         int err;
> +       bool src_is_pltt = src->mem_type == XE_PL_TT;
> +       bool dst_is_pltt = dst->mem_type == XE_PL_TT;
>         bool src_is_vram = mem_type_is_vram(src->mem_type);
>         bool dst_is_vram = mem_type_is_vram(dst->mem_type);
>         bool copy_ccs = xe_device_has_flat_ccs(xe) &&
> @@ -682,9 +680,7 @@ struct dma_fence *xe_migrate_copy(struct
> xe_migrate *m,
>                 src_L0 = xe_migrate_res_sizes(&src_it);
>                 dst_L0 = xe_migrate_res_sizes(&dst_it);
>  
> -               drm_dbg(&xe->drm, "Pass %u, sizes: %llu & %llu\n",
> -                       pass++, src_L0, dst_L0);
> -
> +               drm_dbg(&xe->drm, "Pass %u, sizes: %llu & %llu\n",
> pass++, src_L0, dst_L0);
>                 src_L0 = min(src_L0, dst_L0);
>  
>                 batch_size += pte_update_size(m, src_is_vram, src,
> &src_it, &src_L0,
> @@ -704,8 +700,8 @@ struct dma_fence *xe_migrate_copy(struct
> xe_migrate *m,
>                 }
>  
>                 /* Add copy commands size here */
> -               batch_size += EMIT_COPY_DW +
> -                       (xe_device_has_flat_ccs(xe) ?
> EMIT_COPY_CCS_DW : 0);
> +               batch_size += ((!src_is_vram && !dst_is_vram) ? 0 :
> EMIT_COPY_DW) +
> +                       ((xe_device_has_flat_ccs(xe) ?
> EMIT_COPY_CCS_DW : 0));
>  
>                 bb = xe_bb_new(gt, batch_size, usm);
>                 if (IS_ERR(bb)) {
> @@ -731,10 +727,13 @@ struct dma_fence *xe_migrate_copy(struct
> xe_migrate *m,
>                 bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
>                 update_idx = bb->len;
>  
> -               emit_copy(gt, bb, src_L0_ofs, dst_L0_ofs, src_L0,
> -                         XE_PAGE_SIZE);
> -               flush_flags = xe_migrate_ccs_copy(m, bb, src_L0_ofs,
> src_is_vram,
> -                                                 dst_L0_ofs,
> dst_is_vram,
> +               if (src_is_vram || dst_is_vram)
> +                       emit_copy(gt, bb, src_L0_ofs, dst_L0_ofs,
> src_L0, XE_PAGE_SIZE);
> +
> +               flush_flags = xe_migrate_ccs_copy(m, bb, src_L0_ofs,
> +                                                 IS_DGFX(xe) ?
> src_is_vram : src_is_pltt,
> +                                                 dst_L0_ofs,
> +                                                 IS_DGFX(xe) ?
> dst_is_vram : dst_is_pltt,
>                                                   src_L0, ccs_ofs,
> copy_ccs);
>  
>                 mutex_lock(&m->job_mutex);
> @@ -907,6 +906,7 @@ struct dma_fence *xe_migrate_clear(struct
> xe_migrate *m,
>         bool clear_vram = mem_type_is_vram(dst->mem_type);
>         struct xe_gt *gt = m->tile->primary_gt;
>         struct xe_device *xe = gt_to_xe(gt);
> +       bool clear_system_ccs = (xe_bo_needs_ccs_pages(bo) &&
> !IS_DGFX(xe)) ? true : false;
>         struct dma_fence *fence = NULL;
>         u64 size = bo->size;
>         struct xe_res_cursor src_it;
> @@ -936,9 +936,9 @@ struct dma_fence *xe_migrate_clear(struct
> xe_migrate *m,
>                 batch_size = 2 +
>                         pte_update_size(m, clear_vram, src, &src_it,
>                                         &clear_L0, &clear_L0_ofs,
> &clear_L0_pt,
> -                                       emit_clear_cmd_len(gt), 0,
> +                                       clear_system_ccs ? 0 :
> emit_clear_cmd_len(gt), 0,
>                                         NUM_PT_PER_BLIT);
> -               if (xe_device_has_flat_ccs(xe) && clear_vram)
> +               if (xe_bo_needs_ccs_pages(bo))
>                         batch_size += EMIT_COPY_CCS_DW;
>  
>                 /* Clear commands */
> @@ -953,7 +953,6 @@ struct dma_fence *xe_migrate_clear(struct
> xe_migrate *m,
>                 }
>  
>                 size -= clear_L0;
> -
>                 /* Preemption is enabled again by the ring ops. */
>                 if (!clear_vram) {
>                         emit_pte(m, bb, clear_L0_pt, clear_vram,
> &src_it, clear_L0,
> @@ -964,10 +963,10 @@ struct dma_fence *xe_migrate_clear(struct
> xe_migrate *m,
>                 bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
>                 update_idx = bb->len;
>  
> -               emit_clear(gt, bb, clear_L0_ofs, clear_L0,
> XE_PAGE_SIZE,
> -                          clear_vram);
> +               if (!clear_system_ccs)
> +                       emit_clear(gt, bb, clear_L0_ofs, clear_L0,
> XE_PAGE_SIZE, clear_vram);
>  
> -               if (xe_device_has_flat_ccs(xe) && clear_vram) {
> +               if (xe_bo_needs_ccs_pages(bo)) {
>                         emit_copy_ccs(gt, bb, clear_L0_ofs, true,
>                                       m->cleared_mem_ofs, false,
> clear_L0);
>                         flush_flags = MI_FLUSH_DW_CCS;
> @@ -1024,6 +1023,9 @@ struct dma_fence *xe_migrate_clear(struct
> xe_migrate *m,
>                 return ERR_PTR(err);
>         }
>  
> +       if (clear_system_ccs)
> +               bo->ccs_cleared = true;
> +
>         return fence;
>  }
>  


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [Intel-xe] [RFC v2 1/6] drm/xe/xe2: Support flat ccs.
  2023-11-23 16:28   ` Matthew Auld
@ 2023-11-27  3:04     ` Ghimiray, Himal Prasad
  0 siblings, 0 replies; 28+ messages in thread
From: Ghimiray, Himal Prasad @ 2023-11-27  3:04 UTC (permalink / raw)
  To: Matthew Auld; +Cc: intel-xe

Hi Matthew,

On 23-11-2023 21:58, Matthew Auld wrote:
> On Tue, 21 Nov 2023 at 10:01, Himal Prasad Ghimiray
> <himal.prasad.ghimiray@intel.com> wrote:
>> Enable flat ccs for XE2_GFX_FEATURES.
>>
>> Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
>> Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
> I guess this should rather be the final patch in the series, once we
> have it working?

Thanks for the review. Will move this patch as final patch in next version.

>> ---
>>   drivers/gpu/drm/xe/xe_pci.c | 2 +-
>>   1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c
>> index 682ba188e456..d9754f3d45f7 100644
>> --- a/drivers/gpu/drm/xe/xe_pci.c
>> +++ b/drivers/gpu/drm/xe/xe_pci.c
>> @@ -157,7 +157,7 @@ static const struct xe_graphics_desc graphics_xelpg = {
>>   #define XE2_GFX_FEATURES \
>>          .dma_mask_size = 46, \
>>          .has_asid = 1, \
>> -       .has_flat_ccs = 0 /* FIXME: implementation missing */, \
>> +       .has_flat_ccs = 1, \
>>          .has_range_tlb_invalidation = 1, \
>>          .supports_usm = 0 /* FIXME: implementation missing */, \
>>          .va_bits = 48, \
>> --
>> 2.25.1
>>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [Intel-xe] [RFC v2 1/6] drm/xe/xe2: Support flat ccs.
  2023-11-24 11:02   ` Thomas Hellström
@ 2023-11-27  3:05     ` Ghimiray, Himal Prasad
  0 siblings, 0 replies; 28+ messages in thread
From: Ghimiray, Himal Prasad @ 2023-11-27  3:05 UTC (permalink / raw)
  To: Thomas Hellström, intel-xe

Hi Thomas,

On 24-11-2023 16:32, Thomas Hellström wrote:
> Hi,
>
> On Tue, 2023-11-21 at 15:39 +0530, Himal Prasad Ghimiray wrote:
>> Enable flat ccs for XE2_GFX_FEATURES.
>>
>> Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
>> Signed-off-by: Himal Prasad Ghimiray
>> <himal.prasad.ghimiray@intel.com>
>> ---
>>   drivers/gpu/drm/xe/xe_pci.c | 2 +-
>>   1 file changed, 1 insertion(+), 1 deletion(-)
> Typically you add the implementation first and then enable it, so this
> patch should go last. (Rule is the driver should not misbehave if
> bisecting lands in the middle of a patch series).
>
> /Thomas
Thanks for the comment. Will move this patch as final patch in next 
version.
>
>
>
>> diff --git a/drivers/gpu/drm/xe/xe_pci.c
>> b/drivers/gpu/drm/xe/xe_pci.c
>> index 682ba188e456..d9754f3d45f7 100644
>> --- a/drivers/gpu/drm/xe/xe_pci.c
>> +++ b/drivers/gpu/drm/xe/xe_pci.c
>> @@ -157,7 +157,7 @@ static const struct xe_graphics_desc
>> graphics_xelpg = {
>>   #define XE2_GFX_FEATURES \
>>          .dma_mask_size = 46, \
>>          .has_asid = 1, \
>> -       .has_flat_ccs = 0 /* FIXME: implementation missing */, \
>> +       .has_flat_ccs = 1, \
>>          .has_range_tlb_invalidation = 1, \
>>          .supports_usm = 0 /* FIXME: implementation missing */, \
>>          .va_bits = 48, \

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [Intel-xe] [RFC v2 2/6] drm/xe/xe2: Determine bios enablement for flat ccs on igfx.
  2023-11-23 17:01     ` Matthew Auld
@ 2023-11-27  3:11       ` Ghimiray, Himal Prasad
  0 siblings, 0 replies; 28+ messages in thread
From: Ghimiray, Himal Prasad @ 2023-11-27  3:11 UTC (permalink / raw)
  To: Matthew Auld; +Cc: intel-xe


On 23-11-2023 22:31, Matthew Auld wrote:
> On Thu, 23 Nov 2023 at 16:37, Matthew Auld
> <matthew.william.auld@gmail.com> wrote:
>> On Tue, 21 Nov 2023 at 10:01, Himal Prasad Ghimiray
>> <himal.prasad.ghimiray@intel.com> wrote:
>>> If bios disables flat ccs on igfx  make has_flat_ccs as 0 and notify
>>> via drm_info.
>>>
>>> Bspec:59255
>>>
>>> Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
>>> Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
>> Huh, I wonder if that explains why on some particular LNL DUTs
>> compression just didn't seem to "work" for me.
>>
>> So should this information be passed along to userspace via some query
>> uAPI? Maybe there is some tradeoff between enabling compression and
>> other features (like coherency), and so knowing this might be useful?
> Another idea might be to reject any compressed pat_index, if the
> platform doesn't support flat_ccs?

Thanks for the review.  I will start the discussion regarding exposing 
this information via some uapi or

rejecting the compressed pat_index. Will come back with further info on 
this.

>
>>> ---
>>>   drivers/gpu/drm/xe/regs/xe_gt_regs.h |  3 +++
>>>   drivers/gpu/drm/xe/xe_device.c       | 30 ++++++++++++++++++++++++++++
>>>   2 files changed, 33 insertions(+)
>>>
>>> diff --git a/drivers/gpu/drm/xe/regs/xe_gt_regs.h b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
>>> index cc27fe8fc363..b642301947f5 100644
>>> --- a/drivers/gpu/drm/xe/regs/xe_gt_regs.h
>>> +++ b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
>>> @@ -142,6 +142,9 @@
>>>   #define XEHP_SLICE_COMMON_ECO_CHICKEN1         XE_REG_MCR(0x731c, XE_REG_OPTION_MASKED)
>>>   #define   MSC_MSAA_REODER_BUF_BYPASS_DISABLE   REG_BIT(14)
>>>
>>> +#define XE2_FLAT_CCS_BASE_RANGE_LOWER          XE_REG_MCR(0x8800)
>>> +#define   XE2_FLAT_CCS_ENABLE                  REG_BIT(0)
>>> +
>>>   #define VF_PREEMPTION                          XE_REG(0x83a4, XE_REG_OPTION_MASKED)
>>>   #define   PREEMPTION_VERTEX_COUNT              REG_GENMASK(15, 0)
>>>
>>> diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
>>> index 8be765adf702..07a3e4cf48d1 100644
>>> --- a/drivers/gpu/drm/xe/xe_device.c
>>> +++ b/drivers/gpu/drm/xe/xe_device.c
>>> @@ -16,6 +16,7 @@
>>>   #include <drm/xe_drm.h>
>>>
>>>   #include "regs/xe_regs.h"
>>> +#include "regs/xe_gt_regs.h"
>>>   #include "xe_bo.h"
>>>   #include "xe_debugfs.h"
>>>   #include "xe_display.h"
>>> @@ -25,6 +26,7 @@
>>>   #include "xe_exec_queue.h"
>>>   #include "xe_exec.h"
>>>   #include "xe_gt.h"
>>> +#include "xe_gt_mcr.h"
>>>   #include "xe_irq.h"
>>>   #include "xe_mmio.h"
>>>   #include "xe_module.h"
>>> @@ -342,6 +344,29 @@ static void xe_device_sanitize(struct drm_device *drm, void *arg)
>>>                  xe_gt_sanitize(gt);
>>>   }
>>>
>>> +static int xe_device_set_has_flat_ccs(struct  xe_device *xe)
>>> +{
>>> +       u32 reg;
>>> +       int err;
>>> +
>>> +       if (IS_DGFX(xe) || GRAPHICS_VER(xe) < 20 || !xe->info.has_flat_ccs)
>>> +               return 0;
>>> +
>>> +       struct xe_gt *gt = xe_root_mmio_gt(xe);
>>> +
>>> +       err = xe_force_wake_get(gt_to_fw(gt), XE_FW_GT);
>>> +       if (err)
>>> +               return err;
>>> +
>>> +       reg = xe_gt_mcr_unicast_read_any(gt, XE2_FLAT_CCS_BASE_RANGE_LOWER);
>>> +       xe->info.has_flat_ccs = (reg & XE2_FLAT_CCS_ENABLE);
>>> +
>>> +       if (!xe->info.has_flat_ccs)
>>> +               drm_info(&xe->drm,
>>> +                        "Flat CCS has been disabled in bios, May lead to performance impact");
>>> +       return 0;
>>> +}
>>> +
>>>   int xe_device_probe(struct xe_device *xe)
>>>   {
>>>          struct xe_tile *tile;
>>> @@ -352,6 +377,7 @@ int xe_device_probe(struct xe_device *xe)
>>>          xe_pat_init_early(xe);
>>>
>>>          xe->info.mem_region_mask = 1;
>>> +
>>>          err = xe_display_init_nommio(xe);
>>>          if (err)
>>>                  return err;
>>> @@ -390,6 +416,10 @@ int xe_device_probe(struct xe_device *xe)
>>>                          goto err_irq_shutdown;
>>>          }
>>>
>>> +       err = xe_device_set_has_flat_ccs(xe);
>>> +       if (err)
>>> +               return err;
>>> +
>>>          err = xe_mmio_probe_vram(xe);
>>>          if (err)
>>>                  goto err_irq_shutdown;
>>> --
>>> 2.25.1
>>>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [Intel-xe] [RFC v2 2/6] drm/xe/xe2: Determine bios enablement for flat ccs on igfx.
  2023-11-24 11:05   ` Thomas Hellström
@ 2023-11-27  3:12     ` Ghimiray, Himal Prasad
  0 siblings, 0 replies; 28+ messages in thread
From: Ghimiray, Himal Prasad @ 2023-11-27  3:12 UTC (permalink / raw)
  To: Thomas Hellström, intel-xe


On 24-11-2023 16:35, Thomas Hellström wrote:
> On Tue, 2023-11-21 at 15:39 +0530, Himal Prasad Ghimiray wrote:
>> If bios disables flat ccs on igfx  make has_flat_ccs as 0 and notify
> double space above                 ^^
Will address in next patch.
>> via drm_info.
>>
>> Bspec:59255
>>
>> Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
>> Signed-off-by: Himal Prasad Ghimiray
>> <himal.prasad.ghimiray@intel.com>
>> ---
>>   drivers/gpu/drm/xe/regs/xe_gt_regs.h |  3 +++
>>   drivers/gpu/drm/xe/xe_device.c       | 30
>> ++++++++++++++++++++++++++++
>>   2 files changed, 33 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/xe/regs/xe_gt_regs.h
>> b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
>> index cc27fe8fc363..b642301947f5 100644
>> --- a/drivers/gpu/drm/xe/regs/xe_gt_regs.h
>> +++ b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
>> @@ -142,6 +142,9 @@
>>   #define XEHP_SLICE_COMMON_ECO_CHICKEN1         XE_REG_MCR(0x731c,
>> XE_REG_OPTION_MASKED)
>>   #define   MSC_MSAA_REODER_BUF_BYPASS_DISABLE   REG_BIT(14)
>>   
>> +#define XE2_FLAT_CCS_BASE_RANGE_LOWER          XE_REG_MCR(0x8800)
>> +#define   XE2_FLAT_CCS_ENABLE                  REG_BIT(0)
>> +
>>   #define VF_PREEMPTION                          XE_REG(0x83a4,
>> XE_REG_OPTION_MASKED)
>>   #define   PREEMPTION_VERTEX_COUNT              REG_GENMASK(15, 0)
>>   
>> diff --git a/drivers/gpu/drm/xe/xe_device.c
>> b/drivers/gpu/drm/xe/xe_device.c
>> index 8be765adf702..07a3e4cf48d1 100644
>> --- a/drivers/gpu/drm/xe/xe_device.c
>> +++ b/drivers/gpu/drm/xe/xe_device.c
>> @@ -16,6 +16,7 @@
>>   #include <drm/xe_drm.h>
>>   
>>   #include "regs/xe_regs.h"
>> +#include "regs/xe_gt_regs.h"
>>   #include "xe_bo.h"
>>   #include "xe_debugfs.h"
>>   #include "xe_display.h"
>> @@ -25,6 +26,7 @@
>>   #include "xe_exec_queue.h"
>>   #include "xe_exec.h"
>>   #include "xe_gt.h"
>> +#include "xe_gt_mcr.h"
>>   #include "xe_irq.h"
>>   #include "xe_mmio.h"
>>   #include "xe_module.h"
>> @@ -342,6 +344,29 @@ static void xe_device_sanitize(struct drm_device
>> *drm, void *arg)
>>                  xe_gt_sanitize(gt);
>>   }
>>   
>> +static int xe_device_set_has_flat_ccs(struct  xe_device *xe)
>> +{
>> +       u32 reg;
>> +       int err;
>> +
>> +       if (IS_DGFX(xe) || GRAPHICS_VER(xe) < 20 || !xe-
>>> info.has_flat_ccs)
>> +               return 0;
> Depending on the outcome of the iommu discussion we might need to check
> for iommu present here as well?
Sure. That makes sense.
>
>> +
>> +       struct xe_gt *gt = xe_root_mmio_gt(xe);
>> +
>> +       err = xe_force_wake_get(gt_to_fw(gt), XE_FW_GT);
>> +       if (err)
>> +               return err;
>> +
>> +       reg = xe_gt_mcr_unicast_read_any(gt,
>> XE2_FLAT_CCS_BASE_RANGE_LOWER);
>> +       xe->info.has_flat_ccs = (reg & XE2_FLAT_CCS_ENABLE);
>> +
>> +       if (!xe->info.has_flat_ccs)
>> +               drm_info(&xe->drm,
>> +                        "Flat CCS has been disabled in bios, May
>> lead to performance impact");
>> +       return 0;
>> +}
>> +
>>   int xe_device_probe(struct xe_device *xe)
>>   {
>>          struct xe_tile *tile;
>> @@ -352,6 +377,7 @@ int xe_device_probe(struct xe_device *xe)
>>          xe_pat_init_early(xe);
>>   
>>          xe->info.mem_region_mask = 1;
>> +
> Unrelated change.
>
>>          err = xe_display_init_nommio(xe);
>>          if (err)
>>                  return err;
>> @@ -390,6 +416,10 @@ int xe_device_probe(struct xe_device *xe)
>>                          goto err_irq_shutdown;
>>          }
>>   
>> +       err = xe_device_set_has_flat_ccs(xe);
>> +       if (err)
>> +               return err;
>> +
>>          err = xe_mmio_probe_vram(xe);
>>          if (err)
>>                  goto err_irq_shutdown;

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [Intel-xe] [RFC v2 3/6] drm/xe/xe2: Allocate extra pages for ccs during bo create.
  2023-11-24 11:09   ` Thomas Hellström
@ 2023-11-27  3:19     ` Ghimiray, Himal Prasad
  2023-11-27  9:41       ` Thomas Hellström
  0 siblings, 1 reply; 28+ messages in thread
From: Ghimiray, Himal Prasad @ 2023-11-27  3:19 UTC (permalink / raw)
  To: Thomas Hellström, intel-xe

[-- Attachment #1: Type: text/plain, Size: 3885 bytes --]


On 24-11-2023 16:39, Thomas Hellström wrote:
> On Tue, 2023-11-21 at 15:39 +0530, Himal Prasad Ghimiray wrote:
>> Each byte of CCS data now represents 512 bytes of main memory data.
>> Allocate extra pages to handle ccs region for igfx too.
>>
>> Bspec:58796
>>
>> Cc: Thomas Hellström<thomas.hellstrom@linux.intel.com>
>> Signed-off-by: Himal Prasad Ghimiray
>> <himal.prasad.ghimiray@intel.com>
>> ---
>>   drivers/gpu/drm/xe/regs/xe_gpu_commands.h |  2 +-
>>   drivers/gpu/drm/xe/xe_bo.c                | 15 ++++++---------
>>   drivers/gpu/drm/xe/xe_device.c            |  2 +-
>>   3 files changed, 8 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/xe/regs/xe_gpu_commands.h
>> b/drivers/gpu/drm/xe/regs/xe_gpu_commands.h
>> index 4402f72481dc..7f74592f99ce 100644
>> --- a/drivers/gpu/drm/xe/regs/xe_gpu_commands.h
>> +++ b/drivers/gpu/drm/xe/regs/xe_gpu_commands.h
>> @@ -16,7 +16,7 @@
>>   #define   XY_CTRL_SURF_MOCS_MASK       GENMASK(31, 26)
>>   #define   XE2_XY_CTRL_SURF_MOCS_INDEX_MASK     GENMASK(31, 28)
>>   #define   NUM_CCS_BYTES_PER_BLOCK      256
>> -#define   NUM_BYTES_PER_CCS_BYTE       256
>> +#define   NUM_BYTES_PER_CCS_BYTE(_xe)  (GRAPHICS_VER(_xe) >= 20 ?
>> 512 : 256)
>>   #define   NUM_CCS_BLKS_PER_XFER                1024
>>   
>>   #define XY_FAST_COLOR_BLT_CMD          (2 << 29 | 0x44 << 22)
>> diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c
>> index 4305f5cbc2ab..4730ee3c1012 100644
>> --- a/drivers/gpu/drm/xe/xe_bo.c
>> +++ b/drivers/gpu/drm/xe/xe_bo.c
>> @@ -2074,19 +2074,16 @@ int xe_bo_evict(struct xe_bo *bo, bool
>> force_alloc)
>>    * placed in system memory.
>>    * @bo: The xe_bo
>>    *
>> - * If a bo has an allowable placement in XE_PL_TT memory, it can't
>> use
>> - * flat CCS compression, because the GPU then has no way to access
>> the
>> - * CCS metadata using relevant commands. For the opposite case, we
>> need to
>> - * allocate storage for the CCS metadata when the BO is not resident
>> in
>> - * VRAM memory.
> Please extend modify this comment rather than deleting it
Sure. Will address in next patch.
>
>
>> - *
>>    * Return: true if extra pages need to be allocated, false
>> otherwise.
>>    */
>>   bool xe_bo_needs_ccs_pages(struct xe_bo *bo)
>>   {
>> -       return bo->ttm.type == ttm_bo_type_device &&
>> -               !(bo->flags & XE_BO_CREATE_SYSTEM_BIT) &&
>> -               (bo->flags & XE_BO_CREATE_VRAM_MASK);
>> +       struct xe_device *xe = xe_bo_device(bo);
>> +
>> +       return (xe_device_has_flat_ccs(xe) &&
>> +               bo->ttm.type == ttm_bo_type_device &&
>> +               ((IS_DGFX(xe) && (bo->flags &
>> XE_BO_CREATE_VRAM_MASK)) ||
> It looks like you have removed a restriction for DGFX here: If the BO
> has SYSTEM set, then ccs pages are not needed.

Is ((IS_DGFX(xe) && (bo->flags & XE_BO_CREATE_VRAM_MASK)) not suffice to 
ensure

that for DGFX BO is only in VRAM region ?

>
>> +               (!IS_DGFX(xe) && (bo->flags &
>> XE_BO_CREATE_SYSTEM_BIT))));
>>   }
>>   
>>   /**
>> diff --git a/drivers/gpu/drm/xe/xe_device.c
>> b/drivers/gpu/drm/xe/xe_device.c
>> index 07a3e4cf48d1..265f9ffc5323 100644
>> --- a/drivers/gpu/drm/xe/xe_device.c
>> +++ b/drivers/gpu/drm/xe/xe_device.c
>> @@ -551,7 +551,7 @@ void xe_device_wmb(struct xe_device *xe)
>>   u32 xe_device_ccs_bytes(struct xe_device *xe, u64 size)
>>   {
>>          return xe_device_has_flat_ccs(xe) ?
>> -               DIV_ROUND_UP(size, NUM_BYTES_PER_CCS_BYTE) : 0;
>> +               DIV_ROUND_UP(size, NUM_BYTES_PER_CCS_BYTE(xe)) : 0;
>>   }
>>   
>>   bool xe_device_mem_access_ongoing(struct xe_device *xe)

[-- Attachment #2: Type: text/html, Size: 6288 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [Intel-xe] [RFC v2 4/6] drm/xe/xe2: Updates on XY_CTRL_SURF_COPY_BLT
  2023-11-24 11:11   ` Thomas Hellström
@ 2023-11-27  3:21     ` Ghimiray, Himal Prasad
  0 siblings, 0 replies; 28+ messages in thread
From: Ghimiray, Himal Prasad @ 2023-11-27  3:21 UTC (permalink / raw)
  To: Thomas Hellström, intel-xe


On 24-11-2023 16:41, Thomas Hellström wrote:
> On Tue, 2023-11-21 at 15:39 +0530, Himal Prasad Ghimiray wrote:
>> - The XY_CTRL_SURF_COPY_BLT instruction operationg on ccs data
>> expects
>> size in pages of main memory for which CCS data should be copied.
>> - The bitfield representing copy size in XY_CTRL_SURF_COPY_BLT has
>> shifted one bit higher in the instruction.
>>
>> Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
>> Signed-off-by: Himal Prasad Ghimiray
>> <himal.prasad.ghimiray@intel.com>
>> ---
>>   drivers/gpu/drm/xe/regs/xe_gpu_commands.h |  1 +
>>   drivers/gpu/drm/xe/xe_migrate.c           | 23 +++++++++++++++++----
>> --
>>   2 files changed, 18 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/xe/regs/xe_gpu_commands.h
>> b/drivers/gpu/drm/xe/regs/xe_gpu_commands.h
>> index 7f74592f99ce..47459841aa69 100644
>> --- a/drivers/gpu/drm/xe/regs/xe_gpu_commands.h
>> +++ b/drivers/gpu/drm/xe/regs/xe_gpu_commands.h
>> @@ -13,6 +13,7 @@
>>   #define   DST_ACCESS_TYPE_SHIFT                20
>>   #define   CCS_SIZE_MASK                        0x3FF
>>   #define   CCS_SIZE_SHIFT               8
>> +#define   XE2_CCS_SIZE_SHIFT           9
>>   #define   XY_CTRL_SURF_MOCS_MASK       GENMASK(31, 26)
>>   #define   XE2_XY_CTRL_SURF_MOCS_INDEX_MASK     GENMASK(31, 28)
>>   #define   NUM_CCS_BYTES_PER_BLOCK      256
>> diff --git a/drivers/gpu/drm/xe/xe_migrate.c
>> b/drivers/gpu/drm/xe/xe_migrate.c
>> index ed2f3f5109f3..06706fad67aa 100644
>> --- a/drivers/gpu/drm/xe/xe_migrate.c
>> +++ b/drivers/gpu/drm/xe/xe_migrate.c
>> @@ -523,21 +523,31 @@ static void emit_copy_ccs(struct xe_gt *gt,
>> struct xe_bb *bb,
>>          struct xe_device *xe = gt_to_xe(gt);
>>          u32 *cs = bb->cs + bb->len;
>>          u32 num_ccs_blks;
>> +       u32 num_ccs_pages;
>> +       u32 ccs_copy_size;
>>          u32 mocs;
>>   
>> -       num_ccs_blks = DIV_ROUND_UP(xe_device_ccs_bytes(gt_to_xe(gt),
>> size),
>> -                                   NUM_CCS_BYTES_PER_BLOCK);
>> -       xe_gt_assert(gt, num_ccs_blks <= NUM_CCS_BLKS_PER_XFER);
>> +       if (GRAPHICS_VERx100(xe) >= 2000) {
>> +               num_ccs_pages =
>> DIV_ROUND_UP(xe_device_ccs_bytes(gt_to_xe(gt), size),
>> +                                            XE_PAGE_SIZE);
>> +               xe_gt_assert(gt, num_ccs_pages <= 1024);
>>   
>> -       if (GRAPHICS_VERx100(xe) >= 2000)
>> +               ccs_copy_size = ((num_ccs_pages - 1) & CCS_SIZE_MASK)
>> << XE2_CCS_SIZE_SHIFT;
>>                  mocs = FIELD_PREP(XE2_XY_CTRL_SURF_MOCS_INDEX_MASK,
>> gt->mocs.uc_index);
>> -       else
>> +
>> +       } else {
>> +               num_ccs_blks =
>> DIV_ROUND_UP(xe_device_ccs_bytes(gt_to_xe(gt), size),
>> +                                           NUM_CCS_BYTES_PER_BLOCK);
>> +               xe_gt_assert(gt, num_ccs_blks <=
>> NUM_CCS_BLKS_PER_XFER);
>> +
>> +               ccs_copy_size = ((num_ccs_blks - 1) & CCS_SIZE_MASK)
>> << CCS_SIZE_SHIFT;
>>                  mocs = FIELD_PREP(XY_CTRL_SURF_MOCS_MASK, gt-
>>> mocs.uc_index);
>> +       }
>>   
>>          *cs++ = XY_CTRL_SURF_COPY_BLT |
>>                  (src_is_indirect ? 0x0 : 0x1) <<
>> SRC_ACCESS_TYPE_SHIFT |
>>                  (dst_is_indirect ? 0x0 : 0x1) <<
>> DST_ACCESS_TYPE_SHIFT |
>> -               ((num_ccs_blks - 1) & CCS_SIZE_MASK) <<
>> CCS_SIZE_SHIFT;
>> +               ccs_copy_size;
>>          *cs++ = lower_32_bits(src_ofs);
>>          *cs++ = upper_32_bits(src_ofs) | mocs;
>>          *cs++ = lower_32_bits(dst_ofs);
>> @@ -992,6 +1002,7 @@ struct dma_fence *xe_migrate_clear(struct
>> xe_migrate *m,
>>   
>>                  emit_clear(gt, bb, clear_L0_ofs, clear_L0,
>> XE_PAGE_SIZE,
>>                             clear_vram);
>> +
> Unrelated change. Please remove, Otherwise LGTM.
> Reviewed-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Thanks for the review. Will remove unrelated change in next patch.
>
>
>>                  if (xe_device_has_flat_ccs(xe) && clear_vram) {
>>                          emit_copy_ccs(gt, bb, clear_L0_ofs, true,
>>                                        m->cleared_vram_ofs, false,
>> clear_L0);

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [Intel-xe] [RFC v2 5/6] drm/xe/xe_migrate.c: Use NULL 1G PTE mapped at 255GiB VA for ccs clear.
  2023-11-24 11:19   ` Thomas Hellström
@ 2023-11-27  3:23     ` Ghimiray, Himal Prasad
  0 siblings, 0 replies; 28+ messages in thread
From: Ghimiray, Himal Prasad @ 2023-11-27  3:23 UTC (permalink / raw)
  To: Thomas Hellström, intel-xe


On 24-11-2023 16:49, Thomas Hellström wrote:
> On Tue, 2023-11-21 at 15:39 +0530, Himal Prasad Ghimiray wrote:
>> Get rid of the cleared bo, instead use null 1G PTE mapped at 255GiB
>> offset, this can be used for both dgfx and igfx.
>>
>> Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
>> Signed-off-by: Himal Prasad Ghimiray
>> <himal.prasad.ghimiray@intel.com>
>> ---
>>   drivers/gpu/drm/xe/xe_migrate.c | 56 ++++++-------------------------
>> --
>>   1 file changed, 10 insertions(+), 46 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/xe/xe_migrate.c
>> b/drivers/gpu/drm/xe/xe_migrate.c
>> index 06706fad67aa..bdcb20f23531 100644
>> --- a/drivers/gpu/drm/xe/xe_migrate.c
>> +++ b/drivers/gpu/drm/xe/xe_migrate.c
>> @@ -54,8 +54,8 @@ struct xe_migrate {
>>          u64 batch_base_ofs;
>>          /** @usm_batch_base_ofs: VM offset of the usm batch buffer */
>>          u64 usm_batch_base_ofs;
>> -       /** @cleared_vram_ofs: VM offset of @cleared_bo. */
>> -       u64 cleared_vram_ofs;
>> +       /** @cleared_mem_ofs: VM offset of @cleared_bo. */
>> +       u64 cleared_mem_ofs;
> Can we also remove xe_migrate::cleared_bo?
Sure. Will clean it up in next version.
>
>>          /**
>>           * @fence: dma-fence representing the last migration job
>> batch.
>>           * Protected by @job_mutex.
>> @@ -125,41 +125,6 @@ static u64 xe_migrate_vram_ofs(struct xe_device
>> *xe, u64 addr)
>>          return addr + (256ULL << xe_pt_shift(2));
>>   }
>>   
>> -/*
>> - * For flat CCS clearing we need a cleared chunk of memory to copy
>> from,
>> - * since the CCS clearing mode of XY_FAST_COLOR_BLT appears to be
>> buggy
>> - * (it clears on only 14 bytes in each chunk of 16).
>> - * If clearing the main surface one can use the part of the main
>> surface
>> - * already cleared, but for clearing as part of copying non-
>> compressed
>> - * data out of system memory, we don't readily have a cleared part
>> of
>> - * VRAM to copy from, so create one to use for that case.
>> - */
>> -static int xe_migrate_create_cleared_bo(struct xe_migrate *m, struct
>> xe_vm *vm)
>> -{
>> -       struct xe_tile *tile = m->tile;
>> -       struct xe_device *xe = vm->xe;
>> -       size_t cleared_size;
>> -       u64 vram_addr;
>> -
>> -       if (!xe_device_has_flat_ccs(xe))
>> -               return 0;
>> -
>> -       cleared_size = xe_device_ccs_bytes(xe,
>> MAX_PREEMPTDISABLE_TRANSFER);
>> -       cleared_size = PAGE_ALIGN(cleared_size);
>> -       m->cleared_bo = xe_bo_create_pin_map(xe, tile, vm,
>> cleared_size,
>> -                                            ttm_bo_type_kernel,
>> -
>> XE_BO_CREATE_VRAM_IF_DGFX(tile) |
>> -
>> XE_BO_CREATE_PINNED_BIT);
>> -       if (IS_ERR(m->cleared_bo))
>> -               return PTR_ERR(m->cleared_bo);
>> -
>> -       xe_map_memset(xe, &m->cleared_bo->vmap, 0, 0x00,
>> cleared_size);
>> -       vram_addr = xe_bo_addr(m->cleared_bo, 0, XE_PAGE_SIZE);
>> -       m->cleared_vram_ofs = xe_migrate_vram_ofs(xe, vram_addr);
>> -
>> -       return 0;
>> -}
>> -
>>   static int xe_migrate_prepare_vm(struct xe_tile *tile, struct
>> xe_migrate *m,
>>                                   struct xe_vm *vm)
>>   {
>> @@ -170,7 +135,6 @@ static int xe_migrate_prepare_vm(struct xe_tile
>> *tile, struct xe_migrate *m,
>>          u32 map_ofs, level, i;
>>          struct xe_bo *bo, *batch = tile->mem.kernel_bb_pool->bo;
>>          u64 entry;
>> -       int ret;
>>   
>>          /* Can't bump NUM_PT_SLOTS too high */
>>          BUILD_BUG_ON(NUM_PT_SLOTS > SZ_2M/XE_PAGE_SIZE);
>> @@ -190,12 +154,6 @@ static int xe_migrate_prepare_vm(struct xe_tile
>> *tile, struct xe_migrate *m,
>>          if (IS_ERR(bo))
>>                  return PTR_ERR(bo);
>>   
>> -       ret = xe_migrate_create_cleared_bo(m, vm);
>> -       if (ret) {
>> -               xe_bo_put(bo);
>> -               return ret;
>> -       }
>> -
>>          entry = vm->pt_ops->pde_encode_bo(bo, bo->size -
>> XE_PAGE_SIZE, pat_index);
>>          xe_pt_write(xe, &vm->pt_root[id]->bo->vmap, 0, entry);
>>   
>> @@ -262,6 +220,12 @@ static int xe_migrate_prepare_vm(struct xe_tile
>> *tile, struct xe_migrate *m,
>>                            (i + 1) * 8, u64, entry);
>>          }
> A short comment that we set up a 1GiB NULL mapping at 255GiB offset.
Sure.
>
>>   
>> +       level = 2;
>> +       xe_map_wr(xe, &bo->vmap, map_ofs + XE_PAGE_SIZE * level + 255
>> * 8, u64,
>> +                 vm->pt_ops->pte_encode_addr(xe, 0, pat_index,
>> level, IS_DGFX(xe), 0)
>> +                 | XE_PTE_NULL);
>> +       m->cleared_mem_ofs = (255ULL << xe_pt_shift(level));
>> +
>
>
>>          /* Identity map the entire vram at 256GiB offset */
>>          if (IS_DGFX(xe)) {
>>                  u64 pos, ofs, flags;
>> @@ -616,7 +580,7 @@ static u32 xe_migrate_ccs_copy(struct xe_migrate
>> *m,
>>                   * Otherwise if the bo doesn't have any CCS metadata
>> attached,
>>                   * we still need to clear it for security reasons.
>>                   */
>> -               u64 ccs_src_ofs =  src_is_vram ? src_ofs : m-
>>> cleared_vram_ofs;
>> +               u64 ccs_src_ofs =  src_is_vram ? src_ofs : m-
>>> cleared_mem_ofs;
>>   
>>                  emit_copy_ccs(gt, bb,
>>                                dst_ofs, true,
>> @@ -1005,7 +969,7 @@ struct dma_fence *xe_migrate_clear(struct
>> xe_migrate *m,
>>   
>>                  if (xe_device_has_flat_ccs(xe) && clear_vram) {
>>                          emit_copy_ccs(gt, bb, clear_L0_ofs, true,
>> -                                     m->cleared_vram_ofs, false,
>> clear_L0);
>> +                                     m->cleared_mem_ofs, false,
>> clear_L0);
>>                          flush_flags = MI_FLUSH_DW_CCS;
>>                  }
>>   

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [Intel-xe] [RFC v2 6/6] drm/xe/xe2: Handle flat ccs move for igfx.
  2023-11-24 15:48   ` Thomas Hellström
@ 2023-11-27  3:25     ` Ghimiray, Himal Prasad
  0 siblings, 0 replies; 28+ messages in thread
From: Ghimiray, Himal Prasad @ 2023-11-27  3:25 UTC (permalink / raw)
  To: Thomas Hellström, intel-xe


On 24-11-2023 21:18, Thomas Hellström wrote:
> On Tue, 2023-11-21 at 15:39 +0530, Himal Prasad Ghimiray wrote:
>> - Clear flat ccs during user bo creation.
>> - copy ccs meta data between flat ccs and bo during eviction and
>> restore.
>> - Add a bool field ccs_cleared in bo, true means ccs region of bo is
>> already cleared.
>>
>> Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
>> Signed-off-by: Himal Prasad Ghimiray
>> <himal.prasad.ghimiray@intel.com>
>> ---
>>   drivers/gpu/drm/xe/xe_bo.c       | 25 ++++++++-----
>>   drivers/gpu/drm/xe/xe_bo_types.h |  2 ++
>>   drivers/gpu/drm/xe/xe_migrate.c  | 62 ++++++++++++++++--------------
>> --
>>   3 files changed, 50 insertions(+), 39 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c
>> index 4730ee3c1012..a40f17ae21e7 100644
>> --- a/drivers/gpu/drm/xe/xe_bo.c
>> +++ b/drivers/gpu/drm/xe/xe_bo.c
>> @@ -630,10 +630,12 @@ static int xe_bo_move(struct ttm_buffer_object
>> *ttm_bo, bool evict,
>>          bool move_lacks_source;
>>          bool tt_has_data;
>>          bool needs_clear;
>> +       bool handle_system_ccs = (!IS_DGFX(xe) &&
>> xe_bo_needs_ccs_pages(bo) &&
>> +                                 ttm && ttm_tt_is_populated(ttm)) ?
>> true : false;
>>          int ret = 0;
>> -
>> -       /* Bo creation path, moving to system or TT. No clearing
>> required. */
>> -       if (!old_mem && ttm) {
>> +       /* Bo creation path, moving to system or TT. */
>> +       if (((old_mem_type == XE_PL_SYSTEM && new_mem->mem_type ==
>> XE_PL_TT) ||
> I figure moving from SYSTEM to TT must always trigger a copy or clear
> of CCS in the handle_system_ccs case?
>
> /Thomas
AFAIU that is correct.
>
>> +            (!old_mem && ttm)) && !handle_system_ccs) {
>>                  ttm_bo_move_null(ttm_bo, new_mem);
>>                  return 0;
>>          }
>> @@ -648,14 +650,13 @@ static int xe_bo_move(struct ttm_buffer_object
>> *ttm_bo, bool evict,
>>          tt_has_data = ttm && (ttm_tt_is_populated(ttm) ||
>>                                (ttm->page_flags &
>> TTM_TT_FLAG_SWAPPED));
>>   
>> -       move_lacks_source = !mem_type_is_vram(old_mem_type) &&
>> !tt_has_data;
>> +       move_lacks_source = handle_system_ccs ? (!bo->ccs_cleared)  :
>> +                                               (!mem_type_is_vram(ol
>> d_mem_type) && !tt_has_data);
>>   
>>          needs_clear = (ttm && ttm->page_flags &
>> TTM_TT_FLAG_ZERO_ALLOC) ||
>>                  (!ttm && ttm_bo->type == ttm_bo_type_device);
>>   
>> -       if ((move_lacks_source && !needs_clear) ||
>> -           (old_mem_type == XE_PL_SYSTEM &&
>> -            new_mem->mem_type == XE_PL_TT)) {
>> +       if ((move_lacks_source && !needs_clear)) {
>>                  ttm_bo_move_null(ttm_bo, new_mem);
>>                  goto out;
>>          }
>> @@ -686,8 +687,11 @@ static int xe_bo_move(struct ttm_buffer_object
>> *ttm_bo, bool evict,
>>                          ret = timeout;
>>                          goto out;
>>                  }
>> -               ttm_bo_move_null(ttm_bo, new_mem);
>> -               goto out;
>> +
>> +               if (!handle_system_ccs) {
>> +                       ttm_bo_move_null(ttm_bo, new_mem);
>> +                       goto out;
>> +               }
>>          }
>>   
>>          if (!move_lacks_source &&
>> @@ -708,6 +712,8 @@ static int xe_bo_move(struct ttm_buffer_object
>> *ttm_bo, bool evict,
>>                  migrate = mem_type_to_migrate(xe, new_mem->mem_type);
>>          else if (mem_type_is_vram(old_mem_type))
>>                  migrate = mem_type_to_migrate(xe, old_mem_type);
>> +       else
>> +               migrate = xe->tiles[0].migrate;
>>   
>>          xe_assert(xe, migrate);
>>   
>> @@ -1229,6 +1235,7 @@ struct xe_bo *__xe_bo_create_locked(struct
>> xe_device *xe, struct xe_bo *bo,
>>                  alignment = SZ_4K >> PAGE_SHIFT;
>>          }
>>   
>> +       bo->ccs_cleared = false;
>>          bo->tile = tile;
>>          bo->size = size;
>>          bo->flags = flags;
>> diff --git a/drivers/gpu/drm/xe/xe_bo_types.h
>> b/drivers/gpu/drm/xe/xe_bo_types.h
>> index 4bff60996168..508e67c81427 100644
>> --- a/drivers/gpu/drm/xe/xe_bo_types.h
>> +++ b/drivers/gpu/drm/xe/xe_bo_types.h
>> @@ -79,6 +79,8 @@ struct xe_bo {
>>          struct llist_node freed;
>>          /** @created: Whether the bo has passed initial creation */
>>          bool created;
>> +       /** @ccs_cleared */
>> +       bool ccs_cleared;
>>   };
>>   
>>   #define intel_bo_to_drm_bo(bo) (&(bo)->ttm.base)
>> diff --git a/drivers/gpu/drm/xe/xe_migrate.c
>> b/drivers/gpu/drm/xe/xe_migrate.c
>> index bdcb20f23531..bac24768fe2a 100644
>> --- a/drivers/gpu/drm/xe/xe_migrate.c
>> +++ b/drivers/gpu/drm/xe/xe_migrate.c
>> @@ -564,14 +564,14 @@ static u64 xe_migrate_batch_base(struct
>> xe_migrate *m, bool usm)
>>   
>>   static u32 xe_migrate_ccs_copy(struct xe_migrate *m,
>>                                 struct xe_bb *bb,
>> -                              u64 src_ofs, bool src_is_vram,
>> -                              u64 dst_ofs, bool dst_is_vram, u32
>> dst_size,
>> +                              u64 src_ofs, bool src_is_indirect,
>> +                              u64 dst_ofs, bool dst_is_indirect, u32
>> dst_size,
>>                                 u64 ccs_ofs, bool copy_ccs)
>>   {
>>          struct xe_gt *gt = m->tile->primary_gt;
>>          u32 flush_flags = 0;
>>   
>> -       if (xe_device_has_flat_ccs(gt_to_xe(gt)) && !copy_ccs &&
>> dst_is_vram) {
>> +       if (xe_device_has_flat_ccs(gt_to_xe(gt)) && !copy_ccs &&
>> dst_is_indirect) {
>>                  /*
>>                   * If the src is already in vram, then it should
>> already
>>                   * have been cleared by us, or has been populated by
>> the
>> @@ -580,28 +580,24 @@ static u32 xe_migrate_ccs_copy(struct
>> xe_migrate *m,
>>                   * Otherwise if the bo doesn't have any CCS metadata
>> attached,
>>                   * we still need to clear it for security reasons.
>>                   */
>> -               u64 ccs_src_ofs =  src_is_vram ? src_ofs : m-
>>> cleared_mem_ofs;
>> +               u64 ccs_src_ofs =  src_is_indirect ? src_ofs : m-
>>> cleared_mem_ofs;
>>   
>>                  emit_copy_ccs(gt, bb,
>>                                dst_ofs, true,
>> -                             ccs_src_ofs, src_is_vram, dst_size);
>> +                             ccs_src_ofs, src_is_indirect,
>> dst_size);
>>   
>>                  flush_flags = MI_FLUSH_DW_CCS;
>>          } else if (copy_ccs) {
>> -               if (!src_is_vram)
>> +               if (!src_is_indirect)
>>                          src_ofs = ccs_ofs;
>> -               else if (!dst_is_vram)
>> +               else if (!dst_is_indirect)
>>                          dst_ofs = ccs_ofs;
>>   
>> -               /*
>> -                * At the moment, we don't support copying CCS
>> metadata from
>> -                * system to system.
>> -                */
>> -               xe_gt_assert(gt, src_is_vram || dst_is_vram);
>> +               xe_gt_assert(gt, src_is_indirect || dst_is_indirect);
>>   
>> -               emit_copy_ccs(gt, bb, dst_ofs, dst_is_vram, src_ofs,
>> -                             src_is_vram, dst_size);
>> -               if (dst_is_vram)
>> +               emit_copy_ccs(gt, bb, dst_ofs, dst_is_indirect,
>> src_ofs,
>> +                             src_is_indirect, dst_size);
>> +               if (dst_is_indirect)
>>                          flush_flags = MI_FLUSH_DW_CCS;
>>          }
>>   
>> @@ -642,6 +638,8 @@ struct dma_fence *xe_migrate_copy(struct
>> xe_migrate *m,
>>          u64 src_L0, dst_L0;
>>          int pass = 0;
>>          int err;
>> +       bool src_is_pltt = src->mem_type == XE_PL_TT;
>> +       bool dst_is_pltt = dst->mem_type == XE_PL_TT;
>>          bool src_is_vram = mem_type_is_vram(src->mem_type);
>>          bool dst_is_vram = mem_type_is_vram(dst->mem_type);
>>          bool copy_ccs = xe_device_has_flat_ccs(xe) &&
>> @@ -682,9 +680,7 @@ struct dma_fence *xe_migrate_copy(struct
>> xe_migrate *m,
>>                  src_L0 = xe_migrate_res_sizes(&src_it);
>>                  dst_L0 = xe_migrate_res_sizes(&dst_it);
>>   
>> -               drm_dbg(&xe->drm, "Pass %u, sizes: %llu & %llu\n",
>> -                       pass++, src_L0, dst_L0);
>> -
>> +               drm_dbg(&xe->drm, "Pass %u, sizes: %llu & %llu\n",
>> pass++, src_L0, dst_L0);
>>                  src_L0 = min(src_L0, dst_L0);
>>   
>>                  batch_size += pte_update_size(m, src_is_vram, src,
>> &src_it, &src_L0,
>> @@ -704,8 +700,8 @@ struct dma_fence *xe_migrate_copy(struct
>> xe_migrate *m,
>>                  }
>>   
>>                  /* Add copy commands size here */
>> -               batch_size += EMIT_COPY_DW +
>> -                       (xe_device_has_flat_ccs(xe) ?
>> EMIT_COPY_CCS_DW : 0);
>> +               batch_size += ((!src_is_vram && !dst_is_vram) ? 0 :
>> EMIT_COPY_DW) +
>> +                       ((xe_device_has_flat_ccs(xe) ?
>> EMIT_COPY_CCS_DW : 0));
>>   
>>                  bb = xe_bb_new(gt, batch_size, usm);
>>                  if (IS_ERR(bb)) {
>> @@ -731,10 +727,13 @@ struct dma_fence *xe_migrate_copy(struct
>> xe_migrate *m,
>>                  bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
>>                  update_idx = bb->len;
>>   
>> -               emit_copy(gt, bb, src_L0_ofs, dst_L0_ofs, src_L0,
>> -                         XE_PAGE_SIZE);
>> -               flush_flags = xe_migrate_ccs_copy(m, bb, src_L0_ofs,
>> src_is_vram,
>> -                                                 dst_L0_ofs,
>> dst_is_vram,
>> +               if (src_is_vram || dst_is_vram)
>> +                       emit_copy(gt, bb, src_L0_ofs, dst_L0_ofs,
>> src_L0, XE_PAGE_SIZE);
>> +
>> +               flush_flags = xe_migrate_ccs_copy(m, bb, src_L0_ofs,
>> +                                                 IS_DGFX(xe) ?
>> src_is_vram : src_is_pltt,
>> +                                                 dst_L0_ofs,
>> +                                                 IS_DGFX(xe) ?
>> dst_is_vram : dst_is_pltt,
>>                                                    src_L0, ccs_ofs,
>> copy_ccs);
>>   
>>                  mutex_lock(&m->job_mutex);
>> @@ -907,6 +906,7 @@ struct dma_fence *xe_migrate_clear(struct
>> xe_migrate *m,
>>          bool clear_vram = mem_type_is_vram(dst->mem_type);
>>          struct xe_gt *gt = m->tile->primary_gt;
>>          struct xe_device *xe = gt_to_xe(gt);
>> +       bool clear_system_ccs = (xe_bo_needs_ccs_pages(bo) &&
>> !IS_DGFX(xe)) ? true : false;
>>          struct dma_fence *fence = NULL;
>>          u64 size = bo->size;
>>          struct xe_res_cursor src_it;
>> @@ -936,9 +936,9 @@ struct dma_fence *xe_migrate_clear(struct
>> xe_migrate *m,
>>                  batch_size = 2 +
>>                          pte_update_size(m, clear_vram, src, &src_it,
>>                                          &clear_L0, &clear_L0_ofs,
>> &clear_L0_pt,
>> -                                       emit_clear_cmd_len(gt), 0,
>> +                                       clear_system_ccs ? 0 :
>> emit_clear_cmd_len(gt), 0,
>>                                          NUM_PT_PER_BLIT);
>> -               if (xe_device_has_flat_ccs(xe) && clear_vram)
>> +               if (xe_bo_needs_ccs_pages(bo))
>>                          batch_size += EMIT_COPY_CCS_DW;
>>   
>>                  /* Clear commands */
>> @@ -953,7 +953,6 @@ struct dma_fence *xe_migrate_clear(struct
>> xe_migrate *m,
>>                  }
>>   
>>                  size -= clear_L0;
>> -
>>                  /* Preemption is enabled again by the ring ops. */
>>                  if (!clear_vram) {
>>                          emit_pte(m, bb, clear_L0_pt, clear_vram,
>> &src_it, clear_L0,
>> @@ -964,10 +963,10 @@ struct dma_fence *xe_migrate_clear(struct
>> xe_migrate *m,
>>                  bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
>>                  update_idx = bb->len;
>>   
>> -               emit_clear(gt, bb, clear_L0_ofs, clear_L0,
>> XE_PAGE_SIZE,
>> -                          clear_vram);
>> +               if (!clear_system_ccs)
>> +                       emit_clear(gt, bb, clear_L0_ofs, clear_L0,
>> XE_PAGE_SIZE, clear_vram);
>>   
>> -               if (xe_device_has_flat_ccs(xe) && clear_vram) {
>> +               if (xe_bo_needs_ccs_pages(bo)) {
>>                          emit_copy_ccs(gt, bb, clear_L0_ofs, true,
>>                                        m->cleared_mem_ofs, false,
>> clear_L0);
>>                          flush_flags = MI_FLUSH_DW_CCS;
>> @@ -1024,6 +1023,9 @@ struct dma_fence *xe_migrate_clear(struct
>> xe_migrate *m,
>>                  return ERR_PTR(err);
>>          }
>>   
>> +       if (clear_system_ccs)
>> +               bo->ccs_cleared = true;
>> +
>>          return fence;
>>   }
>>   

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [Intel-xe] [RFC v2 3/6] drm/xe/xe2: Allocate extra pages for ccs during bo create.
  2023-11-27  3:19     ` Ghimiray, Himal Prasad
@ 2023-11-27  9:41       ` Thomas Hellström
  2023-11-29  4:53         ` Ghimiray, Himal Prasad
  0 siblings, 1 reply; 28+ messages in thread
From: Thomas Hellström @ 2023-11-27  9:41 UTC (permalink / raw)
  To: Ghimiray, Himal Prasad, intel-xe

[-- Attachment #1: Type: text/plain, Size: 4136 bytes --]


On 11/27/23 04:19, Ghimiray, Himal Prasad wrote:
>
>
> On 24-11-2023 16:39, Thomas Hellström wrote:
>> On Tue, 2023-11-21 at 15:39 +0530, Himal Prasad Ghimiray wrote:
>>> Each byte of CCS data now represents 512 bytes of main memory data.
>>> Allocate extra pages to handle ccs region for igfx too.
>>>
>>> Bspec:58796
>>>
>>> Cc: Thomas Hellström<thomas.hellstrom@linux.intel.com>
>>> Signed-off-by: Himal Prasad Ghimiray
>>> <himal.prasad.ghimiray@intel.com>
>>> ---
>>>   drivers/gpu/drm/xe/regs/xe_gpu_commands.h |  2 +-
>>>   drivers/gpu/drm/xe/xe_bo.c                | 15 ++++++---------
>>>   drivers/gpu/drm/xe/xe_device.c            |  2 +-
>>>   3 files changed, 8 insertions(+), 11 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/xe/regs/xe_gpu_commands.h
>>> b/drivers/gpu/drm/xe/regs/xe_gpu_commands.h
>>> index 4402f72481dc..7f74592f99ce 100644
>>> --- a/drivers/gpu/drm/xe/regs/xe_gpu_commands.h
>>> +++ b/drivers/gpu/drm/xe/regs/xe_gpu_commands.h
>>> @@ -16,7 +16,7 @@
>>>   #define   XY_CTRL_SURF_MOCS_MASK       GENMASK(31, 26)
>>>   #define   XE2_XY_CTRL_SURF_MOCS_INDEX_MASK     GENMASK(31, 28)
>>>   #define   NUM_CCS_BYTES_PER_BLOCK      256
>>> -#define   NUM_BYTES_PER_CCS_BYTE       256
>>> +#define   NUM_BYTES_PER_CCS_BYTE(_xe)  (GRAPHICS_VER(_xe) >= 20 ?
>>> 512 : 256)
>>>   #define   NUM_CCS_BLKS_PER_XFER                1024
>>>   
>>>   #define XY_FAST_COLOR_BLT_CMD          (2 << 29 | 0x44 << 22)
>>> diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c
>>> index 4305f5cbc2ab..4730ee3c1012 100644
>>> --- a/drivers/gpu/drm/xe/xe_bo.c
>>> +++ b/drivers/gpu/drm/xe/xe_bo.c
>>> @@ -2074,19 +2074,16 @@ int xe_bo_evict(struct xe_bo *bo, bool
>>> force_alloc)
>>>    * placed in system memory.
>>>    * @bo: The xe_bo
>>>    *
>>> - * If a bo has an allowable placement in XE_PL_TT memory, it can't
>>> use
>>> - * flat CCS compression, because the GPU then has no way to access
>>> the
>>> - * CCS metadata using relevant commands. For the opposite case, we
>>> need to
>>> - * allocate storage for the CCS metadata when the BO is not resident
>>> in
>>> - * VRAM memory.
>> Please extend modify this comment rather than deleting it
> Sure. Will address in next patch.
>>> - *
>>>    * Return: true if extra pages need to be allocated, false
>>> otherwise.
>>>    */
>>>   bool xe_bo_needs_ccs_pages(struct xe_bo *bo)
>>>   {
>>> -       return bo->ttm.type == ttm_bo_type_device &&
>>> -               !(bo->flags & XE_BO_CREATE_SYSTEM_BIT) &&
>>> -               (bo->flags & XE_BO_CREATE_VRAM_MASK);
>>> +       struct xe_device *xe = xe_bo_device(bo);
>>> +
>>> +       return (xe_device_has_flat_ccs(xe) &&
>>> +               bo->ttm.type == ttm_bo_type_device &&
>>> +               ((IS_DGFX(xe) && (bo->flags &
>>> XE_BO_CREATE_VRAM_MASK)) ||
>> It looks like you have removed a restriction for DGFX here: If the BO
>> has SYSTEM set, then ccs pages are not needed.
>
> Is ((IS_DGFX(xe) && (bo->flags & XE_BO_CREATE_VRAM_MASK)) not suffice 
> to ensure
>
> that for DGFX BO is only in VRAM region ?
>
No. It can also has the the CREATE_SYSTEM_BIT set, and then the content 
can't be compressed.

/Thomas



>>> +               (!IS_DGFX(xe) && (bo->flags &
>>> XE_BO_CREATE_SYSTEM_BIT))));
>>>   }
>>>   
>>>   /**
>>> diff --git a/drivers/gpu/drm/xe/xe_device.c
>>> b/drivers/gpu/drm/xe/xe_device.c
>>> index 07a3e4cf48d1..265f9ffc5323 100644
>>> --- a/drivers/gpu/drm/xe/xe_device.c
>>> +++ b/drivers/gpu/drm/xe/xe_device.c
>>> @@ -551,7 +551,7 @@ void xe_device_wmb(struct xe_device *xe)
>>>   u32 xe_device_ccs_bytes(struct xe_device *xe, u64 size)
>>>   {
>>>          return xe_device_has_flat_ccs(xe) ?
>>> -               DIV_ROUND_UP(size, NUM_BYTES_PER_CCS_BYTE) : 0;
>>> +               DIV_ROUND_UP(size, NUM_BYTES_PER_CCS_BYTE(xe)) : 0;
>>>   }
>>>   
>>>   bool xe_device_mem_access_ongoing(struct xe_device *xe)

[-- Attachment #2: Type: text/html, Size: 5811 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [Intel-xe] [RFC v2 0/6] Enable compression handling on LNL.
  2023-11-21 10:09 [Intel-xe] [RFC v2 0/6] Enable compression handling on LNL Himal Prasad Ghimiray
                   ` (5 preceding siblings ...)
  2023-11-21 10:09 ` [Intel-xe] [RFC v2 6/6] drm/xe/xe2: Handle flat ccs move for igfx Himal Prasad Ghimiray
@ 2023-11-28 13:36 ` Thomas Hellström
  2023-11-29  4:49   ` Ghimiray, Himal Prasad
  6 siblings, 1 reply; 28+ messages in thread
From: Thomas Hellström @ 2023-11-28 13:36 UTC (permalink / raw)
  To: Himal Prasad Ghimiray, intel-xe

Hi!

On Tue, 2023-11-21 at 15:39 +0530, Himal Prasad Ghimiray wrote:
> Series enables flat ccs on xe2 igfx platforms and handles clearing
> and copy 
> of ccs metadata during bo creation and xe_bo_move respectively.
> 
> Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> Signed-off-by: Himal Prasad Ghimiray
> <himal.prasad.ghimiray@intel.com>
> 
> Himal Prasad Ghimiray (6):
>   drm/xe/xe2: Support flat ccs.
>   drm/xe/xe2: Determine bios enablement for flat ccs on igfx.
>   drm/xe/xe2: Allocate extra pages for ccs during bo create.
>   drm/xe/xe2: Updates on XY_CTRL_SURF_COPY_BLT
>   drm/xe/xe_migrate.c: Use NULL 1G PTE mapped at 255GiB VA for ccs
>     clear.
>   drm/xe/xe2: Handle flat ccs move for igfx.
> 
>  drivers/gpu/drm/xe/regs/xe_gpu_commands.h |   3 +-
>  drivers/gpu/drm/xe/regs/xe_gt_regs.h      |   3 +
>  drivers/gpu/drm/xe/xe_bo.c                |  40 ++++---
>  drivers/gpu/drm/xe/xe_bo_types.h          |   2 +
>  drivers/gpu/drm/xe/xe_device.c            |  32 ++++-
>  drivers/gpu/drm/xe/xe_migrate.c           | 139 +++++++++-----------
> --
>  drivers/gpu/drm/xe/xe_pci.c               |   2 +-
>  7 files changed, 119 insertions(+), 102 deletions(-)
> 

I forgot to mention, in xe/tests/xe_bo.c there is a kunit test that
exercises basic clearing and migration with a CCS-metadata-enabled bo.
IMO we should update that to make sure it runs with the new code as
well.

Thanks,
Thomas


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [Intel-xe] [RFC v2 0/6] Enable compression handling on LNL.
  2023-11-28 13:36 ` [Intel-xe] [RFC v2 0/6] Enable compression handling on LNL Thomas Hellström
@ 2023-11-29  4:49   ` Ghimiray, Himal Prasad
  0 siblings, 0 replies; 28+ messages in thread
From: Ghimiray, Himal Prasad @ 2023-11-29  4:49 UTC (permalink / raw)
  To: Thomas Hellström, intel-xe



> -----Original Message-----
> From: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> Sent: 28 November 2023 19:07
> To: Ghimiray, Himal Prasad <himal.prasad.ghimiray@intel.com>; intel-
> xe@lists.freedesktop.org
> Subject: Re: [RFC v2 0/6] Enable compression handling on LNL.
> 
> Hi!
> 
> On Tue, 2023-11-21 at 15:39 +0530, Himal Prasad Ghimiray wrote:
> > Series enables flat ccs on xe2 igfx platforms and handles clearing and
> > copy of ccs metadata during bo creation and xe_bo_move respectively.
> >
> > Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> > Signed-off-by: Himal Prasad Ghimiray
> > <himal.prasad.ghimiray@intel.com>
> >
> > Himal Prasad Ghimiray (6):
> >   drm/xe/xe2: Support flat ccs.
> >   drm/xe/xe2: Determine bios enablement for flat ccs on igfx.
> >   drm/xe/xe2: Allocate extra pages for ccs during bo create.
> >   drm/xe/xe2: Updates on XY_CTRL_SURF_COPY_BLT
> >   drm/xe/xe_migrate.c: Use NULL 1G PTE mapped at 255GiB VA for ccs
> >     clear.
> >   drm/xe/xe2: Handle flat ccs move for igfx.
> >
> >  drivers/gpu/drm/xe/regs/xe_gpu_commands.h |   3 +-
> >  drivers/gpu/drm/xe/regs/xe_gt_regs.h      |   3 +
> >  drivers/gpu/drm/xe/xe_bo.c                |  40 ++++---
> >  drivers/gpu/drm/xe/xe_bo_types.h          |   2 +
> >  drivers/gpu/drm/xe/xe_device.c            |  32 ++++-
> >  drivers/gpu/drm/xe/xe_migrate.c           | 139 +++++++++-----------
> > --
> >  drivers/gpu/drm/xe/xe_pci.c               |   2 +-
> >  7 files changed, 119 insertions(+), 102 deletions(-)
> >
> 
> I forgot to mention, in xe/tests/xe_bo.c there is a kunit test that exercises
> basic clearing and migration with a CCS-metadata-enabled bo.
> IMO we should update that to make sure it runs with the new code as well.


Sure.
> 
> Thanks,
> Thomas


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [Intel-xe] [RFC v2 3/6] drm/xe/xe2: Allocate extra pages for ccs during bo create.
  2023-11-27  9:41       ` Thomas Hellström
@ 2023-11-29  4:53         ` Ghimiray, Himal Prasad
  0 siblings, 0 replies; 28+ messages in thread
From: Ghimiray, Himal Prasad @ 2023-11-29  4:53 UTC (permalink / raw)
  To: Thomas Hellström, intel-xe

[-- Attachment #1: Type: text/plain, Size: 4370 bytes --]


On 27-11-2023 15:11, Thomas Hellström wrote:
>
>
> On 11/27/23 04:19, Ghimiray, Himal Prasad wrote:
>>
>>
>> On 24-11-2023 16:39, Thomas Hellström wrote:
>>> On Tue, 2023-11-21 at 15:39 +0530, Himal Prasad Ghimiray wrote:
>>>> Each byte of CCS data now represents 512 bytes of main memory data.
>>>> Allocate extra pages to handle ccs region for igfx too.
>>>>
>>>> Bspec:58796
>>>>
>>>> Cc: Thomas Hellström<thomas.hellstrom@linux.intel.com>
>>>> Signed-off-by: Himal Prasad Ghimiray
>>>> <himal.prasad.ghimiray@intel.com>
>>>> ---
>>>>   drivers/gpu/drm/xe/regs/xe_gpu_commands.h |  2 +-
>>>>   drivers/gpu/drm/xe/xe_bo.c                | 15 ++++++---------
>>>>   drivers/gpu/drm/xe/xe_device.c            |  2 +-
>>>>   3 files changed, 8 insertions(+), 11 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/xe/regs/xe_gpu_commands.h
>>>> b/drivers/gpu/drm/xe/regs/xe_gpu_commands.h
>>>> index 4402f72481dc..7f74592f99ce 100644
>>>> --- a/drivers/gpu/drm/xe/regs/xe_gpu_commands.h
>>>> +++ b/drivers/gpu/drm/xe/regs/xe_gpu_commands.h
>>>> @@ -16,7 +16,7 @@
>>>>   #define   XY_CTRL_SURF_MOCS_MASK       GENMASK(31, 26)
>>>>   #define   XE2_XY_CTRL_SURF_MOCS_INDEX_MASK     GENMASK(31, 28)
>>>>   #define   NUM_CCS_BYTES_PER_BLOCK      256
>>>> -#define   NUM_BYTES_PER_CCS_BYTE       256
>>>> +#define   NUM_BYTES_PER_CCS_BYTE(_xe)  (GRAPHICS_VER(_xe) >= 20 ?
>>>> 512 : 256)
>>>>   #define   NUM_CCS_BLKS_PER_XFER                1024
>>>>   
>>>>   #define XY_FAST_COLOR_BLT_CMD          (2 << 29 | 0x44 << 22)
>>>> diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c
>>>> index 4305f5cbc2ab..4730ee3c1012 100644
>>>> --- a/drivers/gpu/drm/xe/xe_bo.c
>>>> +++ b/drivers/gpu/drm/xe/xe_bo.c
>>>> @@ -2074,19 +2074,16 @@ int xe_bo_evict(struct xe_bo *bo, bool
>>>> force_alloc)
>>>>    * placed in system memory.
>>>>    * @bo: The xe_bo
>>>>    *
>>>> - * If a bo has an allowable placement in XE_PL_TT memory, it can't
>>>> use
>>>> - * flat CCS compression, because the GPU then has no way to access
>>>> the
>>>> - * CCS metadata using relevant commands. For the opposite case, we
>>>> need to
>>>> - * allocate storage for the CCS metadata when the BO is not resident
>>>> in
>>>> - * VRAM memory.
>>> Please extend modify this comment rather than deleting it
>> Sure. Will address in next patch.
>>>> - *
>>>>    * Return: true if extra pages need to be allocated, false
>>>> otherwise.
>>>>    */
>>>>   bool xe_bo_needs_ccs_pages(struct xe_bo *bo)
>>>>   {
>>>> -       return bo->ttm.type == ttm_bo_type_device &&
>>>> -               !(bo->flags & XE_BO_CREATE_SYSTEM_BIT) &&
>>>> -               (bo->flags & XE_BO_CREATE_VRAM_MASK);
>>>> +       struct xe_device *xe = xe_bo_device(bo);
>>>> +
>>>> +       return (xe_device_has_flat_ccs(xe) &&
>>>> +               bo->ttm.type == ttm_bo_type_device &&
>>>> +               ((IS_DGFX(xe) && (bo->flags &
>>>> XE_BO_CREATE_VRAM_MASK)) ||
>>> It looks like you have removed a restriction for DGFX here: If the BO
>>> has SYSTEM set, then ccs pages are not needed.
>>
>> Is ((IS_DGFX(xe) && (bo->flags & XE_BO_CREATE_VRAM_MASK)) not suffice 
>> to ensure
>>
>> that for DGFX BO is only in VRAM region ?
>>
> No. It can also has the the CREATE_SYSTEM_BIT set, and then the 
> content can't be compressed.
>
> /Thomas
>
Thanks for the clarification. Will address this is next version.

BR

Himal

>
>
>>>> +               (!IS_DGFX(xe) && (bo->flags &
>>>> XE_BO_CREATE_SYSTEM_BIT))));
>>>>   }
>>>>   
>>>>   /**
>>>> diff --git a/drivers/gpu/drm/xe/xe_device.c
>>>> b/drivers/gpu/drm/xe/xe_device.c
>>>> index 07a3e4cf48d1..265f9ffc5323 100644
>>>> --- a/drivers/gpu/drm/xe/xe_device.c
>>>> +++ b/drivers/gpu/drm/xe/xe_device.c
>>>> @@ -551,7 +551,7 @@ void xe_device_wmb(struct xe_device *xe)
>>>>   u32 xe_device_ccs_bytes(struct xe_device *xe, u64 size)
>>>>   {
>>>>          return xe_device_has_flat_ccs(xe) ?
>>>> -               DIV_ROUND_UP(size, NUM_BYTES_PER_CCS_BYTE) : 0;
>>>> +               DIV_ROUND_UP(size, NUM_BYTES_PER_CCS_BYTE(xe)) : 0;
>>>>   }
>>>>   
>>>>   bool xe_device_mem_access_ongoing(struct xe_device *xe)

[-- Attachment #2: Type: text/html, Size: 7293 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2023-11-29  4:54 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-11-21 10:09 [Intel-xe] [RFC v2 0/6] Enable compression handling on LNL Himal Prasad Ghimiray
2023-11-21 10:09 ` [Intel-xe] [RFC v2 1/6] drm/xe/xe2: Support flat ccs Himal Prasad Ghimiray
2023-11-23 16:28   ` Matthew Auld
2023-11-27  3:04     ` Ghimiray, Himal Prasad
2023-11-24 11:02   ` Thomas Hellström
2023-11-27  3:05     ` Ghimiray, Himal Prasad
2023-11-21 10:09 ` [Intel-xe] [RFC v2 2/6] drm/xe/xe2: Determine bios enablement for flat ccs on igfx Himal Prasad Ghimiray
2023-11-23 16:37   ` Matthew Auld
2023-11-23 17:01     ` Matthew Auld
2023-11-27  3:11       ` Ghimiray, Himal Prasad
2023-11-24 11:05   ` Thomas Hellström
2023-11-27  3:12     ` Ghimiray, Himal Prasad
2023-11-21 10:09 ` [Intel-xe] [RFC v2 3/6] drm/xe/xe2: Allocate extra pages for ccs during bo create Himal Prasad Ghimiray
2023-11-24 11:09   ` Thomas Hellström
2023-11-27  3:19     ` Ghimiray, Himal Prasad
2023-11-27  9:41       ` Thomas Hellström
2023-11-29  4:53         ` Ghimiray, Himal Prasad
2023-11-21 10:09 ` [Intel-xe] [RFC v2 4/6] drm/xe/xe2: Updates on XY_CTRL_SURF_COPY_BLT Himal Prasad Ghimiray
2023-11-24 11:11   ` Thomas Hellström
2023-11-27  3:21     ` Ghimiray, Himal Prasad
2023-11-21 10:09 ` [Intel-xe] [RFC v2 5/6] drm/xe/xe_migrate.c: Use NULL 1G PTE mapped at 255GiB VA for ccs clear Himal Prasad Ghimiray
2023-11-24 11:19   ` Thomas Hellström
2023-11-27  3:23     ` Ghimiray, Himal Prasad
2023-11-21 10:09 ` [Intel-xe] [RFC v2 6/6] drm/xe/xe2: Handle flat ccs move for igfx Himal Prasad Ghimiray
2023-11-24 15:48   ` Thomas Hellström
2023-11-27  3:25     ` Ghimiray, Himal Prasad
2023-11-28 13:36 ` [Intel-xe] [RFC v2 0/6] Enable compression handling on LNL Thomas Hellström
2023-11-29  4:49   ` Ghimiray, Himal Prasad

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.