Re: [PATCH v3] igt/gem_trtt: Exercise the TRTT hardware

From: "Goel, Akash" <akash.goel@intel.com>
To: Chris Wilson <chris@chris-wilson.co.uk>, intel-gfx@lists.freedesktop.org
Cc: akash.goel@intel.com
Subject: Re: [PATCH v3] igt/gem_trtt: Exercise the TRTT hardware
Date: Thu, 3 Mar 2016 21:08:25 +0530	[thread overview]
Message-ID: <56D85A71.3000409@intel.com> (raw)
In-Reply-To: <20160303100403.GQ30782@nuc-i3427.alporthouse.com>

Thanks for the review.

On 3/3/2016 3:34 PM, Chris Wilson wrote:
> On Thu, Mar 03, 2016 at 10:25:59AM +0530, akash.goel@intel.com wrote:
>> +static bool uses_full_ppgtt(int fd, int min)
> gem_gtt_type()
fine will change like this,
	gem_gtt_type(fd) > 2

>
>> +static bool has_softpin_support(int fd)
>
> gem_has_softpin()

fine will use gem_has_softpin()
>
>> +static int emit_store_dword(int fd, uint32_t *cmd_buf, uint32_t dw_offset,
>> +			    uint64_t vaddr, uint32_t data)
>> +{
>> +	/* Check that softpin addresses are in the correct form */
>> +	assert(vaddr == gen8_canonical_addr(vaddr));
>> +
>> +	/* SDI cannot write to unaligned addresses */
>> +	assert((vaddr & 3) == 0);
>> +
>> +	cmd_buf[dw_offset++] = MI_STORE_DWORD_IMM;
>> +	cmd_buf[dw_offset++] = vaddr & 0xFFFFFFFC;
>> +	cmd_buf[dw_offset++] = (vaddr >> 32) & 0xFFFF; /* bits 32:47 */
>> +	cmd_buf[dw_offset++] = data;
>> +
>> +	return dw_offset;
>> +}
>> +
>> +/* emit_store_qword
>> + * populate batch buffer with MI_STORE_DWORD_IMM command
>> + * @fd: drm file descriptor
>> + * @cmd_buf: batch buffer
>> + * @dw_offset: write offset in batch buffer
>> + * @vaddr: destination Virtual address
>> + * @data: u64 data to be stored at destination
>> + */
>> +static int emit_store_qword(int fd, uint32_t *cmd_buf, uint32_t dw_offset,
>> +			    uint64_t vaddr, uint64_t data)
>> +{
>> +	/* Check that softpin addresses are in the correct form */
>> +	assert(vaddr == gen8_canonical_addr(vaddr));
>> +
>> +	/* SDI cannot write to unaligned addresses */
>> +	assert((vaddr & 3) == 0);
>> +
>> +	cmd_buf[dw_offset++] = MI_STORE_DWORD_IMM | 0x3;
>> +	cmd_buf[dw_offset++] = vaddr & 0xFFFFFFFC;
>
> You just asserted above that the low bits aren't set.
>
>> +	cmd_buf[dw_offset++] = (vaddr >> 32) & 0xFFFF; /* bits 32:47 */
>
> This conflicts with the value that would be set by the kernel (if it had
> to the relocation). Who is correct? If not required, please don't raise
> the spectre of devastating bugs.
>
Sorry will modify like this,
	cmd_buf[dw_offset++] = (uint32_t)vaddr;
	cmd_buf[dw_offset++] = (uint32_t)(vaddr >> 32);

>> +	cmd_buf[dw_offset++] = data;
>> +	cmd_buf[dw_offset++] = data >> 32;
>> +
>> +	return dw_offset;
>> +}
>> +
>> +/* emit_bb_end
>> + * populate batch buffer with MI_BATCH_BUFFER_END command
>> + * @fd: drm file descriptor
>> + * @cmd_buf: batch buffer
>> + * @dw_offset: write offset in batch buffer
>> + */
>> +static int emit_bb_end(int fd, uint32_t *cmd_buf, uint32_t dw_offset)
>> +{
>> +	cmd_buf[dw_offset++] = MI_BATCH_BUFFER_END;
>> +	cmd_buf[dw_offset++] = 0;
>
> Why? execbuf.batch_len must be aligned, but the CS parser doesn't care,
> and there is no guarantee that you are aligned coming into
> emit_bb_end().
>
Sorry, will change like this,
	dw_offset = ALIGN(dw_offset, 2);
	cmd_buf[dw_offset++] = MI_BATCH_BUFFER_END;
	dw_offset++;

>> +
>> +	return dw_offset;
>> +}
>> +
>> +/* setup_execbuffer
>> + * helper for buffer execution
>> + * @execbuf - pointer to execbuffer
>> + * @exec_object - pointer to exec object2 struct
>> + * @ring - ring to be used
>> + * @buffer_count - how manu buffers to submit
>> + * @batch_length - length of batch buffer
>> + */
>> +static void setup_execbuffer(struct drm_i915_gem_execbuffer2 *execbuf,
>> +			     struct drm_i915_gem_exec_object2 *exec_object,
>> +			     uint32_t ctx_id, int ring, int buffer_count, int batch_length)
>> +{
>> +	memset(execbuf, 0, sizeof(*execbuf));
>> +
>> +	execbuf->buffers_ptr = (unsigned long)exec_object;
>> +	execbuf->buffer_count = buffer_count;
>> +	execbuf->batch_len = batch_length;
>> +	execbuf->flags = ring;
>> +	i915_execbuffer2_set_context_id(*execbuf, ctx_id);
>> +}
>> +
>> +#define TABLE_SIZE 0x1000
>> +#define TILE_SIZE 0x10000
>> +
>> +#define TRTT_SEGMENT_SIZE (1ULL << 44)
>> +#define PPGTT_SIZE (1ULL << 48)
>> +
>> +#define NULL_TILE_PATTERN    0xFFFFFFFF
>> +#define INVALID_TILE_PATTERN 0xFFFFFFFE
>> +
>> +struct local_i915_gem_context_trtt_param {
>> +	uint64_t segment_base_addr;
>> +	uint64_t l3_table_address;
>> +	uint32_t invd_tile_val;
>> +	uint32_t null_tile_val;
>> +};
>> +
>> +/* setup_trtt
>> + * Helper function to request KMD to enable TRTT
>> + * @fd - drm fd
>> + * @ctx_id - id of the context for which TRTT is to be enabled
>> + * @l3_table_address - GFX address of the L3 table
>> + * @segment_base_addr - offset of the TRTT segment in PPGTT space
>> + */
>> +static void
>> +setup_trtt(int fd, uint32_t ctx_id, uint64_t l3_table_address,
>> +	   uint64_t segment_base_addr)
>> +{
>> +	struct local_i915_gem_context_param ctx_param;
>> +	struct local_i915_gem_context_trtt_param trtt_param;
>> +
>> +	memset(&ctx_param, 0, sizeof(ctx_param));
>> +
>> +	trtt_param.null_tile_val = NULL_TILE_PATTERN;
>> +	trtt_param.invd_tile_val = INVALID_TILE_PATTERN;
>> +	trtt_param.l3_table_address = l3_table_address;
>> +	trtt_param.segment_base_addr = segment_base_addr;
>> +
>> +	ctx_param.context = ctx_id;
>> +	ctx_param.size = sizeof(trtt_param);
>> +	ctx_param.param = LOCAL_CONTEXT_PARAM_TRTT;
>> +	ctx_param.value = (uint64_t)&trtt_param;
>> +
>> +	gem_context_set_param(fd, &ctx_param);
>> +}
>> +
>> +/* bo_alloc_setup
>> + * allocate bo and populate exec object
>> + * @exec_object2 - pointer to exec object
>> + * @bo_sizee - buffer size
>> + * @flags - exec flags
>> + * @bo_offset - pointer to the current PPGTT offset
>> + */
>> +static void bo_alloc_setup(int fd, struct drm_i915_gem_exec_object2 *exec_object2,
>> +			   uint64_t bo_size, uint64_t flags, uint64_t *bo_offset)
>> +{
>> +	memset(exec_object2, 0, sizeof(*exec_object2));
>> +	exec_object2->handle = gem_create(fd, bo_size);
>> +	exec_object2->flags = flags;
>> +
>> +	if (bo_offset)
>> +	{
>> +		exec_object2->offset = *bo_offset;
>> +		*bo_offset += bo_size;
>> +	}
>> +}
>> +
>> +/* submit_trtt_context
>> + * This helper function will create a new context if the TR-TT segment
>> + * base address is not zero, allocate a L3 table page, 2 pages apiece
>> + * for L2/L1 tables and couple of data buffers of 64KB in size, matching the
>> + * Tile size. The 2 data buffers will be mapped to the 2 ends of TRTT virtual
>> + * space. Series of MI_STORE_DWORD_IMM commands will be added in the batch
>> + * buffer to first update the TR-TT table entries and then to update the data
>> + * buffers using their TR-TT VA, exercising the table programming done
>> + * previously.
>> + * Invoke CONTEXT_SETPARAM ioctl to request KMD to enable TRTT.
>> + * Invoke execbuffer to submit the batch buffer.
>> + * Verify value of first DWORD in the 2 data buffer matches the data asked
>> + * to be written by the GPU.
>> + */
>> +static void submit_trtt_context(int fd, uint64_t segment_base_addr)
>> +{
>> +	enum {
>> +		L3_TBL,
>> +		L2_TBL1,
>> +		L2_TBL2,
>> +		L1_TBL1,
>> +		L1_TBL2,
>> +		DATA1,
>> +		DATA2,
>> +		BATCH,
>> +		NUM_BUFFERS,
>> +	};
>> +
>> +	int ring, len = 0;
>> +	uint32_t *ptr;
>> +	struct drm_i915_gem_execbuffer2 execbuf;
>> +	struct drm_i915_gem_exec_object2 exec_object2[NUM_BUFFERS];
>> +	uint32_t batch_buffer[BO_SIZE];
>> +	uint32_t ctx_id, data, last_entry_offset;
>> +	uint64_t cur_ppgtt_off, exec_flags;
>> +	uint64_t first_tile_addr, last_tile_addr;
>> +
>> +	first_tile_addr = segment_base_addr;
>> +	last_tile_addr  = first_tile_addr + TRTT_SEGMENT_SIZE - TILE_SIZE;
>> +
>> +	if (segment_base_addr == 0) {
>> +		/* Use the default context for first iteration */
>> +		ctx_id = 0;
>
> Seems like a variable the callee wants to control. (Otherwise you are
> missing a significant test for non-default contexts). It also implies
> that we don't test what happens when we call set-trtt-context twice on a
> context.
>

As per the current logic, the callee will use non-default context for 
the non zero TR-TT segment start location.

Should a new subtest be added to check the case when set-trtt-context is 
called twice, which is expected to fail.

>> +		/* To avoid conflict with the TR-TT segment */
>> +		cur_ppgtt_off = TRTT_SEGMENT_SIZE;
>> +	} else {
>> +		/* Create a new context to have different TRTT settings
>> +		 * on every iteration.
>> +		 */
>> +		ctx_id = gem_context_create(fd);
>> +		cur_ppgtt_off = 0;
>> +	}
>> +
>> +	exec_flags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
>> +
>> +	/* first allocate Batch buffer BO */
>> +	bo_alloc_setup(fd, &exec_object2[BATCH], BO_SIZE, exec_flags, NULL);
>> +
>> +	/* table BOs and data buffer BOs are written by GPU and are soft pinned */
>> +	exec_flags |= (EXEC_OBJECT_WRITE | EXEC_OBJECT_PINNED);
>> +
>> +	/* Allocate a L3 table BO */
>> +	bo_alloc_setup(fd, &exec_object2[L3_TBL], TABLE_SIZE, exec_flags, &cur_ppgtt_off);
>> +
>> +	/* Allocate two L2 table BOs */
>> +	bo_alloc_setup(fd, &exec_object2[L2_TBL1], TABLE_SIZE, exec_flags, &cur_ppgtt_off);
>> +	bo_alloc_setup(fd, &exec_object2[L2_TBL2], TABLE_SIZE, exec_flags, &cur_ppgtt_off);
>> +
>> +	/* Allocate two L1 table BOs */
>> +	bo_alloc_setup(fd, &exec_object2[L1_TBL1], TABLE_SIZE, exec_flags, &cur_ppgtt_off);
>> +	bo_alloc_setup(fd, &exec_object2[L1_TBL2], TABLE_SIZE, exec_flags, &cur_ppgtt_off);
>> +
>> +	/* Align the PPGTT offsets for the 2 data buffers to next 64 KB boundary */
>> +	cur_ppgtt_off = ALIGN(cur_ppgtt_off, TILE_SIZE);
>> +
>> +	/* Allocate two Data buffer BOs */
>> +	bo_alloc_setup(fd, &exec_object2[DATA1], TILE_SIZE, exec_flags, &cur_ppgtt_off);
>> +	bo_alloc_setup(fd, &exec_object2[DATA2], TILE_SIZE, exec_flags, &cur_ppgtt_off);
>> +
>> +	/* Add commands to update the two L3 table entries to point them to the L2 tables*/
>> +	last_entry_offset = 511*sizeof(uint64_t);
>> +
>> +	len = emit_store_qword(fd, batch_buffer, len,
>> +			       exec_object2[L3_TBL].offset,
>> +			       exec_object2[L2_TBL1].offset);
>> +
>> +	len = emit_store_qword(fd, batch_buffer, len,
>> +			       exec_object2[L3_TBL].offset + last_entry_offset,
>> +			       exec_object2[L2_TBL2].offset);
>> +
>> +	/* Add commands to update an entry of 2 L2 tables to point them to the L1 tables*/
>> +	len = emit_store_qword(fd, batch_buffer, len,
>> +			       exec_object2[L2_TBL1].offset,
>> +			       exec_object2[L1_TBL1].offset);
>> +
>> +	len = emit_store_qword(fd, batch_buffer, len,
>> +			       exec_object2[L2_TBL2].offset + last_entry_offset,
>> +			       exec_object2[L1_TBL2].offset);
>> +
>> +	/* Add commands to update an entry of 2 L1 tables to point them to the data buffers*/
>> +	last_entry_offset = 1023*sizeof(uint32_t);
>> +
>> +	len = emit_store_dword(fd, batch_buffer, len,
>> +			       exec_object2[L1_TBL1].offset,
>> +			       exec_object2[DATA1].offset >> 16);
>> +
>> +	len = emit_store_dword(fd, batch_buffer, len,
>> +			       exec_object2[L1_TBL2].offset + last_entry_offset,
>> +			       exec_object2[DATA2].offset >> 16);
>> +
>> +	/* Add commands to update the 2 data buffers, using their TRTT VA */
>> +	data = 0x12345678;
>> +	len = emit_store_dword(fd, batch_buffer, len, first_tile_addr, data);
>> +	len = emit_store_dword(fd, batch_buffer, len, last_tile_addr, data);
>> +
>> +	len = emit_bb_end(fd, batch_buffer, len);
>> +	gem_write(fd, exec_object2[BATCH].handle, 0, batch_buffer, len*4);
>> +
>> +	/* Request KMD to setup the TR-TT */
>> +	setup_trtt(fd, ctx_id, exec_object2[L3_TBL].offset, first_tile_addr);
>> +
>> +	ring = I915_EXEC_RENDER;
>> +	setup_execbuffer(&execbuf, exec_object2, ctx_id, ring, NUM_BUFFERS, len*4);
>> +
>> +	/* submit command buffer */
>> +	gem_execbuf(fd, &execbuf);
>> +
>> +	/* read the 2 data buffers to check for the value written by the GPU */
>> +	ptr = mmap_bo(fd, exec_object2[DATA1].handle, TILE_SIZE);
>> +	igt_fail_on_f(ptr[0] != data,
>> +		"\nCPU read does not match GPU write,\
>> +		expected: 0x%x, got: 0x%x\n",
>> +		data, ptr[0]);
>
> Just igt_assert_eq_u32(ptr[0], expected);

Fine will change.
>
>> +
>> +	ptr = mmap_bo(fd, exec_object2[DATA2].handle, TILE_SIZE);
>> +	igt_fail_on_f(ptr[0] != data,
>> +		"\nCPU read does not match GPU write,\
>> +		expected: 0x%x, got: 0x%x\n",
>> +		data, ptr[0]);
>> +
>> +	gem_close(fd, exec_object2[L3_TBL].handle);
>> +	gem_close(fd, exec_object2[L2_TBL1].handle);
>> +	gem_close(fd, exec_object2[L2_TBL2].handle);
>> +	gem_close(fd, exec_object2[L1_TBL1].handle);
>> +	gem_close(fd, exec_object2[L1_TBL2].handle);
>> +	gem_close(fd, exec_object2[DATA1].handle);
>> +	gem_close(fd, exec_object2[DATA2].handle);
>> +	gem_close(fd, exec_object2[BATCH].handle);
>
> Before we destroy the context (or exit), how about a query_trtt().
> We should also query after create to ensure that the defaults are set.
> Just thinking that is better doing the query after several steps (i.e.
> the execbuf) rather than immediately after the set in order to give time
> for something to go wrong. We should also ensure that everything remains
> set after a GPU hang and suspend/resume.

Nice suggestion, currently get-trtt-context will just retrieve the TR-TT 
params stored with a Driver for a given context.
Should the Context image itself be read to ensure that settings are 
intact or not ?

Best regards
Akash
> -Chris
>
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx