All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/amdgpu: simplify padding calculations
@ 2019-10-24 23:44 ` Tuikov, Luben
  0 siblings, 0 replies; 16+ messages in thread
From: Tuikov, Luben @ 2019-10-24 23:44 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Deucher, Alexander, Tuikov, Luben, Koenig, Christian

Simplify padding calculations.

Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/cik_sdma.c  |  4 ++--
 drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c |  4 ++--
 drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c |  4 ++--
 drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c |  4 ++--
 drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 14 +++++++++-----
 5 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
index c45304f1047c..1ea9e18d6f08 100644
--- a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
@@ -228,7 +228,7 @@ static void cik_sdma_ring_emit_ib(struct amdgpu_ring *ring,
 	u32 extra_bits = vmid & 0xf;
 
 	/* IB packet must end on a 8 DW boundary */
-	cik_sdma_ring_insert_nop(ring, (12 - (lower_32_bits(ring->wptr) & 7)) % 8);
+	cik_sdma_ring_insert_nop(ring, (4-lower_32_bits(ring->wptr)) & 7);
 
 	amdgpu_ring_write(ring, SDMA_PACKET(SDMA_OPCODE_INDIRECT_BUFFER, 0, extra_bits));
 	amdgpu_ring_write(ring, ib->gpu_addr & 0xffffffe0); /* base must be 32 byte aligned */
@@ -811,7 +811,7 @@ static void cik_sdma_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib)
 	u32 pad_count;
 	int i;
 
-	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
+	pad_count = (-ib->length_dw) & 7;
 	for (i = 0; i < pad_count; i++)
 		if (sdma && sdma->burst_nop && (i == 0))
 			ib->ptr[ib->length_dw++] =
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
index a10175838013..d340f179401a 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
@@ -255,7 +255,7 @@ static void sdma_v2_4_ring_emit_ib(struct amdgpu_ring *ring,
 	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
 
 	/* IB packet must end on a 8 DW boundary */
-	sdma_v2_4_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
+	sdma_v2_4_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7);
 
 	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
 			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
@@ -750,7 +750,7 @@ static void sdma_v2_4_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
 	u32 pad_count;
 	int i;
 
-	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
+	pad_count = (-ib->length_dw) & 7;
 	for (i = 0; i < pad_count; i++)
 		if (sdma && sdma->burst_nop && (i == 0))
 			ib->ptr[ib->length_dw++] =
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
index 5f4e2c616241..5c3c310188b6 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
@@ -429,7 +429,7 @@ static void sdma_v3_0_ring_emit_ib(struct amdgpu_ring *ring,
 	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
 
 	/* IB packet must end on a 8 DW boundary */
-	sdma_v3_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
+	sdma_v3_0_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7);
 
 	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
 			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
@@ -1021,7 +1021,7 @@ static void sdma_v3_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
 	u32 pad_count;
 	int i;
 
-	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
+	pad_count = (-ib->length_dw) & 7;
 	for (i = 0; i < pad_count; i++)
 		if (sdma && sdma->burst_nop && (i == 0))
 			ib->ptr[ib->length_dw++] =
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
index 45bd538ba97e..7c71c88e38a4 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
@@ -698,7 +698,7 @@ static void sdma_v4_0_ring_emit_ib(struct amdgpu_ring *ring,
 	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
 
 	/* IB packet must end on a 8 DW boundary */
-	sdma_v4_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
+	sdma_v4_0_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7);
 
 	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
 			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
@@ -1580,7 +1580,7 @@ static void sdma_v4_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
 	u32 pad_count;
 	int i;
 
-	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
+	pad_count = (-ib->length_dw) & 7;
 	for (i = 0; i < pad_count; i++)
 		if (sdma && sdma->burst_nop && (i == 0))
 			ib->ptr[ib->length_dw++] =
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
index 0c41b4fdc58b..67ede9e4df01 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
@@ -382,8 +382,12 @@ static void sdma_v5_0_ring_emit_ib(struct amdgpu_ring *ring,
 	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
 	uint64_t csa_mc_addr = amdgpu_sdma_get_csa_mc_addr(ring, vmid);
 
-	/* IB packet must end on a 8 DW boundary */
-	sdma_v5_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
+	/* An IB packet must end on a 8 DW boundary--the next dword must
+	 * be on a 8-dword boundary. Our IB below is 6 dwords long,
+	 * thus add x number of NOPs, such that
+	 * wptr + 6 + x = 8k, k >= 0.
+	 */
+	sdma_v5_0_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7);
 
 	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
 			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
@@ -1086,10 +1090,10 @@ static void sdma_v5_0_vm_set_pte_pde(struct amdgpu_ib *ib,
 }
 
 /**
- * sdma_v5_0_ring_pad_ib - pad the IB to the required number of dw
- *
+ * sdma_v5_0_ring_pad_ib - pad the IB
  * @ib: indirect buffer to fill with padding
  *
+ * Pad the IB with NOPs to a boundary multiple of 8.
  */
 static void sdma_v5_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib)
 {
@@ -1097,7 +1101,7 @@ static void sdma_v5_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
 	u32 pad_count;
 	int i;
 
-	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
+	pad_count = (-ib->length_dw) & 0x7;
 	for (i = 0; i < pad_count; i++)
 		if (sdma && sdma->burst_nop && (i == 0))
 			ib->ptr[ib->length_dw++] =
-- 
2.23.0.385.gbc12974a89

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH] drm/amdgpu: simplify padding calculations
@ 2019-10-24 23:44 ` Tuikov, Luben
  0 siblings, 0 replies; 16+ messages in thread
From: Tuikov, Luben @ 2019-10-24 23:44 UTC (permalink / raw)
  To: amd-gfx; +Cc: Deucher, Alexander, Tuikov, Luben, Koenig, Christian

Simplify padding calculations.

Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/cik_sdma.c  |  4 ++--
 drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c |  4 ++--
 drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c |  4 ++--
 drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c |  4 ++--
 drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 14 +++++++++-----
 5 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
index c45304f1047c..1ea9e18d6f08 100644
--- a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
@@ -228,7 +228,7 @@ static void cik_sdma_ring_emit_ib(struct amdgpu_ring *ring,
 	u32 extra_bits = vmid & 0xf;
 
 	/* IB packet must end on a 8 DW boundary */
-	cik_sdma_ring_insert_nop(ring, (12 - (lower_32_bits(ring->wptr) & 7)) % 8);
+	cik_sdma_ring_insert_nop(ring, (4-lower_32_bits(ring->wptr)) & 7);
 
 	amdgpu_ring_write(ring, SDMA_PACKET(SDMA_OPCODE_INDIRECT_BUFFER, 0, extra_bits));
 	amdgpu_ring_write(ring, ib->gpu_addr & 0xffffffe0); /* base must be 32 byte aligned */
@@ -811,7 +811,7 @@ static void cik_sdma_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib)
 	u32 pad_count;
 	int i;
 
-	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
+	pad_count = (-ib->length_dw) & 7;
 	for (i = 0; i < pad_count; i++)
 		if (sdma && sdma->burst_nop && (i == 0))
 			ib->ptr[ib->length_dw++] =
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
index a10175838013..d340f179401a 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
@@ -255,7 +255,7 @@ static void sdma_v2_4_ring_emit_ib(struct amdgpu_ring *ring,
 	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
 
 	/* IB packet must end on a 8 DW boundary */
-	sdma_v2_4_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
+	sdma_v2_4_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7);
 
 	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
 			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
@@ -750,7 +750,7 @@ static void sdma_v2_4_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
 	u32 pad_count;
 	int i;
 
-	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
+	pad_count = (-ib->length_dw) & 7;
 	for (i = 0; i < pad_count; i++)
 		if (sdma && sdma->burst_nop && (i == 0))
 			ib->ptr[ib->length_dw++] =
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
index 5f4e2c616241..5c3c310188b6 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
@@ -429,7 +429,7 @@ static void sdma_v3_0_ring_emit_ib(struct amdgpu_ring *ring,
 	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
 
 	/* IB packet must end on a 8 DW boundary */
-	sdma_v3_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
+	sdma_v3_0_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7);
 
 	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
 			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
@@ -1021,7 +1021,7 @@ static void sdma_v3_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
 	u32 pad_count;
 	int i;
 
-	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
+	pad_count = (-ib->length_dw) & 7;
 	for (i = 0; i < pad_count; i++)
 		if (sdma && sdma->burst_nop && (i == 0))
 			ib->ptr[ib->length_dw++] =
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
index 45bd538ba97e..7c71c88e38a4 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
@@ -698,7 +698,7 @@ static void sdma_v4_0_ring_emit_ib(struct amdgpu_ring *ring,
 	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
 
 	/* IB packet must end on a 8 DW boundary */
-	sdma_v4_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
+	sdma_v4_0_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7);
 
 	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
 			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
@@ -1580,7 +1580,7 @@ static void sdma_v4_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
 	u32 pad_count;
 	int i;
 
-	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
+	pad_count = (-ib->length_dw) & 7;
 	for (i = 0; i < pad_count; i++)
 		if (sdma && sdma->burst_nop && (i == 0))
 			ib->ptr[ib->length_dw++] =
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
index 0c41b4fdc58b..67ede9e4df01 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
@@ -382,8 +382,12 @@ static void sdma_v5_0_ring_emit_ib(struct amdgpu_ring *ring,
 	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
 	uint64_t csa_mc_addr = amdgpu_sdma_get_csa_mc_addr(ring, vmid);
 
-	/* IB packet must end on a 8 DW boundary */
-	sdma_v5_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
+	/* An IB packet must end on a 8 DW boundary--the next dword must
+	 * be on a 8-dword boundary. Our IB below is 6 dwords long,
+	 * thus add x number of NOPs, such that
+	 * wptr + 6 + x = 8k, k >= 0.
+	 */
+	sdma_v5_0_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7);
 
 	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
 			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
@@ -1086,10 +1090,10 @@ static void sdma_v5_0_vm_set_pte_pde(struct amdgpu_ib *ib,
 }
 
 /**
- * sdma_v5_0_ring_pad_ib - pad the IB to the required number of dw
- *
+ * sdma_v5_0_ring_pad_ib - pad the IB
  * @ib: indirect buffer to fill with padding
  *
+ * Pad the IB with NOPs to a boundary multiple of 8.
  */
 static void sdma_v5_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib)
 {
@@ -1097,7 +1101,7 @@ static void sdma_v5_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
 	u32 pad_count;
 	int i;
 
-	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
+	pad_count = (-ib->length_dw) & 0x7;
 	for (i = 0; i < pad_count; i++)
 		if (sdma && sdma->burst_nop && (i == 0))
 			ib->ptr[ib->length_dw++] =
-- 
2.23.0.385.gbc12974a89

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH] drm/amdgpu: simplify padding calculations
@ 2019-10-25  6:54     ` Koenig, Christian
  0 siblings, 0 replies; 16+ messages in thread
From: Koenig, Christian @ 2019-10-25  6:54 UTC (permalink / raw)
  To: Tuikov, Luben, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Deucher, Alexander

Am 25.10.19 um 01:44 schrieb Tuikov, Luben:
> Simplify padding calculations.
>
> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/cik_sdma.c  |  4 ++--
>   drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c |  4 ++--
>   drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c |  4 ++--
>   drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c |  4 ++--
>   drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 14 +++++++++-----
>   5 files changed, 17 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
> index c45304f1047c..1ea9e18d6f08 100644
> --- a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
> +++ b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
> @@ -228,7 +228,7 @@ static void cik_sdma_ring_emit_ib(struct amdgpu_ring *ring,
>   	u32 extra_bits = vmid & 0xf;
>   
>   	/* IB packet must end on a 8 DW boundary */
> -	cik_sdma_ring_insert_nop(ring, (12 - (lower_32_bits(ring->wptr) & 7)) % 8);
> +	cik_sdma_ring_insert_nop(ring, (4-lower_32_bits(ring->wptr)) & 7);
>   
>   	amdgpu_ring_write(ring, SDMA_PACKET(SDMA_OPCODE_INDIRECT_BUFFER, 0, extra_bits));
>   	amdgpu_ring_write(ring, ib->gpu_addr & 0xffffffe0); /* base must be 32 byte aligned */
> @@ -811,7 +811,7 @@ static void cik_sdma_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib)
>   	u32 pad_count;
>   	int i;
>   
> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
> +	pad_count = (-ib->length_dw) & 7;
>   	for (i = 0; i < pad_count; i++)
>   		if (sdma && sdma->burst_nop && (i == 0))
>   			ib->ptr[ib->length_dw++] =
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
> index a10175838013..d340f179401a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
> @@ -255,7 +255,7 @@ static void sdma_v2_4_ring_emit_ib(struct amdgpu_ring *ring,
>   	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>   
>   	/* IB packet must end on a 8 DW boundary */
> -	sdma_v2_4_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
> +	sdma_v2_4_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7);
>   
>   	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
>   			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
> @@ -750,7 +750,7 @@ static void sdma_v2_4_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
>   	u32 pad_count;
>   	int i;
>   
> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
> +	pad_count = (-ib->length_dw) & 7;
>   	for (i = 0; i < pad_count; i++)
>   		if (sdma && sdma->burst_nop && (i == 0))
>   			ib->ptr[ib->length_dw++] =
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
> index 5f4e2c616241..5c3c310188b6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
> @@ -429,7 +429,7 @@ static void sdma_v3_0_ring_emit_ib(struct amdgpu_ring *ring,
>   	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>   
>   	/* IB packet must end on a 8 DW boundary */
> -	sdma_v3_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
> +	sdma_v3_0_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7);
>   
>   	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
>   			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
> @@ -1021,7 +1021,7 @@ static void sdma_v3_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
>   	u32 pad_count;
>   	int i;
>   
> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
> +	pad_count = (-ib->length_dw) & 7;
>   	for (i = 0; i < pad_count; i++)
>   		if (sdma && sdma->burst_nop && (i == 0))
>   			ib->ptr[ib->length_dw++] =
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> index 45bd538ba97e..7c71c88e38a4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> @@ -698,7 +698,7 @@ static void sdma_v4_0_ring_emit_ib(struct amdgpu_ring *ring,
>   	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>   
>   	/* IB packet must end on a 8 DW boundary */
> -	sdma_v4_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
> +	sdma_v4_0_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7);
>   
>   	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
>   			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
> @@ -1580,7 +1580,7 @@ static void sdma_v4_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
>   	u32 pad_count;
>   	int i;
>   
> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
> +	pad_count = (-ib->length_dw) & 7;
>   	for (i = 0; i < pad_count; i++)
>   		if (sdma && sdma->burst_nop && (i == 0))
>   			ib->ptr[ib->length_dw++] =
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> index 0c41b4fdc58b..67ede9e4df01 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> @@ -382,8 +382,12 @@ static void sdma_v5_0_ring_emit_ib(struct amdgpu_ring *ring,
>   	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>   	uint64_t csa_mc_addr = amdgpu_sdma_get_csa_mc_addr(ring, vmid);
>   
> -	/* IB packet must end on a 8 DW boundary */
> -	sdma_v5_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
> +	/* An IB packet must end on a 8 DW boundary--the next dword must
> +	 * be on a 8-dword boundary. Our IB below is 6 dwords long,

That should probably read "our IB command" or "our IB packet".

> +	 * thus add x number of NOPs, such that
> +	 * wptr + 6 + x = 8k, k >= 0.

Maybe write (wrptr + 6 + x) % 8 = 0 to make that more obvious.

> +	 */
> +	sdma_v5_0_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7);

Spaces between operators please.

Apart form that looks good to me,
Christian.

>   
>   	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
>   			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
> @@ -1086,10 +1090,10 @@ static void sdma_v5_0_vm_set_pte_pde(struct amdgpu_ib *ib,
>   }
>   
>   /**
> - * sdma_v5_0_ring_pad_ib - pad the IB to the required number of dw
> - *
> + * sdma_v5_0_ring_pad_ib - pad the IB
>    * @ib: indirect buffer to fill with padding
>    *
> + * Pad the IB with NOPs to a boundary multiple of 8.
>    */
>   static void sdma_v5_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib)
>   {
> @@ -1097,7 +1101,7 @@ static void sdma_v5_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
>   	u32 pad_count;
>   	int i;
>   
> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
> +	pad_count = (-ib->length_dw) & 0x7;
>   	for (i = 0; i < pad_count; i++)
>   		if (sdma && sdma->burst_nop && (i == 0))
>   			ib->ptr[ib->length_dw++] =

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] drm/amdgpu: simplify padding calculations
@ 2019-10-25  6:54     ` Koenig, Christian
  0 siblings, 0 replies; 16+ messages in thread
From: Koenig, Christian @ 2019-10-25  6:54 UTC (permalink / raw)
  To: Tuikov, Luben, amd-gfx; +Cc: Deucher, Alexander

Am 25.10.19 um 01:44 schrieb Tuikov, Luben:
> Simplify padding calculations.
>
> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/cik_sdma.c  |  4 ++--
>   drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c |  4 ++--
>   drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c |  4 ++--
>   drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c |  4 ++--
>   drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 14 +++++++++-----
>   5 files changed, 17 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
> index c45304f1047c..1ea9e18d6f08 100644
> --- a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
> +++ b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
> @@ -228,7 +228,7 @@ static void cik_sdma_ring_emit_ib(struct amdgpu_ring *ring,
>   	u32 extra_bits = vmid & 0xf;
>   
>   	/* IB packet must end on a 8 DW boundary */
> -	cik_sdma_ring_insert_nop(ring, (12 - (lower_32_bits(ring->wptr) & 7)) % 8);
> +	cik_sdma_ring_insert_nop(ring, (4-lower_32_bits(ring->wptr)) & 7);
>   
>   	amdgpu_ring_write(ring, SDMA_PACKET(SDMA_OPCODE_INDIRECT_BUFFER, 0, extra_bits));
>   	amdgpu_ring_write(ring, ib->gpu_addr & 0xffffffe0); /* base must be 32 byte aligned */
> @@ -811,7 +811,7 @@ static void cik_sdma_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib)
>   	u32 pad_count;
>   	int i;
>   
> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
> +	pad_count = (-ib->length_dw) & 7;
>   	for (i = 0; i < pad_count; i++)
>   		if (sdma && sdma->burst_nop && (i == 0))
>   			ib->ptr[ib->length_dw++] =
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
> index a10175838013..d340f179401a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
> @@ -255,7 +255,7 @@ static void sdma_v2_4_ring_emit_ib(struct amdgpu_ring *ring,
>   	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>   
>   	/* IB packet must end on a 8 DW boundary */
> -	sdma_v2_4_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
> +	sdma_v2_4_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7);
>   
>   	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
>   			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
> @@ -750,7 +750,7 @@ static void sdma_v2_4_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
>   	u32 pad_count;
>   	int i;
>   
> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
> +	pad_count = (-ib->length_dw) & 7;
>   	for (i = 0; i < pad_count; i++)
>   		if (sdma && sdma->burst_nop && (i == 0))
>   			ib->ptr[ib->length_dw++] =
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
> index 5f4e2c616241..5c3c310188b6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
> @@ -429,7 +429,7 @@ static void sdma_v3_0_ring_emit_ib(struct amdgpu_ring *ring,
>   	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>   
>   	/* IB packet must end on a 8 DW boundary */
> -	sdma_v3_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
> +	sdma_v3_0_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7);
>   
>   	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
>   			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
> @@ -1021,7 +1021,7 @@ static void sdma_v3_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
>   	u32 pad_count;
>   	int i;
>   
> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
> +	pad_count = (-ib->length_dw) & 7;
>   	for (i = 0; i < pad_count; i++)
>   		if (sdma && sdma->burst_nop && (i == 0))
>   			ib->ptr[ib->length_dw++] =
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> index 45bd538ba97e..7c71c88e38a4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> @@ -698,7 +698,7 @@ static void sdma_v4_0_ring_emit_ib(struct amdgpu_ring *ring,
>   	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>   
>   	/* IB packet must end on a 8 DW boundary */
> -	sdma_v4_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
> +	sdma_v4_0_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7);
>   
>   	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
>   			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
> @@ -1580,7 +1580,7 @@ static void sdma_v4_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
>   	u32 pad_count;
>   	int i;
>   
> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
> +	pad_count = (-ib->length_dw) & 7;
>   	for (i = 0; i < pad_count; i++)
>   		if (sdma && sdma->burst_nop && (i == 0))
>   			ib->ptr[ib->length_dw++] =
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> index 0c41b4fdc58b..67ede9e4df01 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> @@ -382,8 +382,12 @@ static void sdma_v5_0_ring_emit_ib(struct amdgpu_ring *ring,
>   	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>   	uint64_t csa_mc_addr = amdgpu_sdma_get_csa_mc_addr(ring, vmid);
>   
> -	/* IB packet must end on a 8 DW boundary */
> -	sdma_v5_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
> +	/* An IB packet must end on a 8 DW boundary--the next dword must
> +	 * be on a 8-dword boundary. Our IB below is 6 dwords long,

That should probably read "our IB command" or "our IB packet".

> +	 * thus add x number of NOPs, such that
> +	 * wptr + 6 + x = 8k, k >= 0.

Maybe write (wrptr + 6 + x) % 8 = 0 to make that more obvious.

> +	 */
> +	sdma_v5_0_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7);

Spaces between operators please.

Apart form that looks good to me,
Christian.

>   
>   	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
>   			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
> @@ -1086,10 +1090,10 @@ static void sdma_v5_0_vm_set_pte_pde(struct amdgpu_ib *ib,
>   }
>   
>   /**
> - * sdma_v5_0_ring_pad_ib - pad the IB to the required number of dw
> - *
> + * sdma_v5_0_ring_pad_ib - pad the IB
>    * @ib: indirect buffer to fill with padding
>    *
> + * Pad the IB with NOPs to a boundary multiple of 8.
>    */
>   static void sdma_v5_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib)
>   {
> @@ -1097,7 +1101,7 @@ static void sdma_v5_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
>   	u32 pad_count;
>   	int i;
>   
> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
> +	pad_count = (-ib->length_dw) & 0x7;
>   	for (i = 0; i < pad_count; i++)
>   		if (sdma && sdma->burst_nop && (i == 0))
>   			ib->ptr[ib->length_dw++] =

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] drm/amdgpu: simplify padding calculations
@ 2019-10-25 21:51         ` Tuikov, Luben
  0 siblings, 0 replies; 16+ messages in thread
From: Tuikov, Luben @ 2019-10-25 21:51 UTC (permalink / raw)
  To: Koenig, Christian, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Deucher, Alexander

On 2019-10-25 2:54 a.m., Koenig, Christian wrote:
> Am 25.10.19 um 01:44 schrieb Tuikov, Luben:
>> Simplify padding calculations.
>>
>> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/cik_sdma.c  |  4 ++--
>>   drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c |  4 ++--
>>   drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c |  4 ++--
>>   drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c |  4 ++--
>>   drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 14 +++++++++-----
>>   5 files changed, 17 insertions(+), 13 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
>> index c45304f1047c..1ea9e18d6f08 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
>> @@ -228,7 +228,7 @@ static void cik_sdma_ring_emit_ib(struct amdgpu_ring *ring,
>>   	u32 extra_bits = vmid & 0xf;
>>   
>>   	/* IB packet must end on a 8 DW boundary */
>> -	cik_sdma_ring_insert_nop(ring, (12 - (lower_32_bits(ring->wptr) & 7)) % 8);
>> +	cik_sdma_ring_insert_nop(ring, (4-lower_32_bits(ring->wptr)) & 7);
>>   
>>   	amdgpu_ring_write(ring, SDMA_PACKET(SDMA_OPCODE_INDIRECT_BUFFER, 0, extra_bits));
>>   	amdgpu_ring_write(ring, ib->gpu_addr & 0xffffffe0); /* base must be 32 byte aligned */
>> @@ -811,7 +811,7 @@ static void cik_sdma_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib)
>>   	u32 pad_count;
>>   	int i;
>>   
>> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
>> +	pad_count = (-ib->length_dw) & 7;
>>   	for (i = 0; i < pad_count; i++)
>>   		if (sdma && sdma->burst_nop && (i == 0))
>>   			ib->ptr[ib->length_dw++] =
>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
>> index a10175838013..d340f179401a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
>> @@ -255,7 +255,7 @@ static void sdma_v2_4_ring_emit_ib(struct amdgpu_ring *ring,
>>   	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>>   
>>   	/* IB packet must end on a 8 DW boundary */
>> -	sdma_v2_4_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
>> +	sdma_v2_4_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7);
>>   
>>   	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
>>   			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
>> @@ -750,7 +750,7 @@ static void sdma_v2_4_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
>>   	u32 pad_count;
>>   	int i;
>>   
>> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
>> +	pad_count = (-ib->length_dw) & 7;
>>   	for (i = 0; i < pad_count; i++)
>>   		if (sdma && sdma->burst_nop && (i == 0))
>>   			ib->ptr[ib->length_dw++] =
>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
>> index 5f4e2c616241..5c3c310188b6 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
>> @@ -429,7 +429,7 @@ static void sdma_v3_0_ring_emit_ib(struct amdgpu_ring *ring,
>>   	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>>   
>>   	/* IB packet must end on a 8 DW boundary */
>> -	sdma_v3_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
>> +	sdma_v3_0_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7);
>>   
>>   	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
>>   			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
>> @@ -1021,7 +1021,7 @@ static void sdma_v3_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
>>   	u32 pad_count;
>>   	int i;
>>   
>> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
>> +	pad_count = (-ib->length_dw) & 7;
>>   	for (i = 0; i < pad_count; i++)
>>   		if (sdma && sdma->burst_nop && (i == 0))
>>   			ib->ptr[ib->length_dw++] =
>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>> index 45bd538ba97e..7c71c88e38a4 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>> @@ -698,7 +698,7 @@ static void sdma_v4_0_ring_emit_ib(struct amdgpu_ring *ring,
>>   	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>>   
>>   	/* IB packet must end on a 8 DW boundary */
>> -	sdma_v4_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
>> +	sdma_v4_0_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7);
>>   
>>   	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
>>   			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
>> @@ -1580,7 +1580,7 @@ static void sdma_v4_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
>>   	u32 pad_count;
>>   	int i;
>>   
>> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
>> +	pad_count = (-ib->length_dw) & 7;
>>   	for (i = 0; i < pad_count; i++)
>>   		if (sdma && sdma->burst_nop && (i == 0))
>>   			ib->ptr[ib->length_dw++] =
>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>> index 0c41b4fdc58b..67ede9e4df01 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>> @@ -382,8 +382,12 @@ static void sdma_v5_0_ring_emit_ib(struct amdgpu_ring *ring,
>>   	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>>   	uint64_t csa_mc_addr = amdgpu_sdma_get_csa_mc_addr(ring, vmid);
>>   
>> -	/* IB packet must end on a 8 DW boundary */
>> -	sdma_v5_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
>> +	/* An IB packet must end on a 8 DW boundary--the next dword must
>> +	 * be on a 8-dword boundary. Our IB below is 6 dwords long,
> 
> That should probably read "our IB command" or "our IB packet".

Done.

> 
>> +	 * thus add x number of NOPs, such that
>> +	 * wptr + 6 + x = 8k, k >= 0.
> 
> Maybe write (wrptr + 6 + x) % 8 = 0 to make that more obvious.

That's less obvious. In my mind I always translate it to
a congruence expression. It's what I used to derive the algebraic
simplifications of this patch.

I'll add the highly cryptic "(wrptr + 6 + x) % 8 = 0" to make you happy,
below the algebraic expression already there.

> 
>> +	 */
>> +	sdma_v5_0_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7);
> 
> Spaces between operators please.

Yeah, I wanted to put an emphasis on the AND-op, but I'll add them.

Regards,
Luben


> 
> Apart form that looks good to me,
> Christian.
> 
>>   
>>   	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
>>   			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
>> @@ -1086,10 +1090,10 @@ static void sdma_v5_0_vm_set_pte_pde(struct amdgpu_ib *ib,
>>   }
>>   
>>   /**
>> - * sdma_v5_0_ring_pad_ib - pad the IB to the required number of dw
>> - *
>> + * sdma_v5_0_ring_pad_ib - pad the IB
>>    * @ib: indirect buffer to fill with padding
>>    *
>> + * Pad the IB with NOPs to a boundary multiple of 8.
>>    */
>>   static void sdma_v5_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib)
>>   {
>> @@ -1097,7 +1101,7 @@ static void sdma_v5_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
>>   	u32 pad_count;
>>   	int i;
>>   
>> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
>> +	pad_count = (-ib->length_dw) & 0x7;
>>   	for (i = 0; i < pad_count; i++)
>>   		if (sdma && sdma->burst_nop && (i == 0))
>>   			ib->ptr[ib->length_dw++] =
> 

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] drm/amdgpu: simplify padding calculations
@ 2019-10-25 21:51         ` Tuikov, Luben
  0 siblings, 0 replies; 16+ messages in thread
From: Tuikov, Luben @ 2019-10-25 21:51 UTC (permalink / raw)
  To: Koenig, Christian, amd-gfx; +Cc: Deucher, Alexander

On 2019-10-25 2:54 a.m., Koenig, Christian wrote:
> Am 25.10.19 um 01:44 schrieb Tuikov, Luben:
>> Simplify padding calculations.
>>
>> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/cik_sdma.c  |  4 ++--
>>   drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c |  4 ++--
>>   drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c |  4 ++--
>>   drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c |  4 ++--
>>   drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 14 +++++++++-----
>>   5 files changed, 17 insertions(+), 13 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
>> index c45304f1047c..1ea9e18d6f08 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
>> @@ -228,7 +228,7 @@ static void cik_sdma_ring_emit_ib(struct amdgpu_ring *ring,
>>   	u32 extra_bits = vmid & 0xf;
>>   
>>   	/* IB packet must end on a 8 DW boundary */
>> -	cik_sdma_ring_insert_nop(ring, (12 - (lower_32_bits(ring->wptr) & 7)) % 8);
>> +	cik_sdma_ring_insert_nop(ring, (4-lower_32_bits(ring->wptr)) & 7);
>>   
>>   	amdgpu_ring_write(ring, SDMA_PACKET(SDMA_OPCODE_INDIRECT_BUFFER, 0, extra_bits));
>>   	amdgpu_ring_write(ring, ib->gpu_addr & 0xffffffe0); /* base must be 32 byte aligned */
>> @@ -811,7 +811,7 @@ static void cik_sdma_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib)
>>   	u32 pad_count;
>>   	int i;
>>   
>> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
>> +	pad_count = (-ib->length_dw) & 7;
>>   	for (i = 0; i < pad_count; i++)
>>   		if (sdma && sdma->burst_nop && (i == 0))
>>   			ib->ptr[ib->length_dw++] =
>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
>> index a10175838013..d340f179401a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
>> @@ -255,7 +255,7 @@ static void sdma_v2_4_ring_emit_ib(struct amdgpu_ring *ring,
>>   	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>>   
>>   	/* IB packet must end on a 8 DW boundary */
>> -	sdma_v2_4_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
>> +	sdma_v2_4_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7);
>>   
>>   	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
>>   			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
>> @@ -750,7 +750,7 @@ static void sdma_v2_4_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
>>   	u32 pad_count;
>>   	int i;
>>   
>> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
>> +	pad_count = (-ib->length_dw) & 7;
>>   	for (i = 0; i < pad_count; i++)
>>   		if (sdma && sdma->burst_nop && (i == 0))
>>   			ib->ptr[ib->length_dw++] =
>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
>> index 5f4e2c616241..5c3c310188b6 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
>> @@ -429,7 +429,7 @@ static void sdma_v3_0_ring_emit_ib(struct amdgpu_ring *ring,
>>   	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>>   
>>   	/* IB packet must end on a 8 DW boundary */
>> -	sdma_v3_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
>> +	sdma_v3_0_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7);
>>   
>>   	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
>>   			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
>> @@ -1021,7 +1021,7 @@ static void sdma_v3_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
>>   	u32 pad_count;
>>   	int i;
>>   
>> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
>> +	pad_count = (-ib->length_dw) & 7;
>>   	for (i = 0; i < pad_count; i++)
>>   		if (sdma && sdma->burst_nop && (i == 0))
>>   			ib->ptr[ib->length_dw++] =
>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>> index 45bd538ba97e..7c71c88e38a4 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>> @@ -698,7 +698,7 @@ static void sdma_v4_0_ring_emit_ib(struct amdgpu_ring *ring,
>>   	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>>   
>>   	/* IB packet must end on a 8 DW boundary */
>> -	sdma_v4_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
>> +	sdma_v4_0_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7);
>>   
>>   	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
>>   			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
>> @@ -1580,7 +1580,7 @@ static void sdma_v4_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
>>   	u32 pad_count;
>>   	int i;
>>   
>> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
>> +	pad_count = (-ib->length_dw) & 7;
>>   	for (i = 0; i < pad_count; i++)
>>   		if (sdma && sdma->burst_nop && (i == 0))
>>   			ib->ptr[ib->length_dw++] =
>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>> index 0c41b4fdc58b..67ede9e4df01 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>> @@ -382,8 +382,12 @@ static void sdma_v5_0_ring_emit_ib(struct amdgpu_ring *ring,
>>   	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>>   	uint64_t csa_mc_addr = amdgpu_sdma_get_csa_mc_addr(ring, vmid);
>>   
>> -	/* IB packet must end on a 8 DW boundary */
>> -	sdma_v5_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
>> +	/* An IB packet must end on a 8 DW boundary--the next dword must
>> +	 * be on a 8-dword boundary. Our IB below is 6 dwords long,
> 
> That should probably read "our IB command" or "our IB packet".

Done.

> 
>> +	 * thus add x number of NOPs, such that
>> +	 * wptr + 6 + x = 8k, k >= 0.
> 
> Maybe write (wrptr + 6 + x) % 8 = 0 to make that more obvious.

That's less obvious. In my mind I always translate it to
a congruence expression. It's what I used to derive the algebraic
simplifications of this patch.

I'll add the highly cryptic "(wrptr + 6 + x) % 8 = 0" to make you happy,
below the algebraic expression already there.

> 
>> +	 */
>> +	sdma_v5_0_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7);
> 
> Spaces between operators please.

Yeah, I wanted to put an emphasis on the AND-op, but I'll add them.

Regards,
Luben


> 
> Apart form that looks good to me,
> Christian.
> 
>>   
>>   	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
>>   			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
>> @@ -1086,10 +1090,10 @@ static void sdma_v5_0_vm_set_pte_pde(struct amdgpu_ib *ib,
>>   }
>>   
>>   /**
>> - * sdma_v5_0_ring_pad_ib - pad the IB to the required number of dw
>> - *
>> + * sdma_v5_0_ring_pad_ib - pad the IB
>>    * @ib: indirect buffer to fill with padding
>>    *
>> + * Pad the IB with NOPs to a boundary multiple of 8.
>>    */
>>   static void sdma_v5_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib)
>>   {
>> @@ -1097,7 +1101,7 @@ static void sdma_v5_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
>>   	u32 pad_count;
>>   	int i;
>>   
>> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
>> +	pad_count = (-ib->length_dw) & 0x7;
>>   	for (i = 0; i < pad_count; i++)
>>   		if (sdma && sdma->burst_nop && (i == 0))
>>   			ib->ptr[ib->length_dw++] =
> 

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH] drm/amdgpu: simplify padding calculations
@ 2019-10-25 22:30             ` Tuikov, Luben
  0 siblings, 0 replies; 16+ messages in thread
From: Tuikov, Luben @ 2019-10-25 22:30 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Deucher, Alexander, Tuikov, Luben, Koenig, Christian

Simplify padding calculations.

Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/cik_sdma.c  |  4 ++--
 drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c |  4 ++--
 drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c |  4 ++--
 drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c |  4 ++--
 drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 17 ++++++++++++-----
 5 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
index c45304f1047c..1ea9e18d6f08 100644
--- a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
@@ -228,7 +228,7 @@ static void cik_sdma_ring_emit_ib(struct amdgpu_ring *ring,
 	u32 extra_bits = vmid & 0xf;
 
 	/* IB packet must end on a 8 DW boundary */
-	cik_sdma_ring_insert_nop(ring, (12 - (lower_32_bits(ring->wptr) & 7)) % 8);
+	cik_sdma_ring_insert_nop(ring, (4-lower_32_bits(ring->wptr)) & 7);
 
 	amdgpu_ring_write(ring, SDMA_PACKET(SDMA_OPCODE_INDIRECT_BUFFER, 0, extra_bits));
 	amdgpu_ring_write(ring, ib->gpu_addr & 0xffffffe0); /* base must be 32 byte aligned */
@@ -811,7 +811,7 @@ static void cik_sdma_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib)
 	u32 pad_count;
 	int i;
 
-	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
+	pad_count = (-ib->length_dw) & 7;
 	for (i = 0; i < pad_count; i++)
 		if (sdma && sdma->burst_nop && (i == 0))
 			ib->ptr[ib->length_dw++] =
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
index a10175838013..d340f179401a 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
@@ -255,7 +255,7 @@ static void sdma_v2_4_ring_emit_ib(struct amdgpu_ring *ring,
 	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
 
 	/* IB packet must end on a 8 DW boundary */
-	sdma_v2_4_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
+	sdma_v2_4_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7);
 
 	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
 			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
@@ -750,7 +750,7 @@ static void sdma_v2_4_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
 	u32 pad_count;
 	int i;
 
-	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
+	pad_count = (-ib->length_dw) & 7;
 	for (i = 0; i < pad_count; i++)
 		if (sdma && sdma->burst_nop && (i == 0))
 			ib->ptr[ib->length_dw++] =
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
index 5f4e2c616241..5c3c310188b6 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
@@ -429,7 +429,7 @@ static void sdma_v3_0_ring_emit_ib(struct amdgpu_ring *ring,
 	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
 
 	/* IB packet must end on a 8 DW boundary */
-	sdma_v3_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
+	sdma_v3_0_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7);
 
 	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
 			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
@@ -1021,7 +1021,7 @@ static void sdma_v3_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
 	u32 pad_count;
 	int i;
 
-	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
+	pad_count = (-ib->length_dw) & 7;
 	for (i = 0; i < pad_count; i++)
 		if (sdma && sdma->burst_nop && (i == 0))
 			ib->ptr[ib->length_dw++] =
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
index 45bd538ba97e..7c71c88e38a4 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
@@ -698,7 +698,7 @@ static void sdma_v4_0_ring_emit_ib(struct amdgpu_ring *ring,
 	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
 
 	/* IB packet must end on a 8 DW boundary */
-	sdma_v4_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
+	sdma_v4_0_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7);
 
 	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
 			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
@@ -1580,7 +1580,7 @@ static void sdma_v4_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
 	u32 pad_count;
 	int i;
 
-	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
+	pad_count = (-ib->length_dw) & 7;
 	for (i = 0; i < pad_count; i++)
 		if (sdma && sdma->burst_nop && (i == 0))
 			ib->ptr[ib->length_dw++] =
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
index 0c41b4fdc58b..d117bde3f29a 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
@@ -382,8 +382,15 @@ static void sdma_v5_0_ring_emit_ib(struct amdgpu_ring *ring,
 	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
 	uint64_t csa_mc_addr = amdgpu_sdma_get_csa_mc_addr(ring, vmid);
 
-	/* IB packet must end on a 8 DW boundary */
-	sdma_v5_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
+	/* An IB packet must end on a 8 DW boundary--the next dword
+	 * must be on a 8-dword boundary. Our IB packet below is 6
+	 * dwords long, thus add x number of NOPs, such that, in
+	 * modular arithmetic,
+	 * wptr + 6 + x = 8k, k >= 0, which in C is,
+	 * (wptr + 6 + x) % 8 = 0.
+	 * The expression below, is a solution of x.
+	 */
+	sdma_v5_0_ring_insert_nop(ring, (2 - lower_32_bits(ring->wptr)) & 7);
 
 	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
 			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
@@ -1086,10 +1093,10 @@ static void sdma_v5_0_vm_set_pte_pde(struct amdgpu_ib *ib,
 }
 
 /**
- * sdma_v5_0_ring_pad_ib - pad the IB to the required number of dw
- *
+ * sdma_v5_0_ring_pad_ib - pad the IB
  * @ib: indirect buffer to fill with padding
  *
+ * Pad the IB with NOPs to a boundary multiple of 8.
  */
 static void sdma_v5_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib)
 {
@@ -1097,7 +1104,7 @@ static void sdma_v5_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
 	u32 pad_count;
 	int i;
 
-	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
+	pad_count = (-ib->length_dw) & 0x7;
 	for (i = 0; i < pad_count; i++)
 		if (sdma && sdma->burst_nop && (i == 0))
 			ib->ptr[ib->length_dw++] =
-- 
2.23.0.385.gbc12974a89

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH] drm/amdgpu: simplify padding calculations
@ 2019-10-25 22:30             ` Tuikov, Luben
  0 siblings, 0 replies; 16+ messages in thread
From: Tuikov, Luben @ 2019-10-25 22:30 UTC (permalink / raw)
  To: amd-gfx; +Cc: Deucher, Alexander, Tuikov, Luben, Koenig, Christian

Simplify padding calculations.

Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/cik_sdma.c  |  4 ++--
 drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c |  4 ++--
 drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c |  4 ++--
 drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c |  4 ++--
 drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 17 ++++++++++++-----
 5 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
index c45304f1047c..1ea9e18d6f08 100644
--- a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
@@ -228,7 +228,7 @@ static void cik_sdma_ring_emit_ib(struct amdgpu_ring *ring,
 	u32 extra_bits = vmid & 0xf;
 
 	/* IB packet must end on a 8 DW boundary */
-	cik_sdma_ring_insert_nop(ring, (12 - (lower_32_bits(ring->wptr) & 7)) % 8);
+	cik_sdma_ring_insert_nop(ring, (4-lower_32_bits(ring->wptr)) & 7);
 
 	amdgpu_ring_write(ring, SDMA_PACKET(SDMA_OPCODE_INDIRECT_BUFFER, 0, extra_bits));
 	amdgpu_ring_write(ring, ib->gpu_addr & 0xffffffe0); /* base must be 32 byte aligned */
@@ -811,7 +811,7 @@ static void cik_sdma_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib)
 	u32 pad_count;
 	int i;
 
-	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
+	pad_count = (-ib->length_dw) & 7;
 	for (i = 0; i < pad_count; i++)
 		if (sdma && sdma->burst_nop && (i == 0))
 			ib->ptr[ib->length_dw++] =
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
index a10175838013..d340f179401a 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
@@ -255,7 +255,7 @@ static void sdma_v2_4_ring_emit_ib(struct amdgpu_ring *ring,
 	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
 
 	/* IB packet must end on a 8 DW boundary */
-	sdma_v2_4_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
+	sdma_v2_4_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7);
 
 	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
 			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
@@ -750,7 +750,7 @@ static void sdma_v2_4_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
 	u32 pad_count;
 	int i;
 
-	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
+	pad_count = (-ib->length_dw) & 7;
 	for (i = 0; i < pad_count; i++)
 		if (sdma && sdma->burst_nop && (i == 0))
 			ib->ptr[ib->length_dw++] =
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
index 5f4e2c616241..5c3c310188b6 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
@@ -429,7 +429,7 @@ static void sdma_v3_0_ring_emit_ib(struct amdgpu_ring *ring,
 	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
 
 	/* IB packet must end on a 8 DW boundary */
-	sdma_v3_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
+	sdma_v3_0_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7);
 
 	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
 			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
@@ -1021,7 +1021,7 @@ static void sdma_v3_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
 	u32 pad_count;
 	int i;
 
-	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
+	pad_count = (-ib->length_dw) & 7;
 	for (i = 0; i < pad_count; i++)
 		if (sdma && sdma->burst_nop && (i == 0))
 			ib->ptr[ib->length_dw++] =
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
index 45bd538ba97e..7c71c88e38a4 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
@@ -698,7 +698,7 @@ static void sdma_v4_0_ring_emit_ib(struct amdgpu_ring *ring,
 	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
 
 	/* IB packet must end on a 8 DW boundary */
-	sdma_v4_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
+	sdma_v4_0_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7);
 
 	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
 			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
@@ -1580,7 +1580,7 @@ static void sdma_v4_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
 	u32 pad_count;
 	int i;
 
-	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
+	pad_count = (-ib->length_dw) & 7;
 	for (i = 0; i < pad_count; i++)
 		if (sdma && sdma->burst_nop && (i == 0))
 			ib->ptr[ib->length_dw++] =
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
index 0c41b4fdc58b..d117bde3f29a 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
@@ -382,8 +382,15 @@ static void sdma_v5_0_ring_emit_ib(struct amdgpu_ring *ring,
 	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
 	uint64_t csa_mc_addr = amdgpu_sdma_get_csa_mc_addr(ring, vmid);
 
-	/* IB packet must end on a 8 DW boundary */
-	sdma_v5_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
+	/* An IB packet must end on a 8 DW boundary--the next dword
+	 * must be on a 8-dword boundary. Our IB packet below is 6
+	 * dwords long, thus add x number of NOPs, such that, in
+	 * modular arithmetic,
+	 * wptr + 6 + x = 8k, k >= 0, which in C is,
+	 * (wptr + 6 + x) % 8 = 0.
+	 * The expression below, is a solution of x.
+	 */
+	sdma_v5_0_ring_insert_nop(ring, (2 - lower_32_bits(ring->wptr)) & 7);
 
 	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
 			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
@@ -1086,10 +1093,10 @@ static void sdma_v5_0_vm_set_pte_pde(struct amdgpu_ib *ib,
 }
 
 /**
- * sdma_v5_0_ring_pad_ib - pad the IB to the required number of dw
- *
+ * sdma_v5_0_ring_pad_ib - pad the IB
  * @ib: indirect buffer to fill with padding
  *
+ * Pad the IB with NOPs to a boundary multiple of 8.
  */
 static void sdma_v5_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib)
 {
@@ -1097,7 +1104,7 @@ static void sdma_v5_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
 	u32 pad_count;
 	int i;
 
-	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
+	pad_count = (-ib->length_dw) & 0x7;
 	for (i = 0; i < pad_count; i++)
 		if (sdma && sdma->burst_nop && (i == 0))
 			ib->ptr[ib->length_dw++] =
-- 
2.23.0.385.gbc12974a89

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH] drm/amdgpu: simplify padding calculations (v2)
@ 2019-10-25 22:41         ` Tuikov, Luben
  0 siblings, 0 replies; 16+ messages in thread
From: Tuikov, Luben @ 2019-10-25 22:41 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Deucher, Alexander, Tuikov, Luben, Koenig, Christian

Simplify padding calculations.

v2: Comment update and spacing.

Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/cik_sdma.c  |  4 ++--
 drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c |  4 ++--
 drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c |  4 ++--
 drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c |  4 ++--
 drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 17 ++++++++++++-----
 5 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
index c45304f1047c..4af9acc2dc4f 100644
--- a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
@@ -228,7 +228,7 @@ static void cik_sdma_ring_emit_ib(struct amdgpu_ring *ring,
 	u32 extra_bits = vmid & 0xf;
 
 	/* IB packet must end on a 8 DW boundary */
-	cik_sdma_ring_insert_nop(ring, (12 - (lower_32_bits(ring->wptr) & 7)) % 8);
+	cik_sdma_ring_insert_nop(ring, (4 - lower_32_bits(ring->wptr)) & 7);
 
 	amdgpu_ring_write(ring, SDMA_PACKET(SDMA_OPCODE_INDIRECT_BUFFER, 0, extra_bits));
 	amdgpu_ring_write(ring, ib->gpu_addr & 0xffffffe0); /* base must be 32 byte aligned */
@@ -811,7 +811,7 @@ static void cik_sdma_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib)
 	u32 pad_count;
 	int i;
 
-	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
+	pad_count = (-ib->length_dw) & 7;
 	for (i = 0; i < pad_count; i++)
 		if (sdma && sdma->burst_nop && (i == 0))
 			ib->ptr[ib->length_dw++] =
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
index a10175838013..b6af67f6f214 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
@@ -255,7 +255,7 @@ static void sdma_v2_4_ring_emit_ib(struct amdgpu_ring *ring,
 	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
 
 	/* IB packet must end on a 8 DW boundary */
-	sdma_v2_4_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
+	sdma_v2_4_ring_insert_nop(ring, (2 - lower_32_bits(ring->wptr)) & 7);
 
 	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
 			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
@@ -750,7 +750,7 @@ static void sdma_v2_4_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
 	u32 pad_count;
 	int i;
 
-	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
+	pad_count = (-ib->length_dw) & 7;
 	for (i = 0; i < pad_count; i++)
 		if (sdma && sdma->burst_nop && (i == 0))
 			ib->ptr[ib->length_dw++] =
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
index 5f4e2c616241..cd3ebed46d05 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
@@ -429,7 +429,7 @@ static void sdma_v3_0_ring_emit_ib(struct amdgpu_ring *ring,
 	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
 
 	/* IB packet must end on a 8 DW boundary */
-	sdma_v3_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
+	sdma_v3_0_ring_insert_nop(ring, (2 - lower_32_bits(ring->wptr)) & 7);
 
 	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
 			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
@@ -1021,7 +1021,7 @@ static void sdma_v3_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
 	u32 pad_count;
 	int i;
 
-	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
+	pad_count = (-ib->length_dw) & 7;
 	for (i = 0; i < pad_count; i++)
 		if (sdma && sdma->burst_nop && (i == 0))
 			ib->ptr[ib->length_dw++] =
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
index 45bd538ba97e..8ce15056ee4f 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
@@ -698,7 +698,7 @@ static void sdma_v4_0_ring_emit_ib(struct amdgpu_ring *ring,
 	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
 
 	/* IB packet must end on a 8 DW boundary */
-	sdma_v4_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
+	sdma_v4_0_ring_insert_nop(ring, (2 - lower_32_bits(ring->wptr)) & 7);
 
 	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
 			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
@@ -1580,7 +1580,7 @@ static void sdma_v4_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
 	u32 pad_count;
 	int i;
 
-	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
+	pad_count = (-ib->length_dw) & 7;
 	for (i = 0; i < pad_count; i++)
 		if (sdma && sdma->burst_nop && (i == 0))
 			ib->ptr[ib->length_dw++] =
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
index 0c41b4fdc58b..d117bde3f29a 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
@@ -382,8 +382,15 @@ static void sdma_v5_0_ring_emit_ib(struct amdgpu_ring *ring,
 	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
 	uint64_t csa_mc_addr = amdgpu_sdma_get_csa_mc_addr(ring, vmid);
 
-	/* IB packet must end on a 8 DW boundary */
-	sdma_v5_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
+	/* An IB packet must end on a 8 DW boundary--the next dword
+	 * must be on a 8-dword boundary. Our IB packet below is 6
+	 * dwords long, thus add x number of NOPs, such that, in
+	 * modular arithmetic,
+	 * wptr + 6 + x = 8k, k >= 0, which in C is,
+	 * (wptr + 6 + x) % 8 = 0.
+	 * The expression below, is a solution of x.
+	 */
+	sdma_v5_0_ring_insert_nop(ring, (2 - lower_32_bits(ring->wptr)) & 7);
 
 	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
 			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
@@ -1086,10 +1093,10 @@ static void sdma_v5_0_vm_set_pte_pde(struct amdgpu_ib *ib,
 }
 
 /**
- * sdma_v5_0_ring_pad_ib - pad the IB to the required number of dw
- *
+ * sdma_v5_0_ring_pad_ib - pad the IB
  * @ib: indirect buffer to fill with padding
  *
+ * Pad the IB with NOPs to a boundary multiple of 8.
  */
 static void sdma_v5_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib)
 {
@@ -1097,7 +1104,7 @@ static void sdma_v5_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
 	u32 pad_count;
 	int i;
 
-	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
+	pad_count = (-ib->length_dw) & 0x7;
 	for (i = 0; i < pad_count; i++)
 		if (sdma && sdma->burst_nop && (i == 0))
 			ib->ptr[ib->length_dw++] =
-- 
2.23.0.385.gbc12974a89

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH] drm/amdgpu: simplify padding calculations (v2)
@ 2019-10-25 22:41         ` Tuikov, Luben
  0 siblings, 0 replies; 16+ messages in thread
From: Tuikov, Luben @ 2019-10-25 22:41 UTC (permalink / raw)
  To: amd-gfx; +Cc: Deucher, Alexander, Tuikov, Luben, Koenig, Christian

Simplify padding calculations.

v2: Comment update and spacing.

Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/cik_sdma.c  |  4 ++--
 drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c |  4 ++--
 drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c |  4 ++--
 drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c |  4 ++--
 drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 17 ++++++++++++-----
 5 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
index c45304f1047c..4af9acc2dc4f 100644
--- a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
@@ -228,7 +228,7 @@ static void cik_sdma_ring_emit_ib(struct amdgpu_ring *ring,
 	u32 extra_bits = vmid & 0xf;
 
 	/* IB packet must end on a 8 DW boundary */
-	cik_sdma_ring_insert_nop(ring, (12 - (lower_32_bits(ring->wptr) & 7)) % 8);
+	cik_sdma_ring_insert_nop(ring, (4 - lower_32_bits(ring->wptr)) & 7);
 
 	amdgpu_ring_write(ring, SDMA_PACKET(SDMA_OPCODE_INDIRECT_BUFFER, 0, extra_bits));
 	amdgpu_ring_write(ring, ib->gpu_addr & 0xffffffe0); /* base must be 32 byte aligned */
@@ -811,7 +811,7 @@ static void cik_sdma_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib)
 	u32 pad_count;
 	int i;
 
-	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
+	pad_count = (-ib->length_dw) & 7;
 	for (i = 0; i < pad_count; i++)
 		if (sdma && sdma->burst_nop && (i == 0))
 			ib->ptr[ib->length_dw++] =
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
index a10175838013..b6af67f6f214 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
@@ -255,7 +255,7 @@ static void sdma_v2_4_ring_emit_ib(struct amdgpu_ring *ring,
 	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
 
 	/* IB packet must end on a 8 DW boundary */
-	sdma_v2_4_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
+	sdma_v2_4_ring_insert_nop(ring, (2 - lower_32_bits(ring->wptr)) & 7);
 
 	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
 			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
@@ -750,7 +750,7 @@ static void sdma_v2_4_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
 	u32 pad_count;
 	int i;
 
-	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
+	pad_count = (-ib->length_dw) & 7;
 	for (i = 0; i < pad_count; i++)
 		if (sdma && sdma->burst_nop && (i == 0))
 			ib->ptr[ib->length_dw++] =
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
index 5f4e2c616241..cd3ebed46d05 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
@@ -429,7 +429,7 @@ static void sdma_v3_0_ring_emit_ib(struct amdgpu_ring *ring,
 	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
 
 	/* IB packet must end on a 8 DW boundary */
-	sdma_v3_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
+	sdma_v3_0_ring_insert_nop(ring, (2 - lower_32_bits(ring->wptr)) & 7);
 
 	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
 			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
@@ -1021,7 +1021,7 @@ static void sdma_v3_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
 	u32 pad_count;
 	int i;
 
-	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
+	pad_count = (-ib->length_dw) & 7;
 	for (i = 0; i < pad_count; i++)
 		if (sdma && sdma->burst_nop && (i == 0))
 			ib->ptr[ib->length_dw++] =
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
index 45bd538ba97e..8ce15056ee4f 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
@@ -698,7 +698,7 @@ static void sdma_v4_0_ring_emit_ib(struct amdgpu_ring *ring,
 	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
 
 	/* IB packet must end on a 8 DW boundary */
-	sdma_v4_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
+	sdma_v4_0_ring_insert_nop(ring, (2 - lower_32_bits(ring->wptr)) & 7);
 
 	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
 			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
@@ -1580,7 +1580,7 @@ static void sdma_v4_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
 	u32 pad_count;
 	int i;
 
-	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
+	pad_count = (-ib->length_dw) & 7;
 	for (i = 0; i < pad_count; i++)
 		if (sdma && sdma->burst_nop && (i == 0))
 			ib->ptr[ib->length_dw++] =
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
index 0c41b4fdc58b..d117bde3f29a 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
@@ -382,8 +382,15 @@ static void sdma_v5_0_ring_emit_ib(struct amdgpu_ring *ring,
 	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
 	uint64_t csa_mc_addr = amdgpu_sdma_get_csa_mc_addr(ring, vmid);
 
-	/* IB packet must end on a 8 DW boundary */
-	sdma_v5_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
+	/* An IB packet must end on a 8 DW boundary--the next dword
+	 * must be on a 8-dword boundary. Our IB packet below is 6
+	 * dwords long, thus add x number of NOPs, such that, in
+	 * modular arithmetic,
+	 * wptr + 6 + x = 8k, k >= 0, which in C is,
+	 * (wptr + 6 + x) % 8 = 0.
+	 * The expression below, is a solution of x.
+	 */
+	sdma_v5_0_ring_insert_nop(ring, (2 - lower_32_bits(ring->wptr)) & 7);
 
 	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
 			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
@@ -1086,10 +1093,10 @@ static void sdma_v5_0_vm_set_pte_pde(struct amdgpu_ib *ib,
 }
 
 /**
- * sdma_v5_0_ring_pad_ib - pad the IB to the required number of dw
- *
+ * sdma_v5_0_ring_pad_ib - pad the IB
  * @ib: indirect buffer to fill with padding
  *
+ * Pad the IB with NOPs to a boundary multiple of 8.
  */
 static void sdma_v5_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib)
 {
@@ -1097,7 +1104,7 @@ static void sdma_v5_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
 	u32 pad_count;
 	int i;
 
-	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
+	pad_count = (-ib->length_dw) & 0x7;
 	for (i = 0; i < pad_count; i++)
 		if (sdma && sdma->burst_nop && (i == 0))
 			ib->ptr[ib->length_dw++] =
-- 
2.23.0.385.gbc12974a89

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH] drm/amdgpu: simplify padding calculations
@ 2019-10-26 12:05             ` Koenig, Christian
  0 siblings, 0 replies; 16+ messages in thread
From: Koenig, Christian @ 2019-10-26 12:05 UTC (permalink / raw)
  To: Tuikov, Luben, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Deucher, Alexander

Am 25.10.19 um 23:51 schrieb Tuikov, Luben:
> On 2019-10-25 2:54 a.m., Koenig, Christian wrote:
>> Am 25.10.19 um 01:44 schrieb Tuikov, Luben:
>>> Simplify padding calculations.
>>>
>>> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/cik_sdma.c  |  4 ++--
>>>    drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c |  4 ++--
>>>    drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c |  4 ++--
>>>    drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c |  4 ++--
>>>    drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 14 +++++++++-----
>>>    5 files changed, 17 insertions(+), 13 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
>>> index c45304f1047c..1ea9e18d6f08 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
>>> @@ -228,7 +228,7 @@ static void cik_sdma_ring_emit_ib(struct amdgpu_ring *ring,
>>>    	u32 extra_bits = vmid & 0xf;
>>>    
>>>    	/* IB packet must end on a 8 DW boundary */
>>> -	cik_sdma_ring_insert_nop(ring, (12 - (lower_32_bits(ring->wptr) & 7)) % 8);
>>> +	cik_sdma_ring_insert_nop(ring, (4-lower_32_bits(ring->wptr)) & 7);
>>>    
>>>    	amdgpu_ring_write(ring, SDMA_PACKET(SDMA_OPCODE_INDIRECT_BUFFER, 0, extra_bits));
>>>    	amdgpu_ring_write(ring, ib->gpu_addr & 0xffffffe0); /* base must be 32 byte aligned */
>>> @@ -811,7 +811,7 @@ static void cik_sdma_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib)
>>>    	u32 pad_count;
>>>    	int i;
>>>    
>>> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
>>> +	pad_count = (-ib->length_dw) & 7;
>>>    	for (i = 0; i < pad_count; i++)
>>>    		if (sdma && sdma->burst_nop && (i == 0))
>>>    			ib->ptr[ib->length_dw++] =
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
>>> index a10175838013..d340f179401a 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
>>> @@ -255,7 +255,7 @@ static void sdma_v2_4_ring_emit_ib(struct amdgpu_ring *ring,
>>>    	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>>>    
>>>    	/* IB packet must end on a 8 DW boundary */
>>> -	sdma_v2_4_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
>>> +	sdma_v2_4_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7);
>>>    
>>>    	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
>>>    			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
>>> @@ -750,7 +750,7 @@ static void sdma_v2_4_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
>>>    	u32 pad_count;
>>>    	int i;
>>>    
>>> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
>>> +	pad_count = (-ib->length_dw) & 7;
>>>    	for (i = 0; i < pad_count; i++)
>>>    		if (sdma && sdma->burst_nop && (i == 0))
>>>    			ib->ptr[ib->length_dw++] =
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
>>> index 5f4e2c616241..5c3c310188b6 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
>>> @@ -429,7 +429,7 @@ static void sdma_v3_0_ring_emit_ib(struct amdgpu_ring *ring,
>>>    	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>>>    
>>>    	/* IB packet must end on a 8 DW boundary */
>>> -	sdma_v3_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
>>> +	sdma_v3_0_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7);
>>>    
>>>    	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
>>>    			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
>>> @@ -1021,7 +1021,7 @@ static void sdma_v3_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
>>>    	u32 pad_count;
>>>    	int i;
>>>    
>>> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
>>> +	pad_count = (-ib->length_dw) & 7;
>>>    	for (i = 0; i < pad_count; i++)
>>>    		if (sdma && sdma->burst_nop && (i == 0))
>>>    			ib->ptr[ib->length_dw++] =
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>>> index 45bd538ba97e..7c71c88e38a4 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>>> @@ -698,7 +698,7 @@ static void sdma_v4_0_ring_emit_ib(struct amdgpu_ring *ring,
>>>    	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>>>    
>>>    	/* IB packet must end on a 8 DW boundary */
>>> -	sdma_v4_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
>>> +	sdma_v4_0_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7);
>>>    
>>>    	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
>>>    			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
>>> @@ -1580,7 +1580,7 @@ static void sdma_v4_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
>>>    	u32 pad_count;
>>>    	int i;
>>>    
>>> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
>>> +	pad_count = (-ib->length_dw) & 7;
>>>    	for (i = 0; i < pad_count; i++)
>>>    		if (sdma && sdma->burst_nop && (i == 0))
>>>    			ib->ptr[ib->length_dw++] =
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>> index 0c41b4fdc58b..67ede9e4df01 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>> @@ -382,8 +382,12 @@ static void sdma_v5_0_ring_emit_ib(struct amdgpu_ring *ring,
>>>    	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>>>    	uint64_t csa_mc_addr = amdgpu_sdma_get_csa_mc_addr(ring, vmid);
>>>    
>>> -	/* IB packet must end on a 8 DW boundary */
>>> -	sdma_v5_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
>>> +	/* An IB packet must end on a 8 DW boundary--the next dword must
>>> +	 * be on a 8-dword boundary. Our IB below is 6 dwords long,
>> That should probably read "our IB command" or "our IB packet".
> Done.
>
>>> +	 * thus add x number of NOPs, such that
>>> +	 * wptr + 6 + x = 8k, k >= 0.
>> Maybe write (wrptr + 6 + x) % 8 = 0 to make that more obvious.
> That's less obvious. In my mind I always translate it to
> a congruence expression. It's what I used to derive the algebraic
> simplifications of this patch.
>
> I'll add the highly cryptic "(wrptr + 6 + x) % 8 = 0" to make you happy,
> below the algebraic expression already there.

The problem I see here is that people who are used to GPUs will think 
that k is a float and not immediately get what's meant here.

>>> +	 */
>>> +	sdma_v5_0_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7);
>> Spaces between operators please.
> Yeah, I wanted to put an emphasis on the AND-op, but I'll add them.

What would be really nice to have is to keep the 10 as number of DW in 
the IB packet around.

Maybe add an "static inline amdgpu_ring_unalign(ring, 10, 8)" helper or 
something like that, the insert_nop is a ring callback anyway IIRC.

Regards,
Christian.

>
> Regards,
> Luben
>
>
>> Apart form that looks good to me,
>> Christian.
>>
>>>    
>>>    	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
>>>    			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
>>> @@ -1086,10 +1090,10 @@ static void sdma_v5_0_vm_set_pte_pde(struct amdgpu_ib *ib,
>>>    }
>>>    
>>>    /**
>>> - * sdma_v5_0_ring_pad_ib - pad the IB to the required number of dw
>>> - *
>>> + * sdma_v5_0_ring_pad_ib - pad the IB
>>>     * @ib: indirect buffer to fill with padding
>>>     *
>>> + * Pad the IB with NOPs to a boundary multiple of 8.
>>>     */
>>>    static void sdma_v5_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib)
>>>    {
>>> @@ -1097,7 +1101,7 @@ static void sdma_v5_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
>>>    	u32 pad_count;
>>>    	int i;
>>>    
>>> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
>>> +	pad_count = (-ib->length_dw) & 0x7;
>>>    	for (i = 0; i < pad_count; i++)
>>>    		if (sdma && sdma->burst_nop && (i == 0))
>>>    			ib->ptr[ib->length_dw++] =

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] drm/amdgpu: simplify padding calculations
@ 2019-10-26 12:05             ` Koenig, Christian
  0 siblings, 0 replies; 16+ messages in thread
From: Koenig, Christian @ 2019-10-26 12:05 UTC (permalink / raw)
  To: Tuikov, Luben, amd-gfx; +Cc: Deucher, Alexander

Am 25.10.19 um 23:51 schrieb Tuikov, Luben:
> On 2019-10-25 2:54 a.m., Koenig, Christian wrote:
>> Am 25.10.19 um 01:44 schrieb Tuikov, Luben:
>>> Simplify padding calculations.
>>>
>>> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/cik_sdma.c  |  4 ++--
>>>    drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c |  4 ++--
>>>    drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c |  4 ++--
>>>    drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c |  4 ++--
>>>    drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 14 +++++++++-----
>>>    5 files changed, 17 insertions(+), 13 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
>>> index c45304f1047c..1ea9e18d6f08 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
>>> @@ -228,7 +228,7 @@ static void cik_sdma_ring_emit_ib(struct amdgpu_ring *ring,
>>>    	u32 extra_bits = vmid & 0xf;
>>>    
>>>    	/* IB packet must end on a 8 DW boundary */
>>> -	cik_sdma_ring_insert_nop(ring, (12 - (lower_32_bits(ring->wptr) & 7)) % 8);
>>> +	cik_sdma_ring_insert_nop(ring, (4-lower_32_bits(ring->wptr)) & 7);
>>>    
>>>    	amdgpu_ring_write(ring, SDMA_PACKET(SDMA_OPCODE_INDIRECT_BUFFER, 0, extra_bits));
>>>    	amdgpu_ring_write(ring, ib->gpu_addr & 0xffffffe0); /* base must be 32 byte aligned */
>>> @@ -811,7 +811,7 @@ static void cik_sdma_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib)
>>>    	u32 pad_count;
>>>    	int i;
>>>    
>>> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
>>> +	pad_count = (-ib->length_dw) & 7;
>>>    	for (i = 0; i < pad_count; i++)
>>>    		if (sdma && sdma->burst_nop && (i == 0))
>>>    			ib->ptr[ib->length_dw++] =
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
>>> index a10175838013..d340f179401a 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
>>> @@ -255,7 +255,7 @@ static void sdma_v2_4_ring_emit_ib(struct amdgpu_ring *ring,
>>>    	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>>>    
>>>    	/* IB packet must end on a 8 DW boundary */
>>> -	sdma_v2_4_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
>>> +	sdma_v2_4_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7);
>>>    
>>>    	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
>>>    			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
>>> @@ -750,7 +750,7 @@ static void sdma_v2_4_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
>>>    	u32 pad_count;
>>>    	int i;
>>>    
>>> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
>>> +	pad_count = (-ib->length_dw) & 7;
>>>    	for (i = 0; i < pad_count; i++)
>>>    		if (sdma && sdma->burst_nop && (i == 0))
>>>    			ib->ptr[ib->length_dw++] =
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
>>> index 5f4e2c616241..5c3c310188b6 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
>>> @@ -429,7 +429,7 @@ static void sdma_v3_0_ring_emit_ib(struct amdgpu_ring *ring,
>>>    	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>>>    
>>>    	/* IB packet must end on a 8 DW boundary */
>>> -	sdma_v3_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
>>> +	sdma_v3_0_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7);
>>>    
>>>    	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
>>>    			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
>>> @@ -1021,7 +1021,7 @@ static void sdma_v3_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
>>>    	u32 pad_count;
>>>    	int i;
>>>    
>>> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
>>> +	pad_count = (-ib->length_dw) & 7;
>>>    	for (i = 0; i < pad_count; i++)
>>>    		if (sdma && sdma->burst_nop && (i == 0))
>>>    			ib->ptr[ib->length_dw++] =
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>>> index 45bd538ba97e..7c71c88e38a4 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>>> @@ -698,7 +698,7 @@ static void sdma_v4_0_ring_emit_ib(struct amdgpu_ring *ring,
>>>    	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>>>    
>>>    	/* IB packet must end on a 8 DW boundary */
>>> -	sdma_v4_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
>>> +	sdma_v4_0_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7);
>>>    
>>>    	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
>>>    			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
>>> @@ -1580,7 +1580,7 @@ static void sdma_v4_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
>>>    	u32 pad_count;
>>>    	int i;
>>>    
>>> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
>>> +	pad_count = (-ib->length_dw) & 7;
>>>    	for (i = 0; i < pad_count; i++)
>>>    		if (sdma && sdma->burst_nop && (i == 0))
>>>    			ib->ptr[ib->length_dw++] =
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>> index 0c41b4fdc58b..67ede9e4df01 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>> @@ -382,8 +382,12 @@ static void sdma_v5_0_ring_emit_ib(struct amdgpu_ring *ring,
>>>    	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>>>    	uint64_t csa_mc_addr = amdgpu_sdma_get_csa_mc_addr(ring, vmid);
>>>    
>>> -	/* IB packet must end on a 8 DW boundary */
>>> -	sdma_v5_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
>>> +	/* An IB packet must end on a 8 DW boundary--the next dword must
>>> +	 * be on a 8-dword boundary. Our IB below is 6 dwords long,
>> That should probably read "our IB command" or "our IB packet".
> Done.
>
>>> +	 * thus add x number of NOPs, such that
>>> +	 * wptr + 6 + x = 8k, k >= 0.
>> Maybe write (wrptr + 6 + x) % 8 = 0 to make that more obvious.
> That's less obvious. In my mind I always translate it to
> a congruence expression. It's what I used to derive the algebraic
> simplifications of this patch.
>
> I'll add the highly cryptic "(wrptr + 6 + x) % 8 = 0" to make you happy,
> below the algebraic expression already there.

The problem I see here is that people who are used to GPUs will think 
that k is a float and not immediately get what's meant here.

>>> +	 */
>>> +	sdma_v5_0_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7);
>> Spaces between operators please.
> Yeah, I wanted to put an emphasis on the AND-op, but I'll add them.

What would be really nice to have is to keep the 10 as number of DW in 
the IB packet around.

Maybe add an "static inline amdgpu_ring_unalign(ring, 10, 8)" helper or 
something like that, the insert_nop is a ring callback anyway IIRC.

Regards,
Christian.

>
> Regards,
> Luben
>
>
>> Apart form that looks good to me,
>> Christian.
>>
>>>    
>>>    	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
>>>    			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
>>> @@ -1086,10 +1090,10 @@ static void sdma_v5_0_vm_set_pte_pde(struct amdgpu_ib *ib,
>>>    }
>>>    
>>>    /**
>>> - * sdma_v5_0_ring_pad_ib - pad the IB to the required number of dw
>>> - *
>>> + * sdma_v5_0_ring_pad_ib - pad the IB
>>>     * @ib: indirect buffer to fill with padding
>>>     *
>>> + * Pad the IB with NOPs to a boundary multiple of 8.
>>>     */
>>>    static void sdma_v5_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib)
>>>    {
>>> @@ -1097,7 +1101,7 @@ static void sdma_v5_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
>>>    	u32 pad_count;
>>>    	int i;
>>>    
>>> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
>>> +	pad_count = (-ib->length_dw) & 0x7;
>>>    	for (i = 0; i < pad_count; i++)
>>>    		if (sdma && sdma->burst_nop && (i == 0))
>>>    			ib->ptr[ib->length_dw++] =

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] drm/amdgpu: simplify padding calculations
@ 2019-10-27 21:46                 ` Tuikov, Luben
  0 siblings, 0 replies; 16+ messages in thread
From: Tuikov, Luben @ 2019-10-27 21:46 UTC (permalink / raw)
  To: Koenig, Christian, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Deucher, Alexander

On 2019-10-26 08:05, Koenig, Christian wrote:
> Am 25.10.19 um 23:51 schrieb Tuikov, Luben:
>> On 2019-10-25 2:54 a.m., Koenig, Christian wrote:
>>> Am 25.10.19 um 01:44 schrieb Tuikov, Luben:
>>>> Simplify padding calculations.
>>>>
>>>> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
>>>> ---
>>>>    drivers/gpu/drm/amd/amdgpu/cik_sdma.c  |  4 ++--
>>>>    drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c |  4 ++--
>>>>    drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c |  4 ++--
>>>>    drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c |  4 ++--
>>>>    drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 14 +++++++++-----
>>>>    5 files changed, 17 insertions(+), 13 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
>>>> index c45304f1047c..1ea9e18d6f08 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
>>>> @@ -228,7 +228,7 @@ static void cik_sdma_ring_emit_ib(struct amdgpu_ring *ring,
>>>>    	u32 extra_bits = vmid & 0xf;
>>>>    
>>>>    	/* IB packet must end on a 8 DW boundary */
>>>> -	cik_sdma_ring_insert_nop(ring, (12 - (lower_32_bits(ring->wptr) & 7)) % 8);
>>>> +	cik_sdma_ring_insert_nop(ring, (4-lower_32_bits(ring->wptr)) & 7);
>>>>    
>>>>    	amdgpu_ring_write(ring, SDMA_PACKET(SDMA_OPCODE_INDIRECT_BUFFER, 0, extra_bits));
>>>>    	amdgpu_ring_write(ring, ib->gpu_addr & 0xffffffe0); /* base must be 32 byte aligned */
>>>> @@ -811,7 +811,7 @@ static void cik_sdma_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib)
>>>>    	u32 pad_count;
>>>>    	int i;
>>>>    
>>>> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
>>>> +	pad_count = (-ib->length_dw) & 7;
>>>>    	for (i = 0; i < pad_count; i++)
>>>>    		if (sdma && sdma->burst_nop && (i == 0))
>>>>    			ib->ptr[ib->length_dw++] =
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
>>>> index a10175838013..d340f179401a 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
>>>> @@ -255,7 +255,7 @@ static void sdma_v2_4_ring_emit_ib(struct amdgpu_ring *ring,
>>>>    	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>>>>    
>>>>    	/* IB packet must end on a 8 DW boundary */
>>>> -	sdma_v2_4_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
>>>> +	sdma_v2_4_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7);
>>>>    
>>>>    	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
>>>>    			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
>>>> @@ -750,7 +750,7 @@ static void sdma_v2_4_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
>>>>    	u32 pad_count;
>>>>    	int i;
>>>>    
>>>> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
>>>> +	pad_count = (-ib->length_dw) & 7;
>>>>    	for (i = 0; i < pad_count; i++)
>>>>    		if (sdma && sdma->burst_nop && (i == 0))
>>>>    			ib->ptr[ib->length_dw++] =
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
>>>> index 5f4e2c616241..5c3c310188b6 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
>>>> @@ -429,7 +429,7 @@ static void sdma_v3_0_ring_emit_ib(struct amdgpu_ring *ring,
>>>>    	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>>>>    
>>>>    	/* IB packet must end on a 8 DW boundary */
>>>> -	sdma_v3_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
>>>> +	sdma_v3_0_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7);
>>>>    
>>>>    	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
>>>>    			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
>>>> @@ -1021,7 +1021,7 @@ static void sdma_v3_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
>>>>    	u32 pad_count;
>>>>    	int i;
>>>>    
>>>> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
>>>> +	pad_count = (-ib->length_dw) & 7;
>>>>    	for (i = 0; i < pad_count; i++)
>>>>    		if (sdma && sdma->burst_nop && (i == 0))
>>>>    			ib->ptr[ib->length_dw++] =
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>>>> index 45bd538ba97e..7c71c88e38a4 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>>>> @@ -698,7 +698,7 @@ static void sdma_v4_0_ring_emit_ib(struct amdgpu_ring *ring,
>>>>    	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>>>>    
>>>>    	/* IB packet must end on a 8 DW boundary */
>>>> -	sdma_v4_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
>>>> +	sdma_v4_0_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7);
>>>>    
>>>>    	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
>>>>    			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
>>>> @@ -1580,7 +1580,7 @@ static void sdma_v4_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
>>>>    	u32 pad_count;
>>>>    	int i;
>>>>    
>>>> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
>>>> +	pad_count = (-ib->length_dw) & 7;
>>>>    	for (i = 0; i < pad_count; i++)
>>>>    		if (sdma && sdma->burst_nop && (i == 0))
>>>>    			ib->ptr[ib->length_dw++] =
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>> index 0c41b4fdc58b..67ede9e4df01 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>> @@ -382,8 +382,12 @@ static void sdma_v5_0_ring_emit_ib(struct amdgpu_ring *ring,
>>>>    	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>>>>    	uint64_t csa_mc_addr = amdgpu_sdma_get_csa_mc_addr(ring, vmid);
>>>>    
>>>> -	/* IB packet must end on a 8 DW boundary */
>>>> -	sdma_v5_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
>>>> +	/* An IB packet must end on a 8 DW boundary--the next dword must
>>>> +	 * be on a 8-dword boundary. Our IB below is 6 dwords long,
>>> That should probably read "our IB command" or "our IB packet".
>> Done.
>>
>>>> +	 * thus add x number of NOPs, such that
>>>> +	 * wptr + 6 + x = 8k, k >= 0.
>>> Maybe write (wrptr + 6 + x) % 8 = 0 to make that more obvious.
>> That's less obvious. In my mind I always translate it to
>> a congruence expression. It's what I used to derive the algebraic
>> simplifications of this patch.
>>
>> I'll add the highly cryptic "(wrptr + 6 + x) % 8 = 0" to make you happy,
>> below the algebraic expression already there.
> 
> The problem I see here is that people who are used to GPUs will think 
> that k is a float and not immediately get what's meant here.
> 
>>>> +	 */
>>>> +	sdma_v5_0_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7);
>>> Spaces between operators please.
>> Yeah, I wanted to put an emphasis on the AND-op, but I'll add them.
> 
> What would be really nice to have is to keep the 10 as number of DW in 
> the IB packet around.

Why? Why 10?

As far as I can see, THE IB packet is 6 dwords, we want to pad with some
number of dwords, in order to make sure that the wptr + 6, ends
up on a boundary multiple of 8. I.e. we want to solve exactly this,
for x:

wptr + 6 + x = 8k, k >= 0.

x = -(wtpr + 6) = -6 - wptr = 2 - wptr (mod 8)

Also note that, 10 = 2 = -6 (mod 8).

I believe the patch is good as is.

Did you see v2 of the patch?

Regards,
Luben


> 
> Maybe add an "static inline amdgpu_ring_unalign(ring, 10, 8)" helper or 
> something like that, the insert_nop is a ring callback anyway IIRC.
> 
> Regards,
> Christian.
> 
>>
>> Regards,
>> Luben
>>
>>
>>> Apart form that looks good to me,
>>> Christian.
>>>
>>>>    
>>>>    	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
>>>>    			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
>>>> @@ -1086,10 +1090,10 @@ static void sdma_v5_0_vm_set_pte_pde(struct amdgpu_ib *ib,
>>>>    }
>>>>    
>>>>    /**
>>>> - * sdma_v5_0_ring_pad_ib - pad the IB to the required number of dw
>>>> - *
>>>> + * sdma_v5_0_ring_pad_ib - pad the IB
>>>>     * @ib: indirect buffer to fill with padding
>>>>     *
>>>> + * Pad the IB with NOPs to a boundary multiple of 8.
>>>>     */
>>>>    static void sdma_v5_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib)
>>>>    {
>>>> @@ -1097,7 +1101,7 @@ static void sdma_v5_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
>>>>    	u32 pad_count;
>>>>    	int i;
>>>>    
>>>> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
>>>> +	pad_count = (-ib->length_dw) & 0x7;
>>>>    	for (i = 0; i < pad_count; i++)
>>>>    		if (sdma && sdma->burst_nop && (i == 0))
>>>>    			ib->ptr[ib->length_dw++] =
> 

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] drm/amdgpu: simplify padding calculations
@ 2019-10-27 21:46                 ` Tuikov, Luben
  0 siblings, 0 replies; 16+ messages in thread
From: Tuikov, Luben @ 2019-10-27 21:46 UTC (permalink / raw)
  To: Koenig, Christian, amd-gfx; +Cc: Deucher, Alexander

On 2019-10-26 08:05, Koenig, Christian wrote:
> Am 25.10.19 um 23:51 schrieb Tuikov, Luben:
>> On 2019-10-25 2:54 a.m., Koenig, Christian wrote:
>>> Am 25.10.19 um 01:44 schrieb Tuikov, Luben:
>>>> Simplify padding calculations.
>>>>
>>>> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
>>>> ---
>>>>    drivers/gpu/drm/amd/amdgpu/cik_sdma.c  |  4 ++--
>>>>    drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c |  4 ++--
>>>>    drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c |  4 ++--
>>>>    drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c |  4 ++--
>>>>    drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 14 +++++++++-----
>>>>    5 files changed, 17 insertions(+), 13 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
>>>> index c45304f1047c..1ea9e18d6f08 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
>>>> @@ -228,7 +228,7 @@ static void cik_sdma_ring_emit_ib(struct amdgpu_ring *ring,
>>>>    	u32 extra_bits = vmid & 0xf;
>>>>    
>>>>    	/* IB packet must end on a 8 DW boundary */
>>>> -	cik_sdma_ring_insert_nop(ring, (12 - (lower_32_bits(ring->wptr) & 7)) % 8);
>>>> +	cik_sdma_ring_insert_nop(ring, (4-lower_32_bits(ring->wptr)) & 7);
>>>>    
>>>>    	amdgpu_ring_write(ring, SDMA_PACKET(SDMA_OPCODE_INDIRECT_BUFFER, 0, extra_bits));
>>>>    	amdgpu_ring_write(ring, ib->gpu_addr & 0xffffffe0); /* base must be 32 byte aligned */
>>>> @@ -811,7 +811,7 @@ static void cik_sdma_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib)
>>>>    	u32 pad_count;
>>>>    	int i;
>>>>    
>>>> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
>>>> +	pad_count = (-ib->length_dw) & 7;
>>>>    	for (i = 0; i < pad_count; i++)
>>>>    		if (sdma && sdma->burst_nop && (i == 0))
>>>>    			ib->ptr[ib->length_dw++] =
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
>>>> index a10175838013..d340f179401a 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
>>>> @@ -255,7 +255,7 @@ static void sdma_v2_4_ring_emit_ib(struct amdgpu_ring *ring,
>>>>    	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>>>>    
>>>>    	/* IB packet must end on a 8 DW boundary */
>>>> -	sdma_v2_4_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
>>>> +	sdma_v2_4_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7);
>>>>    
>>>>    	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
>>>>    			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
>>>> @@ -750,7 +750,7 @@ static void sdma_v2_4_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
>>>>    	u32 pad_count;
>>>>    	int i;
>>>>    
>>>> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
>>>> +	pad_count = (-ib->length_dw) & 7;
>>>>    	for (i = 0; i < pad_count; i++)
>>>>    		if (sdma && sdma->burst_nop && (i == 0))
>>>>    			ib->ptr[ib->length_dw++] =
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
>>>> index 5f4e2c616241..5c3c310188b6 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
>>>> @@ -429,7 +429,7 @@ static void sdma_v3_0_ring_emit_ib(struct amdgpu_ring *ring,
>>>>    	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>>>>    
>>>>    	/* IB packet must end on a 8 DW boundary */
>>>> -	sdma_v3_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
>>>> +	sdma_v3_0_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7);
>>>>    
>>>>    	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
>>>>    			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
>>>> @@ -1021,7 +1021,7 @@ static void sdma_v3_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
>>>>    	u32 pad_count;
>>>>    	int i;
>>>>    
>>>> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
>>>> +	pad_count = (-ib->length_dw) & 7;
>>>>    	for (i = 0; i < pad_count; i++)
>>>>    		if (sdma && sdma->burst_nop && (i == 0))
>>>>    			ib->ptr[ib->length_dw++] =
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>>>> index 45bd538ba97e..7c71c88e38a4 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>>>> @@ -698,7 +698,7 @@ static void sdma_v4_0_ring_emit_ib(struct amdgpu_ring *ring,
>>>>    	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>>>>    
>>>>    	/* IB packet must end on a 8 DW boundary */
>>>> -	sdma_v4_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
>>>> +	sdma_v4_0_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7);
>>>>    
>>>>    	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
>>>>    			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
>>>> @@ -1580,7 +1580,7 @@ static void sdma_v4_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
>>>>    	u32 pad_count;
>>>>    	int i;
>>>>    
>>>> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
>>>> +	pad_count = (-ib->length_dw) & 7;
>>>>    	for (i = 0; i < pad_count; i++)
>>>>    		if (sdma && sdma->burst_nop && (i == 0))
>>>>    			ib->ptr[ib->length_dw++] =
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>> index 0c41b4fdc58b..67ede9e4df01 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>> @@ -382,8 +382,12 @@ static void sdma_v5_0_ring_emit_ib(struct amdgpu_ring *ring,
>>>>    	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>>>>    	uint64_t csa_mc_addr = amdgpu_sdma_get_csa_mc_addr(ring, vmid);
>>>>    
>>>> -	/* IB packet must end on a 8 DW boundary */
>>>> -	sdma_v5_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
>>>> +	/* An IB packet must end on a 8 DW boundary--the next dword must
>>>> +	 * be on a 8-dword boundary. Our IB below is 6 dwords long,
>>> That should probably read "our IB command" or "our IB packet".
>> Done.
>>
>>>> +	 * thus add x number of NOPs, such that
>>>> +	 * wptr + 6 + x = 8k, k >= 0.
>>> Maybe write (wrptr + 6 + x) % 8 = 0 to make that more obvious.
>> That's less obvious. In my mind I always translate it to
>> a congruence expression. It's what I used to derive the algebraic
>> simplifications of this patch.
>>
>> I'll add the highly cryptic "(wrptr + 6 + x) % 8 = 0" to make you happy,
>> below the algebraic expression already there.
> 
> The problem I see here is that people who are used to GPUs will think 
> that k is a float and not immediately get what's meant here.
> 
>>>> +	 */
>>>> +	sdma_v5_0_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7);
>>> Spaces between operators please.
>> Yeah, I wanted to put an emphasis on the AND-op, but I'll add them.
> 
> What would be really nice to have is to keep the 10 as number of DW in 
> the IB packet around.

Why? Why 10?

As far as I can see, THE IB packet is 6 dwords, we want to pad with some
number of dwords, in order to make sure that the wptr + 6, ends
up on a boundary multiple of 8. I.e. we want to solve exactly this,
for x:

wptr + 6 + x = 8k, k >= 0.

x = -(wtpr + 6) = -6 - wptr = 2 - wptr (mod 8)

Also note that, 10 = 2 = -6 (mod 8).

I believe the patch is good as is.

Did you see v2 of the patch?

Regards,
Luben


> 
> Maybe add an "static inline amdgpu_ring_unalign(ring, 10, 8)" helper or 
> something like that, the insert_nop is a ring callback anyway IIRC.
> 
> Regards,
> Christian.
> 
>>
>> Regards,
>> Luben
>>
>>
>>> Apart form that looks good to me,
>>> Christian.
>>>
>>>>    
>>>>    	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
>>>>    			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
>>>> @@ -1086,10 +1090,10 @@ static void sdma_v5_0_vm_set_pte_pde(struct amdgpu_ib *ib,
>>>>    }
>>>>    
>>>>    /**
>>>> - * sdma_v5_0_ring_pad_ib - pad the IB to the required number of dw
>>>> - *
>>>> + * sdma_v5_0_ring_pad_ib - pad the IB
>>>>     * @ib: indirect buffer to fill with padding
>>>>     *
>>>> + * Pad the IB with NOPs to a boundary multiple of 8.
>>>>     */
>>>>    static void sdma_v5_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib)
>>>>    {
>>>> @@ -1097,7 +1101,7 @@ static void sdma_v5_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
>>>>    	u32 pad_count;
>>>>    	int i;
>>>>    
>>>> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
>>>> +	pad_count = (-ib->length_dw) & 0x7;
>>>>    	for (i = 0; i < pad_count; i++)
>>>>    		if (sdma && sdma->burst_nop && (i == 0))
>>>>    			ib->ptr[ib->length_dw++] =
> 

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] drm/amdgpu: simplify padding calculations (v2)
@ 2019-10-28 11:01             ` Koenig, Christian
  0 siblings, 0 replies; 16+ messages in thread
From: Koenig, Christian @ 2019-10-28 11:01 UTC (permalink / raw)
  To: Tuikov, Luben, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Deucher, Alexander

Am 26.10.19 um 00:41 schrieb Tuikov, Luben:
> Simplify padding calculations.
>
> v2: Comment update and spacing.
>
> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>

Reviewed-by: Christian König <christian.koenig@amd.com>

> ---
>   drivers/gpu/drm/amd/amdgpu/cik_sdma.c  |  4 ++--
>   drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c |  4 ++--
>   drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c |  4 ++--
>   drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c |  4 ++--
>   drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 17 ++++++++++++-----
>   5 files changed, 20 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
> index c45304f1047c..4af9acc2dc4f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
> +++ b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
> @@ -228,7 +228,7 @@ static void cik_sdma_ring_emit_ib(struct amdgpu_ring *ring,
>   	u32 extra_bits = vmid & 0xf;
>   
>   	/* IB packet must end on a 8 DW boundary */
> -	cik_sdma_ring_insert_nop(ring, (12 - (lower_32_bits(ring->wptr) & 7)) % 8);
> +	cik_sdma_ring_insert_nop(ring, (4 - lower_32_bits(ring->wptr)) & 7);
>   
>   	amdgpu_ring_write(ring, SDMA_PACKET(SDMA_OPCODE_INDIRECT_BUFFER, 0, extra_bits));
>   	amdgpu_ring_write(ring, ib->gpu_addr & 0xffffffe0); /* base must be 32 byte aligned */
> @@ -811,7 +811,7 @@ static void cik_sdma_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib)
>   	u32 pad_count;
>   	int i;
>   
> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
> +	pad_count = (-ib->length_dw) & 7;
>   	for (i = 0; i < pad_count; i++)
>   		if (sdma && sdma->burst_nop && (i == 0))
>   			ib->ptr[ib->length_dw++] =
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
> index a10175838013..b6af67f6f214 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
> @@ -255,7 +255,7 @@ static void sdma_v2_4_ring_emit_ib(struct amdgpu_ring *ring,
>   	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>   
>   	/* IB packet must end on a 8 DW boundary */
> -	sdma_v2_4_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
> +	sdma_v2_4_ring_insert_nop(ring, (2 - lower_32_bits(ring->wptr)) & 7);
>   
>   	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
>   			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
> @@ -750,7 +750,7 @@ static void sdma_v2_4_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
>   	u32 pad_count;
>   	int i;
>   
> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
> +	pad_count = (-ib->length_dw) & 7;
>   	for (i = 0; i < pad_count; i++)
>   		if (sdma && sdma->burst_nop && (i == 0))
>   			ib->ptr[ib->length_dw++] =
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
> index 5f4e2c616241..cd3ebed46d05 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
> @@ -429,7 +429,7 @@ static void sdma_v3_0_ring_emit_ib(struct amdgpu_ring *ring,
>   	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>   
>   	/* IB packet must end on a 8 DW boundary */
> -	sdma_v3_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
> +	sdma_v3_0_ring_insert_nop(ring, (2 - lower_32_bits(ring->wptr)) & 7);
>   
>   	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
>   			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
> @@ -1021,7 +1021,7 @@ static void sdma_v3_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
>   	u32 pad_count;
>   	int i;
>   
> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
> +	pad_count = (-ib->length_dw) & 7;
>   	for (i = 0; i < pad_count; i++)
>   		if (sdma && sdma->burst_nop && (i == 0))
>   			ib->ptr[ib->length_dw++] =
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> index 45bd538ba97e..8ce15056ee4f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> @@ -698,7 +698,7 @@ static void sdma_v4_0_ring_emit_ib(struct amdgpu_ring *ring,
>   	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>   
>   	/* IB packet must end on a 8 DW boundary */
> -	sdma_v4_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
> +	sdma_v4_0_ring_insert_nop(ring, (2 - lower_32_bits(ring->wptr)) & 7);
>   
>   	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
>   			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
> @@ -1580,7 +1580,7 @@ static void sdma_v4_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
>   	u32 pad_count;
>   	int i;
>   
> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
> +	pad_count = (-ib->length_dw) & 7;
>   	for (i = 0; i < pad_count; i++)
>   		if (sdma && sdma->burst_nop && (i == 0))
>   			ib->ptr[ib->length_dw++] =
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> index 0c41b4fdc58b..d117bde3f29a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> @@ -382,8 +382,15 @@ static void sdma_v5_0_ring_emit_ib(struct amdgpu_ring *ring,
>   	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>   	uint64_t csa_mc_addr = amdgpu_sdma_get_csa_mc_addr(ring, vmid);
>   
> -	/* IB packet must end on a 8 DW boundary */
> -	sdma_v5_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
> +	/* An IB packet must end on a 8 DW boundary--the next dword
> +	 * must be on a 8-dword boundary. Our IB packet below is 6
> +	 * dwords long, thus add x number of NOPs, such that, in
> +	 * modular arithmetic,
> +	 * wptr + 6 + x = 8k, k >= 0, which in C is,
> +	 * (wptr + 6 + x) % 8 = 0.
> +	 * The expression below, is a solution of x.
> +	 */
> +	sdma_v5_0_ring_insert_nop(ring, (2 - lower_32_bits(ring->wptr)) & 7);
>   
>   	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
>   			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
> @@ -1086,10 +1093,10 @@ static void sdma_v5_0_vm_set_pte_pde(struct amdgpu_ib *ib,
>   }
>   
>   /**
> - * sdma_v5_0_ring_pad_ib - pad the IB to the required number of dw
> - *
> + * sdma_v5_0_ring_pad_ib - pad the IB
>    * @ib: indirect buffer to fill with padding
>    *
> + * Pad the IB with NOPs to a boundary multiple of 8.
>    */
>   static void sdma_v5_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib)
>   {
> @@ -1097,7 +1104,7 @@ static void sdma_v5_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
>   	u32 pad_count;
>   	int i;
>   
> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
> +	pad_count = (-ib->length_dw) & 0x7;
>   	for (i = 0; i < pad_count; i++)
>   		if (sdma && sdma->burst_nop && (i == 0))
>   			ib->ptr[ib->length_dw++] =

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] drm/amdgpu: simplify padding calculations (v2)
@ 2019-10-28 11:01             ` Koenig, Christian
  0 siblings, 0 replies; 16+ messages in thread
From: Koenig, Christian @ 2019-10-28 11:01 UTC (permalink / raw)
  To: Tuikov, Luben, amd-gfx; +Cc: Deucher, Alexander

Am 26.10.19 um 00:41 schrieb Tuikov, Luben:
> Simplify padding calculations.
>
> v2: Comment update and spacing.
>
> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>

Reviewed-by: Christian König <christian.koenig@amd.com>

> ---
>   drivers/gpu/drm/amd/amdgpu/cik_sdma.c  |  4 ++--
>   drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c |  4 ++--
>   drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c |  4 ++--
>   drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c |  4 ++--
>   drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 17 ++++++++++++-----
>   5 files changed, 20 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
> index c45304f1047c..4af9acc2dc4f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
> +++ b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
> @@ -228,7 +228,7 @@ static void cik_sdma_ring_emit_ib(struct amdgpu_ring *ring,
>   	u32 extra_bits = vmid & 0xf;
>   
>   	/* IB packet must end on a 8 DW boundary */
> -	cik_sdma_ring_insert_nop(ring, (12 - (lower_32_bits(ring->wptr) & 7)) % 8);
> +	cik_sdma_ring_insert_nop(ring, (4 - lower_32_bits(ring->wptr)) & 7);
>   
>   	amdgpu_ring_write(ring, SDMA_PACKET(SDMA_OPCODE_INDIRECT_BUFFER, 0, extra_bits));
>   	amdgpu_ring_write(ring, ib->gpu_addr & 0xffffffe0); /* base must be 32 byte aligned */
> @@ -811,7 +811,7 @@ static void cik_sdma_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib)
>   	u32 pad_count;
>   	int i;
>   
> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
> +	pad_count = (-ib->length_dw) & 7;
>   	for (i = 0; i < pad_count; i++)
>   		if (sdma && sdma->burst_nop && (i == 0))
>   			ib->ptr[ib->length_dw++] =
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
> index a10175838013..b6af67f6f214 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
> @@ -255,7 +255,7 @@ static void sdma_v2_4_ring_emit_ib(struct amdgpu_ring *ring,
>   	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>   
>   	/* IB packet must end on a 8 DW boundary */
> -	sdma_v2_4_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
> +	sdma_v2_4_ring_insert_nop(ring, (2 - lower_32_bits(ring->wptr)) & 7);
>   
>   	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
>   			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
> @@ -750,7 +750,7 @@ static void sdma_v2_4_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
>   	u32 pad_count;
>   	int i;
>   
> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
> +	pad_count = (-ib->length_dw) & 7;
>   	for (i = 0; i < pad_count; i++)
>   		if (sdma && sdma->burst_nop && (i == 0))
>   			ib->ptr[ib->length_dw++] =
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
> index 5f4e2c616241..cd3ebed46d05 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
> @@ -429,7 +429,7 @@ static void sdma_v3_0_ring_emit_ib(struct amdgpu_ring *ring,
>   	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>   
>   	/* IB packet must end on a 8 DW boundary */
> -	sdma_v3_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
> +	sdma_v3_0_ring_insert_nop(ring, (2 - lower_32_bits(ring->wptr)) & 7);
>   
>   	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
>   			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
> @@ -1021,7 +1021,7 @@ static void sdma_v3_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
>   	u32 pad_count;
>   	int i;
>   
> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
> +	pad_count = (-ib->length_dw) & 7;
>   	for (i = 0; i < pad_count; i++)
>   		if (sdma && sdma->burst_nop && (i == 0))
>   			ib->ptr[ib->length_dw++] =
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> index 45bd538ba97e..8ce15056ee4f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> @@ -698,7 +698,7 @@ static void sdma_v4_0_ring_emit_ib(struct amdgpu_ring *ring,
>   	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>   
>   	/* IB packet must end on a 8 DW boundary */
> -	sdma_v4_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
> +	sdma_v4_0_ring_insert_nop(ring, (2 - lower_32_bits(ring->wptr)) & 7);
>   
>   	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
>   			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
> @@ -1580,7 +1580,7 @@ static void sdma_v4_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
>   	u32 pad_count;
>   	int i;
>   
> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
> +	pad_count = (-ib->length_dw) & 7;
>   	for (i = 0; i < pad_count; i++)
>   		if (sdma && sdma->burst_nop && (i == 0))
>   			ib->ptr[ib->length_dw++] =
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> index 0c41b4fdc58b..d117bde3f29a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> @@ -382,8 +382,15 @@ static void sdma_v5_0_ring_emit_ib(struct amdgpu_ring *ring,
>   	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>   	uint64_t csa_mc_addr = amdgpu_sdma_get_csa_mc_addr(ring, vmid);
>   
> -	/* IB packet must end on a 8 DW boundary */
> -	sdma_v5_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
> +	/* An IB packet must end on a 8 DW boundary--the next dword
> +	 * must be on a 8-dword boundary. Our IB packet below is 6
> +	 * dwords long, thus add x number of NOPs, such that, in
> +	 * modular arithmetic,
> +	 * wptr + 6 + x = 8k, k >= 0, which in C is,
> +	 * (wptr + 6 + x) % 8 = 0.
> +	 * The expression below, is a solution of x.
> +	 */
> +	sdma_v5_0_ring_insert_nop(ring, (2 - lower_32_bits(ring->wptr)) & 7);
>   
>   	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
>   			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
> @@ -1086,10 +1093,10 @@ static void sdma_v5_0_vm_set_pte_pde(struct amdgpu_ib *ib,
>   }
>   
>   /**
> - * sdma_v5_0_ring_pad_ib - pad the IB to the required number of dw
> - *
> + * sdma_v5_0_ring_pad_ib - pad the IB
>    * @ib: indirect buffer to fill with padding
>    *
> + * Pad the IB with NOPs to a boundary multiple of 8.
>    */
>   static void sdma_v5_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib)
>   {
> @@ -1097,7 +1104,7 @@ static void sdma_v5_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
>   	u32 pad_count;
>   	int i;
>   
> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
> +	pad_count = (-ib->length_dw) & 0x7;
>   	for (i = 0; i < pad_count; i++)
>   		if (sdma && sdma->burst_nop && (i == 0))
>   			ib->ptr[ib->length_dw++] =

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2019-10-28 11:01 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-10-24 23:44 [PATCH] drm/amdgpu: simplify padding calculations Tuikov, Luben
2019-10-24 23:44 ` Tuikov, Luben
     [not found] ` <20191024234444.4375-1-luben.tuikov-5C7GfCeVMHo@public.gmane.org>
2019-10-25  6:54   ` Koenig, Christian
2019-10-25  6:54     ` Koenig, Christian
     [not found]     ` <45e07677-717a-2c01-369e-1dbd19ce8e74-5C7GfCeVMHo@public.gmane.org>
2019-10-25 21:51       ` Tuikov, Luben
2019-10-25 21:51         ` Tuikov, Luben
     [not found]         ` <ed713bfe-2032-b56a-3e29-f61f48aeeb3b-5C7GfCeVMHo@public.gmane.org>
2019-10-25 22:30           ` Tuikov, Luben
2019-10-25 22:30             ` Tuikov, Luben
2019-10-26 12:05           ` Koenig, Christian
2019-10-26 12:05             ` Koenig, Christian
     [not found]             ` <db4d6ece-2cec-a465-0639-d75142621dda-5C7GfCeVMHo@public.gmane.org>
2019-10-27 21:46               ` Tuikov, Luben
2019-10-27 21:46                 ` Tuikov, Luben
2019-10-25 22:41       ` [PATCH] drm/amdgpu: simplify padding calculations (v2) Tuikov, Luben
2019-10-25 22:41         ` Tuikov, Luben
     [not found]         ` <20191025224116.12470-1-luben.tuikov-5C7GfCeVMHo@public.gmane.org>
2019-10-28 11:01           ` Koenig, Christian
2019-10-28 11:01             ` Koenig, Christian

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.