amd-gfx.lists.freedesktop.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/3] drm/amdgpu/mes11: print MES opcodes rather than numbers
@ 2024-04-17 11:30 Horace Chen
  2024-04-17 11:30 ` [PATCH 2/3] Revert "drm/amdgpu: increase mes submission timeout" Horace Chen
                   ` (3 more replies)
  0 siblings, 4 replies; 10+ messages in thread
From: Horace Chen @ 2024-04-17 11:30 UTC (permalink / raw)
  To: amd-gfx
  Cc: Andrey Grodzovsky, Felix Kuehling, horace.chen,
	Christian König, Deucher Alexander, Jack Xiao,
	Hawking Zhang, Monk Liu, Feifei Xu, Haijun Chang, Leo Liu,
	Jenny Liu, Alex Deucher

From: Alex Deucher <alexander.deucher@amd.com>

Makes it easier to review the logs when there are MES
errors.

v2: use dbg for emitted, add helpers for fetching strings

Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 78 ++++++++++++++++++++++++--
 1 file changed, 74 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
index 81833395324a..784343fb7470 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
@@ -100,18 +100,72 @@ static const struct amdgpu_ring_funcs mes_v11_0_ring_funcs = {
 	.insert_nop = amdgpu_ring_insert_nop,
 };
 
+static const char *mes_v11_0_opcodes[] = {
+	"MES_SCH_API_SET_HW_RSRC",
+	"MES_SCH_API_SET_SCHEDULING_CONFIG",
+	"MES_SCH_API_ADD_QUEUE"
+	"MES_SCH_API_REMOVE_QUEUE"
+	"MES_SCH_API_PERFORM_YIELD"
+	"MES_SCH_API_SET_GANG_PRIORITY_LEVEL"
+	"MES_SCH_API_SUSPEND"
+	"MES_SCH_API_RESUME"
+	"MES_SCH_API_RESET"
+	"MES_SCH_API_SET_LOG_BUFFER"
+	"MES_SCH_API_CHANGE_GANG_PRORITY"
+	"MES_SCH_API_QUERY_SCHEDULER_STATUS"
+	"MES_SCH_API_PROGRAM_GDS"
+	"MES_SCH_API_SET_DEBUG_VMID"
+	"MES_SCH_API_MISC"
+	"MES_SCH_API_UPDATE_ROOT_PAGE_TABLE"
+	"MES_SCH_API_AMD_LOG"
+};
+
+static const char *mes_v11_0_misc_opcodes[] = {
+	"MESAPI_MISC__WRITE_REG",
+	"MESAPI_MISC__INV_GART",
+	"MESAPI_MISC__QUERY_STATUS",
+	"MESAPI_MISC__READ_REG",
+	"MESAPI_MISC__WAIT_REG_MEM",
+	"MESAPI_MISC__SET_SHADER_DEBUGGER",
+};
+
+static const char *mes_v11_0_get_op_string(union MESAPI__MISC *x_pkt)
+{
+	const char *op_str = NULL;
+
+	if (x_pkt->header.opcode < ARRAY_SIZE(mes_v11_0_opcodes))
+		op_str = mes_v11_0_opcodes[x_pkt->header.opcode];
+
+	return op_str;
+}
+
+static const char *mes_v11_0_get_misc_op_string(union MESAPI__MISC *x_pkt)
+{
+	const char *op_str = NULL;
+
+	if ((x_pkt->header.opcode == MES_SCH_API_MISC) &&
+	    (x_pkt->opcode <= ARRAY_SIZE(mes_v11_0_misc_opcodes)))
+		op_str = mes_v11_0_misc_opcodes[x_pkt->opcode];
+
+	return op_str;
+}
+
 static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
 						    void *pkt, int size,
 						    int api_status_off)
 {
 	int ndw = size / 4;
 	signed long r;
-	union MESAPI__ADD_QUEUE *x_pkt = pkt;
+	union MESAPI__MISC *x_pkt = pkt;
 	struct MES_API_STATUS *api_status;
 	struct amdgpu_device *adev = mes->adev;
 	struct amdgpu_ring *ring = &mes->ring;
 	unsigned long flags;
 	signed long timeout = 3000000; /* 3000 ms */
+	const char *op_str, *misc_op_str;
+
+	if (x_pkt->header.opcode >= MES_SCH_API_MAX)
+		return -EINVAL;
 
 	if (amdgpu_emu_mode) {
 		timeout *= 100;
@@ -135,13 +189,29 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
 	amdgpu_ring_commit(ring);
 	spin_unlock_irqrestore(&mes->ring_lock, flags);
 
-	DRM_DEBUG("MES msg=%d was emitted\n", x_pkt->header.opcode);
+	op_str = mes_v11_0_get_op_string(x_pkt);
+	misc_op_str = mes_v11_0_get_misc_op_string(x_pkt);
+
+	if (misc_op_str)
+		dev_dbg(adev->dev, "MES msg=%s (%s) was emitted\n", op_str, misc_op_str);
+	else if (op_str)
+		dev_dbg(adev->dev, "MES msg=%s was emitted\n", op_str);
+	else
+		dev_dbg(adev->dev, "MES msg=%d was emitted\n", x_pkt->header.opcode);
 
 	r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq,
 		      timeout);
 	if (r < 1) {
-		DRM_ERROR("MES failed to response msg=%d\n",
-			  x_pkt->header.opcode);
+
+		if (misc_op_str)
+			dev_err(adev->dev, "MES failed to respond to msg=%s (%s)\n",
+				op_str, misc_op_str);
+		else if (op_str)
+			dev_err(adev->dev, "MES failed to respond to msg=%s\n",
+				op_str);
+		else
+			dev_err(adev->dev, "MES failed to respond to msg=%d\n",
+				x_pkt->header.opcode);
 
 		while (halt_if_hws_hang)
 			schedule();
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 2/3] Revert "drm/amdgpu: increase mes submission timeout"
  2024-04-17 11:30 [PATCH 1/3] drm/amdgpu/mes11: print MES opcodes rather than numbers Horace Chen
@ 2024-04-17 11:30 ` Horace Chen
  2024-04-17 11:30 ` [PATCH 3/3] drm/amdgpu/mes11: make fence waits synchronous Horace Chen
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 10+ messages in thread
From: Horace Chen @ 2024-04-17 11:30 UTC (permalink / raw)
  To: amd-gfx
  Cc: Andrey Grodzovsky, Felix Kuehling, horace.chen,
	Christian König, Deucher Alexander, Jack Xiao,
	Hawking Zhang, Monk Liu, Feifei Xu, Haijun Chang, Leo Liu,
	Jenny Liu, Alex Deucher

From: Alex Deucher <alexander.deucher@amd.com>

This reverts commit a518c746510e03d8a78db432a659770182546b9e.

Reduce the time we wait since we are waiting for the
fence with the spinlock held.

Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
index 784343fb7470..e40d00afd4f5 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
@@ -161,7 +161,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
 	struct amdgpu_device *adev = mes->adev;
 	struct amdgpu_ring *ring = &mes->ring;
 	unsigned long flags;
-	signed long timeout = 3000000; /* 3000 ms */
+	signed long timeout = adev->usec_timeout;
 	const char *op_str, *misc_op_str;
 
 	if (x_pkt->header.opcode >= MES_SCH_API_MAX)
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 3/3] drm/amdgpu/mes11: make fence waits synchronous
  2024-04-17 11:30 [PATCH 1/3] drm/amdgpu/mes11: print MES opcodes rather than numbers Horace Chen
  2024-04-17 11:30 ` [PATCH 2/3] Revert "drm/amdgpu: increase mes submission timeout" Horace Chen
@ 2024-04-17 11:30 ` Horace Chen
  2024-04-17 12:49   ` Christian König
  2024-04-17 14:16 ` [PATCH 1/3] drm/amdgpu/mes11: print MES opcodes rather than numbers Kasiviswanathan, Harish
  2024-04-17 18:53 ` Liu, Shaoyun
  3 siblings, 1 reply; 10+ messages in thread
From: Horace Chen @ 2024-04-17 11:30 UTC (permalink / raw)
  To: amd-gfx
  Cc: Andrey Grodzovsky, Felix Kuehling, horace.chen,
	Christian König, Deucher Alexander, Jack Xiao,
	Hawking Zhang, Monk Liu, Feifei Xu, Haijun Chang, Leo Liu,
	Jenny Liu

The MES firmware expects synchronous operation with the
driver.  For this to work asynchronously, each caller
would need to provide its own fence location and sequence
number.

For now, add a mutex lock to serialize the MES submission.
For SR-IOV long-wait case, break the long-wait to separated
part to prevent this wait from impacting reset sequence.

Signed-off-by: Horace Chen <horace.chen@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c |  3 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h |  1 +
 drivers/gpu/drm/amd/amdgpu/mes_v11_0.c  | 18 ++++++++++++++----
 3 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
index 78e4f88f5134..8896be95b2c8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
@@ -137,6 +137,7 @@ int amdgpu_mes_init(struct amdgpu_device *adev)
 	spin_lock_init(&adev->mes.queue_id_lock);
 	spin_lock_init(&adev->mes.ring_lock);
 	mutex_init(&adev->mes.mutex_hidden);
+	mutex_init(&adev->mes.submission_lock);
 
 	adev->mes.total_max_queue = AMDGPU_FENCE_MES_QUEUE_ID_MASK;
 	adev->mes.vmid_mask_mmhub = 0xffffff00;
@@ -221,6 +222,7 @@ int amdgpu_mes_init(struct amdgpu_device *adev)
 	idr_destroy(&adev->mes.queue_id_idr);
 	ida_destroy(&adev->mes.doorbell_ida);
 	mutex_destroy(&adev->mes.mutex_hidden);
+	mutex_destroy(&adev->mes.submission_lock);
 	return r;
 }
 
@@ -240,6 +242,7 @@ void amdgpu_mes_fini(struct amdgpu_device *adev)
 	idr_destroy(&adev->mes.queue_id_idr);
 	ida_destroy(&adev->mes.doorbell_ida);
 	mutex_destroy(&adev->mes.mutex_hidden);
+	mutex_destroy(&adev->mes.submission_lock);
 }
 
 static void amdgpu_mes_queue_free_mqd(struct amdgpu_mes_queue *q)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
index 6b3e1844eac5..90af935cc889 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
@@ -85,6 +85,7 @@ struct amdgpu_mes {
 
 	struct amdgpu_ring              ring;
 	spinlock_t                      ring_lock;
+	struct mutex                    submission_lock;
 
 	const struct firmware           *fw[AMDGPU_MAX_MES_PIPES];
 
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
index e40d00afd4f5..0a609a5b8835 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
@@ -162,6 +162,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
 	struct amdgpu_ring *ring = &mes->ring;
 	unsigned long flags;
 	signed long timeout = adev->usec_timeout;
+	signed long retry_count = 1;
 	const char *op_str, *misc_op_str;
 
 	if (x_pkt->header.opcode >= MES_SCH_API_MAX)
@@ -169,15 +170,19 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
 
 	if (amdgpu_emu_mode) {
 		timeout *= 100;
-	} else if (amdgpu_sriov_vf(adev)) {
+	}
+
+	if (amdgpu_sriov_vf(adev) && timeout > 0) {
 		/* Worst case in sriov where all other 15 VF timeout, each VF needs about 600ms */
-		timeout = 15 * 600 * 1000;
+		retry_count = (15 * 600 * 1000) / timeout;
 	}
 	BUG_ON(size % 4 != 0);
 
+	mutex_lock(&mes->submission_lock);
 	spin_lock_irqsave(&mes->ring_lock, flags);
 	if (amdgpu_ring_alloc(ring, ndw)) {
 		spin_unlock_irqrestore(&mes->ring_lock, flags);
+		mutex_unlock(&mes->submission_lock);
 		return -ENOMEM;
 	}
 
@@ -199,8 +204,13 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
 	else
 		dev_dbg(adev->dev, "MES msg=%d was emitted\n", x_pkt->header.opcode);
 
-	r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq,
-		      timeout);
+	do {
+		r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq,
+				timeout);
+		retry_count--;
+	} while (retry_count > 0 && !amdgpu_in_reset(adev));
+
+	mutex_unlock(&mes->submission_lock);
 	if (r < 1) {
 
 		if (misc_op_str)
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH 3/3] drm/amdgpu/mes11: make fence waits synchronous
  2024-04-17 11:30 ` [PATCH 3/3] drm/amdgpu/mes11: make fence waits synchronous Horace Chen
@ 2024-04-17 12:49   ` Christian König
  2024-04-17 18:52     ` Liu, Shaoyun
  0 siblings, 1 reply; 10+ messages in thread
From: Christian König @ 2024-04-17 12:49 UTC (permalink / raw)
  To: Horace Chen, amd-gfx
  Cc: Andrey Grodzovsky, Felix Kuehling, Deucher Alexander, Jack Xiao,
	Hawking Zhang, Monk Liu, Feifei Xu, Haijun Chang, Leo Liu,
	Jenny Liu

Am 17.04.24 um 13:30 schrieb Horace Chen:
> The MES firmware expects synchronous operation with the
> driver.  For this to work asynchronously, each caller
> would need to provide its own fence location and sequence
> number.

Well that's certainly not correct. The seqno takes care that we can wait 
async for the submission to complete.

So clear NAK for that patch here.

Regards,
Christian.

>
> For now, add a mutex lock to serialize the MES submission.
> For SR-IOV long-wait case, break the long-wait to separated
> part to prevent this wait from impacting reset sequence.
>
> Signed-off-by: Horace Chen <horace.chen@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c |  3 +++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h |  1 +
>   drivers/gpu/drm/amd/amdgpu/mes_v11_0.c  | 18 ++++++++++++++----
>   3 files changed, 18 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> index 78e4f88f5134..8896be95b2c8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> @@ -137,6 +137,7 @@ int amdgpu_mes_init(struct amdgpu_device *adev)
>   	spin_lock_init(&adev->mes.queue_id_lock);
>   	spin_lock_init(&adev->mes.ring_lock);
>   	mutex_init(&adev->mes.mutex_hidden);
> +	mutex_init(&adev->mes.submission_lock);
>   
>   	adev->mes.total_max_queue = AMDGPU_FENCE_MES_QUEUE_ID_MASK;
>   	adev->mes.vmid_mask_mmhub = 0xffffff00;
> @@ -221,6 +222,7 @@ int amdgpu_mes_init(struct amdgpu_device *adev)
>   	idr_destroy(&adev->mes.queue_id_idr);
>   	ida_destroy(&adev->mes.doorbell_ida);
>   	mutex_destroy(&adev->mes.mutex_hidden);
> +	mutex_destroy(&adev->mes.submission_lock);
>   	return r;
>   }
>   
> @@ -240,6 +242,7 @@ void amdgpu_mes_fini(struct amdgpu_device *adev)
>   	idr_destroy(&adev->mes.queue_id_idr);
>   	ida_destroy(&adev->mes.doorbell_ida);
>   	mutex_destroy(&adev->mes.mutex_hidden);
> +	mutex_destroy(&adev->mes.submission_lock);
>   }
>   
>   static void amdgpu_mes_queue_free_mqd(struct amdgpu_mes_queue *q)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> index 6b3e1844eac5..90af935cc889 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> @@ -85,6 +85,7 @@ struct amdgpu_mes {
>   
>   	struct amdgpu_ring              ring;
>   	spinlock_t                      ring_lock;
> +	struct mutex                    submission_lock;
>   
>   	const struct firmware           *fw[AMDGPU_MAX_MES_PIPES];
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> index e40d00afd4f5..0a609a5b8835 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> @@ -162,6 +162,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>   	struct amdgpu_ring *ring = &mes->ring;
>   	unsigned long flags;
>   	signed long timeout = adev->usec_timeout;
> +	signed long retry_count = 1;
>   	const char *op_str, *misc_op_str;
>   
>   	if (x_pkt->header.opcode >= MES_SCH_API_MAX)
> @@ -169,15 +170,19 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>   
>   	if (amdgpu_emu_mode) {
>   		timeout *= 100;
> -	} else if (amdgpu_sriov_vf(adev)) {
> +	}
> +
> +	if (amdgpu_sriov_vf(adev) && timeout > 0) {
>   		/* Worst case in sriov where all other 15 VF timeout, each VF needs about 600ms */
> -		timeout = 15 * 600 * 1000;
> +		retry_count = (15 * 600 * 1000) / timeout;
>   	}
>   	BUG_ON(size % 4 != 0);
>   
> +	mutex_lock(&mes->submission_lock);
>   	spin_lock_irqsave(&mes->ring_lock, flags);
>   	if (amdgpu_ring_alloc(ring, ndw)) {
>   		spin_unlock_irqrestore(&mes->ring_lock, flags);
> +		mutex_unlock(&mes->submission_lock);
>   		return -ENOMEM;
>   	}
>   
> @@ -199,8 +204,13 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>   	else
>   		dev_dbg(adev->dev, "MES msg=%d was emitted\n", x_pkt->header.opcode);
>   
> -	r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq,
> -		      timeout);
> +	do {
> +		r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq,
> +				timeout);
> +		retry_count--;
> +	} while (retry_count > 0 && !amdgpu_in_reset(adev));
> +
> +	mutex_unlock(&mes->submission_lock);
>   	if (r < 1) {
>   
>   		if (misc_op_str)


^ permalink raw reply	[flat|nested] 10+ messages in thread

* RE: [PATCH 1/3] drm/amdgpu/mes11: print MES opcodes rather than numbers
  2024-04-17 11:30 [PATCH 1/3] drm/amdgpu/mes11: print MES opcodes rather than numbers Horace Chen
  2024-04-17 11:30 ` [PATCH 2/3] Revert "drm/amdgpu: increase mes submission timeout" Horace Chen
  2024-04-17 11:30 ` [PATCH 3/3] drm/amdgpu/mes11: make fence waits synchronous Horace Chen
@ 2024-04-17 14:16 ` Kasiviswanathan, Harish
  2024-04-17 18:53 ` Liu, Shaoyun
  3 siblings, 0 replies; 10+ messages in thread
From: Kasiviswanathan, Harish @ 2024-04-17 14:16 UTC (permalink / raw)
  To: Chen, Horace, amd-gfx
  Cc: Andrey Grodzovsky, Kuehling, Felix, Chen, Horace, Koenig,
	Christian, Deucher, Alexander, Xiao, Jack, Zhang, Hawking, Liu,
	Monk, Xu, Feifei, Chang, HaiJun, Leo Liu, Liu, Jenny (Jing),
	Deucher, Alexander

[AMD Official Use Only - General]

-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Horace Chen
Sent: Wednesday, April 17, 2024 7:30 AM
To: amd-gfx@lists.freedesktop.org
Cc: Andrey Grodzovsky <Andrey.Grodzovsky@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; Chen, Horace <Horace.Chen@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Xu, Feifei <Feifei.Xu@amd.com>; Chang, HaiJun <HaiJun.Chang@amd.com>; Leo Liu <leo.liiu@amd.com>; Liu, Jenny (Jing) <Jenny-Jing.Liu@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>
Subject: [PATCH 1/3] drm/amdgpu/mes11: print MES opcodes rather than numbers

From: Alex Deucher <alexander.deucher@amd.com>

Makes it easier to review the logs when there are MES
errors.

v2: use dbg for emitted, add helpers for fetching strings

Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 78 ++++++++++++++++++++++++--
 1 file changed, 74 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
index 81833395324a..784343fb7470 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
@@ -100,18 +100,72 @@ static const struct amdgpu_ring_funcs mes_v11_0_ring_funcs = {
        .insert_nop = amdgpu_ring_insert_nop,
 };

+static const char *mes_v11_0_opcodes[] = {
+       "MES_SCH_API_SET_HW_RSRC",
+       "MES_SCH_API_SET_SCHEDULING_CONFIG",
+       "MES_SCH_API_ADD_QUEUE"

"commas are missing for bunch of defines"

+       "MES_SCH_API_REMOVE_QUEUE"
+       "MES_SCH_API_PERFORM_YIELD"
+       "MES_SCH_API_SET_GANG_PRIORITY_LEVEL"
+       "MES_SCH_API_SUSPEND"
+       "MES_SCH_API_RESUME"
+       "MES_SCH_API_RESET"
+       "MES_SCH_API_SET_LOG_BUFFER"
+       "MES_SCH_API_CHANGE_GANG_PRORITY"
+       "MES_SCH_API_QUERY_SCHEDULER_STATUS"
+       "MES_SCH_API_PROGRAM_GDS"
+       "MES_SCH_API_SET_DEBUG_VMID"
+       "MES_SCH_API_MISC"
+       "MES_SCH_API_UPDATE_ROOT_PAGE_TABLE"
+       "MES_SCH_API_AMD_LOG"
+};
+
+static const char *mes_v11_0_misc_opcodes[] = {
+       "MESAPI_MISC__WRITE_REG",
+       "MESAPI_MISC__INV_GART",
+       "MESAPI_MISC__QUERY_STATUS",
+       "MESAPI_MISC__READ_REG",
+       "MESAPI_MISC__WAIT_REG_MEM",
+       "MESAPI_MISC__SET_SHADER_DEBUGGER",
+};
+
+static const char *mes_v11_0_get_op_string(union MESAPI__MISC *x_pkt)
+{
+       const char *op_str = NULL;
+
+       if (x_pkt->header.opcode < ARRAY_SIZE(mes_v11_0_opcodes))
+               op_str = mes_v11_0_opcodes[x_pkt->header.opcode];
+
+       return op_str;
+}
+
+static const char *mes_v11_0_get_misc_op_string(union MESAPI__MISC *x_pkt)
+{
+       const char *op_str = NULL;
+
+       if ((x_pkt->header.opcode == MES_SCH_API_MISC) &&
+           (x_pkt->opcode <= ARRAY_SIZE(mes_v11_0_misc_opcodes)))
+               op_str = mes_v11_0_misc_opcodes[x_pkt->opcode];
+
+       return op_str;
+}
+
 static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
                                                    void *pkt, int size,
                                                    int api_status_off)
 {
        int ndw = size / 4;
        signed long r;
-       union MESAPI__ADD_QUEUE *x_pkt = pkt;
+       union MESAPI__MISC *x_pkt = pkt;
        struct MES_API_STATUS *api_status;
        struct amdgpu_device *adev = mes->adev;
        struct amdgpu_ring *ring = &mes->ring;
        unsigned long flags;
        signed long timeout = 3000000; /* 3000 ms */
+       const char *op_str, *misc_op_str;
+
+       if (x_pkt->header.opcode >= MES_SCH_API_MAX)
+               return -EINVAL;

        if (amdgpu_emu_mode) {
                timeout *= 100;
@@ -135,13 +189,29 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
        amdgpu_ring_commit(ring);
        spin_unlock_irqrestore(&mes->ring_lock, flags);

-       DRM_DEBUG("MES msg=%d was emitted\n", x_pkt->header.opcode);
+       op_str = mes_v11_0_get_op_string(x_pkt);
+       misc_op_str = mes_v11_0_get_misc_op_string(x_pkt);
+
+       if (misc_op_str)
+               dev_dbg(adev->dev, "MES msg=%s (%s) was emitted\n", op_str, misc_op_str);
+       else if (op_str)
+               dev_dbg(adev->dev, "MES msg=%s was emitted\n", op_str);
+       else
+               dev_dbg(adev->dev, "MES msg=%d was emitted\n", x_pkt->header.opcode);

        r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq,
                      timeout);
        if (r < 1) {
-               DRM_ERROR("MES failed to response msg=%d\n",
-                         x_pkt->header.opcode);
+
+               if (misc_op_str)
+                       dev_err(adev->dev, "MES failed to respond to msg=%s (%s)\n",
+                               op_str, misc_op_str);
+               else if (op_str)
+                       dev_err(adev->dev, "MES failed to respond to msg=%s\n",
+                               op_str);
+               else
+                       dev_err(adev->dev, "MES failed to respond to msg=%d\n",
+                               x_pkt->header.opcode);

                while (halt_if_hws_hang)
                        schedule();
--
2.34.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* RE: [PATCH 3/3] drm/amdgpu/mes11: make fence waits synchronous
  2024-04-17 12:49   ` Christian König
@ 2024-04-17 18:52     ` Liu, Shaoyun
  2024-04-17 19:21       ` Alex Deucher
  0 siblings, 1 reply; 10+ messages in thread
From: Liu, Shaoyun @ 2024-04-17 18:52 UTC (permalink / raw)
  To: Koenig, Christian, Chen, Horace, amd-gfx
  Cc: Andrey Grodzovsky, Kuehling, Felix, Deucher, Alexander, Xiao,
	Jack, Zhang, Hawking, Liu, Monk, Xu, Feifei, Chang, HaiJun,
	Leo Liu, Liu, Jenny (Jing)

[AMD Official Use Only - General]

I have  a discussion with Christian about this before .  The conclusion is that driver should prevent multiple process from using  the  MES ring at the same time . Also for current MES  ring usage ,driver doesn't have the  logic to prevent the ring  been  overflowed and we doesn't hit the issue because MES will wait polling for each MES submission . If we want to change the MES to work asynchronously , we need to consider a way to avoid this (similar to add the limit in the fence handling we use for kiq and  HMM paging)

Regards
Shaoyun.liu

-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Christian König
Sent: Wednesday, April 17, 2024 8:49 AM
To: Chen, Horace <Horace.Chen@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Andrey Grodzovsky <Andrey.Grodzovsky@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Xu, Feifei <Feifei.Xu@amd.com>; Chang, HaiJun <HaiJun.Chang@amd.com>; Leo Liu <leo.liiu@amd.com>; Liu, Jenny (Jing) <Jenny-Jing.Liu@amd.com>
Subject: Re: [PATCH 3/3] drm/amdgpu/mes11: make fence waits synchronous

Am 17.04.24 um 13:30 schrieb Horace Chen:
> The MES firmware expects synchronous operation with the driver.  For
> this to work asynchronously, each caller would need to provide its own
> fence location and sequence number.

Well that's certainly not correct. The seqno takes care that we can wait async for the submission to complete.

So clear NAK for that patch here.

Regards,
Christian.

>
> For now, add a mutex lock to serialize the MES submission.
> For SR-IOV long-wait case, break the long-wait to separated part to
> prevent this wait from impacting reset sequence.
>
> Signed-off-by: Horace Chen <horace.chen@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c |  3 +++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h |  1 +
>   drivers/gpu/drm/amd/amdgpu/mes_v11_0.c  | 18 ++++++++++++++----
>   3 files changed, 18 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> index 78e4f88f5134..8896be95b2c8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> @@ -137,6 +137,7 @@ int amdgpu_mes_init(struct amdgpu_device *adev)
>       spin_lock_init(&adev->mes.queue_id_lock);
>       spin_lock_init(&adev->mes.ring_lock);
>       mutex_init(&adev->mes.mutex_hidden);
> +     mutex_init(&adev->mes.submission_lock);
>
>       adev->mes.total_max_queue = AMDGPU_FENCE_MES_QUEUE_ID_MASK;
>       adev->mes.vmid_mask_mmhub = 0xffffff00; @@ -221,6 +222,7 @@ int
> amdgpu_mes_init(struct amdgpu_device *adev)
>       idr_destroy(&adev->mes.queue_id_idr);
>       ida_destroy(&adev->mes.doorbell_ida);
>       mutex_destroy(&adev->mes.mutex_hidden);
> +     mutex_destroy(&adev->mes.submission_lock);
>       return r;
>   }
>
> @@ -240,6 +242,7 @@ void amdgpu_mes_fini(struct amdgpu_device *adev)
>       idr_destroy(&adev->mes.queue_id_idr);
>       ida_destroy(&adev->mes.doorbell_ida);
>       mutex_destroy(&adev->mes.mutex_hidden);
> +     mutex_destroy(&adev->mes.submission_lock);
>   }
>
>   static void amdgpu_mes_queue_free_mqd(struct amdgpu_mes_queue *q)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> index 6b3e1844eac5..90af935cc889 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> @@ -85,6 +85,7 @@ struct amdgpu_mes {
>
>       struct amdgpu_ring              ring;
>       spinlock_t                      ring_lock;
> +     struct mutex                    submission_lock;
>
>       const struct firmware           *fw[AMDGPU_MAX_MES_PIPES];
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> index e40d00afd4f5..0a609a5b8835 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> @@ -162,6 +162,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>       struct amdgpu_ring *ring = &mes->ring;
>       unsigned long flags;
>       signed long timeout = adev->usec_timeout;
> +     signed long retry_count = 1;
>       const char *op_str, *misc_op_str;
>
>       if (x_pkt->header.opcode >= MES_SCH_API_MAX) @@ -169,15 +170,19 @@
> static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes
> *mes,
>
>       if (amdgpu_emu_mode) {
>               timeout *= 100;
> -     } else if (amdgpu_sriov_vf(adev)) {
> +     }
> +
> +     if (amdgpu_sriov_vf(adev) && timeout > 0) {
>               /* Worst case in sriov where all other 15 VF timeout, each VF needs about 600ms */
> -             timeout = 15 * 600 * 1000;
> +             retry_count = (15 * 600 * 1000) / timeout;
>       }
>       BUG_ON(size % 4 != 0);
>
> +     mutex_lock(&mes->submission_lock);
>       spin_lock_irqsave(&mes->ring_lock, flags);
>       if (amdgpu_ring_alloc(ring, ndw)) {
>               spin_unlock_irqrestore(&mes->ring_lock, flags);
> +             mutex_unlock(&mes->submission_lock);
>               return -ENOMEM;
>       }
>
> @@ -199,8 +204,13 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>       else
>               dev_dbg(adev->dev, "MES msg=%d was emitted\n",
> x_pkt->header.opcode);
>
> -     r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq,
> -                   timeout);
> +     do {
> +             r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq,
> +                             timeout);
> +             retry_count--;
> +     } while (retry_count > 0 && !amdgpu_in_reset(adev));
> +
> +     mutex_unlock(&mes->submission_lock);
>       if (r < 1) {
>
>               if (misc_op_str)


^ permalink raw reply	[flat|nested] 10+ messages in thread

* RE: [PATCH 1/3] drm/amdgpu/mes11: print MES opcodes rather than numbers
  2024-04-17 11:30 [PATCH 1/3] drm/amdgpu/mes11: print MES opcodes rather than numbers Horace Chen
                   ` (2 preceding siblings ...)
  2024-04-17 14:16 ` [PATCH 1/3] drm/amdgpu/mes11: print MES opcodes rather than numbers Kasiviswanathan, Harish
@ 2024-04-17 18:53 ` Liu, Shaoyun
  3 siblings, 0 replies; 10+ messages in thread
From: Liu, Shaoyun @ 2024-04-17 18:53 UTC (permalink / raw)
  To: Chen, Horace, amd-gfx
  Cc: Andrey Grodzovsky, Kuehling, Felix, Chen, Horace, Koenig,
	Christian, Deucher, Alexander, Xiao, Jack, Zhang, Hawking, Liu,
	Monk, Xu, Feifei, Chang, HaiJun, Leo Liu, Liu, Jenny (Jing),
	Deucher, Alexander

[AMD Official Use Only - General]

Looks good to me .
Reviewed by Shaoyun.liu < Shaoyun.liu@amd.com>

-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Horace Chen
Sent: Wednesday, April 17, 2024 7:30 AM
To: amd-gfx@lists.freedesktop.org
Cc: Andrey Grodzovsky <Andrey.Grodzovsky@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; Chen, Horace <Horace.Chen@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Xu, Feifei <Feifei.Xu@amd.com>; Chang, HaiJun <HaiJun.Chang@amd.com>; Leo Liu <leo.liiu@amd.com>; Liu, Jenny (Jing) <Jenny-Jing.Liu@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>
Subject: [PATCH 1/3] drm/amdgpu/mes11: print MES opcodes rather than numbers

From: Alex Deucher <alexander.deucher@amd.com>

Makes it easier to review the logs when there are MES errors.

v2: use dbg for emitted, add helpers for fetching strings

Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 78 ++++++++++++++++++++++++--
 1 file changed, 74 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
index 81833395324a..784343fb7470 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
@@ -100,18 +100,72 @@ static const struct amdgpu_ring_funcs mes_v11_0_ring_funcs = {
        .insert_nop = amdgpu_ring_insert_nop,
 };

+static const char *mes_v11_0_opcodes[] = {
+       "MES_SCH_API_SET_HW_RSRC",
+       "MES_SCH_API_SET_SCHEDULING_CONFIG",
+       "MES_SCH_API_ADD_QUEUE"
+       "MES_SCH_API_REMOVE_QUEUE"
+       "MES_SCH_API_PERFORM_YIELD"
+       "MES_SCH_API_SET_GANG_PRIORITY_LEVEL"
+       "MES_SCH_API_SUSPEND"
+       "MES_SCH_API_RESUME"
+       "MES_SCH_API_RESET"
+       "MES_SCH_API_SET_LOG_BUFFER"
+       "MES_SCH_API_CHANGE_GANG_PRORITY"
+       "MES_SCH_API_QUERY_SCHEDULER_STATUS"
+       "MES_SCH_API_PROGRAM_GDS"
+       "MES_SCH_API_SET_DEBUG_VMID"
+       "MES_SCH_API_MISC"
+       "MES_SCH_API_UPDATE_ROOT_PAGE_TABLE"
+       "MES_SCH_API_AMD_LOG"
+};
+
+static const char *mes_v11_0_misc_opcodes[] = {
+       "MESAPI_MISC__WRITE_REG",
+       "MESAPI_MISC__INV_GART",
+       "MESAPI_MISC__QUERY_STATUS",
+       "MESAPI_MISC__READ_REG",
+       "MESAPI_MISC__WAIT_REG_MEM",
+       "MESAPI_MISC__SET_SHADER_DEBUGGER",
+};
+
+static const char *mes_v11_0_get_op_string(union MESAPI__MISC *x_pkt) {
+       const char *op_str = NULL;
+
+       if (x_pkt->header.opcode < ARRAY_SIZE(mes_v11_0_opcodes))
+               op_str = mes_v11_0_opcodes[x_pkt->header.opcode];
+
+       return op_str;
+}
+
+static const char *mes_v11_0_get_misc_op_string(union MESAPI__MISC
+*x_pkt) {
+       const char *op_str = NULL;
+
+       if ((x_pkt->header.opcode == MES_SCH_API_MISC) &&
+           (x_pkt->opcode <= ARRAY_SIZE(mes_v11_0_misc_opcodes)))
+               op_str = mes_v11_0_misc_opcodes[x_pkt->opcode];
+
+       return op_str;
+}
+
 static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
                                                    void *pkt, int size,
                                                    int api_status_off)
 {
        int ndw = size / 4;
        signed long r;
-       union MESAPI__ADD_QUEUE *x_pkt = pkt;
+       union MESAPI__MISC *x_pkt = pkt;
        struct MES_API_STATUS *api_status;
        struct amdgpu_device *adev = mes->adev;
        struct amdgpu_ring *ring = &mes->ring;
        unsigned long flags;
        signed long timeout = 3000000; /* 3000 ms */
+       const char *op_str, *misc_op_str;
+
+       if (x_pkt->header.opcode >= MES_SCH_API_MAX)
+               return -EINVAL;

        if (amdgpu_emu_mode) {
                timeout *= 100;
@@ -135,13 +189,29 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
        amdgpu_ring_commit(ring);
        spin_unlock_irqrestore(&mes->ring_lock, flags);

-       DRM_DEBUG("MES msg=%d was emitted\n", x_pkt->header.opcode);
+       op_str = mes_v11_0_get_op_string(x_pkt);
+       misc_op_str = mes_v11_0_get_misc_op_string(x_pkt);
+
+       if (misc_op_str)
+               dev_dbg(adev->dev, "MES msg=%s (%s) was emitted\n", op_str, misc_op_str);
+       else if (op_str)
+               dev_dbg(adev->dev, "MES msg=%s was emitted\n", op_str);
+       else
+               dev_dbg(adev->dev, "MES msg=%d was emitted\n", x_pkt->header.opcode);

        r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq,
                      timeout);
        if (r < 1) {
-               DRM_ERROR("MES failed to response msg=%d\n",
-                         x_pkt->header.opcode);
+
+               if (misc_op_str)
+                       dev_err(adev->dev, "MES failed to respond to msg=%s (%s)\n",
+                               op_str, misc_op_str);
+               else if (op_str)
+                       dev_err(adev->dev, "MES failed to respond to msg=%s\n",
+                               op_str);
+               else
+                       dev_err(adev->dev, "MES failed to respond to msg=%d\n",
+                               x_pkt->header.opcode);

                while (halt_if_hws_hang)
                        schedule();
--
2.34.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH 3/3] drm/amdgpu/mes11: make fence waits synchronous
  2024-04-17 18:52     ` Liu, Shaoyun
@ 2024-04-17 19:21       ` Alex Deucher
  2024-04-18  5:59         ` Christian König
  0 siblings, 1 reply; 10+ messages in thread
From: Alex Deucher @ 2024-04-17 19:21 UTC (permalink / raw)
  To: Liu, Shaoyun
  Cc: Koenig, Christian, Chen, Horace, amd-gfx, Andrey Grodzovsky,
	Kuehling, Felix, Deucher, Alexander, Xiao, Jack, Zhang, Hawking,
	Liu, Monk, Xu, Feifei, Chang, HaiJun, Leo Liu, Liu, Jenny (Jing)

On Wed, Apr 17, 2024 at 3:17 PM Liu, Shaoyun <Shaoyun.Liu@amd.com> wrote:
>
> [AMD Official Use Only - General]
>
> I have  a discussion with Christian about this before .  The conclusion is that driver should prevent multiple process from using  the  MES ring at the same time . Also for current MES  ring usage ,driver doesn't have the  logic to prevent the ring  been  overflowed and we doesn't hit the issue because MES will wait polling for each MES submission . If we want to change the MES to work asynchronously , we need to consider a way to avoid this (similar to add the limit in the fence handling we use for kiq and  HMM paging)
>

I think we need a separate fence (different GPU address and seq
number) per request.  Then each caller can wait independently.

Alex

> Regards
> Shaoyun.liu
>
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Christian König
> Sent: Wednesday, April 17, 2024 8:49 AM
> To: Chen, Horace <Horace.Chen@amd.com>; amd-gfx@lists.freedesktop.org
> Cc: Andrey Grodzovsky <Andrey.Grodzovsky@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Xu, Feifei <Feifei.Xu@amd.com>; Chang, HaiJun <HaiJun.Chang@amd.com>; Leo Liu <leo.liiu@amd.com>; Liu, Jenny (Jing) <Jenny-Jing.Liu@amd.com>
> Subject: Re: [PATCH 3/3] drm/amdgpu/mes11: make fence waits synchronous
>
> Am 17.04.24 um 13:30 schrieb Horace Chen:
> > The MES firmware expects synchronous operation with the driver.  For
> > this to work asynchronously, each caller would need to provide its own
> > fence location and sequence number.
>
> Well that's certainly not correct. The seqno takes care that we can wait async for the submission to complete.
>
> So clear NAK for that patch here.
>
> Regards,
> Christian.
>
> >
> > For now, add a mutex lock to serialize the MES submission.
> > For SR-IOV long-wait case, break the long-wait to separated part to
> > prevent this wait from impacting reset sequence.
> >
> > Signed-off-by: Horace Chen <horace.chen@amd.com>
> > ---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c |  3 +++
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h |  1 +
> >   drivers/gpu/drm/amd/amdgpu/mes_v11_0.c  | 18 ++++++++++++++----
> >   3 files changed, 18 insertions(+), 4 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> > index 78e4f88f5134..8896be95b2c8 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> > @@ -137,6 +137,7 @@ int amdgpu_mes_init(struct amdgpu_device *adev)
> >       spin_lock_init(&adev->mes.queue_id_lock);
> >       spin_lock_init(&adev->mes.ring_lock);
> >       mutex_init(&adev->mes.mutex_hidden);
> > +     mutex_init(&adev->mes.submission_lock);
> >
> >       adev->mes.total_max_queue = AMDGPU_FENCE_MES_QUEUE_ID_MASK;
> >       adev->mes.vmid_mask_mmhub = 0xffffff00; @@ -221,6 +222,7 @@ int
> > amdgpu_mes_init(struct amdgpu_device *adev)
> >       idr_destroy(&adev->mes.queue_id_idr);
> >       ida_destroy(&adev->mes.doorbell_ida);
> >       mutex_destroy(&adev->mes.mutex_hidden);
> > +     mutex_destroy(&adev->mes.submission_lock);
> >       return r;
> >   }
> >
> > @@ -240,6 +242,7 @@ void amdgpu_mes_fini(struct amdgpu_device *adev)
> >       idr_destroy(&adev->mes.queue_id_idr);
> >       ida_destroy(&adev->mes.doorbell_ida);
> >       mutex_destroy(&adev->mes.mutex_hidden);
> > +     mutex_destroy(&adev->mes.submission_lock);
> >   }
> >
> >   static void amdgpu_mes_queue_free_mqd(struct amdgpu_mes_queue *q)
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> > index 6b3e1844eac5..90af935cc889 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> > @@ -85,6 +85,7 @@ struct amdgpu_mes {
> >
> >       struct amdgpu_ring              ring;
> >       spinlock_t                      ring_lock;
> > +     struct mutex                    submission_lock;
> >
> >       const struct firmware           *fw[AMDGPU_MAX_MES_PIPES];
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> > b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> > index e40d00afd4f5..0a609a5b8835 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> > @@ -162,6 +162,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> >       struct amdgpu_ring *ring = &mes->ring;
> >       unsigned long flags;
> >       signed long timeout = adev->usec_timeout;
> > +     signed long retry_count = 1;
> >       const char *op_str, *misc_op_str;
> >
> >       if (x_pkt->header.opcode >= MES_SCH_API_MAX) @@ -169,15 +170,19 @@
> > static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes
> > *mes,
> >
> >       if (amdgpu_emu_mode) {
> >               timeout *= 100;
> > -     } else if (amdgpu_sriov_vf(adev)) {
> > +     }
> > +
> > +     if (amdgpu_sriov_vf(adev) && timeout > 0) {
> >               /* Worst case in sriov where all other 15 VF timeout, each VF needs about 600ms */
> > -             timeout = 15 * 600 * 1000;
> > +             retry_count = (15 * 600 * 1000) / timeout;
> >       }
> >       BUG_ON(size % 4 != 0);
> >
> > +     mutex_lock(&mes->submission_lock);
> >       spin_lock_irqsave(&mes->ring_lock, flags);
> >       if (amdgpu_ring_alloc(ring, ndw)) {
> >               spin_unlock_irqrestore(&mes->ring_lock, flags);
> > +             mutex_unlock(&mes->submission_lock);
> >               return -ENOMEM;
> >       }
> >
> > @@ -199,8 +204,13 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> >       else
> >               dev_dbg(adev->dev, "MES msg=%d was emitted\n",
> > x_pkt->header.opcode);
> >
> > -     r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq,
> > -                   timeout);
> > +     do {
> > +             r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq,
> > +                             timeout);
> > +             retry_count--;
> > +     } while (retry_count > 0 && !amdgpu_in_reset(adev));
> > +
> > +     mutex_unlock(&mes->submission_lock);
> >       if (r < 1) {
> >
> >               if (misc_op_str)
>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 3/3] drm/amdgpu/mes11: make fence waits synchronous
  2024-04-17 19:21       ` Alex Deucher
@ 2024-04-18  5:59         ` Christian König
  2024-04-18 14:42           ` Liu, Shaoyun
  0 siblings, 1 reply; 10+ messages in thread
From: Christian König @ 2024-04-18  5:59 UTC (permalink / raw)
  To: Alex Deucher, Liu, Shaoyun
  Cc: Chen, Horace, amd-gfx, Andrey Grodzovsky, Kuehling, Felix,
	Deucher, Alexander, Xiao, Jack, Zhang, Hawking, Liu, Monk, Xu,
	Feifei, Chang, HaiJun, Leo Liu, Liu, Jenny (Jing)

Am 17.04.24 um 21:21 schrieb Alex Deucher:
> On Wed, Apr 17, 2024 at 3:17 PM Liu, Shaoyun <Shaoyun.Liu@amd.com> wrote:
>> [AMD Official Use Only - General]
>>
>> I have  a discussion with Christian about this before .  The conclusion is that driver should prevent multiple process from using  the  MES ring at the same time . Also for current MES  ring usage ,driver doesn't have the  logic to prevent the ring  been  overflowed and we doesn't hit the issue because MES will wait polling for each MES submission . If we want to change the MES to work asynchronously , we need to consider a way to avoid this (similar to add the limit in the fence handling we use for kiq and  HMM paging)
>>
> I think we need a separate fence (different GPU address and seq
> number) per request.  Then each caller can wait independently.

Well no, we need to modify the MES firmware to stop abusing the fence as 
signaling mechanism for the result of an operation.

I've pointed that out before and I think this is a hard requirement for 
correct operation.

Additional to that retrying on the reset flag looks like another broken 
workaround to me.

So just to make it clear this approach is a NAK from my side, don't 
commit that.

Regards,
Christian.

>
> Alex
>
>> Regards
>> Shaoyun.liu
>>
>> -----Original Message-----
>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Christian König
>> Sent: Wednesday, April 17, 2024 8:49 AM
>> To: Chen, Horace <Horace.Chen@amd.com>; amd-gfx@lists.freedesktop.org
>> Cc: Andrey Grodzovsky <Andrey.Grodzovsky@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Xu, Feifei <Feifei.Xu@amd.com>; Chang, HaiJun <HaiJun.Chang@amd.com>; Leo Liu <leo.liiu@amd.com>; Liu, Jenny (Jing) <Jenny-Jing.Liu@amd.com>
>> Subject: Re: [PATCH 3/3] drm/amdgpu/mes11: make fence waits synchronous
>>
>> Am 17.04.24 um 13:30 schrieb Horace Chen:
>>> The MES firmware expects synchronous operation with the driver.  For
>>> this to work asynchronously, each caller would need to provide its own
>>> fence location and sequence number.
>> Well that's certainly not correct. The seqno takes care that we can wait async for the submission to complete.
>>
>> So clear NAK for that patch here.
>>
>> Regards,
>> Christian.
>>
>>> For now, add a mutex lock to serialize the MES submission.
>>> For SR-IOV long-wait case, break the long-wait to separated part to
>>> prevent this wait from impacting reset sequence.
>>>
>>> Signed-off-by: Horace Chen <horace.chen@amd.com>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c |  3 +++
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h |  1 +
>>>    drivers/gpu/drm/amd/amdgpu/mes_v11_0.c  | 18 ++++++++++++++----
>>>    3 files changed, 18 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
>>> index 78e4f88f5134..8896be95b2c8 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
>>> @@ -137,6 +137,7 @@ int amdgpu_mes_init(struct amdgpu_device *adev)
>>>        spin_lock_init(&adev->mes.queue_id_lock);
>>>        spin_lock_init(&adev->mes.ring_lock);
>>>        mutex_init(&adev->mes.mutex_hidden);
>>> +     mutex_init(&adev->mes.submission_lock);
>>>
>>>        adev->mes.total_max_queue = AMDGPU_FENCE_MES_QUEUE_ID_MASK;
>>>        adev->mes.vmid_mask_mmhub = 0xffffff00; @@ -221,6 +222,7 @@ int
>>> amdgpu_mes_init(struct amdgpu_device *adev)
>>>        idr_destroy(&adev->mes.queue_id_idr);
>>>        ida_destroy(&adev->mes.doorbell_ida);
>>>        mutex_destroy(&adev->mes.mutex_hidden);
>>> +     mutex_destroy(&adev->mes.submission_lock);
>>>        return r;
>>>    }
>>>
>>> @@ -240,6 +242,7 @@ void amdgpu_mes_fini(struct amdgpu_device *adev)
>>>        idr_destroy(&adev->mes.queue_id_idr);
>>>        ida_destroy(&adev->mes.doorbell_ida);
>>>        mutex_destroy(&adev->mes.mutex_hidden);
>>> +     mutex_destroy(&adev->mes.submission_lock);
>>>    }
>>>
>>>    static void amdgpu_mes_queue_free_mqd(struct amdgpu_mes_queue *q)
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
>>> index 6b3e1844eac5..90af935cc889 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
>>> @@ -85,6 +85,7 @@ struct amdgpu_mes {
>>>
>>>        struct amdgpu_ring              ring;
>>>        spinlock_t                      ring_lock;
>>> +     struct mutex                    submission_lock;
>>>
>>>        const struct firmware           *fw[AMDGPU_MAX_MES_PIPES];
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
>>> index e40d00afd4f5..0a609a5b8835 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
>>> @@ -162,6 +162,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>>>        struct amdgpu_ring *ring = &mes->ring;
>>>        unsigned long flags;
>>>        signed long timeout = adev->usec_timeout;
>>> +     signed long retry_count = 1;
>>>        const char *op_str, *misc_op_str;
>>>
>>>        if (x_pkt->header.opcode >= MES_SCH_API_MAX) @@ -169,15 +170,19 @@
>>> static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes
>>> *mes,
>>>
>>>        if (amdgpu_emu_mode) {
>>>                timeout *= 100;
>>> -     } else if (amdgpu_sriov_vf(adev)) {
>>> +     }
>>> +
>>> +     if (amdgpu_sriov_vf(adev) && timeout > 0) {
>>>                /* Worst case in sriov where all other 15 VF timeout, each VF needs about 600ms */
>>> -             timeout = 15 * 600 * 1000;
>>> +             retry_count = (15 * 600 * 1000) / timeout;
>>>        }
>>>        BUG_ON(size % 4 != 0);
>>>
>>> +     mutex_lock(&mes->submission_lock);
>>>        spin_lock_irqsave(&mes->ring_lock, flags);
>>>        if (amdgpu_ring_alloc(ring, ndw)) {
>>>                spin_unlock_irqrestore(&mes->ring_lock, flags);
>>> +             mutex_unlock(&mes->submission_lock);
>>>                return -ENOMEM;
>>>        }
>>>
>>> @@ -199,8 +204,13 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>>>        else
>>>                dev_dbg(adev->dev, "MES msg=%d was emitted\n",
>>> x_pkt->header.opcode);
>>>
>>> -     r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq,
>>> -                   timeout);
>>> +     do {
>>> +             r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq,
>>> +                             timeout);
>>> +             retry_count--;
>>> +     } while (retry_count > 0 && !amdgpu_in_reset(adev));
>>> +
>>> +     mutex_unlock(&mes->submission_lock);
>>>        if (r < 1) {
>>>
>>>                if (misc_op_str)


^ permalink raw reply	[flat|nested] 10+ messages in thread

* RE: [PATCH 3/3] drm/amdgpu/mes11: make fence waits synchronous
  2024-04-18  5:59         ` Christian König
@ 2024-04-18 14:42           ` Liu, Shaoyun
  0 siblings, 0 replies; 10+ messages in thread
From: Liu, Shaoyun @ 2024-04-18 14:42 UTC (permalink / raw)
  To: Koenig, Christian, Alex Deucher
  Cc: Chen, Horace, amd-gfx, Andrey Grodzovsky, Kuehling, Felix,
	Deucher, Alexander, Xiao, Jack, Zhang, Hawking, Liu, Monk, Xu,
	Feifei, Chang, HaiJun, Leo Liu, Liu, Jenny (Jing)

[AMD Official Use Only - General]

I think the only user for the MES is kernel driver and all the submission to MES need to be synced ,  driver already have the  mes->ring_lock  to ensure that . I don't know why there is a necessary to use separate fence per request .
MES do update the API status for each API (update the fence seq at an specific address) , but driver can decide to check it or not . For now , MES don't generate a interrupt as fence signal, but from driver side it  can  check the fence seq as polling .

Regards
Shaoyun.liu
-----Original Message-----
From: Koenig, Christian <Christian.Koenig@amd.com>
Sent: Thursday, April 18, 2024 1:59 AM
To: Alex Deucher <alexdeucher@gmail.com>; Liu, Shaoyun <Shaoyun.Liu@amd.com>
Cc: Chen, Horace <Horace.Chen@amd.com>; amd-gfx@lists.freedesktop.org; Andrey Grodzovsky <Andrey.Grodzovsky@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Xu, Feifei <Feifei.Xu@amd.com>; Chang, HaiJun <HaiJun.Chang@amd.com>; Leo Liu <leo.liiu@amd.com>; Liu, Jenny (Jing) <Jenny-Jing.Liu@amd.com>
Subject: Re: [PATCH 3/3] drm/amdgpu/mes11: make fence waits synchronous

Am 17.04.24 um 21:21 schrieb Alex Deucher:
> On Wed, Apr 17, 2024 at 3:17 PM Liu, Shaoyun <Shaoyun.Liu@amd.com> wrote:
>> [AMD Official Use Only - General]
>>
>> I have  a discussion with Christian about this before .  The
>> conclusion is that driver should prevent multiple process from using
>> the  MES ring at the same time . Also for current MES  ring usage
>> ,driver doesn't have the  logic to prevent the ring  been  overflowed
>> and we doesn't hit the issue because MES will wait polling for each
>> MES submission . If we want to change the MES to work asynchronously
>> , we need to consider a way to avoid this (similar to add the limit
>> in the fence handling we use for kiq and  HMM paging)
>>
> I think we need a separate fence (different GPU address and seq
> number) per request.  Then each caller can wait independently.

Well no, we need to modify the MES firmware to stop abusing the fence as signaling mechanism for the result of an operation.

I've pointed that out before and I think this is a hard requirement for correct operation.

Additional to that retrying on the reset flag looks like another broken workaround to me.

So just to make it clear this approach is a NAK from my side, don't commit that.

Regards,
Christian.

>
> Alex
>
>> Regards
>> Shaoyun.liu
>>
>> -----Original Message-----
>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
>> Christian König
>> Sent: Wednesday, April 17, 2024 8:49 AM
>> To: Chen, Horace <Horace.Chen@amd.com>; amd-gfx@lists.freedesktop.org
>> Cc: Andrey Grodzovsky <Andrey.Grodzovsky@amd.com>; Kuehling, Felix
>> <Felix.Kuehling@amd.com>; Deucher, Alexander
>> <Alexander.Deucher@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhang,
>> Hawking <Hawking.Zhang@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Xu,
>> Feifei <Feifei.Xu@amd.com>; Chang, HaiJun <HaiJun.Chang@amd.com>; Leo
>> Liu <leo.liiu@amd.com>; Liu, Jenny (Jing) <Jenny-Jing.Liu@amd.com>
>> Subject: Re: [PATCH 3/3] drm/amdgpu/mes11: make fence waits
>> synchronous
>>
>> Am 17.04.24 um 13:30 schrieb Horace Chen:
>>> The MES firmware expects synchronous operation with the driver.  For
>>> this to work asynchronously, each caller would need to provide its
>>> own fence location and sequence number.
>> Well that's certainly not correct. The seqno takes care that we can wait async for the submission to complete.
>>
>> So clear NAK for that patch here.
>>
>> Regards,
>> Christian.
>>
>>> For now, add a mutex lock to serialize the MES submission.
>>> For SR-IOV long-wait case, break the long-wait to separated part to
>>> prevent this wait from impacting reset sequence.
>>>
>>> Signed-off-by: Horace Chen <horace.chen@amd.com>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c |  3 +++
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h |  1 +
>>>    drivers/gpu/drm/amd/amdgpu/mes_v11_0.c  | 18 ++++++++++++++----
>>>    3 files changed, 18 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
>>> index 78e4f88f5134..8896be95b2c8 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
>>> @@ -137,6 +137,7 @@ int amdgpu_mes_init(struct amdgpu_device *adev)
>>>        spin_lock_init(&adev->mes.queue_id_lock);
>>>        spin_lock_init(&adev->mes.ring_lock);
>>>        mutex_init(&adev->mes.mutex_hidden);
>>> +     mutex_init(&adev->mes.submission_lock);
>>>
>>>        adev->mes.total_max_queue = AMDGPU_FENCE_MES_QUEUE_ID_MASK;
>>>        adev->mes.vmid_mask_mmhub = 0xffffff00; @@ -221,6 +222,7 @@
>>> int amdgpu_mes_init(struct amdgpu_device *adev)
>>>        idr_destroy(&adev->mes.queue_id_idr);
>>>        ida_destroy(&adev->mes.doorbell_ida);
>>>        mutex_destroy(&adev->mes.mutex_hidden);
>>> +     mutex_destroy(&adev->mes.submission_lock);
>>>        return r;
>>>    }
>>>
>>> @@ -240,6 +242,7 @@ void amdgpu_mes_fini(struct amdgpu_device *adev)
>>>        idr_destroy(&adev->mes.queue_id_idr);
>>>        ida_destroy(&adev->mes.doorbell_ida);
>>>        mutex_destroy(&adev->mes.mutex_hidden);
>>> +     mutex_destroy(&adev->mes.submission_lock);
>>>    }
>>>
>>>    static void amdgpu_mes_queue_free_mqd(struct amdgpu_mes_queue *q)
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
>>> index 6b3e1844eac5..90af935cc889 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
>>> @@ -85,6 +85,7 @@ struct amdgpu_mes {
>>>
>>>        struct amdgpu_ring              ring;
>>>        spinlock_t                      ring_lock;
>>> +     struct mutex                    submission_lock;
>>>
>>>        const struct firmware           *fw[AMDGPU_MAX_MES_PIPES];
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
>>> index e40d00afd4f5..0a609a5b8835 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
>>> @@ -162,6 +162,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>>>        struct amdgpu_ring *ring = &mes->ring;
>>>        unsigned long flags;
>>>        signed long timeout = adev->usec_timeout;
>>> +     signed long retry_count = 1;
>>>        const char *op_str, *misc_op_str;
>>>
>>>        if (x_pkt->header.opcode >= MES_SCH_API_MAX) @@ -169,15
>>> +170,19 @@ static int
>>> mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>>>
>>>        if (amdgpu_emu_mode) {
>>>                timeout *= 100;
>>> -     } else if (amdgpu_sriov_vf(adev)) {
>>> +     }
>>> +
>>> +     if (amdgpu_sriov_vf(adev) && timeout > 0) {
>>>                /* Worst case in sriov where all other 15 VF timeout, each VF needs about 600ms */
>>> -             timeout = 15 * 600 * 1000;
>>> +             retry_count = (15 * 600 * 1000) / timeout;
>>>        }
>>>        BUG_ON(size % 4 != 0);
>>>
>>> +     mutex_lock(&mes->submission_lock);
>>>        spin_lock_irqsave(&mes->ring_lock, flags);
>>>        if (amdgpu_ring_alloc(ring, ndw)) {
>>>                spin_unlock_irqrestore(&mes->ring_lock, flags);
>>> +             mutex_unlock(&mes->submission_lock);
>>>                return -ENOMEM;
>>>        }
>>>
>>> @@ -199,8 +204,13 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>>>        else
>>>                dev_dbg(adev->dev, "MES msg=%d was emitted\n",
>>> x_pkt->header.opcode);
>>>
>>> -     r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq,
>>> -                   timeout);
>>> +     do {
>>> +             r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq,
>>> +                             timeout);
>>> +             retry_count--;
>>> +     } while (retry_count > 0 && !amdgpu_in_reset(adev));
>>> +
>>> +     mutex_unlock(&mes->submission_lock);
>>>        if (r < 1) {
>>>
>>>                if (misc_op_str)


^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2024-04-18 14:42 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-04-17 11:30 [PATCH 1/3] drm/amdgpu/mes11: print MES opcodes rather than numbers Horace Chen
2024-04-17 11:30 ` [PATCH 2/3] Revert "drm/amdgpu: increase mes submission timeout" Horace Chen
2024-04-17 11:30 ` [PATCH 3/3] drm/amdgpu/mes11: make fence waits synchronous Horace Chen
2024-04-17 12:49   ` Christian König
2024-04-17 18:52     ` Liu, Shaoyun
2024-04-17 19:21       ` Alex Deucher
2024-04-18  5:59         ` Christian König
2024-04-18 14:42           ` Liu, Shaoyun
2024-04-17 14:16 ` [PATCH 1/3] drm/amdgpu/mes11: print MES opcodes rather than numbers Kasiviswanathan, Harish
2024-04-17 18:53 ` Liu, Shaoyun

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).