All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH V2 1/2] drm/amdgpu: Fixed the defect of soft lock caused by infinite loop
@ 2022-01-30  7:11 yipechai
  2022-01-30  7:11 ` [PATCH V2 2/2] Revert "drm/amdgpu: Add judgement to avoid infinite loop" yipechai
  2022-01-30  7:35 ` [PATCH V2 1/2] drm/amdgpu: Fixed the defect of soft lock caused by infinite loop Zhou1, Tao
  0 siblings, 2 replies; 3+ messages in thread
From: yipechai @ 2022-01-30  7:11 UTC (permalink / raw)
  To: amd-gfx; +Cc: Tao.Zhou1, Hawking.Zhang, John.Clements, yipechai, yipechai

1. The infinite loop case only occurs on multiple cards support
   ras functions.
2. The explanation of root cause refer to commit 76641cbbf196
   ("drm/amdgpu: Add judgement to avoid infinite loop").
3. Create new node to manage each unique ras instance to guarantee
   each device .ras_list is completely independent.
4. Fixes: commit 7a6b8ab3231b51 ("drm/amdgpu: Unify ras block
   interface for each ras block").
5. The soft locked logs are as follows:
[  262.165690] CPU: 93 PID: 758 Comm: kworker/93:1 Tainted: G           OE     5.13.0-27-generic #29~20.04.1-Ubuntu
[  262.165695] Hardware name: Supermicro AS -4124GS-TNR/H12DSG-O-CPU, BIOS T20200717143848 07/17/2020
[  262.165698] Workqueue: events amdgpu_ras_do_recovery [amdgpu]
[  262.165980] RIP: 0010:amdgpu_ras_get_ras_block+0x86/0xd0 [amdgpu]
[  262.166239] Code: 68 d8 4c 8d 71 d8 48 39 c3 74 54 49 8b 45 38 48 85 c0 74 32 44 89 fa 44 89 e6 4c 89 ef e8 82 e4 9b dc 85 c0 74 3c 49 8b 46 28 <49> 8d 56 28 4d 89 f5 48 83 e8 28 48 39 d3 74 25 49 89 c6 49 8b 45
[  262.166243] RSP: 0018:ffffac908fa87d80 EFLAGS: 00000202
[  262.166247] RAX: ffffffffc1394248 RBX: ffff91e4ab8d6e20 RCX: ffffffffc1394248
[  262.166249] RDX: ffff91e4aa356e20 RSI: 000000000000000e RDI: ffff91e4ab8c0000
[  262.166252] RBP: ffffac908fa87da8 R08: 0000000000000007 R09: 0000000000000001
[  262.166254] R10: ffff91e4930b64ec R11: 0000000000000000 R12: 000000000000000e
[  262.166256] R13: ffff91e4aa356df8 R14: ffffffffc1394320 R15: 0000000000000003
[  262.166258] FS:  0000000000000000(0000) GS:ffff92238fb40000(0000) knlGS:0000000000000000
[  262.166261] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  262.166264] CR2: 00000001004865d0 CR3: 000000406d796000 CR4: 0000000000350ee0
[  262.166267] Call Trace:
[  262.166272]  amdgpu_ras_do_recovery+0x130/0x290 [amdgpu]
[  262.166529]  ? psi_task_switch+0xd2/0x250
[  262.166537]  ? __switch_to+0x11d/0x460
[  262.166542]  ? __switch_to_asm+0x36/0x70
[  262.166549]  process_one_work+0x220/0x3c0
[  262.166556]  worker_thread+0x4d/0x3f0
[  262.166560]  ? process_one_work+0x3c0/0x3c0
[  262.166563]  kthread+0x12b/0x150
[  262.166568]  ? set_kthread_struct+0x40/0x40
[  262.166571]  ret_from_fork+0x22/0x30

Signed-off-by: yipechai <YiPeng.Chai@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 37 ++++++++++++++++++++++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  3 --
 2 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 9d7c778c1a2d..9b94c9c4960c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -75,6 +75,13 @@ const char *ras_mca_block_string[] = {
 	"mca_iohc",
 };
 
+struct amdgpu_ras_block_list {
+	/* ras block link */
+	struct list_head node;
+
+	struct amdgpu_ras_block_object *ras_obj;
+};
+
 const char *get_ras_block_str(struct ras_common_if *ras_block)
 {
 	if (!ras_block)
@@ -880,7 +887,8 @@ static struct amdgpu_ras_block_object *amdgpu_ras_get_ras_block(struct amdgpu_de
 					enum amdgpu_ras_block block, uint32_t sub_block_index)
 {
 	int loop_cnt = 0;
-	struct amdgpu_ras_block_object *obj, *tmp;
+	struct amdgpu_ras_block_list *node, *tmp;
+	struct amdgpu_ras_block_object *obj;
 
 	if (block >= AMDGPU_RAS_BLOCK__LAST)
 		return NULL;
@@ -888,7 +896,13 @@ static struct amdgpu_ras_block_object *amdgpu_ras_get_ras_block(struct amdgpu_de
 	if (!amdgpu_ras_is_supported(adev, block))
 		return NULL;
 
-	list_for_each_entry_safe(obj, tmp, &adev->ras_list, node) {
+	list_for_each_entry_safe(node, tmp, &adev->ras_list, node) {
+		if (!node->ras_obj) {
+			dev_warn(adev->dev, "Warning: abnormal ras list node.\n");
+			continue;
+		}
+
+		obj = node->ras_obj;
 		if (obj->ras_block_match) {
 			if (obj->ras_block_match(obj, block, sub_block_index) == 0)
 				return obj;
@@ -2527,6 +2541,7 @@ int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
 
 int amdgpu_ras_fini(struct amdgpu_device *adev)
 {
+	struct amdgpu_ras_block_list *ras_node, *tmp;
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 
 	if (!adev->ras_enabled || !con)
@@ -2545,6 +2560,12 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
 	amdgpu_ras_set_context(adev, NULL);
 	kfree(con);
 
+	/* Clear ras blocks from ras_list and free ras block list node */
+	list_for_each_entry_safe(ras_node, tmp, &adev->ras_list, node) {
+		list_del(&ras_node->node);
+		kfree(ras_node);
+	}
+
 	return 0;
 }
 
@@ -2754,14 +2775,22 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)
 int amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
 		struct amdgpu_ras_block_object *ras_block_obj)
 {
+	struct amdgpu_ras_block_list *ras_node;
 	if (!adev || !ras_block_obj)
 		return -EINVAL;
 
 	if (!amdgpu_ras_asic_supported(adev))
 		return 0;
 
-	INIT_LIST_HEAD(&ras_block_obj->node);
-	list_add_tail(&ras_block_obj->node, &adev->ras_list);
+	ras_node = kzalloc(sizeof(*ras_node), GFP_KERNEL);
+	if (!ras_node) {
+		dev_err(adev->dev, "Failed to allocate ras_node!\n");
+		return -ENOMEM;
+	}
+
+	INIT_LIST_HEAD(&ras_node->node);
+	ras_node->ras_obj = ras_block_obj;
+	list_add_tail(&ras_node->node, &adev->ras_list);
 
 	return 0;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index a51a281bd91a..a55743b12d57 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -493,9 +493,6 @@ struct amdgpu_ras_block_object {
 
 	uint32_t sub_block_index;
 
-	/* ras block link */
-	struct list_head node;
-
 	int (*ras_block_match)(struct amdgpu_ras_block_object *block_obj,
 				enum amdgpu_ras_block block, uint32_t sub_block_index);
 	int (*ras_late_init)(struct amdgpu_device *adev, void *ras_info);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* [PATCH V2 2/2] Revert "drm/amdgpu: Add judgement to avoid infinite loop"
  2022-01-30  7:11 [PATCH V2 1/2] drm/amdgpu: Fixed the defect of soft lock caused by infinite loop yipechai
@ 2022-01-30  7:11 ` yipechai
  2022-01-30  7:35 ` [PATCH V2 1/2] drm/amdgpu: Fixed the defect of soft lock caused by infinite loop Zhou1, Tao
  1 sibling, 0 replies; 3+ messages in thread
From: yipechai @ 2022-01-30  7:11 UTC (permalink / raw)
  To: amd-gfx; +Cc: Tao.Zhou1, Hawking.Zhang, John.Clements, yipechai, yipechai

The commit 8583c8983f1b ("drm/amdgpu: Fixed the defect of
soft lock caused by infinite loop") had fixed this defect.

Revert workaround commit 76641cbbf196 ("drm/amdgpu: Add
judgement to avoid infinite loop").

Signed-off-by: yipechai <YiPeng.Chai@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 9b94c9c4960c..5558df3b21f3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -886,7 +886,6 @@ static int amdgpu_ras_block_match_default(struct amdgpu_ras_block_object *block_
 static struct amdgpu_ras_block_object *amdgpu_ras_get_ras_block(struct amdgpu_device *adev,
 					enum amdgpu_ras_block block, uint32_t sub_block_index)
 {
-	int loop_cnt = 0;
 	struct amdgpu_ras_block_list *node, *tmp;
 	struct amdgpu_ras_block_object *obj;
 
@@ -910,9 +909,6 @@ static struct amdgpu_ras_block_object *amdgpu_ras_get_ras_block(struct amdgpu_de
 			if (amdgpu_ras_block_match_default(obj, block) == 0)
 				return obj;
 		}
-
-		if (++loop_cnt >= AMDGPU_RAS_BLOCK__LAST)
-			break;
 	}
 
 	return NULL;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* RE: [PATCH V2 1/2] drm/amdgpu: Fixed the defect of soft lock caused by infinite loop
  2022-01-30  7:11 [PATCH V2 1/2] drm/amdgpu: Fixed the defect of soft lock caused by infinite loop yipechai
  2022-01-30  7:11 ` [PATCH V2 2/2] Revert "drm/amdgpu: Add judgement to avoid infinite loop" yipechai
@ 2022-01-30  7:35 ` Zhou1, Tao
  1 sibling, 0 replies; 3+ messages in thread
From: Zhou1, Tao @ 2022-01-30  7:35 UTC (permalink / raw)
  To: Chai, Thomas, amd-gfx; +Cc: Clements, John, Zhang, Hawking

[AMD Official Use Only]

The series is:

Reviewed-by: Tao Zhou <tao.zhou1@amd.com

> -----Original Message-----
> From: Chai, Thomas <YiPeng.Chai@amd.com>
> Sent: Sunday, January 30, 2022 3:12 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Chai, Thomas <YiPeng.Chai@amd.com>; Zhang, Hawking
> <Hawking.Zhang@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Clements,
> John <John.Clements@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com>
> Subject: [PATCH V2 1/2] drm/amdgpu: Fixed the defect of soft lock caused by
> infinite loop
> 
> 1. The infinite loop case only occurs on multiple cards support
>    ras functions.
> 2. The explanation of root cause refer to commit 76641cbbf196
>    ("drm/amdgpu: Add judgement to avoid infinite loop").
> 3. Create new node to manage each unique ras instance to guarantee
>    each device .ras_list is completely independent.
> 4. Fixes: commit 7a6b8ab3231b51 ("drm/amdgpu: Unify ras block
>    interface for each ras block").
> 5. The soft locked logs are as follows:
> [  262.165690] CPU: 93 PID: 758 Comm: kworker/93:1 Tainted: G           OE
> 5.13.0-27-generic #29~20.04.1-Ubuntu
> [  262.165695] Hardware name: Supermicro AS -4124GS-TNR/H12DSG-O-CPU,
> BIOS T20200717143848 07/17/2020 [  262.165698] Workqueue: events
> amdgpu_ras_do_recovery [amdgpu] [  262.165980] RIP:
> 0010:amdgpu_ras_get_ras_block+0x86/0xd0 [amdgpu] [  262.166239] Code: 68
> d8 4c 8d 71 d8 48 39 c3 74 54 49 8b 45 38 48 85 c0 74 32 44 89 fa 44 89 e6 4c 89
> ef e8 82 e4 9b dc 85 c0 74 3c 49 8b 46 28 <49> 8d 56 28 4d 89 f5 48 83 e8 28 48
> 39 d3 74 25 49 89 c6 49 8b 45 [  262.166243] RSP: 0018:ffffac908fa87d80
> EFLAGS: 00000202 [  262.166247] RAX: ffffffffc1394248 RBX: ffff91e4ab8d6e20
> RCX: ffffffffc1394248 [  262.166249] RDX: ffff91e4aa356e20 RSI:
> 000000000000000e RDI: ffff91e4ab8c0000 [  262.166252] RBP:
> ffffac908fa87da8 R08: 0000000000000007 R09: 0000000000000001
> [  262.166254] R10: ffff91e4930b64ec R11: 0000000000000000 R12:
> 000000000000000e [  262.166256] R13: ffff91e4aa356df8 R14: ffffffffc1394320
> R15: 0000000000000003 [  262.166258] FS:  0000000000000000(0000)
> GS:ffff92238fb40000(0000) knlGS:0000000000000000 [  262.166261] CS:  0010
> DS: 0000 ES: 0000 CR0: 0000000080050033 [  262.166264] CR2:
> 00000001004865d0 CR3: 000000406d796000 CR4: 0000000000350ee0
> [  262.166267] Call Trace:
> [  262.166272]  amdgpu_ras_do_recovery+0x130/0x290 [amdgpu]
> [  262.166529]  ? psi_task_switch+0xd2/0x250 [  262.166537]  ?
> __switch_to+0x11d/0x460 [  262.166542]  ? __switch_to_asm+0x36/0x70
> [  262.166549]  process_one_work+0x220/0x3c0 [  262.166556]
> worker_thread+0x4d/0x3f0 [  262.166560]  ? process_one_work+0x3c0/0x3c0
> [  262.166563]  kthread+0x12b/0x150 [  262.166568]  ?
> set_kthread_struct+0x40/0x40 [  262.166571]  ret_from_fork+0x22/0x30
> 
> Signed-off-by: yipechai <YiPeng.Chai@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 37 ++++++++++++++++++++++--
> -  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  3 --
>  2 files changed, 33 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 9d7c778c1a2d..9b94c9c4960c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -75,6 +75,13 @@ const char *ras_mca_block_string[] = {
>  	"mca_iohc",
>  };
> 
> +struct amdgpu_ras_block_list {
> +	/* ras block link */
> +	struct list_head node;
> +
> +	struct amdgpu_ras_block_object *ras_obj; };
> +
>  const char *get_ras_block_str(struct ras_common_if *ras_block)  {
>  	if (!ras_block)
> @@ -880,7 +887,8 @@ static struct amdgpu_ras_block_object
> *amdgpu_ras_get_ras_block(struct amdgpu_de
>  					enum amdgpu_ras_block block,
> uint32_t sub_block_index)  {
>  	int loop_cnt = 0;
> -	struct amdgpu_ras_block_object *obj, *tmp;
> +	struct amdgpu_ras_block_list *node, *tmp;
> +	struct amdgpu_ras_block_object *obj;
> 
>  	if (block >= AMDGPU_RAS_BLOCK__LAST)
>  		return NULL;
> @@ -888,7 +896,13 @@ static struct amdgpu_ras_block_object
> *amdgpu_ras_get_ras_block(struct amdgpu_de
>  	if (!amdgpu_ras_is_supported(adev, block))
>  		return NULL;
> 
> -	list_for_each_entry_safe(obj, tmp, &adev->ras_list, node) {
> +	list_for_each_entry_safe(node, tmp, &adev->ras_list, node) {
> +		if (!node->ras_obj) {
> +			dev_warn(adev->dev, "Warning: abnormal ras list
> node.\n");
> +			continue;
> +		}
> +
> +		obj = node->ras_obj;
>  		if (obj->ras_block_match) {
>  			if (obj->ras_block_match(obj, block, sub_block_index)
> == 0)
>  				return obj;
> @@ -2527,6 +2541,7 @@ int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
> 
>  int amdgpu_ras_fini(struct amdgpu_device *adev)  {
> +	struct amdgpu_ras_block_list *ras_node, *tmp;
>  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> 
>  	if (!adev->ras_enabled || !con)
> @@ -2545,6 +2560,12 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
>  	amdgpu_ras_set_context(adev, NULL);
>  	kfree(con);
> 
> +	/* Clear ras blocks from ras_list and free ras block list node */
> +	list_for_each_entry_safe(ras_node, tmp, &adev->ras_list, node) {
> +		list_del(&ras_node->node);
> +		kfree(ras_node);
> +	}
> +
>  	return 0;
>  }
> 
> @@ -2754,14 +2775,22 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device
> *adev)  int amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
>  		struct amdgpu_ras_block_object *ras_block_obj)  {
> +	struct amdgpu_ras_block_list *ras_node;
>  	if (!adev || !ras_block_obj)
>  		return -EINVAL;
> 
>  	if (!amdgpu_ras_asic_supported(adev))
>  		return 0;
> 
> -	INIT_LIST_HEAD(&ras_block_obj->node);
> -	list_add_tail(&ras_block_obj->node, &adev->ras_list);
> +	ras_node = kzalloc(sizeof(*ras_node), GFP_KERNEL);
> +	if (!ras_node) {
> +		dev_err(adev->dev, "Failed to allocate ras_node!\n");
> +		return -ENOMEM;
> +	}
> +
> +	INIT_LIST_HEAD(&ras_node->node);
> +	ras_node->ras_obj = ras_block_obj;
> +	list_add_tail(&ras_node->node, &adev->ras_list);
> 
>  	return 0;
>  }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index a51a281bd91a..a55743b12d57 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -493,9 +493,6 @@ struct amdgpu_ras_block_object {
> 
>  	uint32_t sub_block_index;
> 
> -	/* ras block link */
> -	struct list_head node;
> -
>  	int (*ras_block_match)(struct amdgpu_ras_block_object *block_obj,
>  				enum amdgpu_ras_block block, uint32_t
> sub_block_index);
>  	int (*ras_late_init)(struct amdgpu_device *adev, void *ras_info);
> --
> 2.25.1

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2022-01-30  7:35 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-01-30  7:11 [PATCH V2 1/2] drm/amdgpu: Fixed the defect of soft lock caused by infinite loop yipechai
2022-01-30  7:11 ` [PATCH V2 2/2] Revert "drm/amdgpu: Add judgement to avoid infinite loop" yipechai
2022-01-30  7:35 ` [PATCH V2 1/2] drm/amdgpu: Fixed the defect of soft lock caused by infinite loop Zhou1, Tao

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.