All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] drm/amdgpu:add fw-vram-usage for atomfirmware
@ 2017-10-31 11:55 Monk Liu
       [not found] ` <1509450922-15476-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 8+ messages in thread
From: Monk Liu @ 2017-10-31 11:55 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Monk Liu

otherwise PF & VF exchange is broken

Change-Id: Icbb44e640ba1c6e61914cbd234e92de001496195
Signed-off-by: Monk Liu <Monk.Liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c | 26 ++++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
index f9ffe8e..455ad63 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
@@ -71,19 +71,37 @@ int amdgpu_atomfirmware_allocate_fb_scratch(struct amdgpu_device *adev)
 	struct atom_context *ctx = adev->mode_info.atom_context;
 	int index = get_index_into_master_table(atom_master_list_of_data_tables_v2_1,
 						vram_usagebyfirmware);
+	struct vram_usagebyfirmware_v2_1 *	firmware_usage;
+	uint32_t start_addr, size;
 	uint16_t data_offset;
 	int usage_bytes = 0;
 
 	if (amdgpu_atom_parse_data_header(ctx, index, NULL, NULL, NULL, &data_offset)) {
-		struct vram_usagebyfirmware_v2_1 *firmware_usage =
-			(struct vram_usagebyfirmware_v2_1 *)(ctx->bios + data_offset);
+		printk("data_offset=%x\n", data_offset);
+		firmware_usage = (struct vram_usagebyfirmware_v2_1 *)(ctx->bios + data_offset);
 
-		DRM_DEBUG("atom firmware requested %08x %dkb fw %dkb drv\n",
+		DRM_INFO("atom firmware requested %08x %dkb fw %dkb drv\n",
 			  le32_to_cpu(firmware_usage->start_address_in_kb),
 			  le16_to_cpu(firmware_usage->used_by_firmware_in_kb),
 			  le16_to_cpu(firmware_usage->used_by_driver_in_kb));
 
-		usage_bytes = le16_to_cpu(firmware_usage->used_by_driver_in_kb) * 1024;
+		start_addr = le32_to_cpu(firmware_usage->start_address_in_kb);
+		size = le16_to_cpu(firmware_usage->used_by_firmware_in_kb);
+
+		printk("start_addr = %x, size=%x\n", start_addr, size);
+
+		if ((uint32_t)(start_addr & ATOM_VRAM_OPERATION_FLAGS_MASK) ==
+			(uint32_t)(ATOM_VRAM_BLOCK_SRIOV_MSG_SHARE_RESERVATION <<
+			ATOM_VRAM_OPERATION_FLAGS_SHIFT)) {
+			/* Firmware request VRAM reservation for SR-IOV */
+			adev->fw_vram_usage.start_offset = (start_addr &
+				(~ATOM_VRAM_OPERATION_FLAGS_MASK)) << 10;
+			adev->fw_vram_usage.size = size << 10;
+			/* Use the default scratch size */
+			usage_bytes = 0;
+		} else {
+			usage_bytes = le16_to_cpu(firmware_usage->used_by_driver_in_kb) << 10;
+		}
 	}
 	ctx->scratch_size_bytes = 0;
 	if (usage_bytes == 0)
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 2/2] drm/amdgpu:cleanup deprecated gpu reset logic
       [not found] ` <1509450922-15476-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
@ 2017-10-31 11:55   ` Monk Liu
       [not found]     ` <1509450922-15476-2-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  2017-10-31 15:01   ` [PATCH 1/2] drm/amdgpu:add fw-vram-usage for atomfirmware Deucher, Alexander
  1 sibling, 1 reply; 8+ messages in thread
From: Monk Liu @ 2017-10-31 11:55 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Monk Liu

trigger gpu reset/recovery from illegle instruction IRQ
is deprecated long time ago, we already switch to recover
gpu by TDR triggering.

now please set lockup_timeout to non-zero value
in driver loading to enable TDR.

Change-Id: I45a576a97fd9859e1098e785ce857c2cf5adfba5
Signed-off-by: Monk Liu <Monk.Liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h     |  1 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c | 21 ---------------------
 drivers/gpu/drm/amd/amdgpu/cik_sdma.c   |  1 -
 drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c   |  2 --
 drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c   |  3 ---
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c   |  2 --
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c   |  2 --
 drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c  |  1 -
 drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c  |  1 -
 drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c  |  1 -
 drivers/gpu/drm/amd/amdgpu/si_dma.c     |  1 -
 11 files changed, 36 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 6e89be5..b3453e3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1459,7 +1459,6 @@ struct amdgpu_device {
 	bool				shutdown;
 	bool				need_dma32;
 	bool				accel_working;
-	struct work_struct		reset_work;
 	struct notifier_block		acpi_nb;
 	struct amdgpu_i2c_chan		*i2c_bus[AMDGPU_MAX_I2C_BUS];
 	struct amdgpu_debugfs		debugfs[AMDGPU_DEBUGFS_MAX_COMPONENTS];
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
index c340774..989b530 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
@@ -73,23 +73,6 @@ static void amdgpu_hotplug_work_func(struct work_struct *work)
 	drm_helper_hpd_irq_event(dev);
 }
 
-/**
- * amdgpu_irq_reset_work_func - execute gpu reset
- *
- * @work: work struct
- *
- * Execute scheduled gpu reset (cayman+).
- * This function is called when the irq handler
- * thinks we need a gpu reset.
- */
-static void amdgpu_irq_reset_work_func(struct work_struct *work)
-{
-	struct amdgpu_device *adev = container_of(work, struct amdgpu_device,
-						  reset_work);
-
-	if (!amdgpu_sriov_vf(adev))
-		amdgpu_gpu_recover(adev, NULL);
-}
 
 /* Disable *all* interrupts */
 static void amdgpu_irq_disable_all(struct amdgpu_device *adev)
@@ -251,14 +234,11 @@ int amdgpu_irq_init(struct amdgpu_device *adev)
 				amdgpu_hotplug_work_func);
 	}
 
-	INIT_WORK(&adev->reset_work, amdgpu_irq_reset_work_func);
-
 	adev->irq.installed = true;
 	r = drm_irq_install(adev->ddev, adev->ddev->pdev->irq);
 	if (r) {
 		adev->irq.installed = false;
 		flush_work(&adev->hotplug_work);
-		cancel_work_sync(&adev->reset_work);
 		return r;
 	}
 
@@ -283,7 +263,6 @@ void amdgpu_irq_fini(struct amdgpu_device *adev)
 		if (adev->irq.msi_enabled)
 			pci_disable_msi(adev->pdev);
 		flush_work(&adev->hotplug_work);
-		cancel_work_sync(&adev->reset_work);
 	}
 
 	for (i = 0; i < AMDGPU_IH_CLIENTID_MAX; ++i) {
diff --git a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
index ed26dcb..c8d9bc1 100644
--- a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
@@ -1227,7 +1227,6 @@ static int cik_sdma_process_illegal_inst_irq(struct amdgpu_device *adev,
 					     struct amdgpu_iv_entry *entry)
 {
 	DRM_ERROR("Illegal instruction in SDMA command stream\n");
-	schedule_work(&adev->reset_work);
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c
index 9430d48..25b32b7 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c
@@ -3443,7 +3443,6 @@ static int gfx_v6_0_priv_reg_irq(struct amdgpu_device *adev,
 				 struct amdgpu_iv_entry *entry)
 {
 	DRM_ERROR("Illegal register access in command stream\n");
-	schedule_work(&adev->reset_work);
 	return 0;
 }
 
@@ -3452,7 +3451,6 @@ static int gfx_v6_0_priv_inst_irq(struct amdgpu_device *adev,
 				  struct amdgpu_iv_entry *entry)
 {
 	DRM_ERROR("Illegal instruction in command stream\n");
-	schedule_work(&adev->reset_work);
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
index 3c2b15a..b0e127d 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
@@ -5020,7 +5020,6 @@ static int gfx_v7_0_priv_reg_irq(struct amdgpu_device *adev,
 				 struct amdgpu_iv_entry *entry)
 {
 	DRM_ERROR("Illegal register access in command stream\n");
-	schedule_work(&adev->reset_work);
 	return 0;
 }
 
@@ -5029,8 +5028,6 @@ static int gfx_v7_0_priv_inst_irq(struct amdgpu_device *adev,
 				  struct amdgpu_iv_entry *entry)
 {
 	DRM_ERROR("Illegal instruction in command stream\n");
-	// XXX soft reset the gfx block only
-	schedule_work(&adev->reset_work);
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index a74515a..5fd4996 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -6795,7 +6795,6 @@ static int gfx_v8_0_priv_reg_irq(struct amdgpu_device *adev,
 				 struct amdgpu_iv_entry *entry)
 {
 	DRM_ERROR("Illegal register access in command stream\n");
-	schedule_work(&adev->reset_work);
 	return 0;
 }
 
@@ -6804,7 +6803,6 @@ static int gfx_v8_0_priv_inst_irq(struct amdgpu_device *adev,
 				  struct amdgpu_iv_entry *entry)
 {
 	DRM_ERROR("Illegal instruction in command stream\n");
-	schedule_work(&adev->reset_work);
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 9855dc0..dce1960 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -4136,7 +4136,6 @@ static int gfx_v9_0_priv_reg_irq(struct amdgpu_device *adev,
 				 struct amdgpu_iv_entry *entry)
 {
 	DRM_ERROR("Illegal register access in command stream\n");
-	schedule_work(&adev->reset_work);
 	return 0;
 }
 
@@ -4145,7 +4144,6 @@ static int gfx_v9_0_priv_inst_irq(struct amdgpu_device *adev,
 				  struct amdgpu_iv_entry *entry)
 {
 	DRM_ERROR("Illegal instruction in command stream\n");
-	schedule_work(&adev->reset_work);
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
index 92f8c44..7e2580c 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
@@ -1161,7 +1161,6 @@ static int sdma_v2_4_process_illegal_inst_irq(struct amdgpu_device *adev,
 					      struct amdgpu_iv_entry *entry)
 {
 	DRM_ERROR("Illegal instruction in SDMA command stream\n");
-	schedule_work(&adev->reset_work);
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
index 52e6bf2..c12f994 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
@@ -1480,7 +1480,6 @@ static int sdma_v3_0_process_illegal_inst_irq(struct amdgpu_device *adev,
 					      struct amdgpu_iv_entry *entry)
 {
 	DRM_ERROR("Illegal instruction in SDMA command stream\n");
-	schedule_work(&adev->reset_work);
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
index fe78c00..09b7586 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
@@ -1416,7 +1416,6 @@ static int sdma_v4_0_process_illegal_inst_irq(struct amdgpu_device *adev,
 					      struct amdgpu_iv_entry *entry)
 {
 	DRM_ERROR("Illegal instruction in SDMA command stream\n");
-	schedule_work(&adev->reset_work);
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/si_dma.c b/drivers/gpu/drm/amd/amdgpu/si_dma.c
index ee469a9..408e145 100644
--- a/drivers/gpu/drm/amd/amdgpu/si_dma.c
+++ b/drivers/gpu/drm/amd/amdgpu/si_dma.c
@@ -683,7 +683,6 @@ static int si_dma_process_illegal_inst_irq(struct amdgpu_device *adev,
 					      struct amdgpu_iv_entry *entry)
 {
 	DRM_ERROR("Illegal instruction in SDMA command stream\n");
-	schedule_work(&adev->reset_work);
 	return 0;
 }
 
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu:cleanup deprecated gpu reset logic
       [not found]     ` <1509450922-15476-2-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
@ 2017-10-31 15:01       ` Christian König
       [not found]         ` <9649157d-35ec-cec5-ac7c-54ef9335f56f-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
  0 siblings, 1 reply; 8+ messages in thread
From: Christian König @ 2017-10-31 15:01 UTC (permalink / raw)
  To: Monk Liu, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Am 31.10.2017 um 12:55 schrieb Monk Liu:
> trigger gpu reset/recovery from illegle instruction IRQ
> is deprecated long time ago, we already switch to recover
> gpu by TDR triggering.
>
> now please set lockup_timeout to non-zero value
> in driver loading to enable TDR.
The patch is ok, but NAK to the general approach. We should have illegal 
instruction/access alongside the timeout.

But instead of trying to trigger the reset directly inform the scheduler 
that the we detected a problem on the engine. The scheduler can then 
cancel the timeout and do the appropriate things.

This patch would be ok if you install this new functionality directly 
after removing the old (and broken) one.

Regards,
Christian.

>
> Change-Id: I45a576a97fd9859e1098e785ce857c2cf5adfba5
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h     |  1 -
>   drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c | 21 ---------------------
>   drivers/gpu/drm/amd/amdgpu/cik_sdma.c   |  1 -
>   drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c   |  2 --
>   drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c   |  3 ---
>   drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c   |  2 --
>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c   |  2 --
>   drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c  |  1 -
>   drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c  |  1 -
>   drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c  |  1 -
>   drivers/gpu/drm/amd/amdgpu/si_dma.c     |  1 -
>   11 files changed, 36 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 6e89be5..b3453e3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1459,7 +1459,6 @@ struct amdgpu_device {
>   	bool				shutdown;
>   	bool				need_dma32;
>   	bool				accel_working;
> -	struct work_struct		reset_work;
>   	struct notifier_block		acpi_nb;
>   	struct amdgpu_i2c_chan		*i2c_bus[AMDGPU_MAX_I2C_BUS];
>   	struct amdgpu_debugfs		debugfs[AMDGPU_DEBUGFS_MAX_COMPONENTS];
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
> index c340774..989b530 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
> @@ -73,23 +73,6 @@ static void amdgpu_hotplug_work_func(struct work_struct *work)
>   	drm_helper_hpd_irq_event(dev);
>   }
>   
> -/**
> - * amdgpu_irq_reset_work_func - execute gpu reset
> - *
> - * @work: work struct
> - *
> - * Execute scheduled gpu reset (cayman+).
> - * This function is called when the irq handler
> - * thinks we need a gpu reset.
> - */
> -static void amdgpu_irq_reset_work_func(struct work_struct *work)
> -{
> -	struct amdgpu_device *adev = container_of(work, struct amdgpu_device,
> -						  reset_work);
> -
> -	if (!amdgpu_sriov_vf(adev))
> -		amdgpu_gpu_recover(adev, NULL);
> -}
>   
>   /* Disable *all* interrupts */
>   static void amdgpu_irq_disable_all(struct amdgpu_device *adev)
> @@ -251,14 +234,11 @@ int amdgpu_irq_init(struct amdgpu_device *adev)
>   				amdgpu_hotplug_work_func);
>   	}
>   
> -	INIT_WORK(&adev->reset_work, amdgpu_irq_reset_work_func);
> -
>   	adev->irq.installed = true;
>   	r = drm_irq_install(adev->ddev, adev->ddev->pdev->irq);
>   	if (r) {
>   		adev->irq.installed = false;
>   		flush_work(&adev->hotplug_work);
> -		cancel_work_sync(&adev->reset_work);
>   		return r;
>   	}
>   
> @@ -283,7 +263,6 @@ void amdgpu_irq_fini(struct amdgpu_device *adev)
>   		if (adev->irq.msi_enabled)
>   			pci_disable_msi(adev->pdev);
>   		flush_work(&adev->hotplug_work);
> -		cancel_work_sync(&adev->reset_work);
>   	}
>   
>   	for (i = 0; i < AMDGPU_IH_CLIENTID_MAX; ++i) {
> diff --git a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
> index ed26dcb..c8d9bc1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
> +++ b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
> @@ -1227,7 +1227,6 @@ static int cik_sdma_process_illegal_inst_irq(struct amdgpu_device *adev,
>   					     struct amdgpu_iv_entry *entry)
>   {
>   	DRM_ERROR("Illegal instruction in SDMA command stream\n");
> -	schedule_work(&adev->reset_work);
>   	return 0;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c
> index 9430d48..25b32b7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c
> @@ -3443,7 +3443,6 @@ static int gfx_v6_0_priv_reg_irq(struct amdgpu_device *adev,
>   				 struct amdgpu_iv_entry *entry)
>   {
>   	DRM_ERROR("Illegal register access in command stream\n");
> -	schedule_work(&adev->reset_work);
>   	return 0;
>   }
>   
> @@ -3452,7 +3451,6 @@ static int gfx_v6_0_priv_inst_irq(struct amdgpu_device *adev,
>   				  struct amdgpu_iv_entry *entry)
>   {
>   	DRM_ERROR("Illegal instruction in command stream\n");
> -	schedule_work(&adev->reset_work);
>   	return 0;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> index 3c2b15a..b0e127d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> @@ -5020,7 +5020,6 @@ static int gfx_v7_0_priv_reg_irq(struct amdgpu_device *adev,
>   				 struct amdgpu_iv_entry *entry)
>   {
>   	DRM_ERROR("Illegal register access in command stream\n");
> -	schedule_work(&adev->reset_work);
>   	return 0;
>   }
>   
> @@ -5029,8 +5028,6 @@ static int gfx_v7_0_priv_inst_irq(struct amdgpu_device *adev,
>   				  struct amdgpu_iv_entry *entry)
>   {
>   	DRM_ERROR("Illegal instruction in command stream\n");
> -	// XXX soft reset the gfx block only
> -	schedule_work(&adev->reset_work);
>   	return 0;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> index a74515a..5fd4996 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> @@ -6795,7 +6795,6 @@ static int gfx_v8_0_priv_reg_irq(struct amdgpu_device *adev,
>   				 struct amdgpu_iv_entry *entry)
>   {
>   	DRM_ERROR("Illegal register access in command stream\n");
> -	schedule_work(&adev->reset_work);
>   	return 0;
>   }
>   
> @@ -6804,7 +6803,6 @@ static int gfx_v8_0_priv_inst_irq(struct amdgpu_device *adev,
>   				  struct amdgpu_iv_entry *entry)
>   {
>   	DRM_ERROR("Illegal instruction in command stream\n");
> -	schedule_work(&adev->reset_work);
>   	return 0;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 9855dc0..dce1960 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -4136,7 +4136,6 @@ static int gfx_v9_0_priv_reg_irq(struct amdgpu_device *adev,
>   				 struct amdgpu_iv_entry *entry)
>   {
>   	DRM_ERROR("Illegal register access in command stream\n");
> -	schedule_work(&adev->reset_work);
>   	return 0;
>   }
>   
> @@ -4145,7 +4144,6 @@ static int gfx_v9_0_priv_inst_irq(struct amdgpu_device *adev,
>   				  struct amdgpu_iv_entry *entry)
>   {
>   	DRM_ERROR("Illegal instruction in command stream\n");
> -	schedule_work(&adev->reset_work);
>   	return 0;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
> index 92f8c44..7e2580c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
> @@ -1161,7 +1161,6 @@ static int sdma_v2_4_process_illegal_inst_irq(struct amdgpu_device *adev,
>   					      struct amdgpu_iv_entry *entry)
>   {
>   	DRM_ERROR("Illegal instruction in SDMA command stream\n");
> -	schedule_work(&adev->reset_work);
>   	return 0;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
> index 52e6bf2..c12f994 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
> @@ -1480,7 +1480,6 @@ static int sdma_v3_0_process_illegal_inst_irq(struct amdgpu_device *adev,
>   					      struct amdgpu_iv_entry *entry)
>   {
>   	DRM_ERROR("Illegal instruction in SDMA command stream\n");
> -	schedule_work(&adev->reset_work);
>   	return 0;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> index fe78c00..09b7586 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> @@ -1416,7 +1416,6 @@ static int sdma_v4_0_process_illegal_inst_irq(struct amdgpu_device *adev,
>   					      struct amdgpu_iv_entry *entry)
>   {
>   	DRM_ERROR("Illegal instruction in SDMA command stream\n");
> -	schedule_work(&adev->reset_work);
>   	return 0;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/si_dma.c b/drivers/gpu/drm/amd/amdgpu/si_dma.c
> index ee469a9..408e145 100644
> --- a/drivers/gpu/drm/amd/amdgpu/si_dma.c
> +++ b/drivers/gpu/drm/amd/amdgpu/si_dma.c
> @@ -683,7 +683,6 @@ static int si_dma_process_illegal_inst_irq(struct amdgpu_device *adev,
>   					      struct amdgpu_iv_entry *entry)
>   {
>   	DRM_ERROR("Illegal instruction in SDMA command stream\n");
> -	schedule_work(&adev->reset_work);
>   	return 0;
>   }
>   


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [PATCH 1/2] drm/amdgpu:add fw-vram-usage for atomfirmware
       [not found] ` <1509450922-15476-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  2017-10-31 11:55   ` [PATCH 2/2] drm/amdgpu:cleanup deprecated gpu reset logic Monk Liu
@ 2017-10-31 15:01   ` Deucher, Alexander
       [not found]     ` <BN6PR12MB1652E5737C680A79CDBA9D9AF75E0-/b2+HYfkarQqUD6E6FAiowdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  1 sibling, 1 reply; 8+ messages in thread
From: Deucher, Alexander @ 2017-10-31 15:01 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Liu, Monk

> -----Original Message-----
> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf
> Of Monk Liu
> Sent: Tuesday, October 31, 2017 7:55 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Liu, Monk
> Subject: [PATCH 1/2] drm/amdgpu:add fw-vram-usage for atomfirmware
> 
> otherwise PF & VF exchange is broken
> 
> Change-Id: Icbb44e640ba1c6e61914cbd234e92de001496195
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c | 26
> ++++++++++++++++++++----
>  1 file changed, 22 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
> index f9ffe8e..455ad63 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
> @@ -71,19 +71,37 @@ int
> amdgpu_atomfirmware_allocate_fb_scratch(struct amdgpu_device *adev)
>  	struct atom_context *ctx = adev->mode_info.atom_context;
>  	int index =
> get_index_into_master_table(atom_master_list_of_data_tables_v2_1,
>  						vram_usagebyfirmware);
> +	struct vram_usagebyfirmware_v2_1 *	firmware_usage;
> +	uint32_t start_addr, size;
>  	uint16_t data_offset;
>  	int usage_bytes = 0;
> 
>  	if (amdgpu_atom_parse_data_header(ctx, index, NULL, NULL, NULL,
> &data_offset)) {
> -		struct vram_usagebyfirmware_v2_1 *firmware_usage =
> -			(struct vram_usagebyfirmware_v2_1 *)(ctx->bios +
> data_offset);
> +		printk("data_offset=%x\n", data_offset);

Drop this debugging leftover.

> +		firmware_usage = (struct vram_usagebyfirmware_v2_1
> *)(ctx->bios + data_offset);
> 
> -		DRM_DEBUG("atom firmware requested %08x %dkb fw
> %dkb drv\n",
> +		DRM_INFO("atom firmware requested %08x %dkb fw %dkb

Any reason to make this non-debug?

> drv\n",
>  			  le32_to_cpu(firmware_usage-
> >start_address_in_kb),
>  			  le16_to_cpu(firmware_usage-
> >used_by_firmware_in_kb),
>  			  le16_to_cpu(firmware_usage-
> >used_by_driver_in_kb));
> 
> -		usage_bytes = le16_to_cpu(firmware_usage-
> >used_by_driver_in_kb) * 1024;
> +		start_addr = le32_to_cpu(firmware_usage-
> >start_address_in_kb);
> +		size = le16_to_cpu(firmware_usage-
> >used_by_firmware_in_kb);
> +
> +		printk("start_addr = %x, size=%x\n", start_addr, size);

More debug leftovers.

With the above comments addressed, the patch is:
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>

> +
> +		if ((uint32_t)(start_addr &
> ATOM_VRAM_OPERATION_FLAGS_MASK) ==
> +
> 	(uint32_t)(ATOM_VRAM_BLOCK_SRIOV_MSG_SHARE_RESERVATIO
> N <<
> +			ATOM_VRAM_OPERATION_FLAGS_SHIFT)) {
> +			/* Firmware request VRAM reservation for SR-IOV */
> +			adev->fw_vram_usage.start_offset = (start_addr &
> +
> 	(~ATOM_VRAM_OPERATION_FLAGS_MASK)) << 10;
> +			adev->fw_vram_usage.size = size << 10;
> +			/* Use the default scratch size */
> +			usage_bytes = 0;
> +		} else {
> +			usage_bytes = le16_to_cpu(firmware_usage-
> >used_by_driver_in_kb) << 10;
> +		}
>  	}
>  	ctx->scratch_size_bytes = 0;
>  	if (usage_bytes == 0)
> --
> 2.7.4
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [PATCH 2/2] drm/amdgpu:cleanup deprecated gpu reset logic
       [not found]         ` <9649157d-35ec-cec5-ac7c-54ef9335f56f-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
@ 2017-11-01  3:29           ` Liu, Monk
       [not found]             ` <BLUPR12MB0449E2AD3180D0A2DF089A60845F0-7LeqcoF/hwpTIQvHjXdJlwdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  0 siblings, 1 reply; 8+ messages in thread
From: Liu, Monk @ 2017-11-01  3:29 UTC (permalink / raw)
  To: Koenig, Christian, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

The thing is triggering gpu_recover() in irq routine give you NULL for the @bad/job parameter, so gpu_recover() actually did nothing meaningful, it just repeat scheduling un-signaled jobs again and again, and finally your GPU is stuck with infinite recovering ....

here is my initial thought:
In " gfx_v9_0_priv_reg_irq(struct amdgpu_device *adev, struct amdgpu_iv_entry *entry)" routine:
For lockup_timeout==0 case: we put a new parameter "@sched" to amd_gpu_recover(), that way although we don't have @bad/job, but with @sched we can still find out the pending job on his scheduler,
But that make code ugly, cuz we need to change amdgpu_gpu_recover(adev, job) to amdgpu_gpu_recover(adev, job, sched). Anyway need to find a way to tell gpu_recover() at least which ring has problem.

For lockup_timeout!=0 case: did nothing and just return since TDR will correct this error and do recover() gracefully.



-----Original Message-----
From: Christian König [mailto:ckoenig.leichtzumerken@gmail.com] 
Sent: 2017年10月31日 23:01
To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH 2/2] drm/amdgpu:cleanup deprecated gpu reset logic

Am 31.10.2017 um 12:55 schrieb Monk Liu:
> trigger gpu reset/recovery from illegle instruction IRQ is deprecated 
> long time ago, we already switch to recover gpu by TDR triggering.
>
> now please set lockup_timeout to non-zero value in driver loading to 
> enable TDR.
The patch is ok, but NAK to the general approach. We should have illegal instruction/access alongside the timeout.

But instead of trying to trigger the reset directly inform the scheduler that the we detected a problem on the engine. The scheduler can then cancel the timeout and do the appropriate things.

This patch would be ok if you install this new functionality directly after removing the old (and broken) one.

Regards,
Christian.

>
> Change-Id: I45a576a97fd9859e1098e785ce857c2cf5adfba5
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h     |  1 -
>   drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c | 21 ---------------------
>   drivers/gpu/drm/amd/amdgpu/cik_sdma.c   |  1 -
>   drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c   |  2 --
>   drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c   |  3 ---
>   drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c   |  2 --
>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c   |  2 --
>   drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c  |  1 -
>   drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c  |  1 -
>   drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c  |  1 -
>   drivers/gpu/drm/amd/amdgpu/si_dma.c     |  1 -
>   11 files changed, 36 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 6e89be5..b3453e3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1459,7 +1459,6 @@ struct amdgpu_device {
>   	bool				shutdown;
>   	bool				need_dma32;
>   	bool				accel_working;
> -	struct work_struct		reset_work;
>   	struct notifier_block		acpi_nb;
>   	struct amdgpu_i2c_chan		*i2c_bus[AMDGPU_MAX_I2C_BUS];
>   	struct amdgpu_debugfs		debugfs[AMDGPU_DEBUGFS_MAX_COMPONENTS];
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
> index c340774..989b530 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
> @@ -73,23 +73,6 @@ static void amdgpu_hotplug_work_func(struct work_struct *work)
>   	drm_helper_hpd_irq_event(dev);
>   }
>   
> -/**
> - * amdgpu_irq_reset_work_func - execute gpu reset
> - *
> - * @work: work struct
> - *
> - * Execute scheduled gpu reset (cayman+).
> - * This function is called when the irq handler
> - * thinks we need a gpu reset.
> - */
> -static void amdgpu_irq_reset_work_func(struct work_struct *work) -{
> -	struct amdgpu_device *adev = container_of(work, struct amdgpu_device,
> -						  reset_work);
> -
> -	if (!amdgpu_sriov_vf(adev))
> -		amdgpu_gpu_recover(adev, NULL);
> -}
>   
>   /* Disable *all* interrupts */
>   static void amdgpu_irq_disable_all(struct amdgpu_device *adev) @@ 
> -251,14 +234,11 @@ int amdgpu_irq_init(struct amdgpu_device *adev)
>   				amdgpu_hotplug_work_func);
>   	}
>   
> -	INIT_WORK(&adev->reset_work, amdgpu_irq_reset_work_func);
> -
>   	adev->irq.installed = true;
>   	r = drm_irq_install(adev->ddev, adev->ddev->pdev->irq);
>   	if (r) {
>   		adev->irq.installed = false;
>   		flush_work(&adev->hotplug_work);
> -		cancel_work_sync(&adev->reset_work);
>   		return r;
>   	}
>   
> @@ -283,7 +263,6 @@ void amdgpu_irq_fini(struct amdgpu_device *adev)
>   		if (adev->irq.msi_enabled)
>   			pci_disable_msi(adev->pdev);
>   		flush_work(&adev->hotplug_work);
> -		cancel_work_sync(&adev->reset_work);
>   	}
>   
>   	for (i = 0; i < AMDGPU_IH_CLIENTID_MAX; ++i) { diff --git 
> a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c 
> b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
> index ed26dcb..c8d9bc1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
> +++ b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
> @@ -1227,7 +1227,6 @@ static int cik_sdma_process_illegal_inst_irq(struct amdgpu_device *adev,
>   					     struct amdgpu_iv_entry *entry)
>   {
>   	DRM_ERROR("Illegal instruction in SDMA command stream\n");
> -	schedule_work(&adev->reset_work);
>   	return 0;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c
> index 9430d48..25b32b7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c
> @@ -3443,7 +3443,6 @@ static int gfx_v6_0_priv_reg_irq(struct amdgpu_device *adev,
>   				 struct amdgpu_iv_entry *entry)
>   {
>   	DRM_ERROR("Illegal register access in command stream\n");
> -	schedule_work(&adev->reset_work);
>   	return 0;
>   }
>   
> @@ -3452,7 +3451,6 @@ static int gfx_v6_0_priv_inst_irq(struct amdgpu_device *adev,
>   				  struct amdgpu_iv_entry *entry)
>   {
>   	DRM_ERROR("Illegal instruction in command stream\n");
> -	schedule_work(&adev->reset_work);
>   	return 0;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> index 3c2b15a..b0e127d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> @@ -5020,7 +5020,6 @@ static int gfx_v7_0_priv_reg_irq(struct amdgpu_device *adev,
>   				 struct amdgpu_iv_entry *entry)
>   {
>   	DRM_ERROR("Illegal register access in command stream\n");
> -	schedule_work(&adev->reset_work);
>   	return 0;
>   }
>   
> @@ -5029,8 +5028,6 @@ static int gfx_v7_0_priv_inst_irq(struct amdgpu_device *adev,
>   				  struct amdgpu_iv_entry *entry)
>   {
>   	DRM_ERROR("Illegal instruction in command stream\n");
> -	// XXX soft reset the gfx block only
> -	schedule_work(&adev->reset_work);
>   	return 0;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> index a74515a..5fd4996 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> @@ -6795,7 +6795,6 @@ static int gfx_v8_0_priv_reg_irq(struct amdgpu_device *adev,
>   				 struct amdgpu_iv_entry *entry)
>   {
>   	DRM_ERROR("Illegal register access in command stream\n");
> -	schedule_work(&adev->reset_work);
>   	return 0;
>   }
>   
> @@ -6804,7 +6803,6 @@ static int gfx_v8_0_priv_inst_irq(struct amdgpu_device *adev,
>   				  struct amdgpu_iv_entry *entry)
>   {
>   	DRM_ERROR("Illegal instruction in command stream\n");
> -	schedule_work(&adev->reset_work);
>   	return 0;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 9855dc0..dce1960 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -4136,7 +4136,6 @@ static int gfx_v9_0_priv_reg_irq(struct amdgpu_device *adev,
>   				 struct amdgpu_iv_entry *entry)
>   {
>   	DRM_ERROR("Illegal register access in command stream\n");
> -	schedule_work(&adev->reset_work);
>   	return 0;
>   }
>   
> @@ -4145,7 +4144,6 @@ static int gfx_v9_0_priv_inst_irq(struct amdgpu_device *adev,
>   				  struct amdgpu_iv_entry *entry)
>   {
>   	DRM_ERROR("Illegal instruction in command stream\n");
> -	schedule_work(&adev->reset_work);
>   	return 0;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c 
> b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
> index 92f8c44..7e2580c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
> @@ -1161,7 +1161,6 @@ static int sdma_v2_4_process_illegal_inst_irq(struct amdgpu_device *adev,
>   					      struct amdgpu_iv_entry *entry)
>   {
>   	DRM_ERROR("Illegal instruction in SDMA command stream\n");
> -	schedule_work(&adev->reset_work);
>   	return 0;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c 
> b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
> index 52e6bf2..c12f994 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
> @@ -1480,7 +1480,6 @@ static int sdma_v3_0_process_illegal_inst_irq(struct amdgpu_device *adev,
>   					      struct amdgpu_iv_entry *entry)
>   {
>   	DRM_ERROR("Illegal instruction in SDMA command stream\n");
> -	schedule_work(&adev->reset_work);
>   	return 0;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c 
> b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> index fe78c00..09b7586 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> @@ -1416,7 +1416,6 @@ static int sdma_v4_0_process_illegal_inst_irq(struct amdgpu_device *adev,
>   					      struct amdgpu_iv_entry *entry)
>   {
>   	DRM_ERROR("Illegal instruction in SDMA command stream\n");
> -	schedule_work(&adev->reset_work);
>   	return 0;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/si_dma.c 
> b/drivers/gpu/drm/amd/amdgpu/si_dma.c
> index ee469a9..408e145 100644
> --- a/drivers/gpu/drm/amd/amdgpu/si_dma.c
> +++ b/drivers/gpu/drm/amd/amdgpu/si_dma.c
> @@ -683,7 +683,6 @@ static int si_dma_process_illegal_inst_irq(struct amdgpu_device *adev,
>   					      struct amdgpu_iv_entry *entry)
>   {
>   	DRM_ERROR("Illegal instruction in SDMA command stream\n");
> -	schedule_work(&adev->reset_work);
>   	return 0;
>   }
>   


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [PATCH 1/2] drm/amdgpu:add fw-vram-usage for atomfirmware
       [not found]     ` <BN6PR12MB1652E5737C680A79CDBA9D9AF75E0-/b2+HYfkarQqUD6E6FAiowdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2017-11-01  3:40       ` Liu, Monk
  0 siblings, 0 replies; 8+ messages in thread
From: Liu, Monk @ 2017-11-01  3:40 UTC (permalink / raw)
  To: Deucher, Alexander, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Yeah, I'll cleanup them before pushing 

-----Original Message-----
From: Deucher, Alexander 
Sent: 2017年10月31日 23:01
To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Liu, Monk <Monk.Liu@amd.com>
Subject: RE: [PATCH 1/2] drm/amdgpu:add fw-vram-usage for atomfirmware

> -----Original Message-----
> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf 
> Of Monk Liu
> Sent: Tuesday, October 31, 2017 7:55 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Liu, Monk
> Subject: [PATCH 1/2] drm/amdgpu:add fw-vram-usage for atomfirmware
> 
> otherwise PF & VF exchange is broken
> 
> Change-Id: Icbb44e640ba1c6e61914cbd234e92de001496195
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c | 26
> ++++++++++++++++++++----
>  1 file changed, 22 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
> index f9ffe8e..455ad63 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
> @@ -71,19 +71,37 @@ int
> amdgpu_atomfirmware_allocate_fb_scratch(struct amdgpu_device *adev)
>  	struct atom_context *ctx = adev->mode_info.atom_context;
>  	int index =
> get_index_into_master_table(atom_master_list_of_data_tables_v2_1,
>  						vram_usagebyfirmware);
> +	struct vram_usagebyfirmware_v2_1 *	firmware_usage;
> +	uint32_t start_addr, size;
>  	uint16_t data_offset;
>  	int usage_bytes = 0;
> 
>  	if (amdgpu_atom_parse_data_header(ctx, index, NULL, NULL, NULL,
> &data_offset)) {
> -		struct vram_usagebyfirmware_v2_1 *firmware_usage =
> -			(struct vram_usagebyfirmware_v2_1 *)(ctx->bios +
> data_offset);
> +		printk("data_offset=%x\n", data_offset);

Drop this debugging leftover.

> +		firmware_usage = (struct vram_usagebyfirmware_v2_1
> *)(ctx->bios + data_offset);
> 
> -		DRM_DEBUG("atom firmware requested %08x %dkb fw
> %dkb drv\n",
> +		DRM_INFO("atom firmware requested %08x %dkb fw %dkb

Any reason to make this non-debug?

> drv\n",
>  			  le32_to_cpu(firmware_usage-
> >start_address_in_kb),
>  			  le16_to_cpu(firmware_usage-
> >used_by_firmware_in_kb),
>  			  le16_to_cpu(firmware_usage-
> >used_by_driver_in_kb));
> 
> -		usage_bytes = le16_to_cpu(firmware_usage-
> >used_by_driver_in_kb) * 1024;
> +		start_addr = le32_to_cpu(firmware_usage-
> >start_address_in_kb);
> +		size = le16_to_cpu(firmware_usage-
> >used_by_firmware_in_kb);
> +
> +		printk("start_addr = %x, size=%x\n", start_addr, size);

More debug leftovers.

With the above comments addressed, the patch is:
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>

> +
> +		if ((uint32_t)(start_addr &
> ATOM_VRAM_OPERATION_FLAGS_MASK) ==
> +
> 	(uint32_t)(ATOM_VRAM_BLOCK_SRIOV_MSG_SHARE_RESERVATIO
> N <<
> +			ATOM_VRAM_OPERATION_FLAGS_SHIFT)) {
> +			/* Firmware request VRAM reservation for SR-IOV */
> +			adev->fw_vram_usage.start_offset = (start_addr &
> +
> 	(~ATOM_VRAM_OPERATION_FLAGS_MASK)) << 10;
> +			adev->fw_vram_usage.size = size << 10;
> +			/* Use the default scratch size */
> +			usage_bytes = 0;
> +		} else {
> +			usage_bytes = le16_to_cpu(firmware_usage-
> >used_by_driver_in_kb) << 10;
> +		}
>  	}
>  	ctx->scratch_size_bytes = 0;
>  	if (usage_bytes == 0)
> --
> 2.7.4
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu:cleanup deprecated gpu reset logic
       [not found]             ` <BLUPR12MB0449E2AD3180D0A2DF089A60845F0-7LeqcoF/hwpTIQvHjXdJlwdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2017-11-01  8:41               ` Christian König
       [not found]                 ` <cda71e3e-c1fe-945b-72de-018a50d1c925-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 8+ messages in thread
From: Christian König @ 2017-11-01  8:41 UTC (permalink / raw)
  To: Liu, Monk, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Am 01.11.2017 um 04:29 schrieb Liu, Monk:
> The thing is triggering gpu_recover() in irq routine give you NULL for the @bad/job parameter, so gpu_recover() actually did nothing meaningful, it just repeat scheduling un-signaled jobs again and again, and finally your GPU is stuck with infinite recovering ....

Yeah, completely agree that this isn't a good implementation. But we 
need to somehow do better than that.

> here is my initial thought:
> In " gfx_v9_0_priv_reg_irq(struct amdgpu_device *adev, struct amdgpu_iv_entry *entry)" routine:
> For lockup_timeout==0 case: we put a new parameter "@sched" to amd_gpu_recover(), that way although we don't have @bad/job, but with @sched we can still find out the pending job on his scheduler,
> But that make code ugly, cuz we need to change amdgpu_gpu_recover(adev, job) to amdgpu_gpu_recover(adev, job, sched). Anyway need to find a way to tell gpu_recover() at least which ring has problem.
>
> For lockup_timeout!=0 case: did nothing and just return since TDR will correct this error and do recover() gracefully.

I think waiting for the TDR will take to long. How about this instead:

1. In gfx_v9_0_priv_reg_irq() (and all the other IRQ handlers as well) 
we add functionality to figure out the ring/scheduler which caused the 
illegal operation.

2. Then we add a new function amd_sched_job_fault() which gets the 
scheduler which had a fault as parameter.

3. amd_sched_job_fault() then grabs job_list_lock and takes the first 
job from the ring_mirror_list.

4. We call cancel_delayed_work() to cancel the timeout of the job.

5. If cancel_delayed_work() returns true (which means the TDR isn't 
already running anyway) we call schedule_delayed_work() with a zero timeout.

This way the actual timeout value is truncated to zero and we run the 
recovery immediately just in the same way as it would when we have waited.

Should be trivial to implement actually. What do you think?

Regards,
Christian.

>
>
>
> -----Original Message-----
> From: Christian König [mailto:ckoenig.leichtzumerken@gmail.com]
> Sent: 2017年10月31日 23:01
> To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH 2/2] drm/amdgpu:cleanup deprecated gpu reset logic
>
> Am 31.10.2017 um 12:55 schrieb Monk Liu:
>> trigger gpu reset/recovery from illegle instruction IRQ is deprecated
>> long time ago, we already switch to recover gpu by TDR triggering.
>>
>> now please set lockup_timeout to non-zero value in driver loading to
>> enable TDR.
> The patch is ok, but NAK to the general approach. We should have illegal instruction/access alongside the timeout.
>
> But instead of trying to trigger the reset directly inform the scheduler that the we detected a problem on the engine. The scheduler can then cancel the timeout and do the appropriate things.
>
> This patch would be ok if you install this new functionality directly after removing the old (and broken) one.
>
> Regards,
> Christian.
>
>> Change-Id: I45a576a97fd9859e1098e785ce857c2cf5adfba5
>> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h     |  1 -
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c | 21 ---------------------
>>    drivers/gpu/drm/amd/amdgpu/cik_sdma.c   |  1 -
>>    drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c   |  2 --
>>    drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c   |  3 ---
>>    drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c   |  2 --
>>    drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c   |  2 --
>>    drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c  |  1 -
>>    drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c  |  1 -
>>    drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c  |  1 -
>>    drivers/gpu/drm/amd/amdgpu/si_dma.c     |  1 -
>>    11 files changed, 36 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index 6e89be5..b3453e3 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -1459,7 +1459,6 @@ struct amdgpu_device {
>>    	bool				shutdown;
>>    	bool				need_dma32;
>>    	bool				accel_working;
>> -	struct work_struct		reset_work;
>>    	struct notifier_block		acpi_nb;
>>    	struct amdgpu_i2c_chan		*i2c_bus[AMDGPU_MAX_I2C_BUS];
>>    	struct amdgpu_debugfs		debugfs[AMDGPU_DEBUGFS_MAX_COMPONENTS];
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
>> index c340774..989b530 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
>> @@ -73,23 +73,6 @@ static void amdgpu_hotplug_work_func(struct work_struct *work)
>>    	drm_helper_hpd_irq_event(dev);
>>    }
>>    
>> -/**
>> - * amdgpu_irq_reset_work_func - execute gpu reset
>> - *
>> - * @work: work struct
>> - *
>> - * Execute scheduled gpu reset (cayman+).
>> - * This function is called when the irq handler
>> - * thinks we need a gpu reset.
>> - */
>> -static void amdgpu_irq_reset_work_func(struct work_struct *work) -{
>> -	struct amdgpu_device *adev = container_of(work, struct amdgpu_device,
>> -						  reset_work);
>> -
>> -	if (!amdgpu_sriov_vf(adev))
>> -		amdgpu_gpu_recover(adev, NULL);
>> -}
>>    
>>    /* Disable *all* interrupts */
>>    static void amdgpu_irq_disable_all(struct amdgpu_device *adev) @@
>> -251,14 +234,11 @@ int amdgpu_irq_init(struct amdgpu_device *adev)
>>    				amdgpu_hotplug_work_func);
>>    	}
>>    
>> -	INIT_WORK(&adev->reset_work, amdgpu_irq_reset_work_func);
>> -
>>    	adev->irq.installed = true;
>>    	r = drm_irq_install(adev->ddev, adev->ddev->pdev->irq);
>>    	if (r) {
>>    		adev->irq.installed = false;
>>    		flush_work(&adev->hotplug_work);
>> -		cancel_work_sync(&adev->reset_work);
>>    		return r;
>>    	}
>>    
>> @@ -283,7 +263,6 @@ void amdgpu_irq_fini(struct amdgpu_device *adev)
>>    		if (adev->irq.msi_enabled)
>>    			pci_disable_msi(adev->pdev);
>>    		flush_work(&adev->hotplug_work);
>> -		cancel_work_sync(&adev->reset_work);
>>    	}
>>    
>>    	for (i = 0; i < AMDGPU_IH_CLIENTID_MAX; ++i) { diff --git
>> a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
>> b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
>> index ed26dcb..c8d9bc1 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
>> @@ -1227,7 +1227,6 @@ static int cik_sdma_process_illegal_inst_irq(struct amdgpu_device *adev,
>>    					     struct amdgpu_iv_entry *entry)
>>    {
>>    	DRM_ERROR("Illegal instruction in SDMA command stream\n");
>> -	schedule_work(&adev->reset_work);
>>    	return 0;
>>    }
>>    
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c
>> index 9430d48..25b32b7 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c
>> @@ -3443,7 +3443,6 @@ static int gfx_v6_0_priv_reg_irq(struct amdgpu_device *adev,
>>    				 struct amdgpu_iv_entry *entry)
>>    {
>>    	DRM_ERROR("Illegal register access in command stream\n");
>> -	schedule_work(&adev->reset_work);
>>    	return 0;
>>    }
>>    
>> @@ -3452,7 +3451,6 @@ static int gfx_v6_0_priv_inst_irq(struct amdgpu_device *adev,
>>    				  struct amdgpu_iv_entry *entry)
>>    {
>>    	DRM_ERROR("Illegal instruction in command stream\n");
>> -	schedule_work(&adev->reset_work);
>>    	return 0;
>>    }
>>    
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>> index 3c2b15a..b0e127d 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>> @@ -5020,7 +5020,6 @@ static int gfx_v7_0_priv_reg_irq(struct amdgpu_device *adev,
>>    				 struct amdgpu_iv_entry *entry)
>>    {
>>    	DRM_ERROR("Illegal register access in command stream\n");
>> -	schedule_work(&adev->reset_work);
>>    	return 0;
>>    }
>>    
>> @@ -5029,8 +5028,6 @@ static int gfx_v7_0_priv_inst_irq(struct amdgpu_device *adev,
>>    				  struct amdgpu_iv_entry *entry)
>>    {
>>    	DRM_ERROR("Illegal instruction in command stream\n");
>> -	// XXX soft reset the gfx block only
>> -	schedule_work(&adev->reset_work);
>>    	return 0;
>>    }
>>    
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> index a74515a..5fd4996 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> @@ -6795,7 +6795,6 @@ static int gfx_v8_0_priv_reg_irq(struct amdgpu_device *adev,
>>    				 struct amdgpu_iv_entry *entry)
>>    {
>>    	DRM_ERROR("Illegal register access in command stream\n");
>> -	schedule_work(&adev->reset_work);
>>    	return 0;
>>    }
>>    
>> @@ -6804,7 +6803,6 @@ static int gfx_v8_0_priv_inst_irq(struct amdgpu_device *adev,
>>    				  struct amdgpu_iv_entry *entry)
>>    {
>>    	DRM_ERROR("Illegal instruction in command stream\n");
>> -	schedule_work(&adev->reset_work);
>>    	return 0;
>>    }
>>    
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> index 9855dc0..dce1960 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> @@ -4136,7 +4136,6 @@ static int gfx_v9_0_priv_reg_irq(struct amdgpu_device *adev,
>>    				 struct amdgpu_iv_entry *entry)
>>    {
>>    	DRM_ERROR("Illegal register access in command stream\n");
>> -	schedule_work(&adev->reset_work);
>>    	return 0;
>>    }
>>    
>> @@ -4145,7 +4144,6 @@ static int gfx_v9_0_priv_inst_irq(struct amdgpu_device *adev,
>>    				  struct amdgpu_iv_entry *entry)
>>    {
>>    	DRM_ERROR("Illegal instruction in command stream\n");
>> -	schedule_work(&adev->reset_work);
>>    	return 0;
>>    }
>>    
>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
>> b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
>> index 92f8c44..7e2580c 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
>> @@ -1161,7 +1161,6 @@ static int sdma_v2_4_process_illegal_inst_irq(struct amdgpu_device *adev,
>>    					      struct amdgpu_iv_entry *entry)
>>    {
>>    	DRM_ERROR("Illegal instruction in SDMA command stream\n");
>> -	schedule_work(&adev->reset_work);
>>    	return 0;
>>    }
>>    
>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
>> b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
>> index 52e6bf2..c12f994 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
>> @@ -1480,7 +1480,6 @@ static int sdma_v3_0_process_illegal_inst_irq(struct amdgpu_device *adev,
>>    					      struct amdgpu_iv_entry *entry)
>>    {
>>    	DRM_ERROR("Illegal instruction in SDMA command stream\n");
>> -	schedule_work(&adev->reset_work);
>>    	return 0;
>>    }
>>    
>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>> b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>> index fe78c00..09b7586 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>> @@ -1416,7 +1416,6 @@ static int sdma_v4_0_process_illegal_inst_irq(struct amdgpu_device *adev,
>>    					      struct amdgpu_iv_entry *entry)
>>    {
>>    	DRM_ERROR("Illegal instruction in SDMA command stream\n");
>> -	schedule_work(&adev->reset_work);
>>    	return 0;
>>    }
>>    
>> diff --git a/drivers/gpu/drm/amd/amdgpu/si_dma.c
>> b/drivers/gpu/drm/amd/amdgpu/si_dma.c
>> index ee469a9..408e145 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/si_dma.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/si_dma.c
>> @@ -683,7 +683,6 @@ static int si_dma_process_illegal_inst_irq(struct amdgpu_device *adev,
>>    					      struct amdgpu_iv_entry *entry)
>>    {
>>    	DRM_ERROR("Illegal instruction in SDMA command stream\n");
>> -	schedule_work(&adev->reset_work);
>>    	return 0;
>>    }
>>    
>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [PATCH 2/2] drm/amdgpu:cleanup deprecated gpu reset logic
       [not found]                 ` <cda71e3e-c1fe-945b-72de-018a50d1c925-5C7GfCeVMHo@public.gmane.org>
@ 2017-11-01  9:52                   ` Liu, Monk
  0 siblings, 0 replies; 8+ messages in thread
From: Liu, Monk @ 2017-11-01  9:52 UTC (permalink / raw)
  To: Koenig, Christian, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Sounds feasible, only this step looks need some investigation on hardware side:

>in gfx_v9_0_priv_reg_irq() (and all the other IRQ handlers as well) we add functionality to figure out the ring/scheduler which caused the illegal operation.

e.g. :

gfx_v9_priv_reg_irq(struct amdgpu_device *adev, strut amdgpu_irq_src *source, struct amdgpu_iv_entry *entry){
  //in this function, need to figure out which ring cause this error from source/entry parameter
}



-----Original Message-----
From: Koenig, Christian 
Sent: 2017年11月1日 16:42
To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH 2/2] drm/amdgpu:cleanup deprecated gpu reset logic

Am 01.11.2017 um 04:29 schrieb Liu, Monk:
> The thing is triggering gpu_recover() in irq routine give you NULL for the @bad/job parameter, so gpu_recover() actually did nothing meaningful, it just repeat scheduling un-signaled jobs again and again, and finally your GPU is stuck with infinite recovering ....

Yeah, completely agree that this isn't a good implementation. But we need to somehow do better than that.

> here is my initial thought:
> In " gfx_v9_0_priv_reg_irq(struct amdgpu_device *adev, struct amdgpu_iv_entry *entry)" routine:
> For lockup_timeout==0 case: we put a new parameter "@sched" to 
> amd_gpu_recover(), that way although we don't have @bad/job, but with @sched we can still find out the pending job on his scheduler, But that make code ugly, cuz we need to change amdgpu_gpu_recover(adev, job) to amdgpu_gpu_recover(adev, job, sched). Anyway need to find a way to tell gpu_recover() at least which ring has problem.
>
> For lockup_timeout!=0 case: did nothing and just return since TDR will correct this error and do recover() gracefully.

I think waiting for the TDR will take to long. How about this instead:

1. In gfx_v9_0_priv_reg_irq() (and all the other IRQ handlers as well) we add functionality to figure out the ring/scheduler which caused the illegal operation.

2. Then we add a new function amd_sched_job_fault() which gets the scheduler which had a fault as parameter.

3. amd_sched_job_fault() then grabs job_list_lock and takes the first job from the ring_mirror_list.

4. We call cancel_delayed_work() to cancel the timeout of the job.

5. If cancel_delayed_work() returns true (which means the TDR isn't already running anyway) we call schedule_delayed_work() with a zero timeout.

This way the actual timeout value is truncated to zero and we run the recovery immediately just in the same way as it would when we have waited.

Should be trivial to implement actually. What do you think?

Regards,
Christian.

>
>
>
> -----Original Message-----
> From: Christian König [mailto:ckoenig.leichtzumerken@gmail.com]
> Sent: 2017年10月31日 23:01
> To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH 2/2] drm/amdgpu:cleanup deprecated gpu reset logic
>
> Am 31.10.2017 um 12:55 schrieb Monk Liu:
>> trigger gpu reset/recovery from illegle instruction IRQ is deprecated 
>> long time ago, we already switch to recover gpu by TDR triggering.
>>
>> now please set lockup_timeout to non-zero value in driver loading to 
>> enable TDR.
> The patch is ok, but NAK to the general approach. We should have illegal instruction/access alongside the timeout.
>
> But instead of trying to trigger the reset directly inform the scheduler that the we detected a problem on the engine. The scheduler can then cancel the timeout and do the appropriate things.
>
> This patch would be ok if you install this new functionality directly after removing the old (and broken) one.
>
> Regards,
> Christian.
>
>> Change-Id: I45a576a97fd9859e1098e785ce857c2cf5adfba5
>> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h     |  1 -
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c | 21 ---------------------
>>    drivers/gpu/drm/amd/amdgpu/cik_sdma.c   |  1 -
>>    drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c   |  2 --
>>    drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c   |  3 ---
>>    drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c   |  2 --
>>    drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c   |  2 --
>>    drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c  |  1 -
>>    drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c  |  1 -
>>    drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c  |  1 -
>>    drivers/gpu/drm/amd/amdgpu/si_dma.c     |  1 -
>>    11 files changed, 36 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index 6e89be5..b3453e3 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -1459,7 +1459,6 @@ struct amdgpu_device {
>>    	bool				shutdown;
>>    	bool				need_dma32;
>>    	bool				accel_working;
>> -	struct work_struct		reset_work;
>>    	struct notifier_block		acpi_nb;
>>    	struct amdgpu_i2c_chan		*i2c_bus[AMDGPU_MAX_I2C_BUS];
>>    	struct amdgpu_debugfs		debugfs[AMDGPU_DEBUGFS_MAX_COMPONENTS];
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
>> index c340774..989b530 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
>> @@ -73,23 +73,6 @@ static void amdgpu_hotplug_work_func(struct work_struct *work)
>>    	drm_helper_hpd_irq_event(dev);
>>    }
>>    
>> -/**
>> - * amdgpu_irq_reset_work_func - execute gpu reset
>> - *
>> - * @work: work struct
>> - *
>> - * Execute scheduled gpu reset (cayman+).
>> - * This function is called when the irq handler
>> - * thinks we need a gpu reset.
>> - */
>> -static void amdgpu_irq_reset_work_func(struct work_struct *work) -{
>> -	struct amdgpu_device *adev = container_of(work, struct amdgpu_device,
>> -						  reset_work);
>> -
>> -	if (!amdgpu_sriov_vf(adev))
>> -		amdgpu_gpu_recover(adev, NULL);
>> -}
>>    
>>    /* Disable *all* interrupts */
>>    static void amdgpu_irq_disable_all(struct amdgpu_device *adev) @@
>> -251,14 +234,11 @@ int amdgpu_irq_init(struct amdgpu_device *adev)
>>    				amdgpu_hotplug_work_func);
>>    	}
>>    
>> -	INIT_WORK(&adev->reset_work, amdgpu_irq_reset_work_func);
>> -
>>    	adev->irq.installed = true;
>>    	r = drm_irq_install(adev->ddev, adev->ddev->pdev->irq);
>>    	if (r) {
>>    		adev->irq.installed = false;
>>    		flush_work(&adev->hotplug_work);
>> -		cancel_work_sync(&adev->reset_work);
>>    		return r;
>>    	}
>>    
>> @@ -283,7 +263,6 @@ void amdgpu_irq_fini(struct amdgpu_device *adev)
>>    		if (adev->irq.msi_enabled)
>>    			pci_disable_msi(adev->pdev);
>>    		flush_work(&adev->hotplug_work);
>> -		cancel_work_sync(&adev->reset_work);
>>    	}
>>    
>>    	for (i = 0; i < AMDGPU_IH_CLIENTID_MAX; ++i) { diff --git 
>> a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
>> b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
>> index ed26dcb..c8d9bc1 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
>> @@ -1227,7 +1227,6 @@ static int cik_sdma_process_illegal_inst_irq(struct amdgpu_device *adev,
>>    					     struct amdgpu_iv_entry *entry)
>>    {
>>    	DRM_ERROR("Illegal instruction in SDMA command stream\n");
>> -	schedule_work(&adev->reset_work);
>>    	return 0;
>>    }
>>    
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c
>> index 9430d48..25b32b7 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c
>> @@ -3443,7 +3443,6 @@ static int gfx_v6_0_priv_reg_irq(struct amdgpu_device *adev,
>>    				 struct amdgpu_iv_entry *entry)
>>    {
>>    	DRM_ERROR("Illegal register access in command stream\n");
>> -	schedule_work(&adev->reset_work);
>>    	return 0;
>>    }
>>    
>> @@ -3452,7 +3451,6 @@ static int gfx_v6_0_priv_inst_irq(struct amdgpu_device *adev,
>>    				  struct amdgpu_iv_entry *entry)
>>    {
>>    	DRM_ERROR("Illegal instruction in command stream\n");
>> -	schedule_work(&adev->reset_work);
>>    	return 0;
>>    }
>>    
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>> index 3c2b15a..b0e127d 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>> @@ -5020,7 +5020,6 @@ static int gfx_v7_0_priv_reg_irq(struct amdgpu_device *adev,
>>    				 struct amdgpu_iv_entry *entry)
>>    {
>>    	DRM_ERROR("Illegal register access in command stream\n");
>> -	schedule_work(&adev->reset_work);
>>    	return 0;
>>    }
>>    
>> @@ -5029,8 +5028,6 @@ static int gfx_v7_0_priv_inst_irq(struct amdgpu_device *adev,
>>    				  struct amdgpu_iv_entry *entry)
>>    {
>>    	DRM_ERROR("Illegal instruction in command stream\n");
>> -	// XXX soft reset the gfx block only
>> -	schedule_work(&adev->reset_work);
>>    	return 0;
>>    }
>>    
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> index a74515a..5fd4996 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> @@ -6795,7 +6795,6 @@ static int gfx_v8_0_priv_reg_irq(struct amdgpu_device *adev,
>>    				 struct amdgpu_iv_entry *entry)
>>    {
>>    	DRM_ERROR("Illegal register access in command stream\n");
>> -	schedule_work(&adev->reset_work);
>>    	return 0;
>>    }
>>    
>> @@ -6804,7 +6803,6 @@ static int gfx_v8_0_priv_inst_irq(struct amdgpu_device *adev,
>>    				  struct amdgpu_iv_entry *entry)
>>    {
>>    	DRM_ERROR("Illegal instruction in command stream\n");
>> -	schedule_work(&adev->reset_work);
>>    	return 0;
>>    }
>>    
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> index 9855dc0..dce1960 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> @@ -4136,7 +4136,6 @@ static int gfx_v9_0_priv_reg_irq(struct amdgpu_device *adev,
>>    				 struct amdgpu_iv_entry *entry)
>>    {
>>    	DRM_ERROR("Illegal register access in command stream\n");
>> -	schedule_work(&adev->reset_work);
>>    	return 0;
>>    }
>>    
>> @@ -4145,7 +4144,6 @@ static int gfx_v9_0_priv_inst_irq(struct amdgpu_device *adev,
>>    				  struct amdgpu_iv_entry *entry)
>>    {
>>    	DRM_ERROR("Illegal instruction in command stream\n");
>> -	schedule_work(&adev->reset_work);
>>    	return 0;
>>    }
>>    
>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
>> b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
>> index 92f8c44..7e2580c 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
>> @@ -1161,7 +1161,6 @@ static int sdma_v2_4_process_illegal_inst_irq(struct amdgpu_device *adev,
>>    					      struct amdgpu_iv_entry *entry)
>>    {
>>    	DRM_ERROR("Illegal instruction in SDMA command stream\n");
>> -	schedule_work(&adev->reset_work);
>>    	return 0;
>>    }
>>    
>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
>> b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
>> index 52e6bf2..c12f994 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
>> @@ -1480,7 +1480,6 @@ static int sdma_v3_0_process_illegal_inst_irq(struct amdgpu_device *adev,
>>    					      struct amdgpu_iv_entry *entry)
>>    {
>>    	DRM_ERROR("Illegal instruction in SDMA command stream\n");
>> -	schedule_work(&adev->reset_work);
>>    	return 0;
>>    }
>>    
>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>> b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>> index fe78c00..09b7586 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>> @@ -1416,7 +1416,6 @@ static int sdma_v4_0_process_illegal_inst_irq(struct amdgpu_device *adev,
>>    					      struct amdgpu_iv_entry *entry)
>>    {
>>    	DRM_ERROR("Illegal instruction in SDMA command stream\n");
>> -	schedule_work(&adev->reset_work);
>>    	return 0;
>>    }
>>    
>> diff --git a/drivers/gpu/drm/amd/amdgpu/si_dma.c
>> b/drivers/gpu/drm/amd/amdgpu/si_dma.c
>> index ee469a9..408e145 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/si_dma.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/si_dma.c
>> @@ -683,7 +683,6 @@ static int si_dma_process_illegal_inst_irq(struct amdgpu_device *adev,
>>    					      struct amdgpu_iv_entry *entry)
>>    {
>>    	DRM_ERROR("Illegal instruction in SDMA command stream\n");
>> -	schedule_work(&adev->reset_work);
>>    	return 0;
>>    }
>>    
>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2017-11-01  9:52 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-10-31 11:55 [PATCH 1/2] drm/amdgpu:add fw-vram-usage for atomfirmware Monk Liu
     [not found] ` <1509450922-15476-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
2017-10-31 11:55   ` [PATCH 2/2] drm/amdgpu:cleanup deprecated gpu reset logic Monk Liu
     [not found]     ` <1509450922-15476-2-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
2017-10-31 15:01       ` Christian König
     [not found]         ` <9649157d-35ec-cec5-ac7c-54ef9335f56f-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2017-11-01  3:29           ` Liu, Monk
     [not found]             ` <BLUPR12MB0449E2AD3180D0A2DF089A60845F0-7LeqcoF/hwpTIQvHjXdJlwdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2017-11-01  8:41               ` Christian König
     [not found]                 ` <cda71e3e-c1fe-945b-72de-018a50d1c925-5C7GfCeVMHo@public.gmane.org>
2017-11-01  9:52                   ` Liu, Monk
2017-10-31 15:01   ` [PATCH 1/2] drm/amdgpu:add fw-vram-usage for atomfirmware Deucher, Alexander
     [not found]     ` <BN6PR12MB1652E5737C680A79CDBA9D9AF75E0-/b2+HYfkarQqUD6E6FAiowdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2017-11-01  3:40       ` Liu, Monk

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.