All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 01/20] drm/amdgpu:fix powerplay logic
@ 2017-02-07  6:10 Monk Liu
       [not found] ` <1486447878-20521-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 51+ messages in thread
From: Monk Liu @ 2017-02-07  6:10 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Monk Liu

1,like pp_hw_init, we shouldn't report error if PP disabled
2,disable pp_en if sriov

Change-Id: I6d259f9609f223998bea236f64676b9c22133e4e
Signed-off-by: Monk Liu <Monk.Liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_powerplay.c | 2 +-
 drivers/gpu/drm/amd/powerplay/amd_powerplay.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_powerplay.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_powerplay.c
index 8856ecc..d56d200 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_powerplay.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_powerplay.c
@@ -43,7 +43,7 @@ static int amdgpu_create_pp_handle(struct amdgpu_device *adev)
 	amd_pp = &(adev->powerplay);
 	pp_init.chip_family = adev->family;
 	pp_init.chip_id = adev->asic_type;
-	pp_init.pm_en = amdgpu_dpm != 0 ? true : false;
+	pp_init.pm_en = (amdgpu_dpm != 0 && !amdgpu_sriov_vf(adev)) ? true : false;
 	pp_init.feature_mask = amdgpu_pp_feature_mask;
 	pp_init.device = amdgpu_cgs_create_device(adev);
 	ret = amd_powerplay_create(&pp_init, &(amd_pp->pp_handle));
diff --git a/drivers/gpu/drm/amd/powerplay/amd_powerplay.c b/drivers/gpu/drm/amd/powerplay/amd_powerplay.c
index 429f18b..e9cf207 100644
--- a/drivers/gpu/drm/amd/powerplay/amd_powerplay.c
+++ b/drivers/gpu/drm/amd/powerplay/amd_powerplay.c
@@ -286,7 +286,7 @@ static int pp_resume(void *handle)
 	}
 
 	if (ret1 == PP_DPM_DISABLED)
-		return ret1;
+		return 0;
 
 	eventmgr = pp_handle->eventmgr;
 
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH 02/20] drm/amdgpu:cg & pg are not applied on VF
       [not found] ` <1486447878-20521-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
@ 2017-02-07  6:11   ` Monk Liu
       [not found]     ` <1486447878-20521-2-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  2017-02-07  6:11   ` [PATCH 03/20] drm/damdgpu:add new mqd member in ring Monk Liu
                     ` (18 subsequent siblings)
  19 siblings, 1 reply; 51+ messages in thread
From: Monk Liu @ 2017-02-07  6:11 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Monk Liu

Change-Id: I93a4e97f1d24a289ab20c2a62371f3e303322587
Signed-off-by: Monk Liu <Monk.Liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c  | 9 +++++++++
 drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c  | 6 ++++++
 drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c | 6 ++++++
 drivers/gpu/drm/amd/amdgpu/vi.c        | 6 ++++++
 4 files changed, 27 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index 0a75021..1e170ab 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -5833,6 +5833,9 @@ static int gfx_v8_0_set_powergating_state(void *handle,
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 	bool enable = (state == AMD_PG_STATE_GATE) ? true : false;
 
+	if (amdgpu_sriov_vf(adev))
+		return 0;
+
 	switch (adev->asic_type) {
 	case CHIP_CARRIZO:
 	case CHIP_STONEY:
@@ -5890,6 +5893,9 @@ static void gfx_v8_0_get_clockgating_state(void *handle, u32 *flags)
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 	int data;
 
+	if (amdgpu_sriov_vf(adev))
+		*flags = 0;
+
 	/* AMD_CG_SUPPORT_GFX_MGCG */
 	data = RREG32(mmRLC_CGTT_MGCG_OVERRIDE);
 	if (!(data & RLC_CGTT_MGCG_OVERRIDE__CPF_MASK))
@@ -6403,6 +6409,9 @@ static int gfx_v8_0_set_clockgating_state(void *handle,
 {
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
+	if (amdgpu_sriov_vf(adev))
+		return 0;
+
 	switch (adev->asic_type) {
 	case CHIP_FIJI:
 	case CHIP_CARRIZO:
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
index 7669b32..22c52d6 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
@@ -1427,6 +1427,9 @@ static int gmc_v8_0_set_clockgating_state(void *handle,
 {
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
+	if (amdgpu_sriov_vf(adev))
+		return 0;
+
 	switch (adev->asic_type) {
 	case CHIP_FIJI:
 		fiji_update_mc_medium_grain_clock_gating(adev,
@@ -1451,6 +1454,9 @@ static void gmc_v8_0_get_clockgating_state(void *handle, u32 *flags)
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 	int data;
 
+	if (amdgpu_sriov_vf(adev))
+		*flags = 0;
+
 	/* AMD_CG_SUPPORT_MC_MGCG */
 	data = RREG32(mmMC_HUB_MISC_HUB_CG);
 	if (data & MC_HUB_MISC_HUB_CG__ENABLE_MASK)
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
index 25602c4..9394ca6 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
@@ -1512,6 +1512,9 @@ static int sdma_v3_0_set_clockgating_state(void *handle,
 {
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
+	if (amdgpu_sriov_vf(adev))
+		return 0;
+
 	switch (adev->asic_type) {
 	case CHIP_FIJI:
 	case CHIP_CARRIZO:
@@ -1538,6 +1541,9 @@ static void sdma_v3_0_get_clockgating_state(void *handle, u32 *flags)
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 	int data;
 
+	if (amdgpu_sriov_vf(adev))
+		*flags = 0;
+
 	/* AMD_CG_SUPPORT_SDMA_MGCG */
 	data = RREG32(mmSDMA0_CLK_CTRL + sdma_offsets[0]);
 	if (!(data & SDMA0_CLK_CTRL__SOFT_OVERRIDE0_MASK))
diff --git a/drivers/gpu/drm/amd/amdgpu/vi.c b/drivers/gpu/drm/amd/amdgpu/vi.c
index 89b0dfe..aeef3c9 100644
--- a/drivers/gpu/drm/amd/amdgpu/vi.c
+++ b/drivers/gpu/drm/amd/amdgpu/vi.c
@@ -1391,6 +1391,9 @@ static int vi_common_set_clockgating_state(void *handle,
 {
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
+	if (amdgpu_sriov_vf(adev))
+		return 0;
+
 	switch (adev->asic_type) {
 	case CHIP_FIJI:
 		vi_update_bif_medium_grain_light_sleep(adev,
@@ -1435,6 +1438,9 @@ static void vi_common_get_clockgating_state(void *handle, u32 *flags)
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 	int data;
 
+	if (amdgpu_sriov_vf(adev))
+		*flags = 0;
+
 	/* AMD_CG_SUPPORT_BIF_LS */
 	data = RREG32_PCIE(ixPCIE_CNTL2);
 	if (data & PCIE_CNTL2__SLV_MEM_LS_EN_MASK)
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH 03/20] drm/damdgpu:add new mqd member in ring
       [not found] ` <1486447878-20521-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  2017-02-07  6:11   ` [PATCH 02/20] drm/amdgpu:cg & pg are not applied on VF Monk Liu
@ 2017-02-07  6:11   ` Monk Liu
       [not found]     ` <1486447878-20521-3-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  2017-02-07  6:11   ` [PATCH 04/20] drm/amdgpu:imple mqd soft ini/fini Monk Liu
                     ` (17 subsequent siblings)
  19 siblings, 1 reply; 51+ messages in thread
From: Monk Liu @ 2017-02-07  6:11 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Monk Liu

Change-Id: If4dc6bb92d6a364125a568f37ea409e4c438e6a2
Signed-off-by: Monk Liu <Monk.Liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 2 ++
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index c813cbe..0e57b04 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -161,6 +161,8 @@ struct amdgpu_ring {
 	u32			pipe;
 	u32			queue;
 	struct amdgpu_bo	*mqd_obj;
+	uint64_t                mqd_gpu_addr;
+	struct vi_mqd           *mqd_ptr;
 	u32			doorbell_index;
 	bool			use_doorbell;
 	unsigned		wptr_offs;
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index 1e170ab..22bd155 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -4588,6 +4588,8 @@ static void gfx_v8_0_cp_compute_fini(struct amdgpu_device *adev)
 
 			amdgpu_bo_unref(&ring->mqd_obj);
 			ring->mqd_obj = NULL;
+			ring->mqd_ptr = NULL;
+			ring->mqd_gpu_addr = 0;
 		}
 	}
 }
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH 04/20] drm/amdgpu:imple mqd soft ini/fini
       [not found] ` <1486447878-20521-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  2017-02-07  6:11   ` [PATCH 02/20] drm/amdgpu:cg & pg are not applied on VF Monk Liu
  2017-02-07  6:11   ` [PATCH 03/20] drm/damdgpu:add new mqd member in ring Monk Liu
@ 2017-02-07  6:11   ` Monk Liu
       [not found]     ` <1486447878-20521-4-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  2017-02-07  6:11   ` [PATCH 05/20] drm/amdgpu:bo_free_kernel will set ptr to NULL if freed Monk Liu
                     ` (16 subsequent siblings)
  19 siblings, 1 reply; 51+ messages in thread
From: Monk Liu @ 2017-02-07  6:11 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Monk Liu

Change-Id: I650a78c8d27f76997e1ef6e3934d0d7e043d4715
Signed-off-by: Monk Liu <Monk.Liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 52 +++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index 22bd155..0e2c906 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -659,6 +659,8 @@ static u32 gfx_v8_0_get_csb_size(struct amdgpu_device *adev);
 static void gfx_v8_0_get_cu_info(struct amdgpu_device *adev);
 static void gfx_v8_0_ring_emit_ce_meta_init(struct amdgpu_ring *ring, uint64_t addr);
 static void gfx_v8_0_ring_emit_de_meta_init(struct amdgpu_ring *ring, uint64_t addr);
+static int gfx_v8_0_compute_mqd_soft_init(struct amdgpu_device *adev);
+static void gfx_v8_0_compute_mqd_soft_fini(struct amdgpu_device *adev);
 
 static void gfx_v8_0_init_golden_registers(struct amdgpu_device *adev)
 {
@@ -7322,3 +7324,53 @@ static void gfx_v8_0_ring_emit_de_meta_init(struct amdgpu_ring *ring, uint64_t c
 	amdgpu_ring_write(ring, upper_32_bits(de_payload_addr));
 	amdgpu_ring_write_multiple(ring, (void *)&de_payload, cnt_de - 2);
 }
+
+/* create MQD for each compute queue */
+static int gfx_v8_0_compute_mqd_soft_init(struct amdgpu_device *adev)
+{
+	struct amdgpu_ring *ring = NULL;
+	int r, i;
+
+	/* create MQD for KIQ */
+	ring = &adev->gfx.kiq.ring;
+	if (!ring->mqd_obj) {
+		r = amdgpu_bo_create_kernel(adev, sizeof(struct vi_mqd), PAGE_SIZE,
+						AMDGPU_GEM_DOMAIN_GTT, &ring->mqd_obj,
+						&ring->mqd_gpu_addr, (void **)&ring->mqd_ptr);
+		if (r) {
+			dev_warn(adev->dev, "failed to create ring mqd ob (%d)", r);
+			return r;
+		}
+	}
+
+	/* create MQD for each KCQ */
+	for (i = 0; i < adev->gfx.num_compute_rings; i++)
+	{
+		ring = &adev->gfx.compute_ring[i];
+		if (!ring->mqd_obj) {
+			r = amdgpu_bo_create_kernel(adev, sizeof(struct vi_mqd), PAGE_SIZE,
+							AMDGPU_GEM_DOMAIN_GTT, &ring->mqd_obj,
+							&ring->mqd_gpu_addr, (void **)&ring->mqd_ptr);
+			if (r) {
+				dev_warn(adev->dev, "failed to create ring mqd ob (%d)", r);
+				return r;
+			}
+		}
+	}
+
+	return 0;
+}
+
+static void gfx_v8_0_compute_mqd_soft_fini(struct amdgpu_device *adev)
+{
+	struct amdgpu_ring *ring = NULL;
+	int i;
+
+	for (i = 0; i < adev->gfx.num_compute_rings; i++) {
+		ring = &adev->gfx.compute_ring[i];
+		amdgpu_bo_free_kernel(&ring->mqd_obj, &ring->mqd_gpu_addr, (void **)&ring->mqd_ptr);
+	}
+
+	ring = &adev->gfx.kiq.ring;
+	amdgpu_bo_free_kernel(&ring->mqd_obj, &ring->mqd_gpu_addr, (void **)&ring->mqd_ptr);
+}
\ No newline at end of file
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH 05/20] drm/amdgpu:bo_free_kernel will set ptr to NULL if freed
       [not found] ` <1486447878-20521-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
                     ` (2 preceding siblings ...)
  2017-02-07  6:11   ` [PATCH 04/20] drm/amdgpu:imple mqd soft ini/fini Monk Liu
@ 2017-02-07  6:11   ` Monk Liu
       [not found]     ` <1486447878-20521-5-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  2017-02-07  6:11   ` [PATCH 06/20] drm/amdgpu:no need use sriov judge Monk Liu
                     ` (15 subsequent siblings)
  19 siblings, 1 reply; 51+ messages in thread
From: Monk Liu @ 2017-02-07  6:11 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Monk Liu

Change-Id: Iac592f1a6c927677008feabc1b7af6f18c580910
Signed-off-by: Monk Liu <Monk.Liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index 0e2c906..df1cfc5 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -1477,7 +1477,6 @@ static void gfx_v8_0_kiq_fini(struct amdgpu_device *adev)
 	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
 
 	amdgpu_bo_free_kernel(&kiq->eop_obj, &kiq->eop_gpu_addr, NULL);
-	kiq->eop_obj = NULL;
 }
 
 static int gfx_v8_0_kiq_init(struct amdgpu_device *adev)
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH 06/20] drm/amdgpu:no need use sriov judge
       [not found] ` <1486447878-20521-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
                     ` (3 preceding siblings ...)
  2017-02-07  6:11   ` [PATCH 05/20] drm/amdgpu:bo_free_kernel will set ptr to NULL if freed Monk Liu
@ 2017-02-07  6:11   ` Monk Liu
       [not found]     ` <1486447878-20521-6-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  2017-02-07  6:11   ` [PATCH 07/20] drm/amdgpu:minor cleanup Monk Liu
                     ` (14 subsequent siblings)
  19 siblings, 1 reply; 51+ messages in thread
From: Monk Liu @ 2017-02-07  6:11 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Monk Liu

Change-Id: I9717e200be8af36f52d6305e02ffea178044c851
Signed-off-by: Monk Liu <Monk.Liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index df1cfc5..fd29124 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -1379,11 +1379,9 @@ static int gfx_v8_0_kiq_init_ring(struct amdgpu_device *adev,
 {
 	int r = 0;
 
-	if (amdgpu_sriov_vf(adev)) {
-		r = amdgpu_wb_get(adev, &adev->virt.reg_val_offs);
-		if (r)
-			return r;
-	}
+	r = amdgpu_wb_get(adev, &adev->virt.reg_val_offs);
+	if (r)
+		return r;
 
 	ring->adev = NULL;
 	ring->ring_obj = NULL;
@@ -1407,13 +1405,10 @@ static int gfx_v8_0_kiq_init_ring(struct amdgpu_device *adev,
 
 	return r;
 }
-
 static void gfx_v8_0_kiq_free_ring(struct amdgpu_ring *ring,
 				   struct amdgpu_irq_src *irq)
 {
-	if (amdgpu_sriov_vf(ring->adev))
-		amdgpu_wb_free(ring->adev, ring->adev->virt.reg_val_offs);
-
+	amdgpu_wb_free(ring->adev, ring->adev->virt.reg_val_offs);
 	amdgpu_ring_fini(ring);
 	irq->data = NULL;
 }
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH 07/20] drm/amdgpu:minor cleanup
       [not found] ` <1486447878-20521-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
                     ` (4 preceding siblings ...)
  2017-02-07  6:11   ` [PATCH 06/20] drm/amdgpu:no need use sriov judge Monk Liu
@ 2017-02-07  6:11   ` Monk Liu
       [not found]     ` <1486447878-20521-7-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  2017-02-07  6:11   ` [PATCH 08/20] drm/amdgpu:divide KCQ mqd init to sw and hw Monk Liu
                     ` (13 subsequent siblings)
  19 siblings, 1 reply; 51+ messages in thread
From: Monk Liu @ 2017-02-07  6:11 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Monk Liu

Change-Id: Ia5ada3e9990261ca70b03655424e6290701cdb9d
Signed-off-by: Monk Liu <Monk.Liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index fd29124..4029d32 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -4864,10 +4864,7 @@ static int gfx_v8_0_kiq_init_queue(struct amdgpu_ring *ring,
 	struct amdgpu_device *adev = ring->adev;
 	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
 	uint64_t eop_gpu_addr;
-	bool is_kiq = false;
-
-	if (ring->funcs->type == AMDGPU_RING_TYPE_KIQ)
-		is_kiq = true;
+	bool is_kiq = (ring->funcs->type == AMDGPU_RING_TYPE_KIQ);
 
 	if (is_kiq) {
 		eop_gpu_addr = kiq->eop_gpu_addr;
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH 08/20] drm/amdgpu:divide KCQ mqd init to sw and hw
       [not found] ` <1486447878-20521-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
                     ` (5 preceding siblings ...)
  2017-02-07  6:11   ` [PATCH 07/20] drm/amdgpu:minor cleanup Monk Liu
@ 2017-02-07  6:11   ` Monk Liu
       [not found]     ` <1486447878-20521-8-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  2017-02-07  6:11   ` [PATCH 09/20] drm/amdgpu:implement SRIOV gpu_reset Monk Liu
                     ` (12 subsequent siblings)
  19 siblings, 1 reply; 51+ messages in thread
From: Monk Liu @ 2017-02-07  6:11 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Monk Liu

sw part only invoked once during sw_init.
hw part invoked during first drv load and resume later.

that way we cannot alloc mqd in hw/resume, we only keep
mqd allocted in sw_init routine.
and hw_init routine only kmap and set it.

Change-Id: Ib0b788c71154e79819e8abb8daee9b9234a8eabb
Signed-off-by: Monk Liu <Monk.Liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 107 +++++++++++++---------------------
 1 file changed, 42 insertions(+), 65 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index 4029d32..6734e55 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -2116,17 +2116,6 @@ static int gfx_v8_0_sw_init(void *handle)
 		return r;
 	}
 
-	r = gfx_v8_0_kiq_init(adev);
-	if (r) {
-		DRM_ERROR("Failed to init KIQ BOs!\n");
-		return r;
-	}
-
-	kiq = &adev->gfx.kiq;
-	r = gfx_v8_0_kiq_init_ring(adev, &kiq->ring, &kiq->irq);
-	if (r)
-		return r;
-
 	/* set up the gfx ring */
 	for (i = 0; i < adev->gfx.num_gfx_rings; i++) {
 		ring = &adev->gfx.gfx_ring[i];
@@ -2169,6 +2158,24 @@ static int gfx_v8_0_sw_init(void *handle)
 			return r;
 	}
 
+	if (amdgpu_sriov_vf(adev)) {
+		r = gfx_v8_0_kiq_init(adev);
+		if (r) {
+			DRM_ERROR("Failed to init KIQ BOs!\n");
+			return r;
+		}
+
+		kiq = &adev->gfx.kiq;
+		r = gfx_v8_0_kiq_init_ring(adev, &kiq->ring, &kiq->irq);
+		if (r)
+			return r;
+
+		/* create MQD for all compute queues as wel as KIQ for SRIOV case */
+		r = gfx_v8_0_compute_mqd_soft_init(adev);
+		if (r)
+			return r;
+	}
+
 	/* reserve GDS, GWS and OA resource for gfx */
 	r = amdgpu_bo_create_kernel(adev, adev->gds.mem.gfx_partition_size,
 				    PAGE_SIZE, AMDGPU_GEM_DOMAIN_GDS,
@@ -2210,9 +2217,13 @@ static int gfx_v8_0_sw_fini(void *handle)
 		amdgpu_ring_fini(&adev->gfx.gfx_ring[i]);
 	for (i = 0; i < adev->gfx.num_compute_rings; i++)
 		amdgpu_ring_fini(&adev->gfx.compute_ring[i]);
-	gfx_v8_0_kiq_free_ring(&adev->gfx.kiq.ring, &adev->gfx.kiq.irq);
 
-	gfx_v8_0_kiq_fini(adev);
+	if (amdgpu_sriov_vf(adev)) {
+		gfx_v8_0_compute_mqd_soft_fini(adev);
+		gfx_v8_0_kiq_free_ring(&adev->gfx.kiq.ring, &adev->gfx.kiq.irq);
+		gfx_v8_0_kiq_fini(adev);
+	}
+
 	gfx_v8_0_mec_fini(adev);
 	gfx_v8_0_rlc_fini(adev);
 	gfx_v8_0_free_microcode(adev);
@@ -4892,70 +4903,37 @@ static int gfx_v8_0_kiq_init_queue(struct amdgpu_ring *ring,
 	return 0;
 }
 
-static void gfx_v8_0_kiq_free_queue(struct amdgpu_device *adev)
+static int gfx_v8_0_kiq_resume(struct amdgpu_device *adev)
 {
 	struct amdgpu_ring *ring = NULL;
-	int i;
+	int r = 0, i;
 
-	for (i = 0; i < adev->gfx.num_compute_rings; i++) {
-		ring = &adev->gfx.compute_ring[i];
-		amdgpu_bo_free_kernel(&ring->mqd_obj, NULL, NULL);
-		ring->mqd_obj = NULL;
-	}
+	gfx_v8_0_cp_compute_enable(adev, true);
 
 	ring = &adev->gfx.kiq.ring;
-	amdgpu_bo_free_kernel(&ring->mqd_obj, NULL, NULL);
-	ring->mqd_obj = NULL;
-}
-
-static int gfx_v8_0_kiq_setup_queue(struct amdgpu_device *adev,
-				    struct amdgpu_ring *ring)
-{
-	struct vi_mqd *mqd;
-	u64 mqd_gpu_addr;
-	u32 *buf;
-	int r = 0;
-
-	r = amdgpu_bo_create_kernel(adev, sizeof(struct vi_mqd), PAGE_SIZE,
-				    AMDGPU_GEM_DOMAIN_GTT, &ring->mqd_obj,
-				    &mqd_gpu_addr, (void **)&buf);
-	if (r) {
-		dev_warn(adev->dev, "failed to create ring mqd ob (%d)", r);
+	if (!amdgpu_bo_kmap(ring->mqd_obj, (void **)&ring->mqd_ptr)) {
+		memset((void *)ring->mqd_ptr, 0, sizeof(struct vi_mqd));
+		r = gfx_v8_0_kiq_init_queue(ring, ring->mqd_ptr, ring->mqd_gpu_addr);
+		amdgpu_bo_kunmap(ring->mqd_obj);
+		if (r)
+			return r;
+	} else {
 		return r;
 	}
 
-	/* init the mqd struct */
-	memset(buf, 0, sizeof(struct vi_mqd));
-	mqd = (struct vi_mqd *)buf;
-
-	r = gfx_v8_0_kiq_init_queue(ring, mqd, mqd_gpu_addr);
-	if (r)
-		return r;
-
-	amdgpu_bo_kunmap(ring->mqd_obj);
-
-	return 0;
-}
-
-static int gfx_v8_0_kiq_resume(struct amdgpu_device *adev)
-{
-	struct amdgpu_ring *ring = NULL;
-	int r, i;
-
-	ring = &adev->gfx.kiq.ring;
-	r = gfx_v8_0_kiq_setup_queue(adev, ring);
-	if (r)
-		return r;
-
 	for (i = 0; i < adev->gfx.num_compute_rings; i++) {
 		ring = &adev->gfx.compute_ring[i];
-		r = gfx_v8_0_kiq_setup_queue(adev, ring);
-		if (r)
+		if (!amdgpu_bo_kmap(ring->mqd_obj, (void **)&ring->mqd_ptr)) {
+			memset((void *)ring->mqd_ptr, 0, sizeof(struct vi_mqd));
+			r = gfx_v8_0_kiq_init_queue(ring, ring->mqd_ptr, ring->mqd_gpu_addr);
+			amdgpu_bo_kunmap(ring->mqd_obj);
+			if (r)
+			return r;
+		} else {
 			return r;
+		}
 	}
 
-	gfx_v8_0_cp_compute_enable(adev, true);
-
 	for (i = 0; i < adev->gfx.num_compute_rings; i++) {
 		ring = &adev->gfx.compute_ring[i];
 
@@ -5316,7 +5294,6 @@ static int gfx_v8_0_hw_fini(void *handle)
 	amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0);
 	amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0);
 	if (amdgpu_sriov_vf(adev)) {
-		gfx_v8_0_kiq_free_queue(adev);
 		pr_debug("For SRIOV client, shouldn't do anything.\n");
 		return 0;
 	}
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH 09/20] drm/amdgpu:implement SRIOV gpu_reset
       [not found] ` <1486447878-20521-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
                     ` (6 preceding siblings ...)
  2017-02-07  6:11   ` [PATCH 08/20] drm/amdgpu:divide KCQ mqd init to sw and hw Monk Liu
@ 2017-02-07  6:11   ` Monk Liu
       [not found]     ` <1486447878-20521-9-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  2017-02-07  6:11   ` [PATCH 10/20] drm/amdgpu:change kiq lock name Monk Liu
                     ` (11 subsequent siblings)
  19 siblings, 1 reply; 51+ messages in thread
From: Monk Liu @ 2017-02-07  6:11 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Monk Liu

Signed-off-by: Monk Liu <Monk.Liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 158 ++++++++++++++++++++++++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h   |   1 +
 2 files changed, 158 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e926f84..2b404ca 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1604,6 +1604,53 @@ int amdgpu_suspend(struct amdgpu_device *adev)
 	return 0;
 }
 
+static int amdgpu_resume_early(struct amdgpu_device *adev)
+{
+	int i, r;
+
+	for (i = 0; i < adev->num_ip_blocks; i++) {
+		if (!adev->ip_blocks[i].status.valid)
+			continue;
+
+		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
+				adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
+				adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)
+			r = adev->ip_blocks[i].version->funcs->resume(adev);
+
+		if (r) {
+			DRM_ERROR("resume of IP block <%s> failed %d\n",
+				  adev->ip_blocks[i].version->funcs->name, r);
+			return r;
+		}
+	}
+
+	return 0;
+}
+
+static int amdgpu_resume_late(struct amdgpu_device *adev)
+{
+	int i, r;
+
+	for (i = 0; i < adev->num_ip_blocks; i++) {
+		if (!adev->ip_blocks[i].status.valid)
+			continue;
+
+		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
+				adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
+				adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH )
+			continue;
+
+		r = adev->ip_blocks[i].version->funcs->resume(adev);
+		if (r) {
+			DRM_ERROR("resume of IP block <%s> failed %d\n",
+				  adev->ip_blocks[i].version->funcs->name, r);
+			return r;
+		}
+	}
+
+	return 0;
+}
+
 static int amdgpu_resume(struct amdgpu_device *adev)
 {
 	int i, r;
@@ -2343,6 +2390,115 @@ static int amdgpu_recover_vram_from_shadow(struct amdgpu_device *adev,
 }
 
 /**
+ * amdgpu_sriov_gpu_reset - reset the asic
+ *
+ * @adev: amdgpu device pointer
+ * @voluntary: if this reset is requested by guest.
+ *             (true means by guest and false means by HYPERVISOR )
+ *
+ * Attempt the reset the GPU if it has hung (all asics).
+ * for SRIOV case.
+ * Returns 0 for success or an error on failure.
+ */
+int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, bool voluntary)
+{
+	int i, r = 0;
+	int resched;
+	struct amdgpu_bo *bo, *tmp;
+	struct amdgpu_ring *ring;
+	struct fence *fence = NULL, *next = NULL;
+
+	mutex_lock(&adev->virt.lock_reset);
+	atomic_inc(&adev->gpu_reset_counter);
+
+	/* block TTM */
+	resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
+
+	/* block scheduler */
+	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
+		ring = adev->rings[i];
+
+		if (!ring || !ring->sched.thread)
+			continue;
+
+		kthread_park(ring->sched.thread);
+		amd_sched_hw_job_reset(&ring->sched);
+	}
+
+	/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
+	amdgpu_fence_driver_force_completion(adev);
+
+	/* request to take full control of GPU before re-initialization  */
+	if (voluntary)
+		amdgpu_virt_reset_gpu(adev);
+	else
+		amdgpu_virt_request_full_gpu(adev, true);
+
+
+	/* Resume IP prior to SMC */
+	amdgpu_resume_early(adev);
+
+	/* we need recover gart prior to run SMC/CP/SDMA resume */
+	amdgpu_ttm_recover_gart(adev);
+
+	/* now we are okay to resume SMC/CP/SDMA */
+	amdgpu_resume_late(adev);
+
+	amdgpu_irq_gpu_reset_resume_helper(adev);
+
+	if (amdgpu_ib_ring_tests(adev))
+		dev_err(adev->dev, "[GPU_RESET] ib ring test failed (%d).\n", r);
+
+	/* rellease full control of GPU after ib test */
+	amdgpu_virt_release_full_gpu(adev, true);
+
+	DRM_INFO("recover vram bo from shadow\n");
+
+	ring = adev->mman.buffer_funcs_ring;
+	mutex_lock(&adev->shadow_list_lock);
+	list_for_each_entry_safe(bo, tmp, &adev->shadow_list, shadow_list) {
+		amdgpu_recover_vram_from_shadow(adev, ring, bo, &next);
+		if (fence) {
+			r = fence_wait(fence, false);
+			if (r) {
+				WARN(r, "recovery from shadow isn't completed\n");
+				break;
+			}
+		}
+
+		fence_put(fence);
+		fence = next;
+	}
+	mutex_unlock(&adev->shadow_list_lock);
+
+	if (fence) {
+		r = fence_wait(fence, false);
+		if (r)
+			WARN(r, "recovery from shadow isn't completed\n");
+	}
+	fence_put(fence);
+
+	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
+		struct amdgpu_ring *ring = adev->rings[i];
+		if (!ring || !ring->sched.thread)
+			continue;
+
+		amd_sched_job_recovery(&ring->sched);
+		kthread_unpark(ring->sched.thread);
+	}
+
+	drm_helper_resume_force_mode(adev->ddev);
+	ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched);
+	if (r) {
+		/* bad news, how to tell it to userspace ? */
+		dev_info(adev->dev, "GPU reset failed\n");
+	}
+
+	mutex_unlock(&adev->virt.lock_reset);
+	return r;
+}
+
+/**
  * amdgpu_gpu_reset - reset the asic
  *
  * @adev: amdgpu device pointer
@@ -2358,7 +2514,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
 	bool need_full_reset;
 
 	if (amdgpu_sriov_vf(adev))
-		return 0;
+		return amdgpu_sriov_gpu_reset(adev, true);
 
 	if (!amdgpu_check_soft_reset(adev)) {
 		DRM_INFO("No hardware hang detected. Did some blocks stall?\n");
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
index 675e12c..73d24df 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
@@ -89,5 +89,6 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v);
 int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool init);
 int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, bool init);
 int amdgpu_virt_reset_gpu(struct amdgpu_device *adev);
+int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, bool voluntary);
 
 #endif
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH 10/20] drm/amdgpu:change kiq lock name
       [not found] ` <1486447878-20521-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
                     ` (7 preceding siblings ...)
  2017-02-07  6:11   ` [PATCH 09/20] drm/amdgpu:implement SRIOV gpu_reset Monk Liu
@ 2017-02-07  6:11   ` Monk Liu
       [not found]     ` <1486447878-20521-10-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  2017-02-07  6:11   ` [PATCH 11/20] drm/amdgpu:add lock_reset for SRIOV Monk Liu
                     ` (10 subsequent siblings)
  19 siblings, 1 reply; 51+ messages in thread
From: Monk Liu @ 2017-02-07  6:11 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Monk Liu

Change-Id: Ib11de11fb0a9e8086e542b932c9c62d5aa40ebb2
Signed-off-by: Monk Liu <Monk.Liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 10 +++++-----
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
index 76ef641..82a70db 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
@@ -98,7 +98,7 @@ void amdgpu_virt_init_setting(struct amdgpu_device *adev)
 	adev->mode_info.num_crtc = 1;
 	adev->enable_virtual_display = true;
 
-	mutex_init(&adev->virt.lock);
+	mutex_init(&adev->virt.lock_kiq);
 }
 
 uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
@@ -111,14 +111,14 @@ uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
 
 	BUG_ON(!ring->funcs->emit_rreg);
 
-	mutex_lock(&adev->virt.lock);
+	mutex_lock(&adev->virt.lock_kiq);
 	amdgpu_ring_alloc(ring, 32);
 	amdgpu_ring_emit_hdp_flush(ring);
 	amdgpu_ring_emit_rreg(ring, reg);
 	amdgpu_ring_emit_hdp_invalidate(ring);
 	amdgpu_fence_emit(ring, &f);
 	amdgpu_ring_commit(ring);
-	mutex_unlock(&adev->virt.lock);
+	mutex_unlock(&adev->virt.lock_kiq);
 
 	r = fence_wait(f, false);
 	if (r)
@@ -139,14 +139,14 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
 
 	BUG_ON(!ring->funcs->emit_wreg);
 
-	mutex_lock(&adev->virt.lock);
+	mutex_lock(&adev->virt.lock_kiq);
 	amdgpu_ring_alloc(ring, 32);
 	amdgpu_ring_emit_hdp_flush(ring);
 	amdgpu_ring_emit_wreg(ring, reg, v);
 	amdgpu_ring_emit_hdp_invalidate(ring);
 	amdgpu_fence_emit(ring, &f);
 	amdgpu_ring_commit(ring);
-	mutex_unlock(&adev->virt.lock);
+	mutex_unlock(&adev->virt.lock_kiq);
 
 	r = fence_wait(f, false);
 	if (r)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
index 73d24df..7020ff2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
@@ -46,7 +46,7 @@ struct amdgpu_virt {
 	uint64_t			csa_vmid0_addr;
 	bool chained_ib_support;
 	uint32_t			reg_val_offs;
-	struct mutex			lock;
+	struct mutex			lock_kiq;
 	struct amdgpu_irq_src		ack_irq;
 	struct amdgpu_irq_src		rcv_irq;
 	struct delayed_work		flr_work;
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH 11/20] drm/amdgpu:add lock_reset for SRIOV
       [not found] ` <1486447878-20521-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
                     ` (8 preceding siblings ...)
  2017-02-07  6:11   ` [PATCH 10/20] drm/amdgpu:change kiq lock name Monk Liu
@ 2017-02-07  6:11   ` Monk Liu
       [not found]     ` <1486447878-20521-11-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  2017-02-07  6:11   ` [PATCH 12/20] drm/amdgpu:impl mm_r/weg_nokiq Monk Liu
                     ` (9 subsequent siblings)
  19 siblings, 1 reply; 51+ messages in thread
From: Monk Liu @ 2017-02-07  6:11 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Monk Liu

this lock is used for sriov_gpu_reset, only get this mutex
can run into sriov_gpu_reset.

we have two cases triggers gpu_reset for SRIOV:
1) we have submit timedout and trigger reset voluntarily
2) hypervisor found world switch hang and trigger flr and notify we to
   do gpu reset.

both cases need take care and we need a mutex to protect the consistency of
reset routine.

Change-Id: I37aabccfaef1cde32dc350062a519d32c9d51c02
Signed-off-by: Monk Liu <Monk.Liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 1 +
 2 files changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
index 82a70db..ac035ab 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
@@ -99,6 +99,7 @@ void amdgpu_virt_init_setting(struct amdgpu_device *adev)
 	adev->enable_virtual_display = true;
 
 	mutex_init(&adev->virt.lock_kiq);
+	mutex_init(&adev->virt.lock_reset);
 }
 
 uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
index 7020ff2..4b05568 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
@@ -47,6 +47,7 @@ struct amdgpu_virt {
 	bool chained_ib_support;
 	uint32_t			reg_val_offs;
 	struct mutex			lock_kiq;
+	struct mutex                    lock_reset;
 	struct amdgpu_irq_src		ack_irq;
 	struct amdgpu_irq_src		rcv_irq;
 	struct delayed_work		flr_work;
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH 12/20] drm/amdgpu:impl mm_r/weg_nokiq
       [not found] ` <1486447878-20521-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
                     ` (9 preceding siblings ...)
  2017-02-07  6:11   ` [PATCH 11/20] drm/amdgpu:add lock_reset for SRIOV Monk Liu
@ 2017-02-07  6:11   ` Monk Liu
       [not found]     ` <1486447878-20521-12-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  2017-02-07  6:11   ` [PATCH 13/20] Refine handshake between guest and host by mailbox Monk Liu
                     ` (8 subsequent siblings)
  19 siblings, 1 reply; 51+ messages in thread
From: Monk Liu @ 2017-02-07  6:11 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Monk Liu

some registers are pf&vf copy, and we can safely use
mmio method to access them.

and some time we are forbid to use kiq to access register
like in INTR context.

Change-Id: Ie6dc323dc86829a4a6ceb7073c269b106b534c4a
Signed-off-by: Monk Liu <Monk.Liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 10 +++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 36 ++++++++++++++++++++++++++++++
 2 files changed, 46 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 402a895..5dd0615 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1509,6 +1509,11 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 void amdgpu_device_fini(struct amdgpu_device *adev);
 int amdgpu_gpu_wait_for_idle(struct amdgpu_device *adev);
 
+uint32_t amdgpu_mm_rreg_nokiq(struct amdgpu_device *adev, uint32_t reg,
+			bool always_indirect);
+void amdgpu_mm_wreg_nokiq(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
+		    bool always_indirect);
+
 uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
 			bool always_indirect);
 void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
@@ -1523,6 +1528,11 @@ bool amdgpu_device_has_dc_support(struct amdgpu_device *adev);
 /*
  * Registers read & write functions.
  */
+#define RREG32_nokiq(reg) amdgpu_mm_rreg_nokiq(adev, (reg), false)
+#define RREG32_IDX_nokiq(reg) amdgpu_mm_rreg(adev, (reg), true)
+#define WREG32_nokiq(reg, v) amdgpu_mm_wreg_nokiq(adev, (reg), (v), false)
+#define WREG32_IDX_nokiq(reg, v) amdgpu_mm_wreg_nokiq(adev, (reg), (v), true)
+
 #define RREG32(reg) amdgpu_mm_rreg(adev, (reg), false)
 #define RREG32_IDX(reg) amdgpu_mm_rreg(adev, (reg), true)
 #define DREG32(reg) printk(KERN_INFO "REGISTER: " #reg " : 0x%08X\n", amdgpu_mm_rreg(adev, (reg), false))
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 2b404ca..d5870d0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -136,6 +136,42 @@ void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
 	}
 }
 
+uint32_t amdgpu_mm_rreg_nokiq(struct amdgpu_device *adev, uint32_t reg,
+			bool always_indirect)
+{
+	uint32_t ret;
+
+	if ((reg * 4) < adev->rmmio_size && !always_indirect)
+		ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
+	else {
+		unsigned long flags;
+
+		spin_lock_irqsave(&adev->mmio_idx_lock, flags);
+		writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4));
+		ret = readl(((void __iomem *)adev->rmmio) + (mmMM_DATA * 4));
+		spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
+	}
+	trace_amdgpu_mm_rreg(adev->pdev->device, reg, ret);
+	return ret;
+}
+
+void amdgpu_mm_wreg_nokiq(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
+		    bool always_indirect)
+{
+	trace_amdgpu_mm_wreg(adev->pdev->device, reg, v);
+
+	if ((reg * 4) < adev->rmmio_size && !always_indirect)
+		writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
+	else {
+		unsigned long flags;
+
+		spin_lock_irqsave(&adev->mmio_idx_lock, flags);
+		writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4));
+		writel(v, ((void __iomem *)adev->rmmio) + (mmMM_DATA * 4));
+		spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
+	}
+}
+
 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
 {
 	if ((reg * 4) < adev->rio_mem_size)
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH 13/20] Refine handshake between guest and host by mailbox
       [not found] ` <1486447878-20521-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
                     ` (10 preceding siblings ...)
  2017-02-07  6:11   ` [PATCH 12/20] drm/amdgpu:impl mm_r/weg_nokiq Monk Liu
@ 2017-02-07  6:11   ` Monk Liu
       [not found]     ` <1486447878-20521-13-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  2017-02-07  6:11   ` [PATCH 14/20] drm/amdgpu:use nokiq version mm access Monk Liu
                     ` (7 subsequent siblings)
  19 siblings, 1 reply; 51+ messages in thread
From: Monk Liu @ 2017-02-07  6:11 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Ken Xue

From: Ken Xue <Ken.Xue@amd.com>

Change-Id: If3a7d05824847234759b86563e8052949e171972
Signed-off-by: Ken Xue <Ken.Xue@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
index d2622b6..5fe4aad 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
@@ -318,10 +318,25 @@ void xgpu_vi_init_golden_registers(struct amdgpu_device *adev)
 static void xgpu_vi_mailbox_send_ack(struct amdgpu_device *adev)
 {
 	u32 reg;
+	int timeout = VI_MAILBOX_TIMEDOUT;
+	u32 mask = REG_FIELD_MASK(MAILBOX_CONTROL, RCV_MSG_VALID);
 
 	reg = RREG32(mmMAILBOX_CONTROL);
 	reg = REG_SET_FIELD(reg, MAILBOX_CONTROL, RCV_MSG_ACK, 1);
 	WREG32(mmMAILBOX_CONTROL, reg);
+
+	/*Wait for RCV_MSG_VALID to be 0*/
+	reg = RREG32(mmMAILBOX_CONTROL);
+	while (reg & mask) {
+		if (timeout <= 0) {
+			pr_err("RCV_MSG_VALID is not cleared\n");
+			break;
+		}
+		mdelay(1);
+		timeout -=1;
+
+		reg = RREG32(mmMAILBOX_CONTROL);
+	}
 }
 
 static void xgpu_vi_mailbox_set_valid(struct amdgpu_device *adev, bool val)
@@ -339,6 +354,8 @@ static void xgpu_vi_mailbox_trans_msg(struct amdgpu_device *adev,
 {
 	u32 reg;
 
+	xgpu_vi_mailbox_send_ack(adev);
+
 	reg = RREG32(mmMAILBOX_MSGBUF_TRN_DW0);
 	reg = REG_SET_FIELD(reg, MAILBOX_MSGBUF_TRN_DW0,
 			    MSGBUF_DATA, event);
@@ -351,6 +368,11 @@ static int xgpu_vi_mailbox_rcv_msg(struct amdgpu_device *adev,
 				   enum idh_event event)
 {
 	u32 reg;
+	u32 mask = REG_FIELD_MASK(MAILBOX_CONTROL, RCV_MSG_VALID);
+
+	reg = RREG32(mmMAILBOX_CONTROL);
+	if (!(reg & mask))
+		return -ENOENT;
 
 	reg = RREG32(mmMAILBOX_MSGBUF_RCV_DW0);
 	if (reg != event)
@@ -419,7 +441,9 @@ static int xgpu_vi_send_access_requests(struct amdgpu_device *adev,
 	xgpu_vi_mailbox_set_valid(adev, false);
 
 	/* start to check msg if request is idh_req_gpu_init_access */
-	if (request == IDH_REQ_GPU_INIT_ACCESS) {
+	if (request == IDH_REQ_GPU_INIT_ACCESS ||
+		request == IDH_REQ_GPU_FINI_ACCESS ||
+		request == IDH_REQ_GPU_RESET_ACCESS) {
 		r = xgpu_vi_poll_msg(adev, IDH_READY_TO_ACCESS_GPU);
 		if (r)
 			return r;
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH 14/20] drm/amdgpu:use nokiq version mm access
       [not found] ` <1486447878-20521-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
                     ` (11 preceding siblings ...)
  2017-02-07  6:11   ` [PATCH 13/20] Refine handshake between guest and host by mailbox Monk Liu
@ 2017-02-07  6:11   ` Monk Liu
       [not found]     ` <1486447878-20521-14-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  2017-02-07  6:11   ` [PATCH 15/20] drm/amdgpu:use work instead of delay-work Monk Liu
                     ` (6 subsequent siblings)
  19 siblings, 1 reply; 51+ messages in thread
From: Monk Liu @ 2017-02-07  6:11 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Monk Liu

Change-Id: I383d7ce858a136d7b112180f86e3d632d37b4d1c
Signed-off-by: Monk Liu <Monk.Liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
index 5fe4aad..4e9e0bb 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
@@ -321,12 +321,12 @@ static void xgpu_vi_mailbox_send_ack(struct amdgpu_device *adev)
 	int timeout = VI_MAILBOX_TIMEDOUT;
 	u32 mask = REG_FIELD_MASK(MAILBOX_CONTROL, RCV_MSG_VALID);
 
-	reg = RREG32(mmMAILBOX_CONTROL);
+	reg = RREG32_nokiq(mmMAILBOX_CONTROL);
 	reg = REG_SET_FIELD(reg, MAILBOX_CONTROL, RCV_MSG_ACK, 1);
-	WREG32(mmMAILBOX_CONTROL, reg);
+	WREG32_nokiq(mmMAILBOX_CONTROL, reg);
 
 	/*Wait for RCV_MSG_VALID to be 0*/
-	reg = RREG32(mmMAILBOX_CONTROL);
+	reg = RREG32_nokiq(mmMAILBOX_CONTROL);
 	while (reg & mask) {
 		if (timeout <= 0) {
 			pr_err("RCV_MSG_VALID is not cleared\n");
@@ -335,7 +335,7 @@ static void xgpu_vi_mailbox_send_ack(struct amdgpu_device *adev)
 		mdelay(1);
 		timeout -=1;
 
-		reg = RREG32(mmMAILBOX_CONTROL);
+		reg = RREG32_nokiq(mmMAILBOX_CONTROL);
 	}
 }
 
@@ -343,10 +343,10 @@ static void xgpu_vi_mailbox_set_valid(struct amdgpu_device *adev, bool val)
 {
 	u32 reg;
 
-	reg = RREG32(mmMAILBOX_CONTROL);
+	reg = RREG32_nokiq(mmMAILBOX_CONTROL);
 	reg = REG_SET_FIELD(reg, MAILBOX_CONTROL,
 			    TRN_MSG_VALID, val ? 1 : 0);
-	WREG32(mmMAILBOX_CONTROL, reg);
+	WREG32_nokiq(mmMAILBOX_CONTROL, reg);
 }
 
 static void xgpu_vi_mailbox_trans_msg(struct amdgpu_device *adev,
@@ -356,10 +356,10 @@ static void xgpu_vi_mailbox_trans_msg(struct amdgpu_device *adev,
 
 	xgpu_vi_mailbox_send_ack(adev);
 
-	reg = RREG32(mmMAILBOX_MSGBUF_TRN_DW0);
+	reg = RREG32_nokiq(mmMAILBOX_MSGBUF_TRN_DW0);
 	reg = REG_SET_FIELD(reg, MAILBOX_MSGBUF_TRN_DW0,
 			    MSGBUF_DATA, event);
-	WREG32(mmMAILBOX_MSGBUF_TRN_DW0, reg);
+	WREG32_nokiq(mmMAILBOX_MSGBUF_TRN_DW0, reg);
 
 	xgpu_vi_mailbox_set_valid(adev, true);
 }
@@ -370,11 +370,11 @@ static int xgpu_vi_mailbox_rcv_msg(struct amdgpu_device *adev,
 	u32 reg;
 	u32 mask = REG_FIELD_MASK(MAILBOX_CONTROL, RCV_MSG_VALID);
 
-	reg = RREG32(mmMAILBOX_CONTROL);
+	reg = RREG32_nokiq(mmMAILBOX_CONTROL);
 	if (!(reg & mask))
 		return -ENOENT;
 
-	reg = RREG32(mmMAILBOX_MSGBUF_RCV_DW0);
+	reg = RREG32_nokiq(mmMAILBOX_MSGBUF_RCV_DW0);
 	if (reg != event)
 		return -ENOENT;
 
@@ -390,7 +390,7 @@ static int xgpu_vi_poll_ack(struct amdgpu_device *adev)
 	u32 mask = REG_FIELD_MASK(MAILBOX_CONTROL, TRN_MSG_ACK);
 	u32 reg;
 
-	reg = RREG32(mmMAILBOX_CONTROL);
+	reg = RREG32_nokiq(mmMAILBOX_CONTROL);
 	while (!(reg & mask)) {
 		if (timeout <= 0) {
 			pr_err("Doesn't get ack from pf.\n");
@@ -400,7 +400,7 @@ static int xgpu_vi_poll_ack(struct amdgpu_device *adev)
 		msleep(1);
 		timeout -= 1;
 
-		reg = RREG32(mmMAILBOX_CONTROL);
+		reg = RREG32_nokiq(mmMAILBOX_CONTROL);
 	}
 
 	return r;
@@ -492,11 +492,11 @@ static int xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device *adev,
 				       unsigned type,
 				       enum amdgpu_interrupt_state state)
 {
-	u32 tmp = RREG32(mmMAILBOX_INT_CNTL);
+	u32 tmp = RREG32_nokiq(mmMAILBOX_INT_CNTL);
 
 	tmp = REG_SET_FIELD(tmp, MAILBOX_INT_CNTL, ACK_INT_EN,
 			    (state == AMDGPU_IRQ_STATE_ENABLE) ? 1 : 0);
-	WREG32(mmMAILBOX_INT_CNTL, tmp);
+	WREG32_nokiq(mmMAILBOX_INT_CNTL, tmp);
 
 	return 0;
 }
@@ -521,11 +521,11 @@ static int xgpu_vi_set_mailbox_rcv_irq(struct amdgpu_device *adev,
 				       unsigned type,
 				       enum amdgpu_interrupt_state state)
 {
-	u32 tmp = RREG32(mmMAILBOX_INT_CNTL);
+	u32 tmp = RREG32_nokiq(mmMAILBOX_INT_CNTL);
 
 	tmp = REG_SET_FIELD(tmp, MAILBOX_INT_CNTL, VALID_INT_EN,
 			    (state == AMDGPU_IRQ_STATE_ENABLE) ? 1 : 0);
-	WREG32(mmMAILBOX_INT_CNTL, tmp);
+	WREG32_nokiq(mmMAILBOX_INT_CNTL, tmp);
 
 	return 0;
 }
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH 15/20] drm/amdgpu:use work instead of delay-work
       [not found] ` <1486447878-20521-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
                     ` (12 preceding siblings ...)
  2017-02-07  6:11   ` [PATCH 14/20] drm/amdgpu:use nokiq version mm access Monk Liu
@ 2017-02-07  6:11   ` Monk Liu
       [not found]     ` <1486447878-20521-15-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  2017-02-07  6:11   ` [PATCH 16/20] drm/amdgpu:RUNTIME flag should clr later Monk Liu
                     ` (5 subsequent siblings)
  19 siblings, 1 reply; 51+ messages in thread
From: Monk Liu @ 2017-02-07  6:11 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Monk Liu

Change-Id: I41b6336baa00b1fd299311349402a17951b585a2
Signed-off-by: Monk Liu <Monk.Liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  2 +-
 drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c    | 36 +++++++++++++++-----------------
 2 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
index 4b05568..846f29c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
@@ -50,7 +50,7 @@ struct amdgpu_virt {
 	struct mutex                    lock_reset;
 	struct amdgpu_irq_src		ack_irq;
 	struct amdgpu_irq_src		rcv_irq;
-	struct delayed_work		flr_work;
+	struct work_struct		flr_work;
 	const struct amdgpu_virt_ops	*ops;
 };
 
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
index 4e9e0bb..53fa590c 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
@@ -503,17 +503,19 @@ static int xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device *adev,
 
 static void xgpu_vi_mailbox_flr_work(struct work_struct *work)
 {
-	struct amdgpu_virt *virt = container_of(work,
-					struct amdgpu_virt, flr_work.work);
-	struct amdgpu_device *adev = container_of(virt,
-					struct amdgpu_device, virt);
-	int r = 0;
-
-	r = xgpu_vi_poll_msg(adev, IDH_FLR_NOTIFICATION_CMPL);
-	if (r)
-		DRM_ERROR("failed to get flr cmpl msg from hypervior.\n");
+	struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
+	struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
+
+	/* wait until RCV_MSG become 3 */
+	if (!xgpu_vi_poll_msg(adev, IDH_FLR_NOTIFICATION_CMPL))
+		adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
+	else {
+		pr_err("failed to recieve FLR_CMPL\n");
+		return;
+	}
 
-	/* TODO: need to restore gfx states */
+	/* Trigger recovery due to world switch failure */
+	amdgpu_sriov_gpu_reset(adev, false);
 }
 
 static int xgpu_vi_set_mailbox_rcv_irq(struct amdgpu_device *adev,
@@ -536,15 +538,12 @@ static int xgpu_vi_mailbox_rcv_irq(struct amdgpu_device *adev,
 {
 	int r;
 
-	adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
+	/* see what event we get */
 	r = xgpu_vi_mailbox_rcv_msg(adev, IDH_FLR_NOTIFICATION);
-	/* do nothing for other msg */
-	if (r)
-		return 0;
 
-	/* TODO: need to save gfx states */
-	schedule_delayed_work(&adev->virt.flr_work,
-			      msecs_to_jiffies(VI_MAILBOX_RESET_TIME));
+	/* only handle FLR_NOTIFY now */
+	if (!r)
+		schedule_work(&adev->virt.flr_work);
 
 	return 0;
 }
@@ -597,14 +596,13 @@ int xgpu_vi_mailbox_get_irq(struct amdgpu_device *adev)
 		return r;
 	}
 
-	INIT_DELAYED_WORK(&adev->virt.flr_work, xgpu_vi_mailbox_flr_work);
+	INIT_WORK(&adev->virt.flr_work, xgpu_vi_mailbox_flr_work);
 
 	return 0;
 }
 
 void xgpu_vi_mailbox_put_irq(struct amdgpu_device *adev)
 {
-	cancel_delayed_work_sync(&adev->virt.flr_work);
 	amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
 	amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
 }
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH 16/20] drm/amdgpu:RUNTIME flag should clr later
       [not found] ` <1486447878-20521-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
                     ` (13 preceding siblings ...)
  2017-02-07  6:11   ` [PATCH 15/20] drm/amdgpu:use work instead of delay-work Monk Liu
@ 2017-02-07  6:11   ` Monk Liu
       [not found]     ` <1486447878-20521-16-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  2017-02-07  6:11   ` [PATCH 17/20] drm/amdgpu:new field is_load_stage introduced Monk Liu
                     ` (4 subsequent siblings)
  19 siblings, 1 reply; 51+ messages in thread
From: Monk Liu @ 2017-02-07  6:11 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Monk Liu

this flag will get cleared by request gpu access

Change-Id: Ie484bb0141420055370e019dcd8c110fb34f8a1b
Signed-off-by: Monk Liu <Monk.Liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
index 53fa590c..64d2fd0 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
@@ -507,9 +507,8 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct *work)
 	struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
 
 	/* wait until RCV_MSG become 3 */
-	if (!xgpu_vi_poll_msg(adev, IDH_FLR_NOTIFICATION_CMPL))
-		adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
-	else {
+	if (xgpu_vi_poll_msg(adev, IDH_FLR_NOTIFICATION_CMPL))
+	{
 		pr_err("failed to recieve FLR_CMPL\n");
 		return;
 	}
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH 17/20] drm/amdgpu:new field is_load_stage introduced
       [not found] ` <1486447878-20521-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
                     ` (14 preceding siblings ...)
  2017-02-07  6:11   ` [PATCH 16/20] drm/amdgpu:RUNTIME flag should clr later Monk Liu
@ 2017-02-07  6:11   ` Monk Liu
       [not found]     ` <1486447878-20521-17-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  2017-02-07  6:11   ` [PATCH 18/20] drm/amdgpu:alloc mqd backup Monk Liu
                     ` (3 subsequent siblings)
  19 siblings, 1 reply; 51+ messages in thread
From: Monk Liu @ 2017-02-07  6:11 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Monk Liu

use it to seperate first driver load and later reset/resume

Change-Id: I991e0da52ccd197716d279bf9014de46d39acfea
Signed-off-by: Monk Liu <Monk.Liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 5dd0615..bdb47f7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1493,6 +1493,7 @@ struct amdgpu_device {
 	/* link all gtt */
 	spinlock_t			gtt_list_lock;
 	struct list_head                gtt_list;
+	bool	is_load_stage;
 
 };
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index d5870d0..5be0481 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1800,6 +1800,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 	adev->gc_cac_wreg = &amdgpu_invalid_wreg;
 	adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
 	adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
+	adev->is_load_stage = true;
 
 
 	DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
@@ -2010,6 +2011,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 		goto failed;
 	}
 
+	adev->is_load_stage = false;
 	return 0;
 
 failed:
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH 18/20] drm/amdgpu:alloc mqd backup
       [not found] ` <1486447878-20521-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
                     ` (15 preceding siblings ...)
  2017-02-07  6:11   ` [PATCH 17/20] drm/amdgpu:new field is_load_stage introduced Monk Liu
@ 2017-02-07  6:11   ` Monk Liu
       [not found]     ` <1486447878-20521-18-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  2017-02-07  6:11   ` [PATCH 19/20] drm/amdgpu:use nop to clear ring buffer Monk Liu
                     ` (2 subsequent siblings)
  19 siblings, 1 reply; 51+ messages in thread
From: Monk Liu @ 2017-02-07  6:11 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Monk Liu

Change-Id: I84f821faa657a5d942c33d30f206eb66b579c2f8
Signed-off-by: Monk Liu <Monk.Liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h   |  1 +
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 10 ++++++++++
 2 files changed, 11 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index bdb47f7..a801fde 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -781,6 +781,7 @@ struct amdgpu_mec {
 	u32 num_pipe;
 	u32 num_mec;
 	u32 num_queue;
+	struct vi_mqd	*mqd_backup[AMDGPU_MAX_COMPUTE_RINGS + 1];
 };
 
 struct amdgpu_kiq {
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index 6734e55..5f688d4 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -7309,6 +7309,11 @@ static int gfx_v8_0_compute_mqd_soft_init(struct amdgpu_device *adev)
 			dev_warn(adev->dev, "failed to create ring mqd ob (%d)", r);
 			return r;
 		}
+
+		/* prepare MQD backup */
+		adev->gfx.mec.mqd_backup[AMDGPU_MAX_COMPUTE_RINGS] = kmalloc(sizeof(struct vi_mqd), GFP_KERNEL);
+		if (!adev->gfx.mec.mqd_backup[AMDGPU_MAX_COMPUTE_RINGS])
+				dev_warn(adev->dev, "no memory to create MQD backup for ring %s\n", ring->name);
 	}
 
 	/* create MQD for each KCQ */
@@ -7323,6 +7328,11 @@ static int gfx_v8_0_compute_mqd_soft_init(struct amdgpu_device *adev)
 				dev_warn(adev->dev, "failed to create ring mqd ob (%d)", r);
 				return r;
 			}
+
+			/* prepare MQD backup */
+			adev->gfx.mec.mqd_backup[i] = kmalloc(sizeof(struct vi_mqd), GFP_KERNEL);
+			if (!adev->gfx.mec.mqd_backup[i])
+				dev_warn(adev->dev, "no memory to create MQD backup for ring %s\n", ring->name);
 		}
 	}
 
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH 19/20] drm/amdgpu:use nop to clear ring buffer
       [not found] ` <1486447878-20521-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
                     ` (16 preceding siblings ...)
  2017-02-07  6:11   ` [PATCH 18/20] drm/amdgpu:alloc mqd backup Monk Liu
@ 2017-02-07  6:11   ` Monk Liu
       [not found]     ` <1486447878-20521-19-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  2017-02-07  6:11   ` [PATCH 20/20] drm/amdgpu:fix kiq_resume routine Monk Liu
  2017-02-07 15:27   ` [PATCH 01/20] drm/amdgpu:fix powerplay logic Deucher, Alexander
  19 siblings, 1 reply; 51+ messages in thread
From: Monk Liu @ 2017-02-07  6:11 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Monk Liu

this is for a fine GPU reset/resume, which should
use nop clear ringbuffer prior to kickoff engine.

and also use the same clear macro in ring_init.

Change-Id: I7693891fd4431d64c025d052f1dd0ba797f2f0b7
Signed-off-by: Monk Liu <Monk.Liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 7 +++++++
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 1 +
 drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c   | 1 +
 4 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index 7bacf3c..37d8422 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -230,7 +230,7 @@ int amdgpu_ring_init(struct amdgpu_device *adev, struct amdgpu_ring *ring,
 			dev_err(adev->dev, "(%d) ring create failed\n", r);
 			return r;
 		}
-		memset((void *)ring->ring, 0, ring->ring_size);
+		amdgpu_ring_clear_ring(ring);
 	}
 	ring->ptr_mask = (ring->ring_size / 4) - 1;
 	ring->max_dw = max_dw;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 0e57b04..3fd4ce8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -186,5 +186,12 @@ int amdgpu_ring_init(struct amdgpu_device *adev, struct amdgpu_ring *ring,
 		     unsigned ring_size, struct amdgpu_irq_src *irq_src,
 		     unsigned irq_type);
 void amdgpu_ring_fini(struct amdgpu_ring *ring);
+static inline void amdgpu_ring_clear_ring(struct amdgpu_ring *ring)
+{
+	int i = 0;
+	while (i <= ring->ptr_mask)
+		ring->ring[i++] = ring->funcs->nop;
+
+}
 
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index 5f688d4..0ce00ff 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -4509,6 +4509,7 @@ static int gfx_v8_0_cp_gfx_resume(struct amdgpu_device *adev)
 	}
 
 	/* start the ring */
+	amdgpu_ring_clear_ring(ring);
 	gfx_v8_0_cp_gfx_start(adev);
 	ring->ready = true;
 	r = amdgpu_ring_test_ring(ring);
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
index 9394ca6..d5206f5 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
@@ -615,6 +615,7 @@ static int sdma_v3_0_gfx_resume(struct amdgpu_device *adev)
 
 	for (i = 0; i < adev->sdma.num_instances; i++) {
 		ring = &adev->sdma.instance[i].ring;
+		amdgpu_ring_clear_ring(ring);
 		wb_offset = (ring->rptr_offs * 4);
 
 		mutex_lock(&adev->srbm_mutex);
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH 20/20] drm/amdgpu:fix kiq_resume routine
       [not found] ` <1486447878-20521-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
                     ` (17 preceding siblings ...)
  2017-02-07  6:11   ` [PATCH 19/20] drm/amdgpu:use nop to clear ring buffer Monk Liu
@ 2017-02-07  6:11   ` Monk Liu
       [not found]     ` <1486447878-20521-20-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  2017-02-07 15:27   ` [PATCH 01/20] drm/amdgpu:fix powerplay logic Deucher, Alexander
  19 siblings, 1 reply; 51+ messages in thread
From: Monk Liu @ 2017-02-07  6:11 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Monk Liu

use is_load_stage to fix compute ring test failure issue
which occured after FLR/gpu_reset.

we need backup a clean status of MQD which was created in drv load
stage, and use it in resume stage, otherwise KCQ and KIQ all may
faild in ring/ib test.

Change-Id: I41be940454a6638e9a8a05f096601eaa1fbebaab
Signed-off-by: Monk Liu <Monk.Liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 44 ++++++++++++++++++++++++++---------
 1 file changed, 33 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index 0ce00ff..4a641d3 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -4877,24 +4877,46 @@ static int gfx_v8_0_kiq_init_queue(struct amdgpu_ring *ring,
 	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
 	uint64_t eop_gpu_addr;
 	bool is_kiq = (ring->funcs->type == AMDGPU_RING_TYPE_KIQ);
+	int mqd_idx = AMDGPU_MAX_COMPUTE_RINGS;
 
 	if (is_kiq) {
 		eop_gpu_addr = kiq->eop_gpu_addr;
 		gfx_v8_0_kiq_setting(&kiq->ring);
-	} else
+	} else {
 		eop_gpu_addr = adev->gfx.mec.hpd_eop_gpu_addr +
 					ring->queue * MEC_HPD_SIZE;
+		mqd_idx = ring - &adev->gfx.compute_ring[0];
+	}
 
-	mutex_lock(&adev->srbm_mutex);
-	vi_srbm_select(adev, ring->me, ring->pipe, ring->queue, 0);
+	if (adev->is_load_stage) {
+		memset((void *)mqd, 0, sizeof(*mqd));
+		mutex_lock(&adev->srbm_mutex);
+		vi_srbm_select(adev, ring->me, ring->pipe, ring->queue, 0);
+		gfx_v8_0_mqd_init(adev, mqd, mqd_gpu_addr, eop_gpu_addr, ring);
+		if (is_kiq)
+			gfx_v8_0_kiq_init_register(adev, mqd, ring);
+		vi_srbm_select(adev, 0, 0, 0, 0);
+		mutex_unlock(&adev->srbm_mutex);
 
-	gfx_v8_0_mqd_init(adev, mqd, mqd_gpu_addr, eop_gpu_addr, ring);
+		if (adev->gfx.mec.mqd_backup[mqd_idx])
+			memcpy(adev->gfx.mec.mqd_backup[mqd_idx], mqd, sizeof(*mqd));
+	} else { /* for GPU_RESET case */
+		/* reset MQD to a clean status */
+		if (adev->gfx.mec.mqd_backup[mqd_idx])
+			memcpy(mqd, adev->gfx.mec.mqd_backup[mqd_idx], sizeof(*mqd));
 
-	if (is_kiq)
-		gfx_v8_0_kiq_init_register(adev, mqd, ring);
-
-	vi_srbm_select(adev, 0, 0, 0, 0);
-	mutex_unlock(&adev->srbm_mutex);
+		/* reset ring buffer */
+		ring->wptr = 0;
+		amdgpu_ring_clear_ring(ring);
+
+		if (is_kiq) {
+		    mutex_lock(&adev->srbm_mutex);
+		    vi_srbm_select(adev, ring->me, ring->pipe, ring->queue, 0);
+		    gfx_v8_0_kiq_init_register(adev, mqd, ring);
+		    vi_srbm_select(adev, 0, 0, 0, 0);
+		    mutex_unlock(&adev->srbm_mutex);
+		}
+	}
 
 	if (is_kiq)
 		gfx_v8_0_kiq_enable(ring);
@@ -4913,9 +4935,9 @@ static int gfx_v8_0_kiq_resume(struct amdgpu_device *adev)
 
 	ring = &adev->gfx.kiq.ring;
 	if (!amdgpu_bo_kmap(ring->mqd_obj, (void **)&ring->mqd_ptr)) {
-		memset((void *)ring->mqd_ptr, 0, sizeof(struct vi_mqd));
 		r = gfx_v8_0_kiq_init_queue(ring, ring->mqd_ptr, ring->mqd_gpu_addr);
 		amdgpu_bo_kunmap(ring->mqd_obj);
+		ring->mqd_ptr = NULL;
 		if (r)
 			return r;
 	} else {
@@ -4925,9 +4947,9 @@ static int gfx_v8_0_kiq_resume(struct amdgpu_device *adev)
 	for (i = 0; i < adev->gfx.num_compute_rings; i++) {
 		ring = &adev->gfx.compute_ring[i];
 		if (!amdgpu_bo_kmap(ring->mqd_obj, (void **)&ring->mqd_ptr)) {
-			memset((void *)ring->mqd_ptr, 0, sizeof(struct vi_mqd));
 			r = gfx_v8_0_kiq_init_queue(ring, ring->mqd_ptr, ring->mqd_gpu_addr);
 			amdgpu_bo_kunmap(ring->mqd_obj);
+			ring->mqd_ptr = NULL;
 			if (r)
 			return r;
 		} else {
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* 答复: [PATCH 09/20] drm/amdgpu:implement SRIOV gpu_reset
       [not found]     ` <1486447878-20521-9-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
@ 2017-02-07  6:26       ` Liu, Monk
       [not found]         ` <DM5PR12MB16109EC5F03088C1CFB58FE484430-2J9CzHegvk++jCVTvoAFKAdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  2017-02-07 15:45       ` Deucher, Alexander
  1 sibling, 1 reply; 51+ messages in thread
From: Liu, Monk @ 2017-02-07  6:26 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1: Type: text/plain, Size: 7775 bytes --]

patch 1-8 are some fixes for sriov gpu reset feature

patch 9 -20 are for sriov gpu reset


BR Monk

________________________________
发件人: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> 代表 Monk Liu <Monk.Liu@amd.com>
发送时间: 2017年2月7日 14:11:07
收件人: amd-gfx@lists.freedesktop.org
抄送: Liu, Monk
主题: [PATCH 09/20] drm/amdgpu:implement SRIOV gpu_reset

Signed-off-by: Monk Liu <Monk.Liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 158 ++++++++++++++++++++++++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h   |   1 +
 2 files changed, 158 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e926f84..2b404ca 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1604,6 +1604,53 @@ int amdgpu_suspend(struct amdgpu_device *adev)
         return 0;
 }

+static int amdgpu_resume_early(struct amdgpu_device *adev)
+{
+       int i, r;
+
+       for (i = 0; i < adev->num_ip_blocks; i++) {
+               if (!adev->ip_blocks[i].status.valid)
+                       continue;
+
+               if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
+                               adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
+                               adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)
+                       r = adev->ip_blocks[i].version->funcs->resume(adev);
+
+               if (r) {
+                       DRM_ERROR("resume of IP block <%s> failed %d\n",
+                                 adev->ip_blocks[i].version->funcs->name, r);
+                       return r;
+               }
+       }
+
+       return 0;
+}
+
+static int amdgpu_resume_late(struct amdgpu_device *adev)
+{
+       int i, r;
+
+       for (i = 0; i < adev->num_ip_blocks; i++) {
+               if (!adev->ip_blocks[i].status.valid)
+                       continue;
+
+               if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
+                               adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
+                               adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH )
+                       continue;
+
+               r = adev->ip_blocks[i].version->funcs->resume(adev);
+               if (r) {
+                       DRM_ERROR("resume of IP block <%s> failed %d\n",
+                                 adev->ip_blocks[i].version->funcs->name, r);
+                       return r;
+               }
+       }
+
+       return 0;
+}
+
 static int amdgpu_resume(struct amdgpu_device *adev)
 {
         int i, r;
@@ -2343,6 +2390,115 @@ static int amdgpu_recover_vram_from_shadow(struct amdgpu_device *adev,
 }

 /**
+ * amdgpu_sriov_gpu_reset - reset the asic
+ *
+ * @adev: amdgpu device pointer
+ * @voluntary: if this reset is requested by guest.
+ *             (true means by guest and false means by HYPERVISOR )
+ *
+ * Attempt the reset the GPU if it has hung (all asics).
+ * for SRIOV case.
+ * Returns 0 for success or an error on failure.
+ */
+int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, bool voluntary)
+{
+       int i, r = 0;
+       int resched;
+       struct amdgpu_bo *bo, *tmp;
+       struct amdgpu_ring *ring;
+       struct fence *fence = NULL, *next = NULL;
+
+       mutex_lock(&adev->virt.lock_reset);
+       atomic_inc(&adev->gpu_reset_counter);
+
+       /* block TTM */
+       resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
+
+       /* block scheduler */
+       for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
+               ring = adev->rings[i];
+
+               if (!ring || !ring->sched.thread)
+                       continue;
+
+               kthread_park(ring->sched.thread);
+               amd_sched_hw_job_reset(&ring->sched);
+       }
+
+       /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
+       amdgpu_fence_driver_force_completion(adev);
+
+       /* request to take full control of GPU before re-initialization  */
+       if (voluntary)
+               amdgpu_virt_reset_gpu(adev);
+       else
+               amdgpu_virt_request_full_gpu(adev, true);
+
+
+       /* Resume IP prior to SMC */
+       amdgpu_resume_early(adev);
+
+       /* we need recover gart prior to run SMC/CP/SDMA resume */
+       amdgpu_ttm_recover_gart(adev);
+
+       /* now we are okay to resume SMC/CP/SDMA */
+       amdgpu_resume_late(adev);
+
+       amdgpu_irq_gpu_reset_resume_helper(adev);
+
+       if (amdgpu_ib_ring_tests(adev))
+               dev_err(adev->dev, "[GPU_RESET] ib ring test failed (%d).\n", r);
+
+       /* rellease full control of GPU after ib test */
+       amdgpu_virt_release_full_gpu(adev, true);
+
+       DRM_INFO("recover vram bo from shadow\n");
+
+       ring = adev->mman.buffer_funcs_ring;
+       mutex_lock(&adev->shadow_list_lock);
+       list_for_each_entry_safe(bo, tmp, &adev->shadow_list, shadow_list) {
+               amdgpu_recover_vram_from_shadow(adev, ring, bo, &next);
+               if (fence) {
+                       r = fence_wait(fence, false);
+                       if (r) {
+                               WARN(r, "recovery from shadow isn't completed\n");
+                               break;
+                       }
+               }
+
+               fence_put(fence);
+               fence = next;
+       }
+       mutex_unlock(&adev->shadow_list_lock);
+
+       if (fence) {
+               r = fence_wait(fence, false);
+               if (r)
+                       WARN(r, "recovery from shadow isn't completed\n");
+       }
+       fence_put(fence);
+
+       for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
+               struct amdgpu_ring *ring = adev->rings[i];
+               if (!ring || !ring->sched.thread)
+                       continue;
+
+               amd_sched_job_recovery(&ring->sched);
+               kthread_unpark(ring->sched.thread);
+       }
+
+       drm_helper_resume_force_mode(adev->ddev);
+       ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched);
+       if (r) {
+               /* bad news, how to tell it to userspace ? */
+               dev_info(adev->dev, "GPU reset failed\n");
+       }
+
+       mutex_unlock(&adev->virt.lock_reset);
+       return r;
+}
+
+/**
  * amdgpu_gpu_reset - reset the asic
  *
  * @adev: amdgpu device pointer
@@ -2358,7 +2514,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
         bool need_full_reset;

         if (amdgpu_sriov_vf(adev))
-               return 0;
+               return amdgpu_sriov_gpu_reset(adev, true);

         if (!amdgpu_check_soft_reset(adev)) {
                 DRM_INFO("No hardware hang detected. Did some blocks stall?\n");
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
index 675e12c..73d24df 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
@@ -89,5 +89,6 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v);
 int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool init);
 int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, bool init);
 int amdgpu_virt_reset_gpu(struct amdgpu_device *adev);
+int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, bool voluntary);

 #endif
--
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[-- Attachment #1.2: Type: text/html, Size: 17859 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* RE: [PATCH 02/20] drm/amdgpu:cg & pg are not applied on VF
       [not found]     ` <1486447878-20521-2-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
@ 2017-02-07 15:27       ` Deucher, Alexander
  0 siblings, 0 replies; 51+ messages in thread
From: Deucher, Alexander @ 2017-02-07 15:27 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Liu, Monk

> -----Original Message-----
> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf
> Of Monk Liu
> Sent: Tuesday, February 07, 2017 1:11 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Liu, Monk
> Subject: [PATCH 02/20] drm/amdgpu:cg & pg are not applied on VF
> 
> Change-Id: I93a4e97f1d24a289ab20c2a62371f3e303322587
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>

Please add a better patch description.  Something like, CG and PG are not controlled by the PF and are not applicable to the VFs.  With that,
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>

> ---
>  drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c  | 9 +++++++++
>  drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c  | 6 ++++++
>  drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c | 6 ++++++
>  drivers/gpu/drm/amd/amdgpu/vi.c        | 6 ++++++
>  4 files changed, 27 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> index 0a75021..1e170ab 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> @@ -5833,6 +5833,9 @@ static int gfx_v8_0_set_powergating_state(void
> *handle,
>  	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>  	bool enable = (state == AMD_PG_STATE_GATE) ? true : false;
> 
> +	if (amdgpu_sriov_vf(adev))
> +		return 0;
> +
>  	switch (adev->asic_type) {
>  	case CHIP_CARRIZO:
>  	case CHIP_STONEY:
> @@ -5890,6 +5893,9 @@ static void gfx_v8_0_get_clockgating_state(void
> *handle, u32 *flags)
>  	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>  	int data;
> 
> +	if (amdgpu_sriov_vf(adev))
> +		*flags = 0;
> +
>  	/* AMD_CG_SUPPORT_GFX_MGCG */
>  	data = RREG32(mmRLC_CGTT_MGCG_OVERRIDE);
>  	if (!(data & RLC_CGTT_MGCG_OVERRIDE__CPF_MASK))
> @@ -6403,6 +6409,9 @@ static int gfx_v8_0_set_clockgating_state(void
> *handle,
>  {
>  	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> 
> +	if (amdgpu_sriov_vf(adev))
> +		return 0;
> +
>  	switch (adev->asic_type) {
>  	case CHIP_FIJI:
>  	case CHIP_CARRIZO:
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
> b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
> index 7669b32..22c52d6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
> @@ -1427,6 +1427,9 @@ static int gmc_v8_0_set_clockgating_state(void
> *handle,
>  {
>  	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> 
> +	if (amdgpu_sriov_vf(adev))
> +		return 0;
> +
>  	switch (adev->asic_type) {
>  	case CHIP_FIJI:
>  		fiji_update_mc_medium_grain_clock_gating(adev,
> @@ -1451,6 +1454,9 @@ static void gmc_v8_0_get_clockgating_state(void
> *handle, u32 *flags)
>  	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>  	int data;
> 
> +	if (amdgpu_sriov_vf(adev))
> +		*flags = 0;
> +
>  	/* AMD_CG_SUPPORT_MC_MGCG */
>  	data = RREG32(mmMC_HUB_MISC_HUB_CG);
>  	if (data & MC_HUB_MISC_HUB_CG__ENABLE_MASK)
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
> b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
> index 25602c4..9394ca6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
> @@ -1512,6 +1512,9 @@ static int sdma_v3_0_set_clockgating_state(void
> *handle,
>  {
>  	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> 
> +	if (amdgpu_sriov_vf(adev))
> +		return 0;
> +
>  	switch (adev->asic_type) {
>  	case CHIP_FIJI:
>  	case CHIP_CARRIZO:
> @@ -1538,6 +1541,9 @@ static void sdma_v3_0_get_clockgating_state(void
> *handle, u32 *flags)
>  	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>  	int data;
> 
> +	if (amdgpu_sriov_vf(adev))
> +		*flags = 0;
> +
>  	/* AMD_CG_SUPPORT_SDMA_MGCG */
>  	data = RREG32(mmSDMA0_CLK_CTRL + sdma_offsets[0]);
>  	if (!(data & SDMA0_CLK_CTRL__SOFT_OVERRIDE0_MASK))
> diff --git a/drivers/gpu/drm/amd/amdgpu/vi.c
> b/drivers/gpu/drm/amd/amdgpu/vi.c
> index 89b0dfe..aeef3c9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/vi.c
> +++ b/drivers/gpu/drm/amd/amdgpu/vi.c
> @@ -1391,6 +1391,9 @@ static int vi_common_set_clockgating_state(void
> *handle,
>  {
>  	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> 
> +	if (amdgpu_sriov_vf(adev))
> +		return 0;
> +
>  	switch (adev->asic_type) {
>  	case CHIP_FIJI:
>  		vi_update_bif_medium_grain_light_sleep(adev,
> @@ -1435,6 +1438,9 @@ static void vi_common_get_clockgating_state(void
> *handle, u32 *flags)
>  	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>  	int data;
> 
> +	if (amdgpu_sriov_vf(adev))
> +		*flags = 0;
> +
>  	/* AMD_CG_SUPPORT_BIF_LS */
>  	data = RREG32_PCIE(ixPCIE_CNTL2);
>  	if (data & PCIE_CNTL2__SLV_MEM_LS_EN_MASK)
> --
> 2.7.4
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 51+ messages in thread

* RE: [PATCH 01/20] drm/amdgpu:fix powerplay logic
       [not found] ` <1486447878-20521-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
                     ` (18 preceding siblings ...)
  2017-02-07  6:11   ` [PATCH 20/20] drm/amdgpu:fix kiq_resume routine Monk Liu
@ 2017-02-07 15:27   ` Deucher, Alexander
  19 siblings, 0 replies; 51+ messages in thread
From: Deucher, Alexander @ 2017-02-07 15:27 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Liu, Monk

> -----Original Message-----
> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf
> Of Monk Liu
> Sent: Tuesday, February 07, 2017 1:11 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Liu, Monk
> Subject: [PATCH 01/20] drm/amdgpu:fix powerplay logic
> 
> 1,like pp_hw_init, we shouldn't report error if PP disabled
> 2,disable pp_en if sriov
> 
> Change-Id: I6d259f9609f223998bea236f64676b9c22133e4e
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>

Acked-by: Alex Deucher <alexander.deucher@amd.com>

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_powerplay.c | 2 +-
>  drivers/gpu/drm/amd/powerplay/amd_powerplay.c | 2 +-
>  2 files changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_powerplay.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_powerplay.c
> index 8856ecc..d56d200 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_powerplay.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_powerplay.c
> @@ -43,7 +43,7 @@ static int amdgpu_create_pp_handle(struct
> amdgpu_device *adev)
>  	amd_pp = &(adev->powerplay);
>  	pp_init.chip_family = adev->family;
>  	pp_init.chip_id = adev->asic_type;
> -	pp_init.pm_en = amdgpu_dpm != 0 ? true : false;
> +	pp_init.pm_en = (amdgpu_dpm != 0 && !amdgpu_sriov_vf(adev)) ?
> true : false;
>  	pp_init.feature_mask = amdgpu_pp_feature_mask;
>  	pp_init.device = amdgpu_cgs_create_device(adev);
>  	ret = amd_powerplay_create(&pp_init, &(amd_pp->pp_handle));
> diff --git a/drivers/gpu/drm/amd/powerplay/amd_powerplay.c
> b/drivers/gpu/drm/amd/powerplay/amd_powerplay.c
> index 429f18b..e9cf207 100644
> --- a/drivers/gpu/drm/amd/powerplay/amd_powerplay.c
> +++ b/drivers/gpu/drm/amd/powerplay/amd_powerplay.c
> @@ -286,7 +286,7 @@ static int pp_resume(void *handle)
>  	}
> 
>  	if (ret1 == PP_DPM_DISABLED)
> -		return ret1;
> +		return 0;
> 
>  	eventmgr = pp_handle->eventmgr;
> 
> --
> 2.7.4
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 51+ messages in thread

* RE: [PATCH 03/20] drm/damdgpu:add new mqd member in ring
       [not found]     ` <1486447878-20521-3-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
@ 2017-02-07 15:29       ` Deucher, Alexander
  0 siblings, 0 replies; 51+ messages in thread
From: Deucher, Alexander @ 2017-02-07 15:29 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Liu, Monk

> -----Original Message-----
> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf
> Of Monk Liu
> Sent: Tuesday, February 07, 2017 1:11 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Liu, Monk
> Subject: [PATCH 03/20] drm/damdgpu:add new mqd member in ring
> 
> Change-Id: If4dc6bb92d6a364125a568f37ea409e4c438e6a2
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>

Please add a better patch description.  With that fixed,
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 2 ++
>  drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 2 ++
>  2 files changed, 4 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index c813cbe..0e57b04 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -161,6 +161,8 @@ struct amdgpu_ring {
>  	u32			pipe;
>  	u32			queue;
>  	struct amdgpu_bo	*mqd_obj;
> +	uint64_t                mqd_gpu_addr;
> +	struct vi_mqd           *mqd_ptr;
>  	u32			doorbell_index;
>  	bool			use_doorbell;
>  	unsigned		wptr_offs;
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> index 1e170ab..22bd155 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> @@ -4588,6 +4588,8 @@ static void gfx_v8_0_cp_compute_fini(struct
> amdgpu_device *adev)
> 
>  			amdgpu_bo_unref(&ring->mqd_obj);
>  			ring->mqd_obj = NULL;
> +			ring->mqd_ptr = NULL;
> +			ring->mqd_gpu_addr = 0;
>  		}
>  	}
>  }
> --
> 2.7.4
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 51+ messages in thread

* RE: [PATCH 04/20] drm/amdgpu:imple mqd soft ini/fini
       [not found]     ` <1486447878-20521-4-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
@ 2017-02-07 15:30       ` Deucher, Alexander
  0 siblings, 0 replies; 51+ messages in thread
From: Deucher, Alexander @ 2017-02-07 15:30 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Liu, Monk

> -----Original Message-----
> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf
> Of Monk Liu
> Sent: Tuesday, February 07, 2017 1:11 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Liu, Monk
> Subject: [PATCH 04/20] drm/amdgpu:imple mqd soft ini/fini
> 
> Change-Id: I650a78c8d27f76997e1ef6e3934d0d7e043d4715
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>

Please add a better patch description.  With that fixed,
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>

> ---
>  drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 52
> +++++++++++++++++++++++++++++++++++
>  1 file changed, 52 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> index 22bd155..0e2c906 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> @@ -659,6 +659,8 @@ static u32 gfx_v8_0_get_csb_size(struct
> amdgpu_device *adev);
>  static void gfx_v8_0_get_cu_info(struct amdgpu_device *adev);
>  static void gfx_v8_0_ring_emit_ce_meta_init(struct amdgpu_ring *ring,
> uint64_t addr);
>  static void gfx_v8_0_ring_emit_de_meta_init(struct amdgpu_ring *ring,
> uint64_t addr);
> +static int gfx_v8_0_compute_mqd_soft_init(struct amdgpu_device *adev);
> +static void gfx_v8_0_compute_mqd_soft_fini(struct amdgpu_device
> *adev);
> 
>  static void gfx_v8_0_init_golden_registers(struct amdgpu_device *adev)
>  {
> @@ -7322,3 +7324,53 @@ static void
> gfx_v8_0_ring_emit_de_meta_init(struct amdgpu_ring *ring, uint64_t c
>  	amdgpu_ring_write(ring, upper_32_bits(de_payload_addr));
>  	amdgpu_ring_write_multiple(ring, (void *)&de_payload, cnt_de - 2);
>  }
> +
> +/* create MQD for each compute queue */
> +static int gfx_v8_0_compute_mqd_soft_init(struct amdgpu_device *adev)
> +{
> +	struct amdgpu_ring *ring = NULL;
> +	int r, i;
> +
> +	/* create MQD for KIQ */
> +	ring = &adev->gfx.kiq.ring;
> +	if (!ring->mqd_obj) {
> +		r = amdgpu_bo_create_kernel(adev, sizeof(struct vi_mqd),
> PAGE_SIZE,
> +
> 	AMDGPU_GEM_DOMAIN_GTT, &ring->mqd_obj,
> +						&ring->mqd_gpu_addr, (void
> **)&ring->mqd_ptr);
> +		if (r) {
> +			dev_warn(adev->dev, "failed to create ring mqd ob
> (%d)", r);
> +			return r;
> +		}
> +	}
> +
> +	/* create MQD for each KCQ */
> +	for (i = 0; i < adev->gfx.num_compute_rings; i++)
> +	{
> +		ring = &adev->gfx.compute_ring[i];
> +		if (!ring->mqd_obj) {
> +			r = amdgpu_bo_create_kernel(adev, sizeof(struct
> vi_mqd), PAGE_SIZE,
> +
> 	AMDGPU_GEM_DOMAIN_GTT, &ring->mqd_obj,
> +							&ring-
> >mqd_gpu_addr, (void **)&ring->mqd_ptr);
> +			if (r) {
> +				dev_warn(adev->dev, "failed to create ring
> mqd ob (%d)", r);
> +				return r;
> +			}
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +static void gfx_v8_0_compute_mqd_soft_fini(struct amdgpu_device
> *adev)
> +{
> +	struct amdgpu_ring *ring = NULL;
> +	int i;
> +
> +	for (i = 0; i < adev->gfx.num_compute_rings; i++) {
> +		ring = &adev->gfx.compute_ring[i];
> +		amdgpu_bo_free_kernel(&ring->mqd_obj, &ring-
> >mqd_gpu_addr, (void **)&ring->mqd_ptr);
> +	}
> +
> +	ring = &adev->gfx.kiq.ring;
> +	amdgpu_bo_free_kernel(&ring->mqd_obj, &ring->mqd_gpu_addr,
> (void **)&ring->mqd_ptr);
> +}
> \ No newline at end of file
> --
> 2.7.4
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 51+ messages in thread

* RE: [PATCH 05/20] drm/amdgpu:bo_free_kernel will set ptr to NULL if freed
       [not found]     ` <1486447878-20521-5-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
@ 2017-02-07 15:31       ` Deucher, Alexander
  0 siblings, 0 replies; 51+ messages in thread
From: Deucher, Alexander @ 2017-02-07 15:31 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Liu, Monk

> -----Original Message-----
> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf
> Of Monk Liu
> Sent: Tuesday, February 07, 2017 1:11 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Liu, Monk
> Subject: [PATCH 05/20] drm/amdgpu:bo_free_kernel will set ptr to NULL if
> freed
> 
> Change-Id: Iac592f1a6c927677008feabc1b7af6f18c580910
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>

Reviewed-by: Alex Deucher <alexander.deucher@amd.com>

> ---
>  drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 1 -
>  1 file changed, 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> index 0e2c906..df1cfc5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> @@ -1477,7 +1477,6 @@ static void gfx_v8_0_kiq_fini(struct amdgpu_device
> *adev)
>  	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
> 
>  	amdgpu_bo_free_kernel(&kiq->eop_obj, &kiq->eop_gpu_addr,
> NULL);
> -	kiq->eop_obj = NULL;
>  }
> 
>  static int gfx_v8_0_kiq_init(struct amdgpu_device *adev)
> --
> 2.7.4
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 51+ messages in thread

* RE: [PATCH 06/20] drm/amdgpu:no need use sriov judge
       [not found]     ` <1486447878-20521-6-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
@ 2017-02-07 15:33       ` Deucher, Alexander
  0 siblings, 0 replies; 51+ messages in thread
From: Deucher, Alexander @ 2017-02-07 15:33 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Liu, Monk

> -----Original Message-----
> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf
> Of Monk Liu
> Sent: Tuesday, February 07, 2017 1:11 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Liu, Monk
> Subject: [PATCH 06/20] drm/amdgpu:no need use sriov judge
> 
> Change-Id: I9717e200be8af36f52d6305e02ffea178044c851
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>

Please add a better patch description.  We ultimately want to re-use this for bare metal, so no need to have vf checks in the KIQ code itself since kiq itself is currently only used in VF cases.  With that fixed,
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>

> ---
>  drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 13 ++++---------
>  1 file changed, 4 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> index df1cfc5..fd29124 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> @@ -1379,11 +1379,9 @@ static int gfx_v8_0_kiq_init_ring(struct
> amdgpu_device *adev,
>  {
>  	int r = 0;
> 
> -	if (amdgpu_sriov_vf(adev)) {
> -		r = amdgpu_wb_get(adev, &adev->virt.reg_val_offs);
> -		if (r)
> -			return r;
> -	}
> +	r = amdgpu_wb_get(adev, &adev->virt.reg_val_offs);
> +	if (r)
> +		return r;
> 
>  	ring->adev = NULL;
>  	ring->ring_obj = NULL;
> @@ -1407,13 +1405,10 @@ static int gfx_v8_0_kiq_init_ring(struct
> amdgpu_device *adev,
> 
>  	return r;
>  }
> -
>  static void gfx_v8_0_kiq_free_ring(struct amdgpu_ring *ring,
>  				   struct amdgpu_irq_src *irq)
>  {
> -	if (amdgpu_sriov_vf(ring->adev))
> -		amdgpu_wb_free(ring->adev, ring->adev-
> >virt.reg_val_offs);
> -
> +	amdgpu_wb_free(ring->adev, ring->adev->virt.reg_val_offs);
>  	amdgpu_ring_fini(ring);
>  	irq->data = NULL;
>  }
> --
> 2.7.4
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 51+ messages in thread

* RE: [PATCH 07/20] drm/amdgpu:minor cleanup
       [not found]     ` <1486447878-20521-7-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
@ 2017-02-07 15:34       ` Deucher, Alexander
  0 siblings, 0 replies; 51+ messages in thread
From: Deucher, Alexander @ 2017-02-07 15:34 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Liu, Monk

> -----Original Message-----
> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf
> Of Monk Liu
> Sent: Tuesday, February 07, 2017 1:11 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Liu, Monk
> Subject: [PATCH 07/20] drm/amdgpu:minor cleanup
> 
> Change-Id: Ia5ada3e9990261ca70b03655424e6290701cdb9d
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>

Reviewed-by: Alex Deucher <alexander.deucher@amd.com>

> ---
>  drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 5 +----
>  1 file changed, 1 insertion(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> index fd29124..4029d32 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> @@ -4864,10 +4864,7 @@ static int gfx_v8_0_kiq_init_queue(struct
> amdgpu_ring *ring,
>  	struct amdgpu_device *adev = ring->adev;
>  	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>  	uint64_t eop_gpu_addr;
> -	bool is_kiq = false;
> -
> -	if (ring->funcs->type == AMDGPU_RING_TYPE_KIQ)
> -		is_kiq = true;
> +	bool is_kiq = (ring->funcs->type == AMDGPU_RING_TYPE_KIQ);
> 
>  	if (is_kiq) {
>  		eop_gpu_addr = kiq->eop_gpu_addr;
> --
> 2.7.4
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 51+ messages in thread

* RE: [PATCH 08/20] drm/amdgpu:divide KCQ mqd init to sw and hw
       [not found]     ` <1486447878-20521-8-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
@ 2017-02-07 15:36       ` Deucher, Alexander
  0 siblings, 0 replies; 51+ messages in thread
From: Deucher, Alexander @ 2017-02-07 15:36 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Liu, Monk

> -----Original Message-----
> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf
> Of Monk Liu
> Sent: Tuesday, February 07, 2017 1:11 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Liu, Monk
> Subject: [PATCH 08/20] drm/amdgpu:divide KCQ mqd init to sw and hw
> 
> sw part only invoked once during sw_init.
> hw part invoked during first drv load and resume later.
> 
> that way we cannot alloc mqd in hw/resume, we only keep
> mqd allocted in sw_init routine.
> and hw_init routine only kmap and set it.
> 
> Change-Id: Ib0b788c71154e79819e8abb8daee9b9234a8eabb
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>

Reviewed-by: Alex Deucher <alexander.deucher@amd.com>

> ---
>  drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 107 +++++++++++++------------
> ---------
>  1 file changed, 42 insertions(+), 65 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> index 4029d32..6734e55 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> @@ -2116,17 +2116,6 @@ static int gfx_v8_0_sw_init(void *handle)
>  		return r;
>  	}
> 
> -	r = gfx_v8_0_kiq_init(adev);
> -	if (r) {
> -		DRM_ERROR("Failed to init KIQ BOs!\n");
> -		return r;
> -	}
> -
> -	kiq = &adev->gfx.kiq;
> -	r = gfx_v8_0_kiq_init_ring(adev, &kiq->ring, &kiq->irq);
> -	if (r)
> -		return r;
> -
>  	/* set up the gfx ring */
>  	for (i = 0; i < adev->gfx.num_gfx_rings; i++) {
>  		ring = &adev->gfx.gfx_ring[i];
> @@ -2169,6 +2158,24 @@ static int gfx_v8_0_sw_init(void *handle)
>  			return r;
>  	}
> 
> +	if (amdgpu_sriov_vf(adev)) {
> +		r = gfx_v8_0_kiq_init(adev);
> +		if (r) {
> +			DRM_ERROR("Failed to init KIQ BOs!\n");
> +			return r;
> +		}
> +
> +		kiq = &adev->gfx.kiq;
> +		r = gfx_v8_0_kiq_init_ring(adev, &kiq->ring, &kiq->irq);
> +		if (r)
> +			return r;
> +
> +		/* create MQD for all compute queues as wel as KIQ for
> SRIOV case */
> +		r = gfx_v8_0_compute_mqd_soft_init(adev);
> +		if (r)
> +			return r;
> +	}
> +
>  	/* reserve GDS, GWS and OA resource for gfx */
>  	r = amdgpu_bo_create_kernel(adev, adev-
> >gds.mem.gfx_partition_size,
>  				    PAGE_SIZE,
> AMDGPU_GEM_DOMAIN_GDS,
> @@ -2210,9 +2217,13 @@ static int gfx_v8_0_sw_fini(void *handle)
>  		amdgpu_ring_fini(&adev->gfx.gfx_ring[i]);
>  	for (i = 0; i < adev->gfx.num_compute_rings; i++)
>  		amdgpu_ring_fini(&adev->gfx.compute_ring[i]);
> -	gfx_v8_0_kiq_free_ring(&adev->gfx.kiq.ring, &adev->gfx.kiq.irq);
> 
> -	gfx_v8_0_kiq_fini(adev);
> +	if (amdgpu_sriov_vf(adev)) {
> +		gfx_v8_0_compute_mqd_soft_fini(adev);
> +		gfx_v8_0_kiq_free_ring(&adev->gfx.kiq.ring, &adev-
> >gfx.kiq.irq);
> +		gfx_v8_0_kiq_fini(adev);
> +	}
> +
>  	gfx_v8_0_mec_fini(adev);
>  	gfx_v8_0_rlc_fini(adev);
>  	gfx_v8_0_free_microcode(adev);
> @@ -4892,70 +4903,37 @@ static int gfx_v8_0_kiq_init_queue(struct
> amdgpu_ring *ring,
>  	return 0;
>  }
> 
> -static void gfx_v8_0_kiq_free_queue(struct amdgpu_device *adev)
> +static int gfx_v8_0_kiq_resume(struct amdgpu_device *adev)
>  {
>  	struct amdgpu_ring *ring = NULL;
> -	int i;
> +	int r = 0, i;
> 
> -	for (i = 0; i < adev->gfx.num_compute_rings; i++) {
> -		ring = &adev->gfx.compute_ring[i];
> -		amdgpu_bo_free_kernel(&ring->mqd_obj, NULL, NULL);
> -		ring->mqd_obj = NULL;
> -	}
> +	gfx_v8_0_cp_compute_enable(adev, true);
> 
>  	ring = &adev->gfx.kiq.ring;
> -	amdgpu_bo_free_kernel(&ring->mqd_obj, NULL, NULL);
> -	ring->mqd_obj = NULL;
> -}
> -
> -static int gfx_v8_0_kiq_setup_queue(struct amdgpu_device *adev,
> -				    struct amdgpu_ring *ring)
> -{
> -	struct vi_mqd *mqd;
> -	u64 mqd_gpu_addr;
> -	u32 *buf;
> -	int r = 0;
> -
> -	r = amdgpu_bo_create_kernel(adev, sizeof(struct vi_mqd),
> PAGE_SIZE,
> -				    AMDGPU_GEM_DOMAIN_GTT, &ring-
> >mqd_obj,
> -				    &mqd_gpu_addr, (void **)&buf);
> -	if (r) {
> -		dev_warn(adev->dev, "failed to create ring mqd ob (%d)", r);
> +	if (!amdgpu_bo_kmap(ring->mqd_obj, (void **)&ring->mqd_ptr)) {
> +		memset((void *)ring->mqd_ptr, 0, sizeof(struct vi_mqd));
> +		r = gfx_v8_0_kiq_init_queue(ring, ring->mqd_ptr, ring-
> >mqd_gpu_addr);
> +		amdgpu_bo_kunmap(ring->mqd_obj);
> +		if (r)
> +			return r;
> +	} else {
>  		return r;
>  	}
> 
> -	/* init the mqd struct */
> -	memset(buf, 0, sizeof(struct vi_mqd));
> -	mqd = (struct vi_mqd *)buf;
> -
> -	r = gfx_v8_0_kiq_init_queue(ring, mqd, mqd_gpu_addr);
> -	if (r)
> -		return r;
> -
> -	amdgpu_bo_kunmap(ring->mqd_obj);
> -
> -	return 0;
> -}
> -
> -static int gfx_v8_0_kiq_resume(struct amdgpu_device *adev)
> -{
> -	struct amdgpu_ring *ring = NULL;
> -	int r, i;
> -
> -	ring = &adev->gfx.kiq.ring;
> -	r = gfx_v8_0_kiq_setup_queue(adev, ring);
> -	if (r)
> -		return r;
> -
>  	for (i = 0; i < adev->gfx.num_compute_rings; i++) {
>  		ring = &adev->gfx.compute_ring[i];
> -		r = gfx_v8_0_kiq_setup_queue(adev, ring);
> -		if (r)
> +		if (!amdgpu_bo_kmap(ring->mqd_obj, (void **)&ring-
> >mqd_ptr)) {
> +			memset((void *)ring->mqd_ptr, 0, sizeof(struct
> vi_mqd));
> +			r = gfx_v8_0_kiq_init_queue(ring, ring->mqd_ptr,
> ring->mqd_gpu_addr);
> +			amdgpu_bo_kunmap(ring->mqd_obj);
> +			if (r)
> +			return r;
> +		} else {
>  			return r;
> +		}
>  	}
> 
> -	gfx_v8_0_cp_compute_enable(adev, true);
> -
>  	for (i = 0; i < adev->gfx.num_compute_rings; i++) {
>  		ring = &adev->gfx.compute_ring[i];
> 
> @@ -5316,7 +5294,6 @@ static int gfx_v8_0_hw_fini(void *handle)
>  	amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0);
>  	amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0);
>  	if (amdgpu_sriov_vf(adev)) {
> -		gfx_v8_0_kiq_free_queue(adev);
>  		pr_debug("For SRIOV client, shouldn't do anything.\n");
>  		return 0;
>  	}
> --
> 2.7.4
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 51+ messages in thread

* RE: [PATCH 10/20] drm/amdgpu:change kiq lock name
       [not found]     ` <1486447878-20521-10-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
@ 2017-02-07 15:38       ` Deucher, Alexander
  0 siblings, 0 replies; 51+ messages in thread
From: Deucher, Alexander @ 2017-02-07 15:38 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Liu, Monk

> -----Original Message-----
> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf
> Of Monk Liu
> Sent: Tuesday, February 07, 2017 1:11 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Liu, Monk
> Subject: [PATCH 10/20] drm/amdgpu:change kiq lock name
> 
> Change-Id: Ib11de11fb0a9e8086e542b932c9c62d5aa40ebb2
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>

Reviewed-by: Alex Deucher <alexander.deucher@amd.com>

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 10 +++++-----
>  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  2 +-
>  2 files changed, 6 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> index 76ef641..82a70db 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> @@ -98,7 +98,7 @@ void amdgpu_virt_init_setting(struct amdgpu_device
> *adev)
>  	adev->mode_info.num_crtc = 1;
>  	adev->enable_virtual_display = true;
> 
> -	mutex_init(&adev->virt.lock);
> +	mutex_init(&adev->virt.lock_kiq);
>  }
> 
>  uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
> @@ -111,14 +111,14 @@ uint32_t amdgpu_virt_kiq_rreg(struct
> amdgpu_device *adev, uint32_t reg)
> 
>  	BUG_ON(!ring->funcs->emit_rreg);
> 
> -	mutex_lock(&adev->virt.lock);
> +	mutex_lock(&adev->virt.lock_kiq);
>  	amdgpu_ring_alloc(ring, 32);
>  	amdgpu_ring_emit_hdp_flush(ring);
>  	amdgpu_ring_emit_rreg(ring, reg);
>  	amdgpu_ring_emit_hdp_invalidate(ring);
>  	amdgpu_fence_emit(ring, &f);
>  	amdgpu_ring_commit(ring);
> -	mutex_unlock(&adev->virt.lock);
> +	mutex_unlock(&adev->virt.lock_kiq);
> 
>  	r = fence_wait(f, false);
>  	if (r)
> @@ -139,14 +139,14 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device
> *adev, uint32_t reg, uint32_t v)
> 
>  	BUG_ON(!ring->funcs->emit_wreg);
> 
> -	mutex_lock(&adev->virt.lock);
> +	mutex_lock(&adev->virt.lock_kiq);
>  	amdgpu_ring_alloc(ring, 32);
>  	amdgpu_ring_emit_hdp_flush(ring);
>  	amdgpu_ring_emit_wreg(ring, reg, v);
>  	amdgpu_ring_emit_hdp_invalidate(ring);
>  	amdgpu_fence_emit(ring, &f);
>  	amdgpu_ring_commit(ring);
> -	mutex_unlock(&adev->virt.lock);
> +	mutex_unlock(&adev->virt.lock_kiq);
> 
>  	r = fence_wait(f, false);
>  	if (r)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> index 73d24df..7020ff2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> @@ -46,7 +46,7 @@ struct amdgpu_virt {
>  	uint64_t			csa_vmid0_addr;
>  	bool chained_ib_support;
>  	uint32_t			reg_val_offs;
> -	struct mutex			lock;
> +	struct mutex			lock_kiq;
>  	struct amdgpu_irq_src		ack_irq;
>  	struct amdgpu_irq_src		rcv_irq;
>  	struct delayed_work		flr_work;
> --
> 2.7.4
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 51+ messages in thread

* RE: [PATCH 13/20] Refine handshake between guest and host by mailbox
       [not found]     ` <1486447878-20521-13-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
@ 2017-02-07 15:40       ` Deucher, Alexander
  0 siblings, 0 replies; 51+ messages in thread
From: Deucher, Alexander @ 2017-02-07 15:40 UTC (permalink / raw)
  To: Liu, Monk, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Xue, Ken

> -----Original Message-----
> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf
> Of Monk Liu
> Sent: Tuesday, February 07, 2017 1:11 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Xue, Ken
> Subject: [PATCH 13/20] Refine handshake between guest and host by
> mailbox
> 
> From: Ken Xue <Ken.Xue@amd.com>
> 
> Change-Id: If3a7d05824847234759b86563e8052949e171972
> Signed-off-by: Ken Xue <Ken.Xue@amd.com>

Not too familiar with the mailbox stuff.
Acked-by: Alex Deucher <alexander.deucher@amd.com>

> ---
>  drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 26
> +++++++++++++++++++++++++-
>  1 file changed, 25 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
> index d2622b6..5fe4aad 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
> @@ -318,10 +318,25 @@ void xgpu_vi_init_golden_registers(struct
> amdgpu_device *adev)
>  static void xgpu_vi_mailbox_send_ack(struct amdgpu_device *adev)
>  {
>  	u32 reg;
> +	int timeout = VI_MAILBOX_TIMEDOUT;
> +	u32 mask = REG_FIELD_MASK(MAILBOX_CONTROL,
> RCV_MSG_VALID);
> 
>  	reg = RREG32(mmMAILBOX_CONTROL);
>  	reg = REG_SET_FIELD(reg, MAILBOX_CONTROL, RCV_MSG_ACK, 1);
>  	WREG32(mmMAILBOX_CONTROL, reg);
> +
> +	/*Wait for RCV_MSG_VALID to be 0*/
> +	reg = RREG32(mmMAILBOX_CONTROL);
> +	while (reg & mask) {
> +		if (timeout <= 0) {
> +			pr_err("RCV_MSG_VALID is not cleared\n");
> +			break;
> +		}
> +		mdelay(1);
> +		timeout -=1;
> +
> +		reg = RREG32(mmMAILBOX_CONTROL);
> +	}
>  }
> 
>  static void xgpu_vi_mailbox_set_valid(struct amdgpu_device *adev, bool
> val)
> @@ -339,6 +354,8 @@ static void xgpu_vi_mailbox_trans_msg(struct
> amdgpu_device *adev,
>  {
>  	u32 reg;
> 
> +	xgpu_vi_mailbox_send_ack(adev);
> +
>  	reg = RREG32(mmMAILBOX_MSGBUF_TRN_DW0);
>  	reg = REG_SET_FIELD(reg, MAILBOX_MSGBUF_TRN_DW0,
>  			    MSGBUF_DATA, event);
> @@ -351,6 +368,11 @@ static int xgpu_vi_mailbox_rcv_msg(struct
> amdgpu_device *adev,
>  				   enum idh_event event)
>  {
>  	u32 reg;
> +	u32 mask = REG_FIELD_MASK(MAILBOX_CONTROL,
> RCV_MSG_VALID);
> +
> +	reg = RREG32(mmMAILBOX_CONTROL);
> +	if (!(reg & mask))
> +		return -ENOENT;
> 
>  	reg = RREG32(mmMAILBOX_MSGBUF_RCV_DW0);
>  	if (reg != event)
> @@ -419,7 +441,9 @@ static int xgpu_vi_send_access_requests(struct
> amdgpu_device *adev,
>  	xgpu_vi_mailbox_set_valid(adev, false);
> 
>  	/* start to check msg if request is idh_req_gpu_init_access */
> -	if (request == IDH_REQ_GPU_INIT_ACCESS) {
> +	if (request == IDH_REQ_GPU_INIT_ACCESS ||
> +		request == IDH_REQ_GPU_FINI_ACCESS ||
> +		request == IDH_REQ_GPU_RESET_ACCESS) {
>  		r = xgpu_vi_poll_msg(adev, IDH_READY_TO_ACCESS_GPU);
>  		if (r)
>  			return r;
> --
> 2.7.4
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 51+ messages in thread

* RE: [PATCH 09/20] drm/amdgpu:implement SRIOV gpu_reset
       [not found]     ` <1486447878-20521-9-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  2017-02-07  6:26       ` 答复: " Liu, Monk
@ 2017-02-07 15:45       ` Deucher, Alexander
  1 sibling, 0 replies; 51+ messages in thread
From: Deucher, Alexander @ 2017-02-07 15:45 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Liu, Monk

> -----Original Message-----
> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf
> Of Monk Liu
> Sent: Tuesday, February 07, 2017 1:11 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Liu, Monk
> Subject: [PATCH 09/20] drm/amdgpu:implement SRIOV gpu_reset
> 
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>

Please add a better patch description.  With that and the comments below addressed,
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 158
> ++++++++++++++++++++++++++++-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h   |   1 +
>  2 files changed, 158 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index e926f84..2b404ca 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -1604,6 +1604,53 @@ int amdgpu_suspend(struct amdgpu_device
> *adev)
>  	return 0;
>  }
> 
> +static int amdgpu_resume_early(struct amdgpu_device *adev)

Maybe call this something like amdgpu_sriov_resume_early() to avoid causing confusion with the normal suspend/resume path unless you plan to use these on bare metal later.

> +{
> +	int i, r;
> +
> +	for (i = 0; i < adev->num_ip_blocks; i++) {
> +		if (!adev->ip_blocks[i].status.valid)
> +			continue;
> +
> +		if (adev->ip_blocks[i].version->type ==
> AMD_IP_BLOCK_TYPE_COMMON ||
> +				adev->ip_blocks[i].version->type ==
> AMD_IP_BLOCK_TYPE_GMC ||
> +				adev->ip_blocks[i].version->type ==
> AMD_IP_BLOCK_TYPE_IH)
> +			r = adev->ip_blocks[i].version->funcs-
> >resume(adev);
> +
> +		if (r) {
> +			DRM_ERROR("resume of IP block <%s> failed %d\n",
> +				  adev->ip_blocks[i].version->funcs->name,
> r);
> +			return r;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +static int amdgpu_resume_late(struct amdgpu_device *adev)

Same comment here.

> +{
> +	int i, r;
> +
> +	for (i = 0; i < adev->num_ip_blocks; i++) {
> +		if (!adev->ip_blocks[i].status.valid)
> +			continue;
> +
> +		if (adev->ip_blocks[i].version->type ==
> AMD_IP_BLOCK_TYPE_COMMON ||
> +				adev->ip_blocks[i].version->type ==
> AMD_IP_BLOCK_TYPE_GMC ||
> +				adev->ip_blocks[i].version->type ==
> AMD_IP_BLOCK_TYPE_IH )
> +			continue;
> +
> +		r = adev->ip_blocks[i].version->funcs->resume(adev);
> +		if (r) {
> +			DRM_ERROR("resume of IP block <%s> failed %d\n",
> +				  adev->ip_blocks[i].version->funcs->name,
> r);
> +			return r;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
>  static int amdgpu_resume(struct amdgpu_device *adev)
>  {
>  	int i, r;
> @@ -2343,6 +2390,115 @@ static int
> amdgpu_recover_vram_from_shadow(struct amdgpu_device *adev,
>  }
> 
>  /**
> + * amdgpu_sriov_gpu_reset - reset the asic
> + *
> + * @adev: amdgpu device pointer
> + * @voluntary: if this reset is requested by guest.
> + *             (true means by guest and false means by HYPERVISOR )
> + *
> + * Attempt the reset the GPU if it has hung (all asics).
> + * for SRIOV case.
> + * Returns 0 for success or an error on failure.
> + */
> +int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, bool voluntary)
> +{
> +	int i, r = 0;
> +	int resched;
> +	struct amdgpu_bo *bo, *tmp;
> +	struct amdgpu_ring *ring;
> +	struct fence *fence = NULL, *next = NULL;
> +
> +	mutex_lock(&adev->virt.lock_reset);
> +	atomic_inc(&adev->gpu_reset_counter);
> +
> +	/* block TTM */
> +	resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
> +
> +	/* block scheduler */
> +	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> +		ring = adev->rings[i];
> +
> +		if (!ring || !ring->sched.thread)
> +			continue;
> +
> +		kthread_park(ring->sched.thread);
> +		amd_sched_hw_job_reset(&ring->sched);
> +	}
> +
> +	/* after all hw jobs are reset, hw fence is meaningless, so
> force_completion */
> +	amdgpu_fence_driver_force_completion(adev);
> +
> +	/* request to take full control of GPU before re-initialization  */
> +	if (voluntary)
> +		amdgpu_virt_reset_gpu(adev);
> +	else
> +		amdgpu_virt_request_full_gpu(adev, true);
> +
> +
> +	/* Resume IP prior to SMC */
> +	amdgpu_resume_early(adev);
> +
> +	/* we need recover gart prior to run SMC/CP/SDMA resume */
> +	amdgpu_ttm_recover_gart(adev);
> +
> +	/* now we are okay to resume SMC/CP/SDMA */
> +	amdgpu_resume_late(adev);
> +
> +	amdgpu_irq_gpu_reset_resume_helper(adev);
> +
> +	if (amdgpu_ib_ring_tests(adev))
> +		dev_err(adev->dev, "[GPU_RESET] ib ring test failed
> (%d).\n", r);
> +
> +	/* rellease full control of GPU after ib test */
> +	amdgpu_virt_release_full_gpu(adev, true);
> +
> +	DRM_INFO("recover vram bo from shadow\n");
> +
> +	ring = adev->mman.buffer_funcs_ring;
> +	mutex_lock(&adev->shadow_list_lock);
> +	list_for_each_entry_safe(bo, tmp, &adev->shadow_list,
> shadow_list) {
> +		amdgpu_recover_vram_from_shadow(adev, ring, bo,
> &next);
> +		if (fence) {
> +			r = fence_wait(fence, false);
> +			if (r) {
> +				WARN(r, "recovery from shadow isn't
> completed\n");
> +				break;
> +			}
> +		}
> +
> +		fence_put(fence);
> +		fence = next;
> +	}
> +	mutex_unlock(&adev->shadow_list_lock);
> +
> +	if (fence) {
> +		r = fence_wait(fence, false);
> +		if (r)
> +			WARN(r, "recovery from shadow isn't
> completed\n");
> +	}
> +	fence_put(fence);
> +
> +	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> +		struct amdgpu_ring *ring = adev->rings[i];
> +		if (!ring || !ring->sched.thread)
> +			continue;
> +
> +		amd_sched_job_recovery(&ring->sched);
> +		kthread_unpark(ring->sched.thread);
> +	}
> +
> +	drm_helper_resume_force_mode(adev->ddev);
> +	ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev,
> resched);
> +	if (r) {
> +		/* bad news, how to tell it to userspace ? */
> +		dev_info(adev->dev, "GPU reset failed\n");
> +	}
> +
> +	mutex_unlock(&adev->virt.lock_reset);
> +	return r;
> +}
> +
> +/**
>   * amdgpu_gpu_reset - reset the asic
>   *
>   * @adev: amdgpu device pointer
> @@ -2358,7 +2514,7 @@ int amdgpu_gpu_reset(struct amdgpu_device
> *adev)
>  	bool need_full_reset;
> 
>  	if (amdgpu_sriov_vf(adev))
> -		return 0;
> +		return amdgpu_sriov_gpu_reset(adev, true);
> 
>  	if (!amdgpu_check_soft_reset(adev)) {
>  		DRM_INFO("No hardware hang detected. Did some blocks
> stall?\n");
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> index 675e12c..73d24df 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> @@ -89,5 +89,6 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device
> *adev, uint32_t reg, uint32_t v);
>  int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool init);
>  int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, bool init);
>  int amdgpu_virt_reset_gpu(struct amdgpu_device *adev);
> +int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, bool
> voluntary);
> 
>  #endif
> --
> 2.7.4
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 51+ messages in thread

* RE: [PATCH 11/20] drm/amdgpu:add lock_reset for SRIOV
       [not found]     ` <1486447878-20521-11-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
@ 2017-02-07 15:47       ` Deucher, Alexander
  0 siblings, 0 replies; 51+ messages in thread
From: Deucher, Alexander @ 2017-02-07 15:47 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Liu, Monk

> -----Original Message-----
> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf
> Of Monk Liu
> Sent: Tuesday, February 07, 2017 1:11 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Liu, Monk
> Subject: [PATCH 11/20] drm/amdgpu:add lock_reset for SRIOV
> 
> this lock is used for sriov_gpu_reset, only get this mutex
> can run into sriov_gpu_reset.
> 
> we have two cases triggers gpu_reset for SRIOV:
> 1) we have submit timedout and trigger reset voluntarily
> 2) hypervisor found world switch hang and trigger flr and notify we to
>    do gpu reset.
> 
> both cases need take care and we need a mutex to protect the consistency
> of
> reset routine.
> 
> Change-Id: I37aabccfaef1cde32dc350062a519d32c9d51c02
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>

Reviewed-by: Alex Deucher <alexander.deucher@amd.com>

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 1 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 1 +
>  2 files changed, 2 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> index 82a70db..ac035ab 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> @@ -99,6 +99,7 @@ void amdgpu_virt_init_setting(struct amdgpu_device
> *adev)
>  	adev->enable_virtual_display = true;
> 
>  	mutex_init(&adev->virt.lock_kiq);
> +	mutex_init(&adev->virt.lock_reset);
>  }
> 
>  uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> index 7020ff2..4b05568 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> @@ -47,6 +47,7 @@ struct amdgpu_virt {
>  	bool chained_ib_support;
>  	uint32_t			reg_val_offs;
>  	struct mutex			lock_kiq;
> +	struct mutex                    lock_reset;
>  	struct amdgpu_irq_src		ack_irq;
>  	struct amdgpu_irq_src		rcv_irq;
>  	struct delayed_work		flr_work;
> --
> 2.7.4
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 51+ messages in thread

* RE: [PATCH 12/20] drm/amdgpu:impl mm_r/weg_nokiq
       [not found]     ` <1486447878-20521-12-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
@ 2017-02-07 15:52       ` Deucher, Alexander
       [not found]         ` <BN6PR12MB1652FE39574BA6067AB64578F7430-/b2+HYfkarQqUD6E6FAiowdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  0 siblings, 1 reply; 51+ messages in thread
From: Deucher, Alexander @ 2017-02-07 15:52 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Liu, Monk

> -----Original Message-----
> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf
> Of Monk Liu
> Sent: Tuesday, February 07, 2017 1:11 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Liu, Monk
> Subject: [PATCH 12/20] drm/amdgpu:impl mm_r/weg_nokiq
> 
> some registers are pf&vf copy, and we can safely use
> mmio method to access them.
> 
> and some time we are forbid to use kiq to access register
> like in INTR context.
> 
> Change-Id: Ie6dc323dc86829a4a6ceb7073c269b106b534c4a
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 10 +++++++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 36
> ++++++++++++++++++++++++++++++
>  2 files changed, 46 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 402a895..5dd0615 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1509,6 +1509,11 @@ int amdgpu_device_init(struct amdgpu_device
> *adev,
>  void amdgpu_device_fini(struct amdgpu_device *adev);
>  int amdgpu_gpu_wait_for_idle(struct amdgpu_device *adev);
> 
> +uint32_t amdgpu_mm_rreg_nokiq(struct amdgpu_device *adev, uint32_t
> reg,
> +			bool always_indirect);
> +void amdgpu_mm_wreg_nokiq(struct amdgpu_device *adev, uint32_t reg,
> uint32_t v,
> +		    bool always_indirect);
> +
>  uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
>  			bool always_indirect);
>  void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg,
> uint32_t v,
> @@ -1523,6 +1528,11 @@ bool amdgpu_device_has_dc_support(struct
> amdgpu_device *adev);
>  /*
>   * Registers read & write functions.
>   */
> +#define RREG32_nokiq(reg) amdgpu_mm_rreg_nokiq(adev, (reg), false)
> +#define RREG32_IDX_nokiq(reg) amdgpu_mm_rreg(adev, (reg), true)
> +#define WREG32_nokiq(reg, v) amdgpu_mm_wreg_nokiq(adev, (reg), (v),
> false)
> +#define WREG32_IDX_nokiq(reg, v) amdgpu_mm_wreg_nokiq(adev, (reg),
> (v), true)
> +

Can we call the macros something like RREG32_VF or RREG32_NO_KIQ?

>  #define RREG32(reg) amdgpu_mm_rreg(adev, (reg), false)
>  #define RREG32_IDX(reg) amdgpu_mm_rreg(adev, (reg), true)
>  #define DREG32(reg) printk(KERN_INFO "REGISTER: " #reg " : 0x%08X\n",
> amdgpu_mm_rreg(adev, (reg), false))
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 2b404ca..d5870d0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -136,6 +136,42 @@ void amdgpu_mm_wreg(struct amdgpu_device
> *adev, uint32_t reg, uint32_t v,
>  	}
>  }
> 
> +uint32_t amdgpu_mm_rreg_nokiq(struct amdgpu_device *adev, uint32_t
> reg,
> +			bool always_indirect)
> +{
> +	uint32_t ret;
> +
> +	if ((reg * 4) < adev->rmmio_size && !always_indirect)
> +		ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
> +	else {
> +		unsigned long flags;
> +
> +		spin_lock_irqsave(&adev->mmio_idx_lock, flags);
> +		writel((reg * 4), ((void __iomem *)adev->rmmio) +
> (mmMM_INDEX * 4));
> +		ret = readl(((void __iomem *)adev->rmmio) +
> (mmMM_DATA * 4));
> +		spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
> +	}
> +	trace_amdgpu_mm_rreg(adev->pdev->device, reg, ret);
> +	return ret;
> +}
> +
> +void amdgpu_mm_wreg_nokiq(struct amdgpu_device *adev, uint32_t reg,
> uint32_t v,
> +		    bool always_indirect)
> +{
> +	trace_amdgpu_mm_wreg(adev->pdev->device, reg, v);
> +
> +	if ((reg * 4) < adev->rmmio_size && !always_indirect)
> +		writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
> +	else {
> +		unsigned long flags;
> +
> +		spin_lock_irqsave(&adev->mmio_idx_lock, flags);
> +		writel((reg * 4), ((void __iomem *)adev->rmmio) +
> (mmMM_INDEX * 4));
> +		writel(v, ((void __iomem *)adev->rmmio) + (mmMM_DATA
> * 4));
> +		spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
> +	}
> +}

These are duplicated with the regular mm_rreg and mm_wreg functions.  Maybe add a new parameter to the existing functions to bypass kiq in the sr-iov case?

> +
>  u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
>  {
>  	if ((reg * 4) < adev->rio_mem_size)
> --
> 2.7.4
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 51+ messages in thread

* RE: [PATCH 14/20] drm/amdgpu:use nokiq version mm access
       [not found]     ` <1486447878-20521-14-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
@ 2017-02-07 15:54       ` Deucher, Alexander
  0 siblings, 0 replies; 51+ messages in thread
From: Deucher, Alexander @ 2017-02-07 15:54 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Liu, Monk

> -----Original Message-----
> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf
> Of Monk Liu
> Sent: Tuesday, February 07, 2017 1:11 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Liu, Monk
> Subject: [PATCH 14/20] drm/amdgpu:use nokiq version mm access
> 
> Change-Id: I383d7ce858a136d7b112180f86e3d632d37b4d1c
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>

Please add a better patch description.  With that fixed and the comments from patch 12 addressed:
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>

> ---
>  drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 32 ++++++++++++++++--------
> --------
>  1 file changed, 16 insertions(+), 16 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
> index 5fe4aad..4e9e0bb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
> @@ -321,12 +321,12 @@ static void xgpu_vi_mailbox_send_ack(struct
> amdgpu_device *adev)
>  	int timeout = VI_MAILBOX_TIMEDOUT;
>  	u32 mask = REG_FIELD_MASK(MAILBOX_CONTROL,
> RCV_MSG_VALID);
> 
> -	reg = RREG32(mmMAILBOX_CONTROL);
> +	reg = RREG32_nokiq(mmMAILBOX_CONTROL);
>  	reg = REG_SET_FIELD(reg, MAILBOX_CONTROL, RCV_MSG_ACK, 1);
> -	WREG32(mmMAILBOX_CONTROL, reg);
> +	WREG32_nokiq(mmMAILBOX_CONTROL, reg);
> 
>  	/*Wait for RCV_MSG_VALID to be 0*/
> -	reg = RREG32(mmMAILBOX_CONTROL);
> +	reg = RREG32_nokiq(mmMAILBOX_CONTROL);
>  	while (reg & mask) {
>  		if (timeout <= 0) {
>  			pr_err("RCV_MSG_VALID is not cleared\n");
> @@ -335,7 +335,7 @@ static void xgpu_vi_mailbox_send_ack(struct
> amdgpu_device *adev)
>  		mdelay(1);
>  		timeout -=1;
> 
> -		reg = RREG32(mmMAILBOX_CONTROL);
> +		reg = RREG32_nokiq(mmMAILBOX_CONTROL);
>  	}
>  }
> 
> @@ -343,10 +343,10 @@ static void xgpu_vi_mailbox_set_valid(struct
> amdgpu_device *adev, bool val)
>  {
>  	u32 reg;
> 
> -	reg = RREG32(mmMAILBOX_CONTROL);
> +	reg = RREG32_nokiq(mmMAILBOX_CONTROL);
>  	reg = REG_SET_FIELD(reg, MAILBOX_CONTROL,
>  			    TRN_MSG_VALID, val ? 1 : 0);
> -	WREG32(mmMAILBOX_CONTROL, reg);
> +	WREG32_nokiq(mmMAILBOX_CONTROL, reg);
>  }
> 
>  static void xgpu_vi_mailbox_trans_msg(struct amdgpu_device *adev,
> @@ -356,10 +356,10 @@ static void xgpu_vi_mailbox_trans_msg(struct
> amdgpu_device *adev,
> 
>  	xgpu_vi_mailbox_send_ack(adev);
> 
> -	reg = RREG32(mmMAILBOX_MSGBUF_TRN_DW0);
> +	reg = RREG32_nokiq(mmMAILBOX_MSGBUF_TRN_DW0);
>  	reg = REG_SET_FIELD(reg, MAILBOX_MSGBUF_TRN_DW0,
>  			    MSGBUF_DATA, event);
> -	WREG32(mmMAILBOX_MSGBUF_TRN_DW0, reg);
> +	WREG32_nokiq(mmMAILBOX_MSGBUF_TRN_DW0, reg);
> 
>  	xgpu_vi_mailbox_set_valid(adev, true);
>  }
> @@ -370,11 +370,11 @@ static int xgpu_vi_mailbox_rcv_msg(struct
> amdgpu_device *adev,
>  	u32 reg;
>  	u32 mask = REG_FIELD_MASK(MAILBOX_CONTROL,
> RCV_MSG_VALID);
> 
> -	reg = RREG32(mmMAILBOX_CONTROL);
> +	reg = RREG32_nokiq(mmMAILBOX_CONTROL);
>  	if (!(reg & mask))
>  		return -ENOENT;
> 
> -	reg = RREG32(mmMAILBOX_MSGBUF_RCV_DW0);
> +	reg = RREG32_nokiq(mmMAILBOX_MSGBUF_RCV_DW0);
>  	if (reg != event)
>  		return -ENOENT;
> 
> @@ -390,7 +390,7 @@ static int xgpu_vi_poll_ack(struct amdgpu_device
> *adev)
>  	u32 mask = REG_FIELD_MASK(MAILBOX_CONTROL,
> TRN_MSG_ACK);
>  	u32 reg;
> 
> -	reg = RREG32(mmMAILBOX_CONTROL);
> +	reg = RREG32_nokiq(mmMAILBOX_CONTROL);
>  	while (!(reg & mask)) {
>  		if (timeout <= 0) {
>  			pr_err("Doesn't get ack from pf.\n");
> @@ -400,7 +400,7 @@ static int xgpu_vi_poll_ack(struct amdgpu_device
> *adev)
>  		msleep(1);
>  		timeout -= 1;
> 
> -		reg = RREG32(mmMAILBOX_CONTROL);
> +		reg = RREG32_nokiq(mmMAILBOX_CONTROL);
>  	}
> 
>  	return r;
> @@ -492,11 +492,11 @@ static int xgpu_vi_set_mailbox_ack_irq(struct
> amdgpu_device *adev,
>  				       unsigned type,
>  				       enum amdgpu_interrupt_state state)
>  {
> -	u32 tmp = RREG32(mmMAILBOX_INT_CNTL);
> +	u32 tmp = RREG32_nokiq(mmMAILBOX_INT_CNTL);
> 
>  	tmp = REG_SET_FIELD(tmp, MAILBOX_INT_CNTL, ACK_INT_EN,
>  			    (state == AMDGPU_IRQ_STATE_ENABLE) ? 1 : 0);
> -	WREG32(mmMAILBOX_INT_CNTL, tmp);
> +	WREG32_nokiq(mmMAILBOX_INT_CNTL, tmp);
> 
>  	return 0;
>  }
> @@ -521,11 +521,11 @@ static int xgpu_vi_set_mailbox_rcv_irq(struct
> amdgpu_device *adev,
>  				       unsigned type,
>  				       enum amdgpu_interrupt_state state)
>  {
> -	u32 tmp = RREG32(mmMAILBOX_INT_CNTL);
> +	u32 tmp = RREG32_nokiq(mmMAILBOX_INT_CNTL);
> 
>  	tmp = REG_SET_FIELD(tmp, MAILBOX_INT_CNTL, VALID_INT_EN,
>  			    (state == AMDGPU_IRQ_STATE_ENABLE) ? 1 : 0);
> -	WREG32(mmMAILBOX_INT_CNTL, tmp);
> +	WREG32_nokiq(mmMAILBOX_INT_CNTL, tmp);
> 
>  	return 0;
>  }
> --
> 2.7.4
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 51+ messages in thread

* RE: [PATCH 16/20] drm/amdgpu:RUNTIME flag should clr later
       [not found]     ` <1486447878-20521-16-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
@ 2017-02-07 15:56       ` Deucher, Alexander
  0 siblings, 0 replies; 51+ messages in thread
From: Deucher, Alexander @ 2017-02-07 15:56 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Liu, Monk

> -----Original Message-----
> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf
> Of Monk Liu
> Sent: Tuesday, February 07, 2017 1:11 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Liu, Monk
> Subject: [PATCH 16/20] drm/amdgpu:RUNTIME flag should clr later
> 
> this flag will get cleared by request gpu access
> 
> Change-Id: Ie484bb0141420055370e019dcd8c110fb34f8a1b
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 5 ++---
>  1 file changed, 2 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
> index 53fa590c..64d2fd0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
> @@ -507,9 +507,8 @@ static void xgpu_vi_mailbox_flr_work(struct
> work_struct *work)
>  	struct amdgpu_device *adev = container_of(virt, struct
> amdgpu_device, virt);
> 
>  	/* wait until RCV_MSG become 3 */
> -	if (!xgpu_vi_poll_msg(adev, IDH_FLR_NOTIFICATION_CMPL))
> -		adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
> -	else {
> +	if (xgpu_vi_poll_msg(adev, IDH_FLR_NOTIFICATION_CMPL))
> +	{

Coding style.  { should be on the same line as the if.

>  		pr_err("failed to recieve FLR_CMPL\n");
>  		return;
>  	}
> --
> 2.7.4
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 51+ messages in thread

* RE: [PATCH 17/20] drm/amdgpu:new field is_load_stage introduced
       [not found]     ` <1486447878-20521-17-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
@ 2017-02-07 16:08       ` Deucher, Alexander
  0 siblings, 0 replies; 51+ messages in thread
From: Deucher, Alexander @ 2017-02-07 16:08 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Liu, Monk

> -----Original Message-----
> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf
> Of Monk Liu
> Sent: Tuesday, February 07, 2017 1:11 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Liu, Monk
> Subject: [PATCH 17/20] drm/amdgpu:new field is_load_stage introduced
> 
> use it to seperate first driver load and later reset/resume
> 
> Change-Id: I991e0da52ccd197716d279bf9014de46d39acfea
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 1 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 ++
>  2 files changed, 3 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 5dd0615..bdb47f7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1493,6 +1493,7 @@ struct amdgpu_device {
>  	/* link all gtt */
>  	spinlock_t			gtt_list_lock;
>  	struct list_head                gtt_list;
> +	bool	is_load_stage;

Since this is only used by gfx, please put it in the gfx structure.  Also, I think it makes more sense reverse the logic and call it in_reset and set/clear it in the sriov reset function.

> 
>  };
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index d5870d0..5be0481 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -1800,6 +1800,7 @@ int amdgpu_device_init(struct amdgpu_device
> *adev,
>  	adev->gc_cac_wreg = &amdgpu_invalid_wreg;
>  	adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
>  	adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
> +	adev->is_load_stage = true;
> 
> 
>  	DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X
> 0x%04X:0x%04X 0x%02X).\n",
> @@ -2010,6 +2011,7 @@ int amdgpu_device_init(struct amdgpu_device
> *adev,
>  		goto failed;
>  	}
> 
> +	adev->is_load_stage = false;
>  	return 0;
> 
>  failed:
> --
> 2.7.4
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 51+ messages in thread

* RE: [PATCH 20/20] drm/amdgpu:fix kiq_resume routine
       [not found]     ` <1486447878-20521-20-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
@ 2017-02-07 16:09       ` Deucher, Alexander
  0 siblings, 0 replies; 51+ messages in thread
From: Deucher, Alexander @ 2017-02-07 16:09 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Liu, Monk

> -----Original Message-----
> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf
> Of Monk Liu
> Sent: Tuesday, February 07, 2017 1:11 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Liu, Monk
> Subject: [PATCH 20/20] drm/amdgpu:fix kiq_resume routine
> 
> use is_load_stage to fix compute ring test failure issue
> which occured after FLR/gpu_reset.
> 
> we need backup a clean status of MQD which was created in drv load
> stage, and use it in resume stage, otherwise KCQ and KIQ all may
> faild in ring/ib test.
> 
> Change-Id: I41be940454a6638e9a8a05f096601eaa1fbebaab
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>

Please see my comments on patch 17.

> ---
>  drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 44
> ++++++++++++++++++++++++++---------
>  1 file changed, 33 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> index 0ce00ff..4a641d3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> @@ -4877,24 +4877,46 @@ static int gfx_v8_0_kiq_init_queue(struct
> amdgpu_ring *ring,
>  	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>  	uint64_t eop_gpu_addr;
>  	bool is_kiq = (ring->funcs->type == AMDGPU_RING_TYPE_KIQ);
> +	int mqd_idx = AMDGPU_MAX_COMPUTE_RINGS;
> 
>  	if (is_kiq) {
>  		eop_gpu_addr = kiq->eop_gpu_addr;
>  		gfx_v8_0_kiq_setting(&kiq->ring);
> -	} else
> +	} else {
>  		eop_gpu_addr = adev->gfx.mec.hpd_eop_gpu_addr +
>  					ring->queue * MEC_HPD_SIZE;
> +		mqd_idx = ring - &adev->gfx.compute_ring[0];
> +	}
> 
> -	mutex_lock(&adev->srbm_mutex);
> -	vi_srbm_select(adev, ring->me, ring->pipe, ring->queue, 0);
> +	if (adev->is_load_stage) {
> +		memset((void *)mqd, 0, sizeof(*mqd));
> +		mutex_lock(&adev->srbm_mutex);
> +		vi_srbm_select(adev, ring->me, ring->pipe, ring->queue, 0);
> +		gfx_v8_0_mqd_init(adev, mqd, mqd_gpu_addr,
> eop_gpu_addr, ring);
> +		if (is_kiq)
> +			gfx_v8_0_kiq_init_register(adev, mqd, ring);
> +		vi_srbm_select(adev, 0, 0, 0, 0);
> +		mutex_unlock(&adev->srbm_mutex);
> 
> -	gfx_v8_0_mqd_init(adev, mqd, mqd_gpu_addr, eop_gpu_addr,
> ring);
> +		if (adev->gfx.mec.mqd_backup[mqd_idx])
> +			memcpy(adev->gfx.mec.mqd_backup[mqd_idx],
> mqd, sizeof(*mqd));
> +	} else { /* for GPU_RESET case */
> +		/* reset MQD to a clean status */
> +		if (adev->gfx.mec.mqd_backup[mqd_idx])
> +			memcpy(mqd, adev-
> >gfx.mec.mqd_backup[mqd_idx], sizeof(*mqd));
> 
> -	if (is_kiq)
> -		gfx_v8_0_kiq_init_register(adev, mqd, ring);
> -
> -	vi_srbm_select(adev, 0, 0, 0, 0);
> -	mutex_unlock(&adev->srbm_mutex);
> +		/* reset ring buffer */
> +		ring->wptr = 0;
> +		amdgpu_ring_clear_ring(ring);
> +
> +		if (is_kiq) {
> +		    mutex_lock(&adev->srbm_mutex);
> +		    vi_srbm_select(adev, ring->me, ring->pipe, ring->queue,
> 0);
> +		    gfx_v8_0_kiq_init_register(adev, mqd, ring);
> +		    vi_srbm_select(adev, 0, 0, 0, 0);
> +		    mutex_unlock(&adev->srbm_mutex);
> +		}
> +	}
> 
>  	if (is_kiq)
>  		gfx_v8_0_kiq_enable(ring);
> @@ -4913,9 +4935,9 @@ static int gfx_v8_0_kiq_resume(struct
> amdgpu_device *adev)
> 
>  	ring = &adev->gfx.kiq.ring;
>  	if (!amdgpu_bo_kmap(ring->mqd_obj, (void **)&ring->mqd_ptr)) {
> -		memset((void *)ring->mqd_ptr, 0, sizeof(struct vi_mqd));
>  		r = gfx_v8_0_kiq_init_queue(ring, ring->mqd_ptr, ring-
> >mqd_gpu_addr);
>  		amdgpu_bo_kunmap(ring->mqd_obj);
> +		ring->mqd_ptr = NULL;
>  		if (r)
>  			return r;
>  	} else {
> @@ -4925,9 +4947,9 @@ static int gfx_v8_0_kiq_resume(struct
> amdgpu_device *adev)
>  	for (i = 0; i < adev->gfx.num_compute_rings; i++) {
>  		ring = &adev->gfx.compute_ring[i];
>  		if (!amdgpu_bo_kmap(ring->mqd_obj, (void **)&ring-
> >mqd_ptr)) {
> -			memset((void *)ring->mqd_ptr, 0, sizeof(struct
> vi_mqd));
>  			r = gfx_v8_0_kiq_init_queue(ring, ring->mqd_ptr,
> ring->mqd_gpu_addr);
>  			amdgpu_bo_kunmap(ring->mqd_obj);
> +			ring->mqd_ptr = NULL;
>  			if (r)
>  			return r;
>  		} else {
> --
> 2.7.4
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 51+ messages in thread

* RE: [PATCH 19/20] drm/amdgpu:use nop to clear ring buffer
       [not found]     ` <1486447878-20521-19-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
@ 2017-02-07 16:10       ` Deucher, Alexander
  0 siblings, 0 replies; 51+ messages in thread
From: Deucher, Alexander @ 2017-02-07 16:10 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Liu, Monk

> -----Original Message-----
> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf
> Of Monk Liu
> Sent: Tuesday, February 07, 2017 1:11 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Liu, Monk
> Subject: [PATCH 19/20] drm/amdgpu:use nop to clear ring buffer
> 
> this is for a fine GPU reset/resume, which should
> use nop clear ringbuffer prior to kickoff engine.
> 
> and also use the same clear macro in ring_init.
> 
> Change-Id: I7693891fd4431d64c025d052f1dd0ba797f2f0b7
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>

I'd suggest breaking this into two patches, one to add the new ring function and another to convert the IPs to use it.  With that fixed:
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>


> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 2 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 7 +++++++
>  drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 1 +
>  drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c   | 1 +
>  4 files changed, 10 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> index 7bacf3c..37d8422 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> @@ -230,7 +230,7 @@ int amdgpu_ring_init(struct amdgpu_device *adev,
> struct amdgpu_ring *ring,
>  			dev_err(adev->dev, "(%d) ring create failed\n", r);
>  			return r;
>  		}
> -		memset((void *)ring->ring, 0, ring->ring_size);
> +		amdgpu_ring_clear_ring(ring);
>  	}
>  	ring->ptr_mask = (ring->ring_size / 4) - 1;
>  	ring->max_dw = max_dw;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index 0e57b04..3fd4ce8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -186,5 +186,12 @@ int amdgpu_ring_init(struct amdgpu_device *adev,
> struct amdgpu_ring *ring,
>  		     unsigned ring_size, struct amdgpu_irq_src *irq_src,
>  		     unsigned irq_type);
>  void amdgpu_ring_fini(struct amdgpu_ring *ring);
> +static inline void amdgpu_ring_clear_ring(struct amdgpu_ring *ring)
> +{
> +	int i = 0;
> +	while (i <= ring->ptr_mask)
> +		ring->ring[i++] = ring->funcs->nop;
> +
> +}
> 
>  #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> index 5f688d4..0ce00ff 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> @@ -4509,6 +4509,7 @@ static int gfx_v8_0_cp_gfx_resume(struct
> amdgpu_device *adev)
>  	}
> 
>  	/* start the ring */
> +	amdgpu_ring_clear_ring(ring);
>  	gfx_v8_0_cp_gfx_start(adev);
>  	ring->ready = true;
>  	r = amdgpu_ring_test_ring(ring);
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
> b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
> index 9394ca6..d5206f5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
> @@ -615,6 +615,7 @@ static int sdma_v3_0_gfx_resume(struct
> amdgpu_device *adev)
> 
>  	for (i = 0; i < adev->sdma.num_instances; i++) {
>  		ring = &adev->sdma.instance[i].ring;
> +		amdgpu_ring_clear_ring(ring);
>  		wb_offset = (ring->rptr_offs * 4);
> 
>  		mutex_lock(&adev->srbm_mutex);
> --
> 2.7.4
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 51+ messages in thread

* RE: [PATCH 15/20] drm/amdgpu:use work instead of delay-work
       [not found]     ` <1486447878-20521-15-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
@ 2017-02-07 16:10       ` Deucher, Alexander
       [not found]         ` <BN6PR12MB1652AA568136314BD28303BFF7430-/b2+HYfkarQqUD6E6FAiowdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  0 siblings, 1 reply; 51+ messages in thread
From: Deucher, Alexander @ 2017-02-07 16:10 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Liu, Monk

> -----Original Message-----
> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf
> Of Monk Liu
> Sent: Tuesday, February 07, 2017 1:11 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Liu, Monk
> Subject: [PATCH 15/20] drm/amdgpu:use work instead of delay-work
> 
> Change-Id: I41b6336baa00b1fd299311349402a17951b585a2
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>

Please add a better patch description.  Why is this change needed?


> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  2 +-
>  drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c    | 36 +++++++++++++++--------
> ---------
>  2 files changed, 18 insertions(+), 20 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> index 4b05568..846f29c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> @@ -50,7 +50,7 @@ struct amdgpu_virt {
>  	struct mutex                    lock_reset;
>  	struct amdgpu_irq_src		ack_irq;
>  	struct amdgpu_irq_src		rcv_irq;
> -	struct delayed_work		flr_work;
> +	struct work_struct		flr_work;
>  	const struct amdgpu_virt_ops	*ops;
>  };
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
> index 4e9e0bb..53fa590c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
> @@ -503,17 +503,19 @@ static int xgpu_vi_set_mailbox_ack_irq(struct
> amdgpu_device *adev,
> 
>  static void xgpu_vi_mailbox_flr_work(struct work_struct *work)
>  {
> -	struct amdgpu_virt *virt = container_of(work,
> -					struct amdgpu_virt, flr_work.work);
> -	struct amdgpu_device *adev = container_of(virt,
> -					struct amdgpu_device, virt);
> -	int r = 0;
> -
> -	r = xgpu_vi_poll_msg(adev, IDH_FLR_NOTIFICATION_CMPL);
> -	if (r)
> -		DRM_ERROR("failed to get flr cmpl msg from hypervior.\n");
> +	struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt,
> flr_work);
> +	struct amdgpu_device *adev = container_of(virt, struct
> amdgpu_device, virt);
> +
> +	/* wait until RCV_MSG become 3 */
> +	if (!xgpu_vi_poll_msg(adev, IDH_FLR_NOTIFICATION_CMPL))
> +		adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
> +	else {
> +		pr_err("failed to recieve FLR_CMPL\n");
> +		return;
> +	}
> 
> -	/* TODO: need to restore gfx states */
> +	/* Trigger recovery due to world switch failure */
> +	amdgpu_sriov_gpu_reset(adev, false);
>  }
> 
>  static int xgpu_vi_set_mailbox_rcv_irq(struct amdgpu_device *adev,
> @@ -536,15 +538,12 @@ static int xgpu_vi_mailbox_rcv_irq(struct
> amdgpu_device *adev,
>  {
>  	int r;
> 
> -	adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
> +	/* see what event we get */
>  	r = xgpu_vi_mailbox_rcv_msg(adev, IDH_FLR_NOTIFICATION);
> -	/* do nothing for other msg */
> -	if (r)
> -		return 0;
> 
> -	/* TODO: need to save gfx states */
> -	schedule_delayed_work(&adev->virt.flr_work,
> -			      msecs_to_jiffies(VI_MAILBOX_RESET_TIME));
> +	/* only handle FLR_NOTIFY now */
> +	if (!r)
> +		schedule_work(&adev->virt.flr_work);
> 
>  	return 0;
>  }
> @@ -597,14 +596,13 @@ int xgpu_vi_mailbox_get_irq(struct
> amdgpu_device *adev)
>  		return r;
>  	}
> 
> -	INIT_DELAYED_WORK(&adev->virt.flr_work,
> xgpu_vi_mailbox_flr_work);
> +	INIT_WORK(&adev->virt.flr_work, xgpu_vi_mailbox_flr_work);
> 
>  	return 0;
>  }
> 
>  void xgpu_vi_mailbox_put_irq(struct amdgpu_device *adev)
>  {
> -	cancel_delayed_work_sync(&adev->virt.flr_work);
>  	amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>  	amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
>  }
> --
> 2.7.4
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 51+ messages in thread

* RE: [PATCH 18/20] drm/amdgpu:alloc mqd backup
       [not found]     ` <1486447878-20521-18-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
@ 2017-02-07 16:11       ` Deucher, Alexander
  0 siblings, 0 replies; 51+ messages in thread
From: Deucher, Alexander @ 2017-02-07 16:11 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Liu, Monk

> -----Original Message-----
> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf
> Of Monk Liu
> Sent: Tuesday, February 07, 2017 1:11 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Liu, Monk
> Subject: [PATCH 18/20] drm/amdgpu:alloc mqd backup
> 
> Change-Id: I84f821faa657a5d942c33d30f206eb66b579c2f8
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>

Please add a better patch description.  E.g., this is required for restoring the mqds after a GPU reset.  With that fixed,
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h   |  1 +
>  drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 10 ++++++++++
>  2 files changed, 11 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index bdb47f7..a801fde 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -781,6 +781,7 @@ struct amdgpu_mec {
>  	u32 num_pipe;
>  	u32 num_mec;
>  	u32 num_queue;
> +	struct vi_mqd	*mqd_backup[AMDGPU_MAX_COMPUTE_RINGS +
> 1];
>  };
> 
>  struct amdgpu_kiq {
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> index 6734e55..5f688d4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> @@ -7309,6 +7309,11 @@ static int gfx_v8_0_compute_mqd_soft_init(struct
> amdgpu_device *adev)
>  			dev_warn(adev->dev, "failed to create ring mqd ob
> (%d)", r);
>  			return r;
>  		}
> +
> +		/* prepare MQD backup */
> +		adev-
> >gfx.mec.mqd_backup[AMDGPU_MAX_COMPUTE_RINGS] =
> kmalloc(sizeof(struct vi_mqd), GFP_KERNEL);
> +		if (!adev-
> >gfx.mec.mqd_backup[AMDGPU_MAX_COMPUTE_RINGS])
> +				dev_warn(adev->dev, "no memory to create
> MQD backup for ring %s\n", ring->name);
>  	}
> 
>  	/* create MQD for each KCQ */
> @@ -7323,6 +7328,11 @@ static int gfx_v8_0_compute_mqd_soft_init(struct
> amdgpu_device *adev)
>  				dev_warn(adev->dev, "failed to create ring
> mqd ob (%d)", r);
>  				return r;
>  			}
> +
> +			/* prepare MQD backup */
> +			adev->gfx.mec.mqd_backup[i] =
> kmalloc(sizeof(struct vi_mqd), GFP_KERNEL);
> +			if (!adev->gfx.mec.mqd_backup[i])
> +				dev_warn(adev->dev, "no memory to create
> MQD backup for ring %s\n", ring->name);
>  		}
>  	}
> 
> --
> 2.7.4
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 51+ messages in thread

* RE: [PATCH 15/20] drm/amdgpu:use work instead of delay-work
       [not found]         ` <BN6PR12MB1652AA568136314BD28303BFF7430-/b2+HYfkarQqUD6E6FAiowdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2017-02-08  6:28           ` Yu, Xiangliang
  2017-02-08  7:45           ` Liu, Monk
  1 sibling, 0 replies; 51+ messages in thread
From: Yu, Xiangliang @ 2017-02-08  6:28 UTC (permalink / raw)
  To: Deucher, Alexander, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Liu, Monk

> -----Original Message-----
> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf
> Of Deucher, Alexander
> Sent: Wednesday, February 08, 2017 12:11 AM
> To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
> Cc: Liu, Monk <Monk.Liu@amd.com>
> Subject: RE: [PATCH 15/20] drm/amdgpu:use work instead of delay-work
> 
> > -----Original Message-----
> > From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf
> > Of Monk Liu
> > Sent: Tuesday, February 07, 2017 1:11 AM
> > To: amd-gfx@lists.freedesktop.org
> > Cc: Liu, Monk
> > Subject: [PATCH 15/20] drm/amdgpu:use work instead of delay-work
> >
> > Change-Id: I41b6336baa00b1fd299311349402a17951b585a2
> > Signed-off-by: Monk Liu <Monk.Liu@amd.com>
> 
> Please add a better patch description.  Why is this change needed?

My initial version will start two works to handle flr events: one is for flr notify irq and the other is for flr completion irq and will be called delay work.

According to the monk's version, there is only one work to handle all flr events. If so, I think it is also make sense.


> > ---
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  2 +-
> >  drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c    | 36 +++++++++++++++------
> --
> > ---------
> >  2 files changed, 18 insertions(+), 20 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> > index 4b05568..846f29c 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> > @@ -50,7 +50,7 @@ struct amdgpu_virt {
> >  	struct mutex                    lock_reset;
> >  	struct amdgpu_irq_src		ack_irq;
> >  	struct amdgpu_irq_src		rcv_irq;
> > -	struct delayed_work		flr_work;
> > +	struct work_struct		flr_work;
> >  	const struct amdgpu_virt_ops	*ops;
> >  };
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
> > b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
> > index 4e9e0bb..53fa590c 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
> > @@ -503,17 +503,19 @@ static int xgpu_vi_set_mailbox_ack_irq(struct
> > amdgpu_device *adev,
> >
> >  static void xgpu_vi_mailbox_flr_work(struct work_struct *work)  {
> > -	struct amdgpu_virt *virt = container_of(work,
> > -					struct amdgpu_virt, flr_work.work);
> > -	struct amdgpu_device *adev = container_of(virt,
> > -					struct amdgpu_device, virt);
> > -	int r = 0;
> > -
> > -	r = xgpu_vi_poll_msg(adev, IDH_FLR_NOTIFICATION_CMPL);
> > -	if (r)
> > -		DRM_ERROR("failed to get flr cmpl msg from hypervior.\n");
> > +	struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt,
> > flr_work);
> > +	struct amdgpu_device *adev = container_of(virt, struct
> > amdgpu_device, virt);
> > +
> > +	/* wait until RCV_MSG become 3 */
> > +	if (!xgpu_vi_poll_msg(adev, IDH_FLR_NOTIFICATION_CMPL))
> > +		adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
> > +	else {
> > +		pr_err("failed to recieve FLR_CMPL\n");
> > +		return;
> > +	}
> >
> > -	/* TODO: need to restore gfx states */
> > +	/* Trigger recovery due to world switch failure */
> > +	amdgpu_sriov_gpu_reset(adev, false);
> >  }
> >
> >  static int xgpu_vi_set_mailbox_rcv_irq(struct amdgpu_device *adev, @@
> > -536,15 +538,12 @@ static int xgpu_vi_mailbox_rcv_irq(struct
> > amdgpu_device *adev,  {
> >  	int r;
> >
> > -	adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
> > +	/* see what event we get */
> >  	r = xgpu_vi_mailbox_rcv_msg(adev, IDH_FLR_NOTIFICATION);
> > -	/* do nothing for other msg */
> > -	if (r)
> > -		return 0;
> >
> > -	/* TODO: need to save gfx states */
> > -	schedule_delayed_work(&adev->virt.flr_work,
> > -			      msecs_to_jiffies(VI_MAILBOX_RESET_TIME));
> > +	/* only handle FLR_NOTIFY now */
> > +	if (!r)
> > +		schedule_work(&adev->virt.flr_work);
> >
> >  	return 0;
> >  }
> > @@ -597,14 +596,13 @@ int xgpu_vi_mailbox_get_irq(struct
> amdgpu_device
> > *adev)
> >  		return r;
> >  	}
> >
> > -	INIT_DELAYED_WORK(&adev->virt.flr_work,
> > xgpu_vi_mailbox_flr_work);
> > +	INIT_WORK(&adev->virt.flr_work, xgpu_vi_mailbox_flr_work);
> >
> >  	return 0;
> >  }
> >
> >  void xgpu_vi_mailbox_put_irq(struct amdgpu_device *adev)  {
> > -	cancel_delayed_work_sync(&adev->virt.flr_work);
> >  	amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
> >  	amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);  }
> > --
> > 2.7.4
> >
> > _______________________________________________
> > amd-gfx mailing list
> > amd-gfx@lists.freedesktop.org
> > https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 51+ messages in thread

* RE: [PATCH 12/20] drm/amdgpu:impl mm_r/weg_nokiq
       [not found]         ` <BN6PR12MB1652FE39574BA6067AB64578F7430-/b2+HYfkarQqUD6E6FAiowdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2017-02-08  7:30           ` Yu, Xiangliang
  0 siblings, 0 replies; 51+ messages in thread
From: Yu, Xiangliang @ 2017-02-08  7:30 UTC (permalink / raw)
  To: Deucher, Alexander, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Liu, Monk

> > +uint32_t amdgpu_mm_rreg_nokiq(struct amdgpu_device *adev, uint32_t
> > reg,
> > +			bool always_indirect)
> > +{
> > +	uint32_t ret;
> > +
> > +	if ((reg * 4) < adev->rmmio_size && !always_indirect)
> > +		ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
> > +	else {
> > +		unsigned long flags;
> > +
> > +		spin_lock_irqsave(&adev->mmio_idx_lock, flags);
> > +		writel((reg * 4), ((void __iomem *)adev->rmmio) +
> > (mmMM_INDEX * 4));
> > +		ret = readl(((void __iomem *)adev->rmmio) +
> > (mmMM_DATA * 4));
> > +		spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
> > +	}
> > +	trace_amdgpu_mm_rreg(adev->pdev->device, reg, ret);
> > +	return ret;
> > +}
> > +
> > +void amdgpu_mm_wreg_nokiq(struct amdgpu_device *adev, uint32_t
> reg,
> > uint32_t v,
> > +		    bool always_indirect)
> > +{
> > +	trace_amdgpu_mm_wreg(adev->pdev->device, reg, v);
> > +
> > +	if ((reg * 4) < adev->rmmio_size && !always_indirect)
> > +		writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
> > +	else {
> > +		unsigned long flags;
> > +
> > +		spin_lock_irqsave(&adev->mmio_idx_lock, flags);
> > +		writel((reg * 4), ((void __iomem *)adev->rmmio) +
> > (mmMM_INDEX * 4));
> > +		writel(v, ((void __iomem *)adev->rmmio) + (mmMM_DATA
> > * 4));
> > +		spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
> > +	}
> > +}
> 
> These are duplicated with the regular mm_rreg and mm_wreg functions.
> Maybe add a new parameter to the existing functions to bypass kiq in the sr-
> iov case?

Why not re-use always_indirect flag to check kiq path, I think kiq read register is also indirect way to get register value.

> 
> > +
> >  u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)  {
> >  	if ((reg * 4) < adev->rio_mem_size)
> > --
> > 2.7.4
> >
> > _______________________________________________
> > amd-gfx mailing list
> > amd-gfx@lists.freedesktop.org
> > https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 51+ messages in thread

* RE: [PATCH 15/20] drm/amdgpu:use work instead of delay-work
       [not found]         ` <BN6PR12MB1652AA568136314BD28303BFF7430-/b2+HYfkarQqUD6E6FAiowdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  2017-02-08  6:28           ` Yu, Xiangliang
@ 2017-02-08  7:45           ` Liu, Monk
  1 sibling, 0 replies; 51+ messages in thread
From: Liu, Monk @ 2017-02-08  7:45 UTC (permalink / raw)
  To: Deucher, Alexander, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Because in the first place we have no reason to use delay work, 
We use delay work in staging branch is because Xiangliang's TDR/RESET feature is separated into two steps: part A and part B,
Part B must begin after part A, so Xiangliang use delay work to schedule part B,
(and this design has a negative side is that if part A not finished yet, running part B may not good, so I dropped this design and come up with the one you are reviewing now)

But my TDR feature is overall one together, so no need to use delay work at all

BR Monk

-----Original Message-----
From: Deucher, Alexander 
Sent: Wednesday, February 08, 2017 12:11 AM
To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Liu, Monk <Monk.Liu@amd.com>
Subject: RE: [PATCH 15/20] drm/amdgpu:use work instead of delay-work

> -----Original Message-----
> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf 
> Of Monk Liu
> Sent: Tuesday, February 07, 2017 1:11 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Liu, Monk
> Subject: [PATCH 15/20] drm/amdgpu:use work instead of delay-work
> 
> Change-Id: I41b6336baa00b1fd299311349402a17951b585a2
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>

Please add a better patch description.  Why is this change needed?


> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  2 +-
>  drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c    | 36 +++++++++++++++--------
> ---------
>  2 files changed, 18 insertions(+), 20 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> index 4b05568..846f29c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> @@ -50,7 +50,7 @@ struct amdgpu_virt {
>  	struct mutex                    lock_reset;
>  	struct amdgpu_irq_src		ack_irq;
>  	struct amdgpu_irq_src		rcv_irq;
> -	struct delayed_work		flr_work;
> +	struct work_struct		flr_work;
>  	const struct amdgpu_virt_ops	*ops;
>  };
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
> index 4e9e0bb..53fa590c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
> @@ -503,17 +503,19 @@ static int xgpu_vi_set_mailbox_ack_irq(struct
> amdgpu_device *adev,
> 
>  static void xgpu_vi_mailbox_flr_work(struct work_struct *work)  {
> -	struct amdgpu_virt *virt = container_of(work,
> -					struct amdgpu_virt, flr_work.work);
> -	struct amdgpu_device *adev = container_of(virt,
> -					struct amdgpu_device, virt);
> -	int r = 0;
> -
> -	r = xgpu_vi_poll_msg(adev, IDH_FLR_NOTIFICATION_CMPL);
> -	if (r)
> -		DRM_ERROR("failed to get flr cmpl msg from hypervior.\n");
> +	struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt,
> flr_work);
> +	struct amdgpu_device *adev = container_of(virt, struct
> amdgpu_device, virt);
> +
> +	/* wait until RCV_MSG become 3 */
> +	if (!xgpu_vi_poll_msg(adev, IDH_FLR_NOTIFICATION_CMPL))
> +		adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
> +	else {
> +		pr_err("failed to recieve FLR_CMPL\n");
> +		return;
> +	}
> 
> -	/* TODO: need to restore gfx states */
> +	/* Trigger recovery due to world switch failure */
> +	amdgpu_sriov_gpu_reset(adev, false);
>  }
> 
>  static int xgpu_vi_set_mailbox_rcv_irq(struct amdgpu_device *adev, @@ 
> -536,15 +538,12 @@ static int xgpu_vi_mailbox_rcv_irq(struct 
> amdgpu_device *adev,  {
>  	int r;
> 
> -	adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
> +	/* see what event we get */
>  	r = xgpu_vi_mailbox_rcv_msg(adev, IDH_FLR_NOTIFICATION);
> -	/* do nothing for other msg */
> -	if (r)
> -		return 0;
> 
> -	/* TODO: need to save gfx states */
> -	schedule_delayed_work(&adev->virt.flr_work,
> -			      msecs_to_jiffies(VI_MAILBOX_RESET_TIME));
> +	/* only handle FLR_NOTIFY now */
> +	if (!r)
> +		schedule_work(&adev->virt.flr_work);
> 
>  	return 0;
>  }
> @@ -597,14 +596,13 @@ int xgpu_vi_mailbox_get_irq(struct amdgpu_device 
> *adev)
>  		return r;
>  	}
> 
> -	INIT_DELAYED_WORK(&adev->virt.flr_work,
> xgpu_vi_mailbox_flr_work);
> +	INIT_WORK(&adev->virt.flr_work, xgpu_vi_mailbox_flr_work);
> 
>  	return 0;
>  }
> 
>  void xgpu_vi_mailbox_put_irq(struct amdgpu_device *adev)  {
> -	cancel_delayed_work_sync(&adev->virt.flr_work);
>  	amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
>  	amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);  }
> --
> 2.7.4
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: SPAM //答复: [PATCH 09/20] drm/amdgpu:implement SRIOV gpu_reset
       [not found]         ` <DM5PR12MB16109EC5F03088C1CFB58FE484430-2J9CzHegvk++jCVTvoAFKAdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2017-02-08 10:49           ` Christian König
       [not found]             ` <5dfe222d-9564-0835-f749-3ea5ef78c701-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
  0 siblings, 1 reply; 51+ messages in thread
From: Christian König @ 2017-02-08 10:49 UTC (permalink / raw)
  To: Liu, Monk, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1: Type: text/plain, Size: 8716 bytes --]

> +       /* now we are okay to resume SMC/CP/SDMA */
> +       amdgpu_resume_late(adev);
As I wrote in the other thread as well calling amdgpu_resume() without 
proper suspend will just mess up a whole bunch of internal structures.

So a clear NAK on that approach. If you don't need the hw stop which 
amdgpu_suspend() does for SRIOV then please try to just use the 
hw_init() callback and not the resume() callback.

Regards,
Christian.

Am 07.02.2017 um 07:26 schrieb Liu, Monk:
>
> patch 1-8 are some fixes for sriov gpu reset feature
>
> patch 9 -20 are for sriov gpu reset
>
>
> BR Monk
>
> ------------------------------------------------------------------------
> *发件人:* amd-gfx <amd-gfx-bounces-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org> 代表 Monk Liu 
> <Monk.Liu-5C7GfCeVMHo@public.gmane.org>
> *发送时间:* 2017年2月7日 14:11:07
> *收件人:* amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
> *抄送:* Liu, Monk
> *主题:* [PATCH 09/20] drm/amdgpu:implement SRIOV gpu_reset
> Signed-off-by: Monk Liu <Monk.Liu-5C7GfCeVMHo@public.gmane.org>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 158 
> ++++++++++++++++++++++++++++-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h   |   1 +
>  2 files changed, 158 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index e926f84..2b404ca 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -1604,6 +1604,53 @@ int amdgpu_suspend(struct amdgpu_device *adev)
>          return 0;
>  }
>
> +static int amdgpu_resume_early(struct amdgpu_device *adev)
> +{
> +       int i, r;
> +
> +       for (i = 0; i < adev->num_ip_blocks; i++) {
> +               if (!adev->ip_blocks[i].status.valid)
> +                       continue;
> +
> +               if (adev->ip_blocks[i].version->type == 
> AMD_IP_BLOCK_TYPE_COMMON ||
> + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
> + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)
> +                       r = 
> adev->ip_blocks[i].version->funcs->resume(adev);
> +
> +               if (r) {
> +                       DRM_ERROR("resume of IP block <%s> failed %d\n",
> + adev->ip_blocks[i].version->funcs->name, r);
> +                       return r;
> +               }
> +       }
> +
> +       return 0;
> +}
> +
> +static int amdgpu_resume_late(struct amdgpu_device *adev)
> +{
> +       int i, r;
> +
> +       for (i = 0; i < adev->num_ip_blocks; i++) {
> +               if (!adev->ip_blocks[i].status.valid)
> +                       continue;
> +
> +               if (adev->ip_blocks[i].version->type == 
> AMD_IP_BLOCK_TYPE_COMMON ||
> + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
> + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH )
> +                       continue;
> +
> +               r = adev->ip_blocks[i].version->funcs->resume(adev);
> +               if (r) {
> +                       DRM_ERROR("resume of IP block <%s> failed %d\n",
> + adev->ip_blocks[i].version->funcs->name, r);
> +                       return r;
> +               }
> +       }
> +
> +       return 0;
> +}
> +
>  static int amdgpu_resume(struct amdgpu_device *adev)
>  {
>          int i, r;
> @@ -2343,6 +2390,115 @@ static int 
> amdgpu_recover_vram_from_shadow(struct amdgpu_device *adev,
>  }
>
>  /**
> + * amdgpu_sriov_gpu_reset - reset the asic
> + *
> + * @adev: amdgpu device pointer
> + * @voluntary: if this reset is requested by guest.
> + *             (true means by guest and false means by HYPERVISOR )
> + *
> + * Attempt the reset the GPU if it has hung (all asics).
> + * for SRIOV case.
> + * Returns 0 for success or an error on failure.
> + */
> +int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, bool voluntary)
> +{
> +       int i, r = 0;
> +       int resched;
> +       struct amdgpu_bo *bo, *tmp;
> +       struct amdgpu_ring *ring;
> +       struct fence *fence = NULL, *next = NULL;
> +
> +       mutex_lock(&adev->virt.lock_reset);
> +       atomic_inc(&adev->gpu_reset_counter);
> +
> +       /* block TTM */
> +       resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
> +
> +       /* block scheduler */
> +       for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> +               ring = adev->rings[i];
> +
> +               if (!ring || !ring->sched.thread)
> +                       continue;
> +
> +               kthread_park(ring->sched.thread);
> +               amd_sched_hw_job_reset(&ring->sched);
> +       }
> +
> +       /* after all hw jobs are reset, hw fence is meaningless, so 
> force_completion */
> +       amdgpu_fence_driver_force_completion(adev);
> +
> +       /* request to take full control of GPU before 
> re-initialization  */
> +       if (voluntary)
> +               amdgpu_virt_reset_gpu(adev);
> +       else
> +               amdgpu_virt_request_full_gpu(adev, true);
> +
> +
> +       /* Resume IP prior to SMC */
> +       amdgpu_resume_early(adev);
> +
> +       /* we need recover gart prior to run SMC/CP/SDMA resume */
> +       amdgpu_ttm_recover_gart(adev);
> +
> +       /* now we are okay to resume SMC/CP/SDMA */
> +       amdgpu_resume_late(adev);


> +
> +       amdgpu_irq_gpu_reset_resume_helper(adev);
> +
> +       if (amdgpu_ib_ring_tests(adev))
> +               dev_err(adev->dev, "[GPU_RESET] ib ring test failed 
> (%d).\n", r);
> +
> +       /* rellease full control of GPU after ib test */
> +       amdgpu_virt_release_full_gpu(adev, true);
> +
> +       DRM_INFO("recover vram bo from shadow\n");
> +
> +       ring = adev->mman.buffer_funcs_ring;
> +       mutex_lock(&adev->shadow_list_lock);
> +       list_for_each_entry_safe(bo, tmp, &adev->shadow_list, 
> shadow_list) {
> +               amdgpu_recover_vram_from_shadow(adev, ring, bo, &next);
> +               if (fence) {
> +                       r = fence_wait(fence, false);
> +                       if (r) {
> +                               WARN(r, "recovery from shadow isn't 
> completed\n");
> +                               break;
> +                       }
> +               }
> +
> +               fence_put(fence);
> +               fence = next;
> +       }
> +       mutex_unlock(&adev->shadow_list_lock);
> +
> +       if (fence) {
> +               r = fence_wait(fence, false);
> +               if (r)
> +                       WARN(r, "recovery from shadow isn't completed\n");
> +       }
> +       fence_put(fence);
> +
> +       for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> +               struct amdgpu_ring *ring = adev->rings[i];
> +               if (!ring || !ring->sched.thread)
> +                       continue;
> +
> +               amd_sched_job_recovery(&ring->sched);
> +               kthread_unpark(ring->sched.thread);
> +       }
> +
> +       drm_helper_resume_force_mode(adev->ddev);
> + ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched);
> +       if (r) {
> +               /* bad news, how to tell it to userspace ? */
> +               dev_info(adev->dev, "GPU reset failed\n");
> +       }
> +
> +       mutex_unlock(&adev->virt.lock_reset);
> +       return r;
> +}
> +
> +/**
>   * amdgpu_gpu_reset - reset the asic
>   *
>   * @adev: amdgpu device pointer
> @@ -2358,7 +2514,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>          bool need_full_reset;
>
>          if (amdgpu_sriov_vf(adev))
> -               return 0;
> +               return amdgpu_sriov_gpu_reset(adev, true);
>
>          if (!amdgpu_check_soft_reset(adev)) {
>                  DRM_INFO("No hardware hang detected. Did some blocks 
> stall?\n");
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> index 675e12c..73d24df 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> @@ -89,5 +89,6 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device 
> *adev, uint32_t reg, uint32_t v);
>  int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool init);
>  int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, bool init);
>  int amdgpu_virt_reset_gpu(struct amdgpu_device *adev);
> +int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, bool voluntary);
>
>  #endif
> -- 
> 2.7.4
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx



[-- Attachment #1.2: Type: text/html, Size: 16986 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 51+ messages in thread

* 答复: SPAM //答复: [PATCH 09/20] drm/amdgpu:implement SRIOV gpu_reset
       [not found]             ` <5dfe222d-9564-0835-f749-3ea5ef78c701-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
@ 2017-02-08 14:57               ` Liu, Monk
       [not found]                 ` <DM5PR12MB1610E1ADF3A9172B1A418C7184420-2J9CzHegvk++jCVTvoAFKAdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  0 siblings, 1 reply; 51+ messages in thread
From: Liu, Monk @ 2017-02-08 14:57 UTC (permalink / raw)
  To: Christian König, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1: Type: text/plain, Size: 9390 bytes --]

》As I wrote in the other thread as well calling amdgpu_resume() without proper suspend will just mess up a whole bunch of internal structures.

please name at least one, I'll check and see how to improve

and like I said, this approach is correct and verified by hang test, if internal structures messed up I don't think the test will easy pass. hw_init() just call resume per engine.

you can take a deep look into sriov_gpu_reset and judge later



________________________________
发件人: Christian König <deathsimple@vodafone.de>
发送时间: 2017年2月8日 18:49:57
收件人: Liu, Monk; amd-gfx@lists.freedesktop.org
主题: Re: SPAM //答复: [PATCH 09/20] drm/amdgpu:implement SRIOV gpu_reset

+       /* now we are okay to resume SMC/CP/SDMA */
+       amdgpu_resume_late(adev);
As I wrote in the other thread as well calling amdgpu_resume() without proper suspend will just mess up a whole bunch of internal structures.

So a clear NAK on that approach. If you don't need the hw stop which amdgpu_suspend() does for SRIOV then please try to just use the hw_init() callback and not the resume() callback.

Regards,
Christian.

Am 07.02.2017 um 07:26 schrieb Liu, Monk:

patch 1-8 are some fixes for sriov gpu reset feature

patch 9 -20 are for sriov gpu reset


BR Monk

________________________________
发件人: amd-gfx <amd-gfx-bounces@lists.freedesktop.org><mailto:amd-gfx-bounces@lists.freedesktop.org> 代表 Monk Liu <Monk.Liu@amd.com><mailto:Monk.Liu@amd.com>
发送时间: 2017年2月7日 14:11:07
收件人: amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
抄送: Liu, Monk
主题: [PATCH 09/20] drm/amdgpu:implement SRIOV gpu_reset

Signed-off-by: Monk Liu <Monk.Liu@amd.com><mailto:Monk.Liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 158 ++++++++++++++++++++++++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h   |   1 +
 2 files changed, 158 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e926f84..2b404ca 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1604,6 +1604,53 @@ int amdgpu_suspend(struct amdgpu_device *adev)
         return 0;
 }

+static int amdgpu_resume_early(struct amdgpu_device *adev)
+{
+       int i, r;
+
+       for (i = 0; i < adev->num_ip_blocks; i++) {
+               if (!adev->ip_blocks[i].status.valid)
+                       continue;
+
+               if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
+                               adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
+                               adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)
+                       r = adev->ip_blocks[i].version->funcs->resume(adev);
+
+               if (r) {
+                       DRM_ERROR("resume of IP block <%s> failed %d\n",
+                                 adev->ip_blocks[i].version->funcs->name, r);
+                       return r;
+               }
+       }
+
+       return 0;
+}
+
+static int amdgpu_resume_late(struct amdgpu_device *adev)
+{
+       int i, r;
+
+       for (i = 0; i < adev->num_ip_blocks; i++) {
+               if (!adev->ip_blocks[i].status.valid)
+                       continue;
+
+               if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
+                               adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
+                               adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH )
+                       continue;
+
+               r = adev->ip_blocks[i].version->funcs->resume(adev);
+               if (r) {
+                       DRM_ERROR("resume of IP block <%s> failed %d\n",
+                                 adev->ip_blocks[i].version->funcs->name, r);
+                       return r;
+               }
+       }
+
+       return 0;
+}
+
 static int amdgpu_resume(struct amdgpu_device *adev)
 {
         int i, r;
@@ -2343,6 +2390,115 @@ static int amdgpu_recover_vram_from_shadow(struct amdgpu_device *adev,
 }

 /**
+ * amdgpu_sriov_gpu_reset - reset the asic
+ *
+ * @adev: amdgpu device pointer
+ * @voluntary: if this reset is requested by guest.
+ *             (true means by guest and false means by HYPERVISOR )
+ *
+ * Attempt the reset the GPU if it has hung (all asics).
+ * for SRIOV case.
+ * Returns 0 for success or an error on failure.
+ */
+int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, bool voluntary)
+{
+       int i, r = 0;
+       int resched;
+       struct amdgpu_bo *bo, *tmp;
+       struct amdgpu_ring *ring;
+       struct fence *fence = NULL, *next = NULL;
+
+       mutex_lock(&adev->virt.lock_reset);
+       atomic_inc(&adev->gpu_reset_counter);
+
+       /* block TTM */
+       resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
+
+       /* block scheduler */
+       for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
+               ring = adev->rings[i];
+
+               if (!ring || !ring->sched.thread)
+                       continue;
+
+               kthread_park(ring->sched.thread);
+               amd_sched_hw_job_reset(&ring->sched);
+       }
+
+       /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
+       amdgpu_fence_driver_force_completion(adev);
+
+       /* request to take full control of GPU before re-initialization  */
+       if (voluntary)
+               amdgpu_virt_reset_gpu(adev);
+       else
+               amdgpu_virt_request_full_gpu(adev, true);
+
+
+       /* Resume IP prior to SMC */
+       amdgpu_resume_early(adev);
+
+       /* we need recover gart prior to run SMC/CP/SDMA resume */
+       amdgpu_ttm_recover_gart(adev);
+
+       /* now we are okay to resume SMC/CP/SDMA */
+       amdgpu_resume_late(adev);


+
+       amdgpu_irq_gpu_reset_resume_helper(adev);
+
+       if (amdgpu_ib_ring_tests(adev))
+               dev_err(adev->dev, "[GPU_RESET] ib ring test failed (%d).\n", r);
+
+       /* rellease full control of GPU after ib test */
+       amdgpu_virt_release_full_gpu(adev, true);
+
+       DRM_INFO("recover vram bo from shadow\n");
+
+       ring = adev->mman.buffer_funcs_ring;
+       mutex_lock(&adev->shadow_list_lock);
+       list_for_each_entry_safe(bo, tmp, &adev->shadow_list, shadow_list) {
+               amdgpu_recover_vram_from_shadow(adev, ring, bo, &next);
+               if (fence) {
+                       r = fence_wait(fence, false);
+                       if (r) {
+                               WARN(r, "recovery from shadow isn't completed\n");
+                               break;
+                       }
+               }
+
+               fence_put(fence);
+               fence = next;
+       }
+       mutex_unlock(&adev->shadow_list_lock);
+
+       if (fence) {
+               r = fence_wait(fence, false);
+               if (r)
+                       WARN(r, "recovery from shadow isn't completed\n");
+       }
+       fence_put(fence);
+
+       for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
+               struct amdgpu_ring *ring = adev->rings[i];
+               if (!ring || !ring->sched.thread)
+                       continue;
+
+               amd_sched_job_recovery(&ring->sched);
+               kthread_unpark(ring->sched.thread);
+       }
+
+       drm_helper_resume_force_mode(adev->ddev);
+       ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched);
+       if (r) {
+               /* bad news, how to tell it to userspace ? */
+               dev_info(adev->dev, "GPU reset failed\n");
+       }
+
+       mutex_unlock(&adev->virt.lock_reset);
+       return r;
+}
+
+/**
  * amdgpu_gpu_reset - reset the asic
  *
  * @adev: amdgpu device pointer
@@ -2358,7 +2514,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
         bool need_full_reset;

         if (amdgpu_sriov_vf(adev))
-               return 0;
+               return amdgpu_sriov_gpu_reset(adev, true);

         if (!amdgpu_check_soft_reset(adev)) {
                 DRM_INFO("No hardware hang detected. Did some blocks stall?\n");
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
index 675e12c..73d24df 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
@@ -89,5 +89,6 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v);
 int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool init);
 int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, bool init);
 int amdgpu_virt_reset_gpu(struct amdgpu_device *adev);
+int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, bool voluntary);

 #endif
--
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
https://lists.freedesktop.org/mailman/listinfo/amd-gfx



_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
https://lists.freedesktop.org/mailman/listinfo/amd-gfx



[-- Attachment #1.2: Type: text/html, Size: 21209 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* Re: 答复: SPAM //答复: [PATCH 09/20] drm/amdgpu:implement SRIOV gpu_reset
       [not found]                 ` <DM5PR12MB1610E1ADF3A9172B1A418C7184420-2J9CzHegvk++jCVTvoAFKAdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2017-02-08 15:13                   ` Christian König
       [not found]                     ` <afe8b046-c0ba-586e-99c3-adef380cfd3f-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
  0 siblings, 1 reply; 51+ messages in thread
From: Christian König @ 2017-02-08 15:13 UTC (permalink / raw)
  To: Liu, Monk, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1: Type: text/plain, Size: 10559 bytes --]

> and like I said, this approach is correct and verified by hang test
Completely irrelevant.

Please try the following:
1. Trigger a hang
2. Reset the GPU
3. Suspend the VM with glxgears running
4. Resume the VM

I'm pretty sure that with your approach that either suspending or 
resuming the VM with an application running will just hang.

Anyway even if all the code path you break here (UVD/VCE at minimum) are 
disabled in the SRIOV case it's still not a good idea completely 
breaking the design just to satisfy the SRIOV feature.

So that is still a clear NAK on that. Please do as I told you and use 
the hw_init() callback instead, it is especially made for this use case.

Regards,
Christian.

Am 08.02.2017 um 15:57 schrieb Liu, Monk:
>
> 》As I wrote in the other thread as well calling amdgpu_resume() 
> without proper suspend will just mess up a whole bunch of internal 
> structures.
>
>
> please name at least one, I'll check and see how to improve
>
> and like I said, this approach is correct and verified by hang test, 
> if internal structures messed up I don't think the test will easy 
> pass. hw_init() just call resume per engine.
>
> you can take a deep look into sriov_gpu_reset and judge later
>
>
> ------------------------------------------------------------------------
> *发件人:* Christian König <deathsimple-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
> *发送时间:* 2017年2月8日 18:49:57
> *收件人:* Liu, Monk; amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
> *主题:* Re: SPAM //答复: [PATCH 09/20] drm/amdgpu:implement SRIOV 
> gpu_reset
>> +       /* now we are okay to resume SMC/CP/SDMA */
>> +       amdgpu_resume_late(adev);
> As I wrote in the other thread as well calling amdgpu_resume() without 
> proper suspend will just mess up a whole bunch of internal structures.
>
> So a clear NAK on that approach. If you don't need the hw stop which 
> amdgpu_suspend() does for SRIOV then please try to just use the 
> hw_init() callback and not the resume() callback.
>
> Regards,
> Christian.
>
> Am 07.02.2017 um 07:26 schrieb Liu, Monk:
>>
>> patch 1-8 are some fixes for sriov gpu reset feature
>>
>> patch 9 -20 are for sriov gpu reset
>>
>>
>> BR Monk
>>
>> ------------------------------------------------------------------------
>> *发件人:* amd-gfx <amd-gfx-bounces-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org> 代表 Monk Liu 
>> <Monk.Liu-5C7GfCeVMHo@public.gmane.org>
>> *发送时间:* 2017年2月7日 14:11:07
>> *收件人:* amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>> *抄送:* Liu, Monk
>> *主题:* [PATCH 09/20] drm/amdgpu:implement SRIOV gpu_reset
>> Signed-off-by: Monk Liu <Monk.Liu-5C7GfCeVMHo@public.gmane.org>
>> ---
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 158 
>> ++++++++++++++++++++++++++++-
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h   |   1 +
>>  2 files changed, 158 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index e926f84..2b404ca 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -1604,6 +1604,53 @@ int amdgpu_suspend(struct amdgpu_device *adev)
>>          return 0;
>>  }
>>
>> +static int amdgpu_resume_early(struct amdgpu_device *adev)
>> +{
>> +       int i, r;
>> +
>> +       for (i = 0; i < adev->num_ip_blocks; i++) {
>> +               if (!adev->ip_blocks[i].status.valid)
>> +                       continue;
>> +
>> +               if (adev->ip_blocks[i].version->type == 
>> AMD_IP_BLOCK_TYPE_COMMON ||
>> + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
>> + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)
>> +                       r = 
>> adev->ip_blocks[i].version->funcs->resume(adev);
>> +
>> +               if (r) {
>> +                       DRM_ERROR("resume of IP block <%s> failed %d\n",
>> + adev->ip_blocks[i].version->funcs->name, r);
>> +                       return r;
>> +               }
>> +       }
>> +
>> +       return 0;
>> +}
>> +
>> +static int amdgpu_resume_late(struct amdgpu_device *adev)
>> +{
>> +       int i, r;
>> +
>> +       for (i = 0; i < adev->num_ip_blocks; i++) {
>> +               if (!adev->ip_blocks[i].status.valid)
>> +                       continue;
>> +
>> +               if (adev->ip_blocks[i].version->type == 
>> AMD_IP_BLOCK_TYPE_COMMON ||
>> + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
>> + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH )
>> +                       continue;
>> +
>> +               r = adev->ip_blocks[i].version->funcs->resume(adev);
>> +               if (r) {
>> +                       DRM_ERROR("resume of IP block <%s> failed %d\n",
>> + adev->ip_blocks[i].version->funcs->name, r);
>> +                       return r;
>> +               }
>> +       }
>> +
>> +       return 0;
>> +}
>> +
>>  static int amdgpu_resume(struct amdgpu_device *adev)
>>  {
>>          int i, r;
>> @@ -2343,6 +2390,115 @@ static int 
>> amdgpu_recover_vram_from_shadow(struct amdgpu_device *adev,
>>  }
>>
>>  /**
>> + * amdgpu_sriov_gpu_reset - reset the asic
>> + *
>> + * @adev: amdgpu device pointer
>> + * @voluntary: if this reset is requested by guest.
>> + *             (true means by guest and false means by HYPERVISOR )
>> + *
>> + * Attempt the reset the GPU if it has hung (all asics).
>> + * for SRIOV case.
>> + * Returns 0 for success or an error on failure.
>> + */
>> +int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, bool voluntary)
>> +{
>> +       int i, r = 0;
>> +       int resched;
>> +       struct amdgpu_bo *bo, *tmp;
>> +       struct amdgpu_ring *ring;
>> +       struct fence *fence = NULL, *next = NULL;
>> +
>> +       mutex_lock(&adev->virt.lock_reset);
>> +       atomic_inc(&adev->gpu_reset_counter);
>> +
>> +       /* block TTM */
>> +       resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
>> +
>> +       /* block scheduler */
>> +       for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>> +               ring = adev->rings[i];
>> +
>> +               if (!ring || !ring->sched.thread)
>> +                       continue;
>> +
>> +               kthread_park(ring->sched.thread);
>> + amd_sched_hw_job_reset(&ring->sched);
>> +       }
>> +
>> +       /* after all hw jobs are reset, hw fence is meaningless, so 
>> force_completion */
>> +       amdgpu_fence_driver_force_completion(adev);
>> +
>> +       /* request to take full control of GPU before 
>> re-initialization  */
>> +       if (voluntary)
>> +               amdgpu_virt_reset_gpu(adev);
>> +       else
>> +               amdgpu_virt_request_full_gpu(adev, true);
>> +
>> +
>> +       /* Resume IP prior to SMC */
>> +       amdgpu_resume_early(adev);
>> +
>> +       /* we need recover gart prior to run SMC/CP/SDMA resume */
>> +       amdgpu_ttm_recover_gart(adev);
>> +
>> +       /* now we are okay to resume SMC/CP/SDMA */
>> +       amdgpu_resume_late(adev);
>
>
>> +
>> +       amdgpu_irq_gpu_reset_resume_helper(adev);
>> +
>> +       if (amdgpu_ib_ring_tests(adev))
>> +               dev_err(adev->dev, "[GPU_RESET] ib ring test failed 
>> (%d).\n", r);
>> +
>> +       /* rellease full control of GPU after ib test */
>> +       amdgpu_virt_release_full_gpu(adev, true);
>> +
>> +       DRM_INFO("recover vram bo from shadow\n");
>> +
>> +       ring = adev->mman.buffer_funcs_ring;
>> +       mutex_lock(&adev->shadow_list_lock);
>> +       list_for_each_entry_safe(bo, tmp, &adev->shadow_list, 
>> shadow_list) {
>> +               amdgpu_recover_vram_from_shadow(adev, ring, bo, &next);
>> +               if (fence) {
>> +                       r = fence_wait(fence, false);
>> +                       if (r) {
>> +                               WARN(r, "recovery from shadow isn't 
>> completed\n");
>> +                               break;
>> +                       }
>> +               }
>> +
>> +               fence_put(fence);
>> +               fence = next;
>> +       }
>> +       mutex_unlock(&adev->shadow_list_lock);
>> +
>> +       if (fence) {
>> +               r = fence_wait(fence, false);
>> +               if (r)
>> +                       WARN(r, "recovery from shadow isn't 
>> completed\n");
>> +       }
>> +       fence_put(fence);
>> +
>> +       for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>> +               struct amdgpu_ring *ring = adev->rings[i];
>> +               if (!ring || !ring->sched.thread)
>> +                       continue;
>> +
>> + amd_sched_job_recovery(&ring->sched);
>> +               kthread_unpark(ring->sched.thread);
>> +       }
>> +
>> +       drm_helper_resume_force_mode(adev->ddev);
>> + ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched);
>> +       if (r) {
>> +               /* bad news, how to tell it to userspace ? */
>> +               dev_info(adev->dev, "GPU reset failed\n");
>> +       }
>> +
>> +       mutex_unlock(&adev->virt.lock_reset);
>> +       return r;
>> +}
>> +
>> +/**
>>   * amdgpu_gpu_reset - reset the asic
>>   *
>>   * @adev: amdgpu device pointer
>> @@ -2358,7 +2514,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>>          bool need_full_reset;
>>
>>          if (amdgpu_sriov_vf(adev))
>> -               return 0;
>> +               return amdgpu_sriov_gpu_reset(adev, true);
>>
>>          if (!amdgpu_check_soft_reset(adev)) {
>>                  DRM_INFO("No hardware hang detected. Did some blocks 
>> stall?\n");
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> index 675e12c..73d24df 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> @@ -89,5 +89,6 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device 
>> *adev, uint32_t reg, uint32_t v);
>>  int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool init);
>>  int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, bool init);
>>  int amdgpu_virt_reset_gpu(struct amdgpu_device *adev);
>> +int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, bool voluntary);
>>
>>  #endif
>> -- 
>> 2.7.4
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
>


[-- Attachment #1.2: Type: text/html, Size: 22093 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 51+ messages in thread

* 答复: 答复: SPAM //答复: [PATCH 09/20] drm/amdgpu:implement SRIOV gpu_reset
       [not found]                     ` <afe8b046-c0ba-586e-99c3-adef380cfd3f-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
@ 2017-02-08 15:17                       ` Liu, Monk
       [not found]                         ` <DM5PR12MB161095B3EBE0DCC3CEFCBB8A84420-2J9CzHegvk++jCVTvoAFKAdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  0 siblings, 1 reply; 51+ messages in thread
From: Liu, Monk @ 2017-02-08 15:17 UTC (permalink / raw)
  To: Christian König, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1: Type: text/plain, Size: 10708 bytes --]

wait a minutes ...


》3. suspend the VM with glxgears running


do you mean turn VM into s3 mode ?

if so we not get that step, the S3 suspend/resume function is not supported by SRIOV vf case.





________________________________
发件人: Christian König <deathsimple@vodafone.de>
发送时间: 2017年2月8日 23:13:46
收件人: Liu, Monk; amd-gfx@lists.freedesktop.org
主题: Re: 答复: SPAM //答复: [PATCH 09/20] drm/amdgpu:implement SRIOV gpu_reset

and like I said, this approach is correct and verified by hang test
Completely irrelevant.

Please try the following:
1. Trigger a hang
2. Reset the GPU
3. Suspend the VM with glxgears running
4. Resume the VM

I'm pretty sure that with your approach that either suspending or resuming the VM with an application running will just hang.

Anyway even if all the code path you break here (UVD/VCE at minimum) are disabled in the SRIOV case it's still not a good idea completely breaking the design just to satisfy the SRIOV feature.

So that is still a clear NAK on that. Please do as I told you and use the hw_init() callback instead, it is especially made for this use case.

Regards,
Christian.

Am 08.02.2017 um 15:57 schrieb Liu, Monk:

》As I wrote in the other thread as well calling amdgpu_resume() without proper suspend will just mess up a whole bunch of internal structures.

please name at least one, I'll check and see how to improve

and like I said, this approach is correct and verified by hang test, if internal structures messed up I don't think the test will easy pass. hw_init() just call resume per engine.

you can take a deep look into sriov_gpu_reset and judge later



________________________________
发件人: Christian König <deathsimple@vodafone.de><mailto:deathsimple@vodafone.de>
发送时间: 2017年2月8日 18:49:57
收件人: Liu, Monk; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
主题: Re: SPAM //答复: [PATCH 09/20] drm/amdgpu:implement SRIOV gpu_reset

+       /* now we are okay to resume SMC/CP/SDMA */
+       amdgpu_resume_late(adev);
As I wrote in the other thread as well calling amdgpu_resume() without proper suspend will just mess up a whole bunch of internal structures.

So a clear NAK on that approach. If you don't need the hw stop which amdgpu_suspend() does for SRIOV then please try to just use the hw_init() callback and not the resume() callback.

Regards,
Christian.

Am 07.02.2017 um 07:26 schrieb Liu, Monk:

patch 1-8 are some fixes for sriov gpu reset feature

patch 9 -20 are for sriov gpu reset


BR Monk

________________________________
发件人: amd-gfx <amd-gfx-bounces@lists.freedesktop.org><mailto:amd-gfx-bounces@lists.freedesktop.org> 代表 Monk Liu <Monk.Liu@amd.com><mailto:Monk.Liu@amd.com>
发送时间: 2017年2月7日 14:11:07
收件人: amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
抄送: Liu, Monk
主题: [PATCH 09/20] drm/amdgpu:implement SRIOV gpu_reset

Signed-off-by: Monk Liu <Monk.Liu@amd.com><mailto:Monk.Liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 158 ++++++++++++++++++++++++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h   |   1 +
 2 files changed, 158 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e926f84..2b404ca 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1604,6 +1604,53 @@ int amdgpu_suspend(struct amdgpu_device *adev)
         return 0;
 }

+static int amdgpu_resume_early(struct amdgpu_device *adev)
+{
+       int i, r;
+
+       for (i = 0; i < adev->num_ip_blocks; i++) {
+               if (!adev->ip_blocks[i].status.valid)
+                       continue;
+
+               if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
+                               adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
+                               adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)
+                       r = adev->ip_blocks[i].version->funcs->resume(adev);
+
+               if (r) {
+                       DRM_ERROR("resume of IP block <%s> failed %d\n",
+                                 adev->ip_blocks[i].version->funcs->name, r);
+                       return r;
+               }
+       }
+
+       return 0;
+}
+
+static int amdgpu_resume_late(struct amdgpu_device *adev)
+{
+       int i, r;
+
+       for (i = 0; i < adev->num_ip_blocks; i++) {
+               if (!adev->ip_blocks[i].status.valid)
+                       continue;
+
+               if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
+                               adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
+                               adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH )
+                       continue;
+
+               r = adev->ip_blocks[i].version->funcs->resume(adev);
+               if (r) {
+                       DRM_ERROR("resume of IP block <%s> failed %d\n",
+                                 adev->ip_blocks[i].version->funcs->name, r);
+                       return r;
+               }
+       }
+
+       return 0;
+}
+
 static int amdgpu_resume(struct amdgpu_device *adev)
 {
         int i, r;
@@ -2343,6 +2390,115 @@ static int amdgpu_recover_vram_from_shadow(struct amdgpu_device *adev,
 }

 /**
+ * amdgpu_sriov_gpu_reset - reset the asic
+ *
+ * @adev: amdgpu device pointer
+ * @voluntary: if this reset is requested by guest.
+ *             (true means by guest and false means by HYPERVISOR )
+ *
+ * Attempt the reset the GPU if it has hung (all asics).
+ * for SRIOV case.
+ * Returns 0 for success or an error on failure.
+ */
+int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, bool voluntary)
+{
+       int i, r = 0;
+       int resched;
+       struct amdgpu_bo *bo, *tmp;
+       struct amdgpu_ring *ring;
+       struct fence *fence = NULL, *next = NULL;
+
+       mutex_lock(&adev->virt.lock_reset);
+       atomic_inc(&adev->gpu_reset_counter);
+
+       /* block TTM */
+       resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
+
+       /* block scheduler */
+       for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
+               ring = adev->rings[i];
+
+               if (!ring || !ring->sched.thread)
+                       continue;
+
+               kthread_park(ring->sched.thread);
+               amd_sched_hw_job_reset(&ring->sched);
+       }
+
+       /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
+       amdgpu_fence_driver_force_completion(adev);
+
+       /* request to take full control of GPU before re-initialization  */
+       if (voluntary)
+               amdgpu_virt_reset_gpu(adev);
+       else
+               amdgpu_virt_request_full_gpu(adev, true);
+
+
+       /* Resume IP prior to SMC */
+       amdgpu_resume_early(adev);
+
+       /* we need recover gart prior to run SMC/CP/SDMA resume */
+       amdgpu_ttm_recover_gart(adev);
+
+       /* now we are okay to resume SMC/CP/SDMA */
+       amdgpu_resume_late(adev);


+
+       amdgpu_irq_gpu_reset_resume_helper(adev);
+
+       if (amdgpu_ib_ring_tests(adev))
+               dev_err(adev->dev, "[GPU_RESET] ib ring test failed (%d).\n", r);
+
+       /* rellease full control of GPU after ib test */
+       amdgpu_virt_release_full_gpu(adev, true);
+
+       DRM_INFO("recover vram bo from shadow\n");
+
+       ring = adev->mman.buffer_funcs_ring;
+       mutex_lock(&adev->shadow_list_lock);
+       list_for_each_entry_safe(bo, tmp, &adev->shadow_list, shadow_list) {
+               amdgpu_recover_vram_from_shadow(adev, ring, bo, &next);
+               if (fence) {
+                       r = fence_wait(fence, false);
+                       if (r) {
+                               WARN(r, "recovery from shadow isn't completed\n");
+                               break;
+                       }
+               }
+
+               fence_put(fence);
+               fence = next;
+       }
+       mutex_unlock(&adev->shadow_list_lock);
+
+       if (fence) {
+               r = fence_wait(fence, false);
+               if (r)
+                       WARN(r, "recovery from shadow isn't completed\n");
+       }
+       fence_put(fence);
+
+       for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
+               struct amdgpu_ring *ring = adev->rings[i];
+               if (!ring || !ring->sched.thread)
+                       continue;
+
+               amd_sched_job_recovery(&ring->sched);
+               kthread_unpark(ring->sched.thread);
+       }
+
+       drm_helper_resume_force_mode(adev->ddev);
+       ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched);
+       if (r) {
+               /* bad news, how to tell it to userspace ? */
+               dev_info(adev->dev, "GPU reset failed\n");
+       }
+
+       mutex_unlock(&adev->virt.lock_reset);
+       return r;
+}
+
+/**
  * amdgpu_gpu_reset - reset the asic
  *
  * @adev: amdgpu device pointer
@@ -2358,7 +2514,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
         bool need_full_reset;

         if (amdgpu_sriov_vf(adev))
-               return 0;
+               return amdgpu_sriov_gpu_reset(adev, true);

         if (!amdgpu_check_soft_reset(adev)) {
                 DRM_INFO("No hardware hang detected. Did some blocks stall?\n");
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
index 675e12c..73d24df 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
@@ -89,5 +89,6 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v);
 int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool init);
 int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, bool init);
 int amdgpu_virt_reset_gpu(struct amdgpu_device *adev);
+int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, bool voluntary);

 #endif
--
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
https://lists.freedesktop.org/mailman/listinfo/amd-gfx



_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
https://lists.freedesktop.org/mailman/listinfo/amd-gfx




[-- Attachment #1.2: Type: text/html, Size: 23661 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* Re: 答复: 答复: SPAM //答复: [PATCH 09/20] drm/amdgpu:implement SRIOV gpu_reset
       [not found]                         ` <DM5PR12MB161095B3EBE0DCC3CEFCBB8A84420-2J9CzHegvk++jCVTvoAFKAdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2017-02-08 15:27                           ` Christian König
       [not found]                             ` <aea90712-dc2f-5bd5-ea8b-52b8688a44c9-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
  0 siblings, 1 reply; 51+ messages in thread
From: Christian König @ 2017-02-08 15:27 UTC (permalink / raw)
  To: Liu, Monk, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1: Type: text/plain, Size: 12532 bytes --]

> do you mean turn VM into s3 mode ?
>
> if so we not get that step, the S3 suspend/resume function is not 
> supported by SRIOV vf case.
>
Then just try to unbind the fb and unload the module.

The idea of the suspend/resume callbacks are that they are only called 
in pairs.

See suspend is supposed to unpin all BOs and backup all resources from 
VRAM to GTT and preserve the general hardware state.

Now what resume does is to reload BOs and restore the state previously 
saved during the suspend backup.

If I understand you correct you really don't need all that reload and 
restore dance. Instead what you need for the SRIOV case is just 
reinitializing the hardware, isn't it?

That this works is just pure coincident because we don't have 
backup/restore function for the blocks enabled for SRIOV.

Regards,
Christian.

Am 08.02.2017 um 16:17 schrieb Liu, Monk:
>
> wait a minutes ...
>
>
> 》3. suspend the VM with glxgears running
>
>
> do you mean turn VM into s3 mode ?
>
> if so we not get that step, the S3 suspend/resume function is not 
> supported by SRIOV vf case.
>
>
>
>
>
> ------------------------------------------------------------------------
> *发件人:* Christian König <deathsimple-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
> *发送时间:* 2017年2月8日 23:13:46
> *收件人:* Liu, Monk; amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
> *主题:* Re: 答复: SPAM //答复: [PATCH 09/20] drm/amdgpu:implement SRIOV 
> gpu_reset
>> and like I said, this approach is correct and verified by hang test
> Completely irrelevant.
>
> Please try the following:
> 1. Trigger a hang
> 2. Reset the GPU
> 3. Suspend the VM with glxgears running
> 4. Resume the VM
>
> I'm pretty sure that with your approach that either suspending or 
> resuming the VM with an application running will just hang.
>
> Anyway even if all the code path you break here (UVD/VCE at minimum) 
> are disabled in the SRIOV case it's still not a good idea completely 
> breaking the design just to satisfy the SRIOV feature.
>
> So that is still a clear NAK on that. Please do as I told you and use 
> the hw_init() callback instead, it is especially made for this use case.
>
> Regards,
> Christian.
>
> Am 08.02.2017 um 15:57 schrieb Liu, Monk:
>>
>> 》As I wrote in the other thread as well calling amdgpu_resume() 
>> without proper suspend will just mess up a whole bunch of internal 
>> structures.
>>
>>
>> please name at least one, I'll check and see how to improve
>>
>> and like I said, this approach is correct and verified by hang test, 
>> if internal structures messed up I don't think the test will easy 
>> pass. hw_init() just call resume per engine.
>>
>> you can take a deep look into sriov_gpu_reset and judge later
>>
>>
>> ------------------------------------------------------------------------
>> *发件人:* Christian König <deathsimple-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
>> *发送时间:* 2017年2月8日 18:49:57
>> *收件人:* Liu, Monk; amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>> *主题:* Re: SPAM //答复: [PATCH 09/20] drm/amdgpu:implement SRIOV 
>> gpu_reset
>>> +       /* now we are okay to resume SMC/CP/SDMA */
>>> +       amdgpu_resume_late(adev);
>> As I wrote in the other thread as well calling amdgpu_resume() 
>> without proper suspend will just mess up a whole bunch of internal 
>> structures.
>>
>> So a clear NAK on that approach. If you don't need the hw stop which 
>> amdgpu_suspend() does for SRIOV then please try to just use the 
>> hw_init() callback and not the resume() callback.
>>
>> Regards,
>> Christian.
>>
>> Am 07.02.2017 um 07:26 schrieb Liu, Monk:
>>>
>>> patch 1-8 are some fixes for sriov gpu reset feature
>>>
>>> patch 9 -20 are for sriov gpu reset
>>>
>>>
>>> BR Monk
>>>
>>> ------------------------------------------------------------------------
>>> *发件人:* amd-gfx <amd-gfx-bounces-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org> 代表 Monk Liu 
>>> <Monk.Liu-5C7GfCeVMHo@public.gmane.org>
>>> *发送时间:* 2017年2月7日 14:11:07
>>> *收件人:* amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>> *抄送:* Liu, Monk
>>> *主题:* [PATCH 09/20] drm/amdgpu:implement SRIOV gpu_reset
>>> Signed-off-by: Monk Liu <Monk.Liu-5C7GfCeVMHo@public.gmane.org>
>>> ---
>>>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 158 
>>> ++++++++++++++++++++++++++++-
>>>  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h   |   1 +
>>>  2 files changed, 158 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index e926f84..2b404ca 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -1604,6 +1604,53 @@ int amdgpu_suspend(struct amdgpu_device *adev)
>>>          return 0;
>>>  }
>>>
>>> +static int amdgpu_resume_early(struct amdgpu_device *adev)
>>> +{
>>> +       int i, r;
>>> +
>>> +       for (i = 0; i < adev->num_ip_blocks; i++) {
>>> +               if (!adev->ip_blocks[i].status.valid)
>>> +                       continue;
>>> +
>>> +               if (adev->ip_blocks[i].version->type == 
>>> AMD_IP_BLOCK_TYPE_COMMON ||
>>> + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
>>> + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)
>>> +                       r = 
>>> adev->ip_blocks[i].version->funcs->resume(adev);
>>> +
>>> +               if (r) {
>>> +                       DRM_ERROR("resume of IP block <%s> failed %d\n",
>>> + adev->ip_blocks[i].version->funcs->name, r);
>>> +                       return r;
>>> +               }
>>> +       }
>>> +
>>> +       return 0;
>>> +}
>>> +
>>> +static int amdgpu_resume_late(struct amdgpu_device *adev)
>>> +{
>>> +       int i, r;
>>> +
>>> +       for (i = 0; i < adev->num_ip_blocks; i++) {
>>> +               if (!adev->ip_blocks[i].status.valid)
>>> +                       continue;
>>> +
>>> +               if (adev->ip_blocks[i].version->type == 
>>> AMD_IP_BLOCK_TYPE_COMMON ||
>>> + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
>>> + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH )
>>> +                       continue;
>>> +
>>> +               r = adev->ip_blocks[i].version->funcs->resume(adev);
>>> +               if (r) {
>>> +                       DRM_ERROR("resume of IP block <%s> failed %d\n",
>>> + adev->ip_blocks[i].version->funcs->name, r);
>>> +                       return r;
>>> +               }
>>> +       }
>>> +
>>> +       return 0;
>>> +}
>>> +
>>>  static int amdgpu_resume(struct amdgpu_device *adev)
>>>  {
>>>          int i, r;
>>> @@ -2343,6 +2390,115 @@ static int 
>>> amdgpu_recover_vram_from_shadow(struct amdgpu_device *adev,
>>>  }
>>>
>>>  /**
>>> + * amdgpu_sriov_gpu_reset - reset the asic
>>> + *
>>> + * @adev: amdgpu device pointer
>>> + * @voluntary: if this reset is requested by guest.
>>> + *             (true means by guest and false means by HYPERVISOR )
>>> + *
>>> + * Attempt the reset the GPU if it has hung (all asics).
>>> + * for SRIOV case.
>>> + * Returns 0 for success or an error on failure.
>>> + */
>>> +int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, bool voluntary)
>>> +{
>>> +       int i, r = 0;
>>> +       int resched;
>>> +       struct amdgpu_bo *bo, *tmp;
>>> +       struct amdgpu_ring *ring;
>>> +       struct fence *fence = NULL, *next = NULL;
>>> +
>>> +       mutex_lock(&adev->virt.lock_reset);
>>> +       atomic_inc(&adev->gpu_reset_counter);
>>> +
>>> +       /* block TTM */
>>> +       resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
>>> +
>>> +       /* block scheduler */
>>> +       for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>> +               ring = adev->rings[i];
>>> +
>>> +               if (!ring || !ring->sched.thread)
>>> +                       continue;
>>> +
>>> +               kthread_park(ring->sched.thread);
>>> + amd_sched_hw_job_reset(&ring->sched);
>>> +       }
>>> +
>>> +       /* after all hw jobs are reset, hw fence is meaningless, so 
>>> force_completion */
>>> +       amdgpu_fence_driver_force_completion(adev);
>>> +
>>> +       /* request to take full control of GPU before 
>>> re-initialization  */
>>> +       if (voluntary)
>>> +               amdgpu_virt_reset_gpu(adev);
>>> +       else
>>> +               amdgpu_virt_request_full_gpu(adev, true);
>>> +
>>> +
>>> +       /* Resume IP prior to SMC */
>>> +       amdgpu_resume_early(adev);
>>> +
>>> +       /* we need recover gart prior to run SMC/CP/SDMA resume */
>>> +       amdgpu_ttm_recover_gart(adev);
>>> +
>>> +       /* now we are okay to resume SMC/CP/SDMA */
>>> +       amdgpu_resume_late(adev);
>>
>>
>>> +
>>> +       amdgpu_irq_gpu_reset_resume_helper(adev);
>>> +
>>> +       if (amdgpu_ib_ring_tests(adev))
>>> +               dev_err(adev->dev, "[GPU_RESET] ib ring test failed 
>>> (%d).\n", r);
>>> +
>>> +       /* rellease full control of GPU after ib test */
>>> +       amdgpu_virt_release_full_gpu(adev, true);
>>> +
>>> +       DRM_INFO("recover vram bo from shadow\n");
>>> +
>>> +       ring = adev->mman.buffer_funcs_ring;
>>> +       mutex_lock(&adev->shadow_list_lock);
>>> +       list_for_each_entry_safe(bo, tmp, &adev->shadow_list, 
>>> shadow_list) {
>>> + amdgpu_recover_vram_from_shadow(adev, ring, bo, &next);
>>> +               if (fence) {
>>> +                       r = fence_wait(fence, false);
>>> +                       if (r) {
>>> +                               WARN(r, "recovery from shadow isn't 
>>> completed\n");
>>> +                               break;
>>> +                       }
>>> +               }
>>> +
>>> +               fence_put(fence);
>>> +               fence = next;
>>> +       }
>>> + mutex_unlock(&adev->shadow_list_lock);
>>> +
>>> +       if (fence) {
>>> +               r = fence_wait(fence, false);
>>> +               if (r)
>>> +                       WARN(r, "recovery from shadow isn't 
>>> completed\n");
>>> +       }
>>> +       fence_put(fence);
>>> +
>>> +       for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>> +               struct amdgpu_ring *ring = adev->rings[i];
>>> +               if (!ring || !ring->sched.thread)
>>> +                       continue;
>>> +
>>> + amd_sched_job_recovery(&ring->sched);
>>> + kthread_unpark(ring->sched.thread);
>>> +       }
>>> +
>>> +       drm_helper_resume_force_mode(adev->ddev);
>>> + ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched);
>>> +       if (r) {
>>> +               /* bad news, how to tell it to userspace ? */
>>> +               dev_info(adev->dev, "GPU reset failed\n");
>>> +       }
>>> +
>>> +       mutex_unlock(&adev->virt.lock_reset);
>>> +       return r;
>>> +}
>>> +
>>> +/**
>>>   * amdgpu_gpu_reset - reset the asic
>>>   *
>>>   * @adev: amdgpu device pointer
>>> @@ -2358,7 +2514,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>>>          bool need_full_reset;
>>>
>>>          if (amdgpu_sriov_vf(adev))
>>> -               return 0;
>>> +               return amdgpu_sriov_gpu_reset(adev, true);
>>>
>>>          if (!amdgpu_check_soft_reset(adev)) {
>>>                  DRM_INFO("No hardware hang detected. Did some 
>>> blocks stall?\n");
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>> index 675e12c..73d24df 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>> @@ -89,5 +89,6 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device 
>>> *adev, uint32_t reg, uint32_t v);
>>>  int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool 
>>> init);
>>>  int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, bool 
>>> init);
>>>  int amdgpu_virt_reset_gpu(struct amdgpu_device *adev);
>>> +int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, bool voluntary);
>>>
>>>  #endif
>>> -- 
>>> 2.7.4
>>>
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>
>>>
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>
>>
>
>
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx



[-- Attachment #1.2: Type: text/html, Size: 27664 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 51+ messages in thread

* 答复: 答复: 答复: SPAM //答复: [PATCH 09/20] drm/amdgpu:implement SRIOV gpu_reset
       [not found]                             ` <aea90712-dc2f-5bd5-ea8b-52b8688a44c9-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
@ 2017-02-08 15:40                               ` Liu, Monk
       [not found]                                 ` <DM5PR12MB1610E5F6359C78E0740F428384420-2J9CzHegvk++jCVTvoAFKAdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  0 siblings, 1 reply; 51+ messages in thread
From: Liu, Monk @ 2017-02-08 15:40 UTC (permalink / raw)
  To: Christian König, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1: Type: text/plain, Size: 12844 bytes --]

If I understand you correct you really don't need all that reload and restore dance. Instead what you need for the SRIOV case is just reinitializing the hardware, isn't it?

That this works is just pure coincident because we don't have backup/restore function for the blocks enabled for SRIOV.


ML:

yeah,  I use amdgpu_sriov_resume_early/late because according to the current code sequence there is nothing bad introduced without invoking suspend first. call hw_init on each IP should behaves the same as my patch.


I misunderstand you that I thought you insist invoking suspend() and resume() in pair ..

I agree use hw_init without suspend is more reasonable


thanks



________________________________
发件人: Christian König <deathsimple@vodafone.de>
发送时间: 2017年2月8日 下午 11:27
收件人: Liu, Monk; amd-gfx@lists.freedesktop.org
主题: Re: 答复: 答复: SPAM //答复: [PATCH 09/20] drm/amdgpu:implement SRIOV gpu_reset


do you mean turn VM into s3 mode ?

if so we not get that step, the S3 suspend/resume function is not supported by SRIOV vf case.

Then just try to unbind the fb and unload the module.

The idea of the suspend/resume callbacks are that they are only called in pairs.

See suspend is supposed to unpin all BOs and backup all resources from VRAM to GTT and preserve the general hardware state.

Now what resume does is to reload BOs and restore the state previously saved during the suspend backup.

If I understand you correct you really don't need all that reload and restore dance. Instead what you need for the SRIOV case is just reinitializing the hardware, isn't it?

That this works is just pure coincident because we don't have backup/restore function for the blocks enabled for SRIOV.

Regards,
Christian.

Am 08.02.2017 um 16:17 schrieb Liu, Monk:

wait a minutes ...


》3. suspend the VM with glxgears running


do you mean turn VM into s3 mode ?

if so we not get that step, the S3 suspend/resume function is not supported by SRIOV vf case.





________________________________
发件人: Christian König <deathsimple@vodafone.de><mailto:deathsimple@vodafone.de>
发送时间: 2017年2月8日 23:13:46
收件人: Liu, Monk; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
主题: Re: 答复: SPAM //答复: [PATCH 09/20] drm/amdgpu:implement SRIOV gpu_reset

and like I said, this approach is correct and verified by hang test
Completely irrelevant.

Please try the following:
1. Trigger a hang
2. Reset the GPU
3. Suspend the VM with glxgears running
4. Resume the VM

I'm pretty sure that with your approach that either suspending or resuming the VM with an application running will just hang.

Anyway even if all the code path you break here (UVD/VCE at minimum) are disabled in the SRIOV case it's still not a good idea completely breaking the design just to satisfy the SRIOV feature.

So that is still a clear NAK on that. Please do as I told you and use the hw_init() callback instead, it is especially made for this use case.

Regards,
Christian.

Am 08.02.2017 um 15:57 schrieb Liu, Monk:

》As I wrote in the other thread as well calling amdgpu_resume() without proper suspend will just mess up a whole bunch of internal structures.

please name at least one, I'll check and see how to improve

and like I said, this approach is correct and verified by hang test, if internal structures messed up I don't think the test will easy pass. hw_init() just call resume per engine.

you can take a deep look into sriov_gpu_reset and judge later



________________________________
发件人: Christian König <deathsimple@vodafone.de><mailto:deathsimple@vodafone.de>
发送时间: 2017年2月8日 18:49:57
收件人: Liu, Monk; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
主题: Re: SPAM //答复: [PATCH 09/20] drm/amdgpu:implement SRIOV gpu_reset

+       /* now we are okay to resume SMC/CP/SDMA */
+       amdgpu_resume_late(adev);
As I wrote in the other thread as well calling amdgpu_resume() without proper suspend will just mess up a whole bunch of internal structures.

So a clear NAK on that approach. If you don't need the hw stop which amdgpu_suspend() does for SRIOV then please try to just use the hw_init() callback and not the resume() callback.

Regards,
Christian.

Am 07.02.2017 um 07:26 schrieb Liu, Monk:

patch 1-8 are some fixes for sriov gpu reset feature

patch 9 -20 are for sriov gpu reset


BR Monk

________________________________
发件人: amd-gfx <amd-gfx-bounces@lists.freedesktop.org><mailto:amd-gfx-bounces@lists.freedesktop.org> 代表 Monk Liu <Monk.Liu@amd.com><mailto:Monk.Liu@amd.com>
发送时间: 2017年2月7日 14:11:07
收件人: amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
抄送: Liu, Monk
主题: [PATCH 09/20] drm/amdgpu:implement SRIOV gpu_reset

Signed-off-by: Monk Liu <Monk.Liu@amd.com><mailto:Monk.Liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 158 ++++++++++++++++++++++++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h   |   1 +
 2 files changed, 158 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e926f84..2b404ca 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1604,6 +1604,53 @@ int amdgpu_suspend(struct amdgpu_device *adev)
         return 0;
 }

+static int amdgpu_resume_early(struct amdgpu_device *adev)
+{
+       int i, r;
+
+       for (i = 0; i < adev->num_ip_blocks; i++) {
+               if (!adev->ip_blocks[i].status.valid)
+                       continue;
+
+               if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
+                               adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
+                               adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)
+                       r = adev->ip_blocks[i].version->funcs->resume(adev);
+
+               if (r) {
+                       DRM_ERROR("resume of IP block <%s> failed %d\n",
+                                 adev->ip_blocks[i].version->funcs->name, r);
+                       return r;
+               }
+       }
+
+       return 0;
+}
+
+static int amdgpu_resume_late(struct amdgpu_device *adev)
+{
+       int i, r;
+
+       for (i = 0; i < adev->num_ip_blocks; i++) {
+               if (!adev->ip_blocks[i].status.valid)
+                       continue;
+
+               if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
+                               adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
+                               adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH )
+                       continue;
+
+               r = adev->ip_blocks[i].version->funcs->resume(adev);
+               if (r) {
+                       DRM_ERROR("resume of IP block <%s> failed %d\n",
+                                 adev->ip_blocks[i].version->funcs->name, r);
+                       return r;
+               }
+       }
+
+       return 0;
+}
+
 static int amdgpu_resume(struct amdgpu_device *adev)
 {
         int i, r;
@@ -2343,6 +2390,115 @@ static int amdgpu_recover_vram_from_shadow(struct amdgpu_device *adev,
 }

 /**
+ * amdgpu_sriov_gpu_reset - reset the asic
+ *
+ * @adev: amdgpu device pointer
+ * @voluntary: if this reset is requested by guest.
+ *             (true means by guest and false means by HYPERVISOR )
+ *
+ * Attempt the reset the GPU if it has hung (all asics).
+ * for SRIOV case.
+ * Returns 0 for success or an error on failure.
+ */
+int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, bool voluntary)
+{
+       int i, r = 0;
+       int resched;
+       struct amdgpu_bo *bo, *tmp;
+       struct amdgpu_ring *ring;
+       struct fence *fence = NULL, *next = NULL;
+
+       mutex_lock(&adev->virt.lock_reset);
+       atomic_inc(&adev->gpu_reset_counter);
+
+       /* block TTM */
+       resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
+
+       /* block scheduler */
+       for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
+               ring = adev->rings[i];
+
+               if (!ring || !ring->sched.thread)
+                       continue;
+
+               kthread_park(ring->sched.thread);
+               amd_sched_hw_job_reset(&ring->sched);
+       }
+
+       /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
+       amdgpu_fence_driver_force_completion(adev);
+
+       /* request to take full control of GPU before re-initialization  */
+       if (voluntary)
+               amdgpu_virt_reset_gpu(adev);
+       else
+               amdgpu_virt_request_full_gpu(adev, true);
+
+
+       /* Resume IP prior to SMC */
+       amdgpu_resume_early(adev);
+
+       /* we need recover gart prior to run SMC/CP/SDMA resume */
+       amdgpu_ttm_recover_gart(adev);
+
+       /* now we are okay to resume SMC/CP/SDMA */
+       amdgpu_resume_late(adev);


+
+       amdgpu_irq_gpu_reset_resume_helper(adev);
+
+       if (amdgpu_ib_ring_tests(adev))
+               dev_err(adev->dev, "[GPU_RESET] ib ring test failed (%d).\n", r);
+
+       /* rellease full control of GPU after ib test */
+       amdgpu_virt_release_full_gpu(adev, true);
+
+       DRM_INFO("recover vram bo from shadow\n");
+
+       ring = adev->mman.buffer_funcs_ring;
+       mutex_lock(&adev->shadow_list_lock);
+       list_for_each_entry_safe(bo, tmp, &adev->shadow_list, shadow_list) {
+               amdgpu_recover_vram_from_shadow(adev, ring, bo, &next);
+               if (fence) {
+                       r = fence_wait(fence, false);
+                       if (r) {
+                               WARN(r, "recovery from shadow isn't completed\n");
+                               break;
+                       }
+               }
+
+               fence_put(fence);
+               fence = next;
+       }
+       mutex_unlock(&adev->shadow_list_lock);
+
+       if (fence) {
+               r = fence_wait(fence, false);
+               if (r)
+                       WARN(r, "recovery from shadow isn't completed\n");
+       }
+       fence_put(fence);
+
+       for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
+               struct amdgpu_ring *ring = adev->rings[i];
+               if (!ring || !ring->sched.thread)
+                       continue;
+
+               amd_sched_job_recovery(&ring->sched);
+               kthread_unpark(ring->sched.thread);
+       }
+
+       drm_helper_resume_force_mode(adev->ddev);
+       ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched);
+       if (r) {
+               /* bad news, how to tell it to userspace ? */
+               dev_info(adev->dev, "GPU reset failed\n");
+       }
+
+       mutex_unlock(&adev->virt.lock_reset);
+       return r;
+}
+
+/**
  * amdgpu_gpu_reset - reset the asic
  *
  * @adev: amdgpu device pointer
@@ -2358,7 +2514,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
         bool need_full_reset;

         if (amdgpu_sriov_vf(adev))
-               return 0;
+               return amdgpu_sriov_gpu_reset(adev, true);

         if (!amdgpu_check_soft_reset(adev)) {
                 DRM_INFO("No hardware hang detected. Did some blocks stall?\n");
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
index 675e12c..73d24df 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
@@ -89,5 +89,6 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v);
 int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool init);
 int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, bool init);
 int amdgpu_virt_reset_gpu(struct amdgpu_device *adev);
+int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, bool voluntary);

 #endif
--
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
https://lists.freedesktop.org/mailman/listinfo/amd-gfx



_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
https://lists.freedesktop.org/mailman/listinfo/amd-gfx






_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
https://lists.freedesktop.org/mailman/listinfo/amd-gfx



[-- Attachment #1.2: Type: text/html, Size: 26253 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* 答复: 答复: 答复: SPAM //答复: [PATCH 09/20] drm/amdgpu:implement SRIOV gpu_reset
       [not found]                                 ` <DM5PR12MB1610E5F6359C78E0740F428384420-2J9CzHegvk++jCVTvoAFKAdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2017-02-08 15:45                                   ` Liu, Monk
  0 siblings, 0 replies; 51+ messages in thread
From: Liu, Monk @ 2017-02-08 15:45 UTC (permalink / raw)
  To: Christian König, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1: Type: text/plain, Size: 13279 bytes --]

we will send another patch to fix it later that using ->hw_init instead of ->resume .


BR Monk

________________________________
发件人: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> 代表 Liu, Monk <Monk.Liu@amd.com>
发送时间: 2017年2月8日 23:40:35
收件人: Christian König; amd-gfx@lists.freedesktop.org
主题: 答复: 答复: 答复: SPAM //答复: [PATCH 09/20] drm/amdgpu:implement SRIOV gpu_reset


If I understand you correct you really don't need all that reload and restore dance. Instead what you need for the SRIOV case is just reinitializing the hardware, isn't it?

That this works is just pure coincident because we don't have backup/restore function for the blocks enabled for SRIOV.


ML:

yeah,  I use amdgpu_sriov_resume_early/late because according to the current code sequence there is nothing bad introduced without invoking suspend first. call hw_init on each IP should behaves the same as my patch.


I misunderstand you that I thought you insist invoking suspend() and resume() in pair ..

I agree use hw_init without suspend is more reasonable


thanks



________________________________
发件人: Christian König <deathsimple@vodafone.de>
发送时间: 2017年2月8日 下午 11:27
收件人: Liu, Monk; amd-gfx@lists.freedesktop.org
主题: Re: 答复: 答复: SPAM //答复: [PATCH 09/20] drm/amdgpu:implement SRIOV gpu_reset


do you mean turn VM into s3 mode ?

if so we not get that step, the S3 suspend/resume function is not supported by SRIOV vf case.

Then just try to unbind the fb and unload the module.

The idea of the suspend/resume callbacks are that they are only called in pairs.

See suspend is supposed to unpin all BOs and backup all resources from VRAM to GTT and preserve the general hardware state.

Now what resume does is to reload BOs and restore the state previously saved during the suspend backup.

If I understand you correct you really don't need all that reload and restore dance. Instead what you need for the SRIOV case is just reinitializing the hardware, isn't it?

That this works is just pure coincident because we don't have backup/restore function for the blocks enabled for SRIOV.

Regards,
Christian.

Am 08.02.2017 um 16:17 schrieb Liu, Monk:

wait a minutes ...


》3. suspend the VM with glxgears running


do you mean turn VM into s3 mode ?

if so we not get that step, the S3 suspend/resume function is not supported by SRIOV vf case.





________________________________
发件人: Christian König <deathsimple@vodafone.de><mailto:deathsimple@vodafone.de>
发送时间: 2017年2月8日 23:13:46
收件人: Liu, Monk; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
主题: Re: 答复: SPAM //答复: [PATCH 09/20] drm/amdgpu:implement SRIOV gpu_reset

and like I said, this approach is correct and verified by hang test
Completely irrelevant.

Please try the following:
1. Trigger a hang
2. Reset the GPU
3. Suspend the VM with glxgears running
4. Resume the VM

I'm pretty sure that with your approach that either suspending or resuming the VM with an application running will just hang.

Anyway even if all the code path you break here (UVD/VCE at minimum) are disabled in the SRIOV case it's still not a good idea completely breaking the design just to satisfy the SRIOV feature.

So that is still a clear NAK on that. Please do as I told you and use the hw_init() callback instead, it is especially made for this use case.

Regards,
Christian.

Am 08.02.2017 um 15:57 schrieb Liu, Monk:

》As I wrote in the other thread as well calling amdgpu_resume() without proper suspend will just mess up a whole bunch of internal structures.

please name at least one, I'll check and see how to improve

and like I said, this approach is correct and verified by hang test, if internal structures messed up I don't think the test will easy pass. hw_init() just call resume per engine.

you can take a deep look into sriov_gpu_reset and judge later



________________________________
发件人: Christian König <deathsimple@vodafone.de><mailto:deathsimple@vodafone.de>
发送时间: 2017年2月8日 18:49:57
收件人: Liu, Monk; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
主题: Re: SPAM //答复: [PATCH 09/20] drm/amdgpu:implement SRIOV gpu_reset

+       /* now we are okay to resume SMC/CP/SDMA */
+       amdgpu_resume_late(adev);
As I wrote in the other thread as well calling amdgpu_resume() without proper suspend will just mess up a whole bunch of internal structures.

So a clear NAK on that approach. If you don't need the hw stop which amdgpu_suspend() does for SRIOV then please try to just use the hw_init() callback and not the resume() callback.

Regards,
Christian.

Am 07.02.2017 um 07:26 schrieb Liu, Monk:

patch 1-8 are some fixes for sriov gpu reset feature

patch 9 -20 are for sriov gpu reset


BR Monk

________________________________
发件人: amd-gfx <amd-gfx-bounces@lists.freedesktop.org><mailto:amd-gfx-bounces@lists.freedesktop.org> 代表 Monk Liu <Monk.Liu@amd.com><mailto:Monk.Liu@amd.com>
发送时间: 2017年2月7日 14:11:07
收件人: amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
抄送: Liu, Monk
主题: [PATCH 09/20] drm/amdgpu:implement SRIOV gpu_reset

Signed-off-by: Monk Liu <Monk.Liu@amd.com><mailto:Monk.Liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 158 ++++++++++++++++++++++++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h   |   1 +
 2 files changed, 158 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e926f84..2b404ca 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1604,6 +1604,53 @@ int amdgpu_suspend(struct amdgpu_device *adev)
         return 0;
 }

+static int amdgpu_resume_early(struct amdgpu_device *adev)
+{
+       int i, r;
+
+       for (i = 0; i < adev->num_ip_blocks; i++) {
+               if (!adev->ip_blocks[i].status.valid)
+                       continue;
+
+               if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
+                               adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
+                               adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)
+                       r = adev->ip_blocks[i].version->funcs->resume(adev);
+
+               if (r) {
+                       DRM_ERROR("resume of IP block <%s> failed %d\n",
+                                 adev->ip_blocks[i].version->funcs->name, r);
+                       return r;
+               }
+       }
+
+       return 0;
+}
+
+static int amdgpu_resume_late(struct amdgpu_device *adev)
+{
+       int i, r;
+
+       for (i = 0; i < adev->num_ip_blocks; i++) {
+               if (!adev->ip_blocks[i].status.valid)
+                       continue;
+
+               if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
+                               adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
+                               adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH )
+                       continue;
+
+               r = adev->ip_blocks[i].version->funcs->resume(adev);
+               if (r) {
+                       DRM_ERROR("resume of IP block <%s> failed %d\n",
+                                 adev->ip_blocks[i].version->funcs->name, r);
+                       return r;
+               }
+       }
+
+       return 0;
+}
+
 static int amdgpu_resume(struct amdgpu_device *adev)
 {
         int i, r;
@@ -2343,6 +2390,115 @@ static int amdgpu_recover_vram_from_shadow(struct amdgpu_device *adev,
 }

 /**
+ * amdgpu_sriov_gpu_reset - reset the asic
+ *
+ * @adev: amdgpu device pointer
+ * @voluntary: if this reset is requested by guest.
+ *             (true means by guest and false means by HYPERVISOR )
+ *
+ * Attempt the reset the GPU if it has hung (all asics).
+ * for SRIOV case.
+ * Returns 0 for success or an error on failure.
+ */
+int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, bool voluntary)
+{
+       int i, r = 0;
+       int resched;
+       struct amdgpu_bo *bo, *tmp;
+       struct amdgpu_ring *ring;
+       struct fence *fence = NULL, *next = NULL;
+
+       mutex_lock(&adev->virt.lock_reset);
+       atomic_inc(&adev->gpu_reset_counter);
+
+       /* block TTM */
+       resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
+
+       /* block scheduler */
+       for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
+               ring = adev->rings[i];
+
+               if (!ring || !ring->sched.thread)
+                       continue;
+
+               kthread_park(ring->sched.thread);
+               amd_sched_hw_job_reset(&ring->sched);
+       }
+
+       /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
+       amdgpu_fence_driver_force_completion(adev);
+
+       /* request to take full control of GPU before re-initialization  */
+       if (voluntary)
+               amdgpu_virt_reset_gpu(adev);
+       else
+               amdgpu_virt_request_full_gpu(adev, true);
+
+
+       /* Resume IP prior to SMC */
+       amdgpu_resume_early(adev);
+
+       /* we need recover gart prior to run SMC/CP/SDMA resume */
+       amdgpu_ttm_recover_gart(adev);
+
+       /* now we are okay to resume SMC/CP/SDMA */
+       amdgpu_resume_late(adev);


+
+       amdgpu_irq_gpu_reset_resume_helper(adev);
+
+       if (amdgpu_ib_ring_tests(adev))
+               dev_err(adev->dev, "[GPU_RESET] ib ring test failed (%d).\n", r);
+
+       /* rellease full control of GPU after ib test */
+       amdgpu_virt_release_full_gpu(adev, true);
+
+       DRM_INFO("recover vram bo from shadow\n");
+
+       ring = adev->mman.buffer_funcs_ring;
+       mutex_lock(&adev->shadow_list_lock);
+       list_for_each_entry_safe(bo, tmp, &adev->shadow_list, shadow_list) {
+               amdgpu_recover_vram_from_shadow(adev, ring, bo, &next);
+               if (fence) {
+                       r = fence_wait(fence, false);
+                       if (r) {
+                               WARN(r, "recovery from shadow isn't completed\n");
+                               break;
+                       }
+               }
+
+               fence_put(fence);
+               fence = next;
+       }
+       mutex_unlock(&adev->shadow_list_lock);
+
+       if (fence) {
+               r = fence_wait(fence, false);
+               if (r)
+                       WARN(r, "recovery from shadow isn't completed\n");
+       }
+       fence_put(fence);
+
+       for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
+               struct amdgpu_ring *ring = adev->rings[i];
+               if (!ring || !ring->sched.thread)
+                       continue;
+
+               amd_sched_job_recovery(&ring->sched);
+               kthread_unpark(ring->sched.thread);
+       }
+
+       drm_helper_resume_force_mode(adev->ddev);
+       ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched);
+       if (r) {
+               /* bad news, how to tell it to userspace ? */
+               dev_info(adev->dev, "GPU reset failed\n");
+       }
+
+       mutex_unlock(&adev->virt.lock_reset);
+       return r;
+}
+
+/**
  * amdgpu_gpu_reset - reset the asic
  *
  * @adev: amdgpu device pointer
@@ -2358,7 +2514,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
         bool need_full_reset;

         if (amdgpu_sriov_vf(adev))
-               return 0;
+               return amdgpu_sriov_gpu_reset(adev, true);

         if (!amdgpu_check_soft_reset(adev)) {
                 DRM_INFO("No hardware hang detected. Did some blocks stall?\n");
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
index 675e12c..73d24df 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
@@ -89,5 +89,6 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v);
 int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool init);
 int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, bool init);
 int amdgpu_virt_reset_gpu(struct amdgpu_device *adev);
+int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, bool voluntary);

 #endif
--
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
https://lists.freedesktop.org/mailman/listinfo/amd-gfx



_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
https://lists.freedesktop.org/mailman/listinfo/amd-gfx






_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
https://lists.freedesktop.org/mailman/listinfo/amd-gfx



[-- Attachment #1.2: Type: text/html, Size: 27184 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 51+ messages in thread

end of thread, other threads:[~2017-02-08 15:45 UTC | newest]

Thread overview: 51+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-02-07  6:10 [PATCH 01/20] drm/amdgpu:fix powerplay logic Monk Liu
     [not found] ` <1486447878-20521-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
2017-02-07  6:11   ` [PATCH 02/20] drm/amdgpu:cg & pg are not applied on VF Monk Liu
     [not found]     ` <1486447878-20521-2-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
2017-02-07 15:27       ` Deucher, Alexander
2017-02-07  6:11   ` [PATCH 03/20] drm/damdgpu:add new mqd member in ring Monk Liu
     [not found]     ` <1486447878-20521-3-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
2017-02-07 15:29       ` Deucher, Alexander
2017-02-07  6:11   ` [PATCH 04/20] drm/amdgpu:imple mqd soft ini/fini Monk Liu
     [not found]     ` <1486447878-20521-4-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
2017-02-07 15:30       ` Deucher, Alexander
2017-02-07  6:11   ` [PATCH 05/20] drm/amdgpu:bo_free_kernel will set ptr to NULL if freed Monk Liu
     [not found]     ` <1486447878-20521-5-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
2017-02-07 15:31       ` Deucher, Alexander
2017-02-07  6:11   ` [PATCH 06/20] drm/amdgpu:no need use sriov judge Monk Liu
     [not found]     ` <1486447878-20521-6-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
2017-02-07 15:33       ` Deucher, Alexander
2017-02-07  6:11   ` [PATCH 07/20] drm/amdgpu:minor cleanup Monk Liu
     [not found]     ` <1486447878-20521-7-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
2017-02-07 15:34       ` Deucher, Alexander
2017-02-07  6:11   ` [PATCH 08/20] drm/amdgpu:divide KCQ mqd init to sw and hw Monk Liu
     [not found]     ` <1486447878-20521-8-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
2017-02-07 15:36       ` Deucher, Alexander
2017-02-07  6:11   ` [PATCH 09/20] drm/amdgpu:implement SRIOV gpu_reset Monk Liu
     [not found]     ` <1486447878-20521-9-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
2017-02-07  6:26       ` 答复: " Liu, Monk
     [not found]         ` <DM5PR12MB16109EC5F03088C1CFB58FE484430-2J9CzHegvk++jCVTvoAFKAdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2017-02-08 10:49           ` SPAM //答复: " Christian König
     [not found]             ` <5dfe222d-9564-0835-f749-3ea5ef78c701-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
2017-02-08 14:57               ` 答复: " Liu, Monk
     [not found]                 ` <DM5PR12MB1610E1ADF3A9172B1A418C7184420-2J9CzHegvk++jCVTvoAFKAdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2017-02-08 15:13                   ` Christian König
     [not found]                     ` <afe8b046-c0ba-586e-99c3-adef380cfd3f-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
2017-02-08 15:17                       ` 答复: " Liu, Monk
     [not found]                         ` <DM5PR12MB161095B3EBE0DCC3CEFCBB8A84420-2J9CzHegvk++jCVTvoAFKAdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2017-02-08 15:27                           ` Christian König
     [not found]                             ` <aea90712-dc2f-5bd5-ea8b-52b8688a44c9-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
2017-02-08 15:40                               ` 答复: " Liu, Monk
     [not found]                                 ` <DM5PR12MB1610E5F6359C78E0740F428384420-2J9CzHegvk++jCVTvoAFKAdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2017-02-08 15:45                                   ` Liu, Monk
2017-02-07 15:45       ` Deucher, Alexander
2017-02-07  6:11   ` [PATCH 10/20] drm/amdgpu:change kiq lock name Monk Liu
     [not found]     ` <1486447878-20521-10-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
2017-02-07 15:38       ` Deucher, Alexander
2017-02-07  6:11   ` [PATCH 11/20] drm/amdgpu:add lock_reset for SRIOV Monk Liu
     [not found]     ` <1486447878-20521-11-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
2017-02-07 15:47       ` Deucher, Alexander
2017-02-07  6:11   ` [PATCH 12/20] drm/amdgpu:impl mm_r/weg_nokiq Monk Liu
     [not found]     ` <1486447878-20521-12-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
2017-02-07 15:52       ` Deucher, Alexander
     [not found]         ` <BN6PR12MB1652FE39574BA6067AB64578F7430-/b2+HYfkarQqUD6E6FAiowdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2017-02-08  7:30           ` Yu, Xiangliang
2017-02-07  6:11   ` [PATCH 13/20] Refine handshake between guest and host by mailbox Monk Liu
     [not found]     ` <1486447878-20521-13-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
2017-02-07 15:40       ` Deucher, Alexander
2017-02-07  6:11   ` [PATCH 14/20] drm/amdgpu:use nokiq version mm access Monk Liu
     [not found]     ` <1486447878-20521-14-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
2017-02-07 15:54       ` Deucher, Alexander
2017-02-07  6:11   ` [PATCH 15/20] drm/amdgpu:use work instead of delay-work Monk Liu
     [not found]     ` <1486447878-20521-15-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
2017-02-07 16:10       ` Deucher, Alexander
     [not found]         ` <BN6PR12MB1652AA568136314BD28303BFF7430-/b2+HYfkarQqUD6E6FAiowdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2017-02-08  6:28           ` Yu, Xiangliang
2017-02-08  7:45           ` Liu, Monk
2017-02-07  6:11   ` [PATCH 16/20] drm/amdgpu:RUNTIME flag should clr later Monk Liu
     [not found]     ` <1486447878-20521-16-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
2017-02-07 15:56       ` Deucher, Alexander
2017-02-07  6:11   ` [PATCH 17/20] drm/amdgpu:new field is_load_stage introduced Monk Liu
     [not found]     ` <1486447878-20521-17-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
2017-02-07 16:08       ` Deucher, Alexander
2017-02-07  6:11   ` [PATCH 18/20] drm/amdgpu:alloc mqd backup Monk Liu
     [not found]     ` <1486447878-20521-18-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
2017-02-07 16:11       ` Deucher, Alexander
2017-02-07  6:11   ` [PATCH 19/20] drm/amdgpu:use nop to clear ring buffer Monk Liu
     [not found]     ` <1486447878-20521-19-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
2017-02-07 16:10       ` Deucher, Alexander
2017-02-07  6:11   ` [PATCH 20/20] drm/amdgpu:fix kiq_resume routine Monk Liu
     [not found]     ` <1486447878-20521-20-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
2017-02-07 16:09       ` Deucher, Alexander
2017-02-07 15:27   ` [PATCH 01/20] drm/amdgpu:fix powerplay logic Deucher, Alexander

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.