[PATCH 1/3] drm/amdgpu: add gpu reset control for umc page retirement

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 1/3] drm/amdgpu: add gpu reset control for umc page retirement
@ 2021-12-10 11:15 Tao Zhou
  2021-12-10 11:15 ` [PATCH 2/3] drm/amdkfd: add reset parameter for unmap queues Tao Zhou
  2021-12-10 11:15 ` [PATCH 3/3] drm/amdkfd: reset queue which consumes RAS poison Tao Zhou
  0 siblings, 2 replies; 5+ messages in thread
From: Tao Zhou @ 2021-12-10 11:15 UTC (permalink / raw)
  To: amd-gfx, hawking.zhang, stanley.yang, thomas.chai, Felix.Kuehling
  Cc: Tao Zhou

Add a reset parameter for umc page retirement, let user decide whether
call gpu reset in umc page retirement.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 15 ++++++++++++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h |  5 +++--
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 6e4bea012ea4..0c33f367a4e5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -23,6 +23,13 @@
 
 #include "amdgpu_ras.h"
 
+static int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
+		void *ras_error_status,
+		struct amdgpu_iv_entry *entry)
+{
+	return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry, true);
+}
+
 int amdgpu_umc_ras_late_init(struct amdgpu_device *adev)
 {
 	int r;
@@ -88,9 +95,10 @@ void amdgpu_umc_ras_fini(struct amdgpu_device *adev)
 	}
 }
 
-int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
+int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
 		void *ras_error_status,
-		struct amdgpu_iv_entry *entry)
+		struct amdgpu_iv_entry *entry,
+		bool reset)
 {
 	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
@@ -164,7 +172,8 @@ int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
 				adev->smu.ppt_funcs->send_hbm_bad_pages_num(&adev->smu, con->eeprom_control.ras_num_recs);
 		}
 
-		amdgpu_ras_reset_gpu(adev);
+		if (reset)
+			amdgpu_ras_reset_gpu(adev);
 	}
 
 	kfree(err_data->err_addr);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
index 9e40bade0a68..8d18d5121f66 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
@@ -78,9 +78,10 @@ struct amdgpu_umc {
 
 int amdgpu_umc_ras_late_init(struct amdgpu_device *adev);
 void amdgpu_umc_ras_fini(struct amdgpu_device *adev);
-int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
+int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
 		void *ras_error_status,
-		struct amdgpu_iv_entry *entry);
+		struct amdgpu_iv_entry *entry,
+		bool reset);
 int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
 		struct amdgpu_irq_src *source,
 		struct amdgpu_iv_entry *entry);
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 2/3] drm/amdkfd: add reset parameter for unmap queues
  2021-12-10 11:15 [PATCH 1/3] drm/amdgpu: add gpu reset control for umc page retirement Tao Zhou
@ 2021-12-10 11:15 ` Tao Zhou
  2021-12-10 11:15 ` [PATCH 3/3] drm/amdkfd: reset queue which consumes RAS poison Tao Zhou
  1 sibling, 0 replies; 5+ messages in thread
From: Tao Zhou @ 2021-12-10 11:15 UTC (permalink / raw)
  To: amd-gfx, hawking.zhang, stanley.yang, thomas.chai, Felix.Kuehling
  Cc: Tao Zhou

So we can set reset mode for unmap operation, no functional change.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c    | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index dd0b952f0173..01a2cc3928ac 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -47,7 +47,7 @@ static int execute_queues_cpsch(struct device_queue_manager *dqm,
 				uint32_t filter_param);
 static int unmap_queues_cpsch(struct device_queue_manager *dqm,
 				enum kfd_unmap_queues_filter filter,
-				uint32_t filter_param);
+				uint32_t filter_param, bool reset);
 
 static int map_queues_cpsch(struct device_queue_manager *dqm);
 
@@ -570,7 +570,7 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q,
 	/* Make sure the queue is unmapped before updating the MQD */
 	if (dqm->sched_policy != KFD_SCHED_POLICY_NO_HWS) {
 		retval = unmap_queues_cpsch(dqm,
-				KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
+				KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, false);
 		if (retval) {
 			pr_err("unmap queue failed\n");
 			goto out_unlock;
@@ -1223,7 +1223,7 @@ static int stop_cpsch(struct device_queue_manager *dqm)
 	}
 
 	if (!dqm->is_hws_hang)
-		unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
+		unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, false);
 	hanging = dqm->is_hws_hang || dqm->is_resetting;
 	dqm->sched_running = false;
 
@@ -1419,7 +1419,7 @@ static int map_queues_cpsch(struct device_queue_manager *dqm)
 /* dqm->lock mutex has to be locked before calling this function */
 static int unmap_queues_cpsch(struct device_queue_manager *dqm,
 				enum kfd_unmap_queues_filter filter,
-				uint32_t filter_param)
+				uint32_t filter_param, bool reset)
 {
 	int retval = 0;
 	struct mqd_manager *mqd_mgr;
@@ -1432,7 +1432,7 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
 		return retval;
 
 	retval = pm_send_unmap_queue(&dqm->packet_mgr, KFD_QUEUE_TYPE_COMPUTE,
-			filter, filter_param, false, 0);
+			filter, filter_param, reset, 0);
 	if (retval)
 		return retval;
 
@@ -1485,7 +1485,7 @@ static int execute_queues_cpsch(struct device_queue_manager *dqm,
 
 	if (dqm->is_hws_hang)
 		return -EIO;
-	retval = unmap_queues_cpsch(dqm, filter, filter_param);
+	retval = unmap_queues_cpsch(dqm, filter, filter_param, false);
 	if (retval)
 		return retval;
 
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 3/3] drm/amdkfd: reset queue which consumes RAS poison
  2021-12-10 11:15 [PATCH 1/3] drm/amdgpu: add gpu reset control for umc page retirement Tao Zhou
  2021-12-10 11:15 ` [PATCH 2/3] drm/amdkfd: add reset parameter for unmap queues Tao Zhou
@ 2021-12-10 11:15 ` Tao Zhou
  2021-12-10 23:30   ` Felix Kuehling
  2021-12-13 10:43   ` Zhang, Hawking
  1 sibling, 2 replies; 5+ messages in thread
From: Tao Zhou @ 2021-12-10 11:15 UTC (permalink / raw)
  To: amd-gfx, hawking.zhang, stanley.yang, thomas.chai, Felix.Kuehling
  Cc: Tao Zhou

CP supports unmap queue with reset mode which only destroys specific queue without affecting others.
Replacing whole gpu reset with reset queue mode for RAS poison consumption
saves much time, and we can also fallback to gpu reset solution if reset
queue fails.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c    |  6 ++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |  3 ++-
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 14 ++++++++++
 .../drm/amd/amdkfd/kfd_device_queue_manager.h |  1 +
 .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c   | 27 ++++++++++++++++---
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  2 ++
 6 files changed, 45 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 46cf48b3904a..0bf09a94d944 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -721,13 +721,13 @@ bool amdgpu_amdkfd_have_atomics_support(struct amdgpu_device *adev)
 	return adev->have_atomics_support;
 }
 
-void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev)
+void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, bool reset)
 {
 	struct ras_err_data err_data = {0, 0, 0, NULL};
 
 	/* CPU MCA will handle page retirement if connected_to_cpu is 1 */
 	if (!adev->gmc.xgmi.connected_to_cpu)
-		amdgpu_umc_process_ras_data_cb(adev, &err_data, NULL);
-	else
+		amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset);
+	else if (reset)
 		amdgpu_amdkfd_gpu_reset(adev);
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index fcbc8a9c9e06..61f899e54fd5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -296,7 +296,8 @@ int amdgpu_amdkfd_gpuvm_import_dmabuf(struct amdgpu_device *adev,
 				      uint64_t *mmap_offset);
 int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev,
 				struct tile_config *config);
-void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev);
+void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
+				bool reset);
 #if IS_ENABLED(CONFIG_HSA_AMD)
 void amdgpu_amdkfd_gpuvm_init_mem_limits(void);
 void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 01a2cc3928ac..095b2e0822aa 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -1476,6 +1476,20 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
 	return retval;
 }
 
+int unmap_queues_cpsch_poison(struct device_queue_manager *dqm, uint32_t pasid)
+{
+	int ret;
+
+	dqm_lock(dqm);
+
+	ret = unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_BY_PASID,
+			pasid, true);
+
+	dqm_unlock(dqm);
+
+	return ret;
+}
+
 /* dqm->lock mutex has to be locked before calling this function */
 static int execute_queues_cpsch(struct device_queue_manager *dqm,
 				enum kfd_unmap_queues_filter filter,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index 499fc0ea387f..c52869133159 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -219,6 +219,7 @@ unsigned int get_queues_per_pipe(struct device_queue_manager *dqm);
 unsigned int get_pipes_per_mec(struct device_queue_manager *dqm);
 unsigned int get_num_sdma_queues(struct device_queue_manager *dqm);
 unsigned int get_num_xgmi_sdma_queues(struct device_queue_manager *dqm);
+int unmap_queues_cpsch_poison(struct device_queue_manager *dqm, uint32_t pasid);
 
 static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd)
 {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index deb64168c9e8..2863bb9e5bca 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -89,6 +89,27 @@ enum SQ_INTERRUPT_ERROR_TYPE {
 #define KFD_SQ_INT_DATA__ERR_TYPE_MASK 0xF00000
 #define KFD_SQ_INT_DATA__ERR_TYPE__SHIFT 20
 
+static void event_interrupt_poison_consumption(struct kfd_dev *dev,
+				uint16_t pasid)
+{
+	int ret;
+	struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+
+	/* all queues of a process will be unmapped in one time */
+	if (p && atomic_read(&p->poison))
+		return;
+
+	atomic_set(&p->poison, 1);
+	ret = unmap_queues_cpsch_poison(dev->dqm, pasid);
+	kfd_signal_poison_consumed_event(dev, pasid);
+	/* resetting queue passes, do page retirement without gpu reset
+	   resetting queue fails, fallback to gpu reset solution */
+	if (!ret)
+		amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, false);
+	else
+		amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, true);
+}
+
 static bool event_interrupt_isr_v9(struct kfd_dev *dev,
 					const uint32_t *ih_ring_entry,
 					uint32_t *patched_ihre,
@@ -230,8 +251,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
 					sq_intr_err);
 				if (sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST &&
 					sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_MEMVIOL) {
-					kfd_signal_poison_consumed_event(dev, pasid);
-					amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev);
+					event_interrupt_poison_consumption(dev, pasid);
 					return;
 				}
 				break;
@@ -252,8 +272,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
 		if (source_id == SOC15_INTSRC_SDMA_TRAP) {
 			kfd_signal_event_interrupt(pasid, context_id0 & 0xfffffff, 28);
 		} else if (source_id == SOC15_INTSRC_SDMA_ECC) {
-			kfd_signal_poison_consumed_event(dev, pasid);
-			amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev);
+			event_interrupt_poison_consumption(dev, pasid);
 			return;
 		}
 	} else if (client_id == SOC15_IH_CLIENTID_VMC ||
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 0c3f911e3bf4..ea68f3b3a4e9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -856,6 +856,8 @@ struct kfd_process {
 	struct svm_range_list svms;
 
 	bool xnack_enabled;
+
+	atomic_t poison;
 };
 
 #define KFD_PROCESS_TABLE_SIZE 5 /* bits: 32 entries */
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH 3/3] drm/amdkfd: reset queue which consumes RAS poison
  2021-12-10 11:15 ` [PATCH 3/3] drm/amdkfd: reset queue which consumes RAS poison Tao Zhou
@ 2021-12-10 23:30   ` Felix Kuehling
  2021-12-13 10:43   ` Zhang, Hawking
  1 sibling, 0 replies; 5+ messages in thread
From: Felix Kuehling @ 2021-12-10 23:30 UTC (permalink / raw)
  To: Tao Zhou, amd-gfx, hawking.zhang, stanley.yang, thomas.chai


On 2021-12-10 6:15 a.m., Tao Zhou wrote:
> CP supports unmap queue with reset mode which only destroys specific queue without affecting others.
> Replacing whole gpu reset with reset queue mode for RAS poison consumption
> saves much time, and we can also fallback to gpu reset solution if reset
> queue fails.
>
> Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c    |  6 ++---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |  3 ++-
>   .../drm/amd/amdkfd/kfd_device_queue_manager.c | 14 ++++++++++
>   .../drm/amd/amdkfd/kfd_device_queue_manager.h |  1 +
>   .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c   | 27 ++++++++++++++++---
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  2 ++
>   6 files changed, 45 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> index 46cf48b3904a..0bf09a94d944 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> @@ -721,13 +721,13 @@ bool amdgpu_amdkfd_have_atomics_support(struct amdgpu_device *adev)
>   	return adev->have_atomics_support;
>   }
>   
> -void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev)
> +void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, bool reset)
>   {
>   	struct ras_err_data err_data = {0, 0, 0, NULL};
>   
>   	/* CPU MCA will handle page retirement if connected_to_cpu is 1 */
>   	if (!adev->gmc.xgmi.connected_to_cpu)
> -		amdgpu_umc_process_ras_data_cb(adev, &err_data, NULL);
> -	else
> +		amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset);
> +	else if (reset)
>   		amdgpu_amdkfd_gpu_reset(adev);
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index fcbc8a9c9e06..61f899e54fd5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -296,7 +296,8 @@ int amdgpu_amdkfd_gpuvm_import_dmabuf(struct amdgpu_device *adev,
>   				      uint64_t *mmap_offset);
>   int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev,
>   				struct tile_config *config);
> -void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev);
> +void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
> +				bool reset);
>   #if IS_ENABLED(CONFIG_HSA_AMD)
>   void amdgpu_amdkfd_gpuvm_init_mem_limits(void);
>   void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device *adev,
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 01a2cc3928ac..095b2e0822aa 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -1476,6 +1476,20 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
>   	return retval;
>   }
>   
> +int unmap_queues_cpsch_poison(struct device_queue_manager *dqm, uint32_t pasid)
> +{
> +	int ret;
> +
> +	dqm_lock(dqm);
> +
> +	ret = unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_BY_PASID,
> +			pasid, true);
> +
> +	dqm_unlock(dqm);
> +
> +	return ret;
> +}
> +
>   /* dqm->lock mutex has to be locked before calling this function */
>   static int execute_queues_cpsch(struct device_queue_manager *dqm,
>   				enum kfd_unmap_queues_filter filter,
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> index 499fc0ea387f..c52869133159 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> @@ -219,6 +219,7 @@ unsigned int get_queues_per_pipe(struct device_queue_manager *dqm);
>   unsigned int get_pipes_per_mec(struct device_queue_manager *dqm);
>   unsigned int get_num_sdma_queues(struct device_queue_manager *dqm);
>   unsigned int get_num_xgmi_sdma_queues(struct device_queue_manager *dqm);
> +int unmap_queues_cpsch_poison(struct device_queue_manager *dqm, uint32_t pasid);
>   
>   static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd)
>   {
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> index deb64168c9e8..2863bb9e5bca 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> @@ -89,6 +89,27 @@ enum SQ_INTERRUPT_ERROR_TYPE {
>   #define KFD_SQ_INT_DATA__ERR_TYPE_MASK 0xF00000
>   #define KFD_SQ_INT_DATA__ERR_TYPE__SHIFT 20
>   
> +static void event_interrupt_poison_consumption(struct kfd_dev *dev,
> +				uint16_t pasid)
> +{
> +	int ret;
> +	struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
> +
> +	/* all queues of a process will be unmapped in one time */
> +	if (p && atomic_read(&p->poison))
> +		return;
> +
> +	atomic_set(&p->poison, 1);

You're not checking p != NULL here.

You also need to release the process refcount before this function 
returns. Otherwise the process resources will be leaked. You can see 
leaked processes in /sys/class/kfd/kfd/proc. That directory should be 
empty after all KFD processes terminated.

Other than that, the series is

Acked-by: Felix Kuehling <Felix.Kuehling@amd.com>


> +	ret = unmap_queues_cpsch_poison(dev->dqm, pasid);
> +	kfd_signal_poison_consumed_event(dev, pasid);
> +	/* resetting queue passes, do page retirement without gpu reset
> +	   resetting queue fails, fallback to gpu reset solution */
> +	if (!ret)
> +		amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, false);
> +	else
> +		amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, true);
> +}
> +
>   static bool event_interrupt_isr_v9(struct kfd_dev *dev,
>   					const uint32_t *ih_ring_entry,
>   					uint32_t *patched_ihre,
> @@ -230,8 +251,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
>   					sq_intr_err);
>   				if (sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST &&
>   					sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_MEMVIOL) {
> -					kfd_signal_poison_consumed_event(dev, pasid);
> -					amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev);
> +					event_interrupt_poison_consumption(dev, pasid);
>   					return;
>   				}
>   				break;
> @@ -252,8 +272,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
>   		if (source_id == SOC15_INTSRC_SDMA_TRAP) {
>   			kfd_signal_event_interrupt(pasid, context_id0 & 0xfffffff, 28);
>   		} else if (source_id == SOC15_INTSRC_SDMA_ECC) {
> -			kfd_signal_poison_consumed_event(dev, pasid);
> -			amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev);
> +			event_interrupt_poison_consumption(dev, pasid);
>   			return;
>   		}
>   	} else if (client_id == SOC15_IH_CLIENTID_VMC ||
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 0c3f911e3bf4..ea68f3b3a4e9 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -856,6 +856,8 @@ struct kfd_process {
>   	struct svm_range_list svms;
>   
>   	bool xnack_enabled;
> +
> +	atomic_t poison;
>   };
>   
>   #define KFD_PROCESS_TABLE_SIZE 5 /* bits: 32 entries */

^ permalink raw reply	[flat|nested] 5+ messages in thread

* RE: [PATCH 3/3] drm/amdkfd: reset queue which consumes RAS poison
  2021-12-10 11:15 ` [PATCH 3/3] drm/amdkfd: reset queue which consumes RAS poison Tao Zhou
  2021-12-10 23:30   ` Felix Kuehling
@ 2021-12-13 10:43   ` Zhang, Hawking
  1 sibling, 0 replies; 5+ messages in thread
From: Zhang, Hawking @ 2021-12-13 10:43 UTC (permalink / raw)
  To: Zhou1, Tao, amd-gfx, Yang, Stanley, thomas.chai, Kuehling, Felix

[AMD Official Use Only]

                } else if (source_id == SOC15_INTSRC_SDMA_ECC) {
-                       kfd_signal_poison_consumed_event(dev, pasid);
-                       amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev);
+                       event_interrupt_poison_consumption(dev, pasid);
                        return;
                }

SDMA shouldn't go to the same handler and need a separated one. (i.e. re-initialize the RB)

Regards,
Hawking

-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1@amd.com>
Sent: Friday, December 10, 2021 19:15
To: amd-gfx@lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>; thomas.chai@amd.com; Kuehling, Felix <Felix.Kuehling@amd.com>
Cc: Zhou1, Tao <Tao.Zhou1@amd.com>
Subject: [PATCH 3/3] drm/amdkfd: reset queue which consumes RAS poison

CP supports unmap queue with reset mode which only destroys specific queue without affecting others.
Replacing whole gpu reset with reset queue mode for RAS poison consumption saves much time, and we can also fallback to gpu reset solution if reset queue fails.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c    |  6 ++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |  3 ++-
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 14 ++++++++++  .../drm/amd/amdkfd/kfd_device_queue_manager.h |  1 +
 .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c   | 27 ++++++++++++++++---
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  2 ++
 6 files changed, 45 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 46cf48b3904a..0bf09a94d944 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -721,13 +721,13 @@ bool amdgpu_amdkfd_have_atomics_support(struct amdgpu_device *adev)
        return adev->have_atomics_support;
 }

-void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev)
+void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device
+*adev, bool reset)
 {
        struct ras_err_data err_data = {0, 0, 0, NULL};

        /* CPU MCA will handle page retirement if connected_to_cpu is 1 */
        if (!adev->gmc.xgmi.connected_to_cpu)
-               amdgpu_umc_process_ras_data_cb(adev, &err_data, NULL);
-       else
+               amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset);
+       else if (reset)
                amdgpu_amdkfd_gpu_reset(adev);
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index fcbc8a9c9e06..61f899e54fd5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -296,7 +296,8 @@ int amdgpu_amdkfd_gpuvm_import_dmabuf(struct amdgpu_device *adev,
                                      uint64_t *mmap_offset);
 int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev,
                                struct tile_config *config);
-void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev);
+void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
+                               bool reset);
 #if IS_ENABLED(CONFIG_HSA_AMD)
 void amdgpu_amdkfd_gpuvm_init_mem_limits(void);
 void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 01a2cc3928ac..095b2e0822aa 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -1476,6 +1476,20 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
        return retval;
 }

+int unmap_queues_cpsch_poison(struct device_queue_manager *dqm,
+uint32_t pasid) {
+       int ret;
+
+       dqm_lock(dqm);
+
+       ret = unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_BY_PASID,
+                       pasid, true);
+
+       dqm_unlock(dqm);
+
+       return ret;
+}
+
 /* dqm->lock mutex has to be locked before calling this function */  static int execute_queues_cpsch(struct device_queue_manager *dqm,
                                enum kfd_unmap_queues_filter filter, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index 499fc0ea387f..c52869133159 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -219,6 +219,7 @@ unsigned int get_queues_per_pipe(struct device_queue_manager *dqm);  unsigned int get_pipes_per_mec(struct device_queue_manager *dqm);  unsigned int get_num_sdma_queues(struct device_queue_manager *dqm);  unsigned int get_num_xgmi_sdma_queues(struct device_queue_manager *dqm);
+int unmap_queues_cpsch_poison(struct device_queue_manager *dqm,
+uint32_t pasid);

 static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd)  { diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index deb64168c9e8..2863bb9e5bca 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -89,6 +89,27 @@ enum SQ_INTERRUPT_ERROR_TYPE {  #define KFD_SQ_INT_DATA__ERR_TYPE_MASK 0xF00000  #define KFD_SQ_INT_DATA__ERR_TYPE__SHIFT 20

+static void event_interrupt_poison_consumption(struct kfd_dev *dev,
+                               uint16_t pasid)
+{
+       int ret;
+       struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+
+       /* all queues of a process will be unmapped in one time */
+       if (p && atomic_read(&p->poison))
+               return;
+
+       atomic_set(&p->poison, 1);
+       ret = unmap_queues_cpsch_poison(dev->dqm, pasid);
+       kfd_signal_poison_consumed_event(dev, pasid);
+       /* resetting queue passes, do page retirement without gpu reset
+          resetting queue fails, fallback to gpu reset solution */
+       if (!ret)
+               amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, false);
+       else
+               amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, true); }
+
 static bool event_interrupt_isr_v9(struct kfd_dev *dev,
                                        const uint32_t *ih_ring_entry,
                                        uint32_t *patched_ihre,
@@ -230,8 +251,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
                                        sq_intr_err);
                                if (sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST &&
                                        sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_MEMVIOL) {
-                                       kfd_signal_poison_consumed_event(dev, pasid);
-                                       amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev);
+                                       event_interrupt_poison_consumption(dev, pasid);
                                        return;
                                }
                                break;
@@ -252,8 +272,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
                if (source_id == SOC15_INTSRC_SDMA_TRAP) {
                        kfd_signal_event_interrupt(pasid, context_id0 & 0xfffffff, 28);
                } else if (source_id == SOC15_INTSRC_SDMA_ECC) {
-                       kfd_signal_poison_consumed_event(dev, pasid);
-                       amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev);
+                       event_interrupt_poison_consumption(dev, pasid);
                        return;
                }
        } else if (client_id == SOC15_IH_CLIENTID_VMC || diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 0c3f911e3bf4..ea68f3b3a4e9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -856,6 +856,8 @@ struct kfd_process {
        struct svm_range_list svms;

        bool xnack_enabled;
+
+       atomic_t poison;
 };

 #define KFD_PROCESS_TABLE_SIZE 5 /* bits: 32 entries */
--
2.17.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2021-12-13 10:43 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-12-10 11:15 [PATCH 1/3] drm/amdgpu: add gpu reset control for umc page retirement Tao Zhou
2021-12-10 11:15 ` [PATCH 2/3] drm/amdkfd: add reset parameter for unmap queues Tao Zhou
2021-12-10 11:15 ` [PATCH 3/3] drm/amdkfd: reset queue which consumes RAS poison Tao Zhou
2021-12-10 23:30   ` Felix Kuehling
2021-12-13 10:43   ` Zhang, Hawking

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.