All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/5] drm/amdkfd: Only initialize sdma vm for sdma queues
@ 2019-06-03 17:50 Zeng, Oak
       [not found] ` <1559584248-12115-1-git-send-email-Oak.Zeng-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 8+ messages in thread
From: Zeng, Oak @ 2019-06-03 17:50 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Kuehling, Felix, Zeng, Oak

Don't do the same for compute queues

Change-Id: Id5f743ca10c2b761590bfe18cab2f802d3c04d2d
Signed-off-by: Oak Zeng <Oak.Zeng@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index ece35c7..e5cbf21 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -1206,7 +1206,9 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
 		q->properties.is_evicted = (q->properties.queue_size > 0 &&
 					    q->properties.queue_percent > 0 &&
 					    q->properties.queue_address != 0);
-	dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
+	if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
+		q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
+		dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
 	q->properties.tba_addr = qpd->tba_addr;
 	q->properties.tma_addr = qpd->tma_addr;
 	retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj,
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 2/5] drm/amdkfd: Only load sdma mqd when queue is active
       [not found] ` <1559584248-12115-1-git-send-email-Oak.Zeng-5C7GfCeVMHo@public.gmane.org>
@ 2019-06-03 17:51   ` Zeng, Oak
  2019-06-03 17:51   ` [PATCH 3/5] drm/amdkfd: Refactor create_queue_nocpsch Zeng, Oak
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 8+ messages in thread
From: Zeng, Oak @ 2019-06-03 17:51 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Kuehling, Felix, Zeng, Oak

Also calls load_mqd with current->mm struct. The mm
struct is used to read back user wptr of the queue.

Change-Id: I0f6d085878358dcd3a413054dbe61d1ca0fdf66d
Signed-off-by: Oak Zeng <Oak.Zeng@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index e5cbf21..dc1a70b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -990,8 +990,11 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
 	if (retval)
 		goto out_deallocate_doorbell;
 
+	if (!q->properties.is_active)
+		return 0;
+
 	retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, 0, 0, &q->properties,
-				NULL);
+				current->mm);
 	if (retval)
 		goto out_uninit_mqd;
 
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 3/5] drm/amdkfd: Refactor create_queue_nocpsch
       [not found] ` <1559584248-12115-1-git-send-email-Oak.Zeng-5C7GfCeVMHo@public.gmane.org>
  2019-06-03 17:51   ` [PATCH 2/5] drm/amdkfd: Only load sdma mqd when queue is active Zeng, Oak
@ 2019-06-03 17:51   ` Zeng, Oak
       [not found]     ` <1559584248-12115-3-git-send-email-Oak.Zeng-5C7GfCeVMHo@public.gmane.org>
  2019-06-03 17:51   ` [PATCH 4/5] drm/amdkfd: Fix a circular lock dependency Zeng, Oak
  2019-06-03 17:51   ` [PATCH 5/5] drm/amdkfd: Fix sdma queue allocate race condition Zeng, Oak
  3 siblings, 1 reply; 8+ messages in thread
From: Zeng, Oak @ 2019-06-03 17:51 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Kuehling, Felix, Zeng, Oak

This is prepare work to fix a circular lock dependency.
No logic change

Change-Id: I4e0ee918260e7780de972dd71f4ce787b4f6dde9
Signed-off-by: Oak Zeng <Oak.Zeng@amd.com>
---
 .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c  | 171 +++++++--------------
 1 file changed, 58 insertions(+), 113 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index dc1a70b..06654de 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -42,10 +42,6 @@
 static int set_pasid_vmid_mapping(struct device_queue_manager *dqm,
 					unsigned int pasid, unsigned int vmid);
 
-static int create_compute_queue_nocpsch(struct device_queue_manager *dqm,
-					struct queue *q,
-					struct qcm_process_device *qpd);
-
 static int execute_queues_cpsch(struct device_queue_manager *dqm,
 				enum kfd_unmap_queues_filter filter,
 				uint32_t filter_param);
@@ -55,13 +51,14 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
 
 static int map_queues_cpsch(struct device_queue_manager *dqm);
 
-static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
-					struct queue *q,
-					struct qcm_process_device *qpd);
-
 static void deallocate_sdma_queue(struct device_queue_manager *dqm,
 				struct queue *q);
 
+static inline void deallocate_hqd(struct device_queue_manager *dqm,
+				struct queue *q);
+static int allocate_hqd(struct device_queue_manager *dqm, struct queue *q);
+static int allocate_sdma_queue(struct device_queue_manager *dqm,
+				struct queue *q);
 static void kfd_process_hw_exception(struct work_struct *work);
 
 static inline
@@ -269,6 +266,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
 				struct queue *q,
 				struct qcm_process_device *qpd)
 {
+	struct mqd_manager *mqd_mgr;
 	int retval;
 
 	print_queue(q);
@@ -300,18 +298,46 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
 	q->properties.tba_addr = qpd->tba_addr;
 	q->properties.tma_addr = qpd->tma_addr;
 
+	mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
+			q->properties.type)];
+	if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE) {
+		retval = allocate_hqd(dqm, q);
+		if (retval)
+			goto deallocate_vmid;
+	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
+		q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
+		retval = allocate_sdma_queue(dqm, q);
+		if (retval)
+			goto deallocate_vmid;
+		dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
+	}
+
+	retval = allocate_doorbell(qpd, q);
+	if (retval)
+		goto out_deallocate_hqd;
+
+	retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj,
+				&q->gart_mqd_addr, &q->properties);
+	if (retval)
+		goto out_deallocate_doorbell;
+
+	pr_debug("Loading mqd to hqd on pipe %d, queue %d\n",
+			q->pipe, q->queue);
+
 	if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE)
-		retval = create_compute_queue_nocpsch(dqm, q, qpd);
-	else if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
-			q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
-		retval = create_sdma_queue_nocpsch(dqm, q, qpd);
-	else
-		retval = -EINVAL;
+		dqm->dev->kfd2kgd->set_scratch_backing_va(
+			dqm->dev->kgd, qpd->sh_hidden_private_base, qpd->vmid);
 
-	if (retval) {
-		if (list_empty(&qpd->queues_list))
-			deallocate_vmid(dqm, qpd, q);
-		goto out_unlock;
+	if (q->properties.is_active) {
+
+		if (WARN(q->process->mm != current->mm,
+					"should only run in user thread"))
+			retval = -EFAULT;
+		else
+			retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe,
+					q->queue, &q->properties, current->mm);
+		if (retval)
+			goto out_uninit_mqd;
 	}
 
 	list_add(&q->list, &qpd->queues_list);
@@ -331,7 +357,21 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
 	dqm->total_queue_count++;
 	pr_debug("Total of %d queues are accountable so far\n",
 			dqm->total_queue_count);
+	goto out_unlock;
 
+out_uninit_mqd:
+	mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
+out_deallocate_doorbell:
+	deallocate_doorbell(qpd, q);
+out_deallocate_hqd:
+	if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE)
+		deallocate_hqd(dqm, q);
+	else if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
+		q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
+		deallocate_sdma_queue(dqm, q);
+deallocate_vmid:
+	if (list_empty(&qpd->queues_list))
+		deallocate_vmid(dqm, qpd, q);
 out_unlock:
 	dqm_unlock(dqm);
 	return retval;
@@ -377,58 +417,6 @@ static inline void deallocate_hqd(struct device_queue_manager *dqm,
 	dqm->allocated_queues[q->pipe] |= (1 << q->queue);
 }
 
-static int create_compute_queue_nocpsch(struct device_queue_manager *dqm,
-					struct queue *q,
-					struct qcm_process_device *qpd)
-{
-	struct mqd_manager *mqd_mgr;
-	int retval;
-
-	mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_COMPUTE];
-
-	retval = allocate_hqd(dqm, q);
-	if (retval)
-		return retval;
-
-	retval = allocate_doorbell(qpd, q);
-	if (retval)
-		goto out_deallocate_hqd;
-
-	retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj,
-				&q->gart_mqd_addr, &q->properties);
-	if (retval)
-		goto out_deallocate_doorbell;
-
-	pr_debug("Loading mqd to hqd on pipe %d, queue %d\n",
-			q->pipe, q->queue);
-
-	dqm->dev->kfd2kgd->set_scratch_backing_va(
-			dqm->dev->kgd, qpd->sh_hidden_private_base, qpd->vmid);
-
-	if (!q->properties.is_active)
-		return 0;
-
-	if (WARN(q->process->mm != current->mm,
-		 "should only run in user thread"))
-		retval = -EFAULT;
-	else
-		retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe, q->queue,
-					   &q->properties, current->mm);
-	if (retval)
-		goto out_uninit_mqd;
-
-	return 0;
-
-out_uninit_mqd:
-	mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
-out_deallocate_doorbell:
-	deallocate_doorbell(qpd, q);
-out_deallocate_hqd:
-	deallocate_hqd(dqm, q);
-
-	return retval;
-}
-
 /* Access to DQM has to be locked before calling destroy_queue_nocpsch_locked
  * to avoid asynchronized access
  */
@@ -967,49 +955,6 @@ static void deallocate_sdma_queue(struct device_queue_manager *dqm,
 	}
 }
 
-static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
-					struct queue *q,
-					struct qcm_process_device *qpd)
-{
-	struct mqd_manager *mqd_mgr;
-	int retval;
-
-	mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_SDMA];
-
-	retval = allocate_sdma_queue(dqm, q);
-	if (retval)
-		return retval;
-
-	retval = allocate_doorbell(qpd, q);
-	if (retval)
-		goto out_deallocate_sdma_queue;
-
-	dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
-	retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj,
-				&q->gart_mqd_addr, &q->properties);
-	if (retval)
-		goto out_deallocate_doorbell;
-
-	if (!q->properties.is_active)
-		return 0;
-
-	retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, 0, 0, &q->properties,
-				current->mm);
-	if (retval)
-		goto out_uninit_mqd;
-
-	return 0;
-
-out_uninit_mqd:
-	mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
-out_deallocate_doorbell:
-	deallocate_doorbell(qpd, q);
-out_deallocate_sdma_queue:
-	deallocate_sdma_queue(dqm, q);
-
-	return retval;
-}
-
 /*
  * Device Queue Manager implementation for cp scheduler
  */
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 4/5] drm/amdkfd: Fix a circular lock dependency
       [not found] ` <1559584248-12115-1-git-send-email-Oak.Zeng-5C7GfCeVMHo@public.gmane.org>
  2019-06-03 17:51   ` [PATCH 2/5] drm/amdkfd: Only load sdma mqd when queue is active Zeng, Oak
  2019-06-03 17:51   ` [PATCH 3/5] drm/amdkfd: Refactor create_queue_nocpsch Zeng, Oak
@ 2019-06-03 17:51   ` Zeng, Oak
       [not found]     ` <1559584248-12115-4-git-send-email-Oak.Zeng-5C7GfCeVMHo@public.gmane.org>
  2019-06-03 17:51   ` [PATCH 5/5] drm/amdkfd: Fix sdma queue allocate race condition Zeng, Oak
  3 siblings, 1 reply; 8+ messages in thread
From: Zeng, Oak @ 2019-06-03 17:51 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Kuehling, Felix, Zeng, Oak

The idea to break the circular lock dependency is
to move init_mqd out of lock protection of dqm lock
in callstack #1 below. There is no need to.

[   59.510149] [drm] Initialized amdgpu 3.30.0 20150101 for 0000:04:00.0 on minor 0

[  513.604034] ======================================================
[  513.604205] WARNING: possible circular locking dependency detected
[  513.604375] 4.18.0-kfd-root #2 Tainted: G        W
[  513.604530] ------------------------------------------------------
[  513.604699] kswapd0/611 is trying to acquire lock:
[  513.604840] 00000000d254022e (&dqm->lock_hidden){+.+.}, at: evict_process_queues_nocpsch+0x26/0x140 [amdgpu]
[  513.605150]
               but task is already holding lock:
[  513.605307] 00000000961547fc (&anon_vma->rwsem){++++}, at: page_lock_anon_vma_read+0xe4/0x250
[  513.605540]
               which lock already depends on the new lock.

[  513.605747]
               the existing dependency chain (in reverse order) is:
[  513.605944]
               -> #4 (&anon_vma->rwsem){++++}:
[  513.606106]        __vma_adjust+0x147/0x7f0
[  513.606231]        __split_vma+0x179/0x190
[  513.606353]        mprotect_fixup+0x217/0x260
[  513.606553]        do_mprotect_pkey+0x211/0x380
[  513.606752]        __x64_sys_mprotect+0x1b/0x20
[  513.606954]        do_syscall_64+0x50/0x1a0
[  513.607149]        entry_SYSCALL_64_after_hwframe+0x49/0xbe
[  513.607380]
               -> #3 (&mapping->i_mmap_rwsem){++++}:
[  513.607678]        rmap_walk_file+0x1f0/0x280
[  513.607887]        page_referenced+0xdd/0x180
[  513.608081]        shrink_page_list+0x853/0xcb0
[  513.608279]        shrink_inactive_list+0x33b/0x700
[  513.608483]        shrink_node_memcg+0x37a/0x7f0
[  513.608682]        shrink_node+0xd8/0x490
[  513.608869]        balance_pgdat+0x18b/0x3b0
[  513.609062]        kswapd+0x203/0x5c0
[  513.609241]        kthread+0x100/0x140
[  513.609420]        ret_from_fork+0x24/0x30
[  513.609607]
               -> #2 (fs_reclaim){+.+.}:
[  513.609883]        kmem_cache_alloc_trace+0x34/0x2e0
[  513.610093]        reservation_object_reserve_shared+0x139/0x300
[  513.610326]        ttm_bo_init_reserved+0x291/0x480 [ttm]
[  513.610567]        amdgpu_bo_do_create+0x1d2/0x650 [amdgpu]
[  513.610811]        amdgpu_bo_create+0x40/0x1f0 [amdgpu]
[  513.611041]        amdgpu_bo_create_reserved+0x249/0x2d0 [amdgpu]
[  513.611290]        amdgpu_bo_create_kernel+0x12/0x70 [amdgpu]
[  513.611584]        amdgpu_ttm_init+0x2cb/0x560 [amdgpu]
[  513.611823]        gmc_v9_0_sw_init+0x400/0x750 [amdgpu]
[  513.612491]        amdgpu_device_init+0x14eb/0x1990 [amdgpu]
[  513.612730]        amdgpu_driver_load_kms+0x78/0x290 [amdgpu]
[  513.612958]        drm_dev_register+0x111/0x1a0
[  513.613171]        amdgpu_pci_probe+0x11c/0x1e0 [amdgpu]
[  513.613389]        local_pci_probe+0x3f/0x90
[  513.613581]        pci_device_probe+0x102/0x1c0
[  513.613779]        driver_probe_device+0x2a7/0x480
[  513.613984]        __driver_attach+0x10a/0x110
[  513.614179]        bus_for_each_dev+0x67/0xc0
[  513.614372]        bus_add_driver+0x1eb/0x260
[  513.614565]        driver_register+0x5b/0xe0
[  513.614756]        do_one_initcall+0xac/0x357
[  513.614952]        do_init_module+0x5b/0x213
[  513.615145]        load_module+0x2542/0x2d30
[  513.615337]        __do_sys_finit_module+0xd2/0x100
[  513.615541]        do_syscall_64+0x50/0x1a0
[  513.615731]        entry_SYSCALL_64_after_hwframe+0x49/0xbe
[  513.615963]
               -> #1 (reservation_ww_class_mutex){+.+.}:
[  513.616293]        amdgpu_amdkfd_alloc_gtt_mem+0xcf/0x2c0 [amdgpu]
[  513.616554]        init_mqd+0x223/0x260 [amdgpu]
[  513.616779]        create_queue_nocpsch+0x4d9/0x600 [amdgpu]
[  513.617031]        pqm_create_queue+0x37c/0x520 [amdgpu]
[  513.617270]        kfd_ioctl_create_queue+0x2f9/0x650 [amdgpu]
[  513.617522]        kfd_ioctl+0x202/0x350 [amdgpu]
[  513.617724]        do_vfs_ioctl+0x9f/0x6c0
[  513.617914]        ksys_ioctl+0x66/0x70
[  513.618095]        __x64_sys_ioctl+0x16/0x20
[  513.618286]        do_syscall_64+0x50/0x1a0
[  513.618476]        entry_SYSCALL_64_after_hwframe+0x49/0xbe
[  513.618695]
               -> #0 (&dqm->lock_hidden){+.+.}:
[  513.618984]        __mutex_lock+0x98/0x970
[  513.619197]        evict_process_queues_nocpsch+0x26/0x140 [amdgpu]
[  513.619459]        kfd_process_evict_queues+0x3b/0xb0 [amdgpu]
[  513.619710]        kgd2kfd_quiesce_mm+0x1c/0x40 [amdgpu]
[  513.620103]        amdgpu_amdkfd_evict_userptr+0x38/0x70 [amdgpu]
[  513.620363]        amdgpu_mn_invalidate_range_start_hsa+0xa6/0xc0 [amdgpu]
[  513.620614]        __mmu_notifier_invalidate_range_start+0x70/0xb0
[  513.620851]        try_to_unmap_one+0x7fc/0x8f0
[  513.621049]        rmap_walk_anon+0x121/0x290
[  513.621242]        try_to_unmap+0x93/0xf0
[  513.621428]        shrink_page_list+0x606/0xcb0
[  513.621625]        shrink_inactive_list+0x33b/0x700
[  513.621835]        shrink_node_memcg+0x37a/0x7f0
[  513.622034]        shrink_node+0xd8/0x490
[  513.622219]        balance_pgdat+0x18b/0x3b0
[  513.622410]        kswapd+0x203/0x5c0
[  513.622589]        kthread+0x100/0x140
[  513.622769]        ret_from_fork+0x24/0x30
[  513.622957]
               other info that might help us debug this:

[  513.623354] Chain exists of:
                 &dqm->lock_hidden --> &mapping->i_mmap_rwsem --> &anon_vma->rwsem

[  513.623900]  Possible unsafe locking scenario:

[  513.624189]        CPU0                    CPU1
[  513.624397]        ----                    ----
[  513.624594]   lock(&anon_vma->rwsem);
[  513.624771]                                lock(&mapping->i_mmap_rwsem);
[  513.625020]                                lock(&anon_vma->rwsem);
[  513.625253]   lock(&dqm->lock_hidden);
[  513.625433]
                *** DEADLOCK ***

[  513.625783] 3 locks held by kswapd0/611:
[  513.625967]  #0: 00000000f14edf84 (fs_reclaim){+.+.}, at: __fs_reclaim_acquire+0x5/0x30
[  513.626309]  #1: 00000000961547fc (&anon_vma->rwsem){++++}, at: page_lock_anon_vma_read+0xe4/0x250
[  513.626671]  #2: 0000000067b5cd12 (srcu){....}, at: __mmu_notifier_invalidate_range_start+0x5/0xb0
[  513.627037]
               stack backtrace:
[  513.627292] CPU: 0 PID: 611 Comm: kswapd0 Tainted: G        W         4.18.0-kfd-root #2
[  513.627632] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
[  513.627990] Call Trace:
[  513.628143]  dump_stack+0x7c/0xbb
[  513.628315]  print_circular_bug.isra.37+0x21b/0x228
[  513.628581]  __lock_acquire+0xf7d/0x1470
[  513.628782]  ? unwind_next_frame+0x6c/0x4f0
[  513.628974]  ? lock_acquire+0xec/0x1e0
[  513.629154]  lock_acquire+0xec/0x1e0
[  513.629357]  ? evict_process_queues_nocpsch+0x26/0x140 [amdgpu]
[  513.629587]  __mutex_lock+0x98/0x970
[  513.629790]  ? evict_process_queues_nocpsch+0x26/0x140 [amdgpu]
[  513.630047]  ? evict_process_queues_nocpsch+0x26/0x140 [amdgpu]
[  513.630309]  ? evict_process_queues_nocpsch+0x26/0x140 [amdgpu]
[  513.630562]  evict_process_queues_nocpsch+0x26/0x140 [amdgpu]
[  513.630816]  kfd_process_evict_queues+0x3b/0xb0 [amdgpu]
[  513.631057]  kgd2kfd_quiesce_mm+0x1c/0x40 [amdgpu]
[  513.631288]  amdgpu_amdkfd_evict_userptr+0x38/0x70 [amdgpu]
[  513.631536]  amdgpu_mn_invalidate_range_start_hsa+0xa6/0xc0 [amdgpu]
[  513.632076]  __mmu_notifier_invalidate_range_start+0x70/0xb0
[  513.632299]  try_to_unmap_one+0x7fc/0x8f0
[  513.632487]  ? page_lock_anon_vma_read+0x68/0x250
[  513.632690]  rmap_walk_anon+0x121/0x290
[  513.632875]  try_to_unmap+0x93/0xf0
[  513.633050]  ? page_remove_rmap+0x330/0x330
[  513.633239]  ? rcu_read_unlock+0x60/0x60
[  513.633422]  ? page_get_anon_vma+0x160/0x160
[  513.633613]  shrink_page_list+0x606/0xcb0
[  513.633800]  shrink_inactive_list+0x33b/0x700
[  513.633997]  shrink_node_memcg+0x37a/0x7f0
[  513.634186]  ? shrink_node+0xd8/0x490
[  513.634363]  shrink_node+0xd8/0x490
[  513.634537]  balance_pgdat+0x18b/0x3b0
[  513.634718]  kswapd+0x203/0x5c0
[  513.634887]  ? wait_woken+0xb0/0xb0
[  513.635062]  kthread+0x100/0x140
[  513.635231]  ? balance_pgdat+0x3b0/0x3b0
[  513.635414]  ? kthread_delayed_work_timer_fn+0x80/0x80
[  513.635626]  ret_from_fork+0x24/0x30
[  513.636042] Evicting PASID 32768 queues
[  513.936236] Restoring PASID 32768 queues
[  524.708912] Evicting PASID 32768 queues
[  524.999875] Restoring PASID 32768 queues

Change-Id: Ib40b5fb8d7c6572888a0c879a315b382eb948647
Signed-off-by: Oak Zeng <Oak.Zeng@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 06654de..39e2d6d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -271,19 +271,17 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
 
 	print_queue(q);
 
-	dqm_lock(dqm);
-
 	if (dqm->total_queue_count >= max_num_of_queues_per_device) {
 		pr_warn("Can't create new usermode queue because %d queues were already created\n",
 				dqm->total_queue_count);
 		retval = -EPERM;
-		goto out_unlock;
+		goto out;
 	}
 
 	if (list_empty(&qpd->queues_list)) {
 		retval = allocate_vmid(dqm, qpd, q);
 		if (retval)
-			goto out_unlock;
+			goto out;
 	}
 	q->properties.vmid = qpd->vmid;
 	/*
@@ -340,6 +338,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
 			goto out_uninit_mqd;
 	}
 
+	dqm_lock(dqm);
 	list_add(&q->list, &qpd->queues_list);
 	qpd->queue_count++;
 	if (q->properties.is_active)
@@ -357,7 +356,8 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
 	dqm->total_queue_count++;
 	pr_debug("Total of %d queues are accountable so far\n",
 			dqm->total_queue_count);
-	goto out_unlock;
+	dqm_unlock(dqm);
+	return retval;
 
 out_uninit_mqd:
 	mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
@@ -372,8 +372,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
 deallocate_vmid:
 	if (list_empty(&qpd->queues_list))
 		deallocate_vmid(dqm, qpd, q);
-out_unlock:
-	dqm_unlock(dqm);
+out:
 	return retval;
 }
 
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 5/5] drm/amdkfd: Fix sdma queue allocate race condition
       [not found] ` <1559584248-12115-1-git-send-email-Oak.Zeng-5C7GfCeVMHo@public.gmane.org>
                     ` (2 preceding siblings ...)
  2019-06-03 17:51   ` [PATCH 4/5] drm/amdkfd: Fix a circular lock dependency Zeng, Oak
@ 2019-06-03 17:51   ` Zeng, Oak
  3 siblings, 0 replies; 8+ messages in thread
From: Zeng, Oak @ 2019-06-03 17:51 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Kuehling, Felix, Zeng, Oak

SDMA queue allocation requires the dqm lock as it modify
the global dqm members. Introduce functions to allocate/deallocate
in locked/unlocked circumstance.

Change-Id: Id3084524c5f65d9629b12cf6b4862a7516945cb1
Signed-off-by: Oak Zeng <Oak.Zeng@amd.com>
---
 .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c  | 34 ++++++++++++++++------
 1 file changed, 25 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 39e2d6d..7071e0b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -53,6 +53,8 @@ static int map_queues_cpsch(struct device_queue_manager *dqm);
 
 static void deallocate_sdma_queue(struct device_queue_manager *dqm,
 				struct queue *q);
+static void deallocate_sdma_queue_locked(struct device_queue_manager *dqm,
+				struct queue *q);
 
 static inline void deallocate_hqd(struct device_queue_manager *dqm,
 				struct queue *q);
@@ -433,10 +435,10 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
 		deallocate_hqd(dqm, q);
 	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
 		dqm->sdma_queue_count--;
-		deallocate_sdma_queue(dqm, q);
+		deallocate_sdma_queue_locked(dqm, q);
 	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
 		dqm->xgmi_sdma_queue_count--;
-		deallocate_sdma_queue(dqm, q);
+		deallocate_sdma_queue_locked(dqm, q);
 	} else {
 		pr_debug("q->properties.type %d is invalid\n",
 				q->properties.type);
@@ -906,9 +908,12 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
 {
 	int bit;
 
+	dqm_lock(dqm);
 	if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
-		if (dqm->sdma_bitmap == 0)
+		if (dqm->sdma_bitmap == 0) {
+			dqm_unlock(dqm);
 			return -ENOMEM;
+		}
 		bit = __ffs64(dqm->sdma_bitmap);
 		dqm->sdma_bitmap &= ~(1ULL << bit);
 		q->sdma_id = bit;
@@ -917,8 +922,10 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
 		q->properties.sdma_queue_id = q->sdma_id /
 				get_num_sdma_engines(dqm);
 	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
-		if (dqm->xgmi_sdma_bitmap == 0)
+		if (dqm->xgmi_sdma_bitmap == 0) {
+			dqm_unlock(dqm);
 			return -ENOMEM;
+		}
 		bit = __ffs64(dqm->xgmi_sdma_bitmap);
 		dqm->xgmi_sdma_bitmap &= ~(1ULL << bit);
 		q->sdma_id = bit;
@@ -934,13 +941,14 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
 				get_num_xgmi_sdma_engines(dqm);
 	}
 
+	dqm_unlock(dqm);
 	pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id);
 	pr_debug("SDMA queue id: %d\n", q->properties.sdma_queue_id);
 
 	return 0;
 }
 
-static void deallocate_sdma_queue(struct device_queue_manager *dqm,
+static void deallocate_sdma_queue_locked(struct device_queue_manager *dqm,
 				struct queue *q)
 {
 	if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
@@ -954,6 +962,14 @@ static void deallocate_sdma_queue(struct device_queue_manager *dqm,
 	}
 }
 
+static void deallocate_sdma_queue(struct device_queue_manager *dqm,
+				struct queue *q)
+{
+	dqm_lock(dqm);
+	deallocate_sdma_queue_locked(dqm, q);
+	dqm_unlock(dqm);
+}
+
 /*
  * Device Queue Manager implementation for cp scheduler
  */
@@ -1345,10 +1361,10 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
 
 	if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
 		dqm->sdma_queue_count--;
-		deallocate_sdma_queue(dqm, q);
+		deallocate_sdma_queue_locked(dqm, q);
 	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
 		dqm->xgmi_sdma_queue_count--;
-		deallocate_sdma_queue(dqm, q);
+		deallocate_sdma_queue_locked(dqm, q);
 	}
 
 	list_del(&q->list);
@@ -1574,10 +1590,10 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
 	list_for_each_entry(q, &qpd->queues_list, list) {
 		if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
 			dqm->sdma_queue_count--;
-			deallocate_sdma_queue(dqm, q);
+			deallocate_sdma_queue_locked(dqm, q);
 		} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
 			dqm->xgmi_sdma_queue_count--;
-			deallocate_sdma_queue(dqm, q);
+			deallocate_sdma_queue_locked(dqm, q);
 		}
 
 		if (q->properties.is_active)
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH 3/5] drm/amdkfd: Refactor create_queue_nocpsch
       [not found]     ` <1559584248-12115-3-git-send-email-Oak.Zeng-5C7GfCeVMHo@public.gmane.org>
@ 2019-06-03 22:04       ` Kuehling, Felix
  0 siblings, 0 replies; 8+ messages in thread
From: Kuehling, Felix @ 2019-06-03 22:04 UTC (permalink / raw)
  To: Zeng, Oak, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

On 2019-06-03 1:51 p.m., Zeng, Oak wrote:
> This is prepare work to fix a circular lock dependency.
> No logic change
>
> Change-Id: I4e0ee918260e7780de972dd71f4ce787b4f6dde9
> Signed-off-by: Oak Zeng <Oak.Zeng@amd.com>
> ---
>   .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c  | 171 +++++++--------------
>   1 file changed, 58 insertions(+), 113 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index dc1a70b..06654de 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -42,10 +42,6 @@
>   static int set_pasid_vmid_mapping(struct device_queue_manager *dqm,
>   					unsigned int pasid, unsigned int vmid);
>   
> -static int create_compute_queue_nocpsch(struct device_queue_manager *dqm,
> -					struct queue *q,
> -					struct qcm_process_device *qpd);
> -
>   static int execute_queues_cpsch(struct device_queue_manager *dqm,
>   				enum kfd_unmap_queues_filter filter,
>   				uint32_t filter_param);
> @@ -55,13 +51,14 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
>   
>   static int map_queues_cpsch(struct device_queue_manager *dqm);
>   
> -static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
> -					struct queue *q,
> -					struct qcm_process_device *qpd);
> -
>   static void deallocate_sdma_queue(struct device_queue_manager *dqm,
>   				struct queue *q);
>   
> +static inline void deallocate_hqd(struct device_queue_manager *dqm,
> +				struct queue *q);
> +static int allocate_hqd(struct device_queue_manager *dqm, struct queue *q);
> +static int allocate_sdma_queue(struct device_queue_manager *dqm,
> +				struct queue *q);
>   static void kfd_process_hw_exception(struct work_struct *work);
>   
>   static inline
> @@ -269,6 +266,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
>   				struct queue *q,
>   				struct qcm_process_device *qpd)
>   {
> +	struct mqd_manager *mqd_mgr;
>   	int retval;
>   
>   	print_queue(q);
> @@ -300,18 +298,46 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
>   	q->properties.tba_addr = qpd->tba_addr;
>   	q->properties.tma_addr = qpd->tma_addr;
>   
> +	mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
> +			q->properties.type)];
> +	if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE) {
> +		retval = allocate_hqd(dqm, q);
> +		if (retval)
> +			goto deallocate_vmid;
> +	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
> +		q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
> +		retval = allocate_sdma_queue(dqm, q);
> +		if (retval)
> +			goto deallocate_vmid;
> +		dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
> +	}
> +
> +	retval = allocate_doorbell(qpd, q);
> +	if (retval)
> +		goto out_deallocate_hqd;
> +
> +	retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj,
> +				&q->gart_mqd_addr, &q->properties);
> +	if (retval)
> +		goto out_deallocate_doorbell;
> +
> +	pr_debug("Loading mqd to hqd on pipe %d, queue %d\n",
> +			q->pipe, q->queue);

This message only makes sense for compute queues. Maybe move this up to 
just after allocate_hqd, because that's what assignes q->pipe and q->queue.


> +
>   	if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE)
> -		retval = create_compute_queue_nocpsch(dqm, q, qpd);
> -	else if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
> -			q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
> -		retval = create_sdma_queue_nocpsch(dqm, q, qpd);
> -	else
> -		retval = -EINVAL;
> +		dqm->dev->kfd2kgd->set_scratch_backing_va(
> +			dqm->dev->kgd, qpd->sh_hidden_private_base, qpd->vmid);

Not sure why this is here. I think it would make more sense in 
allocate_vmid. Normally this is called when the scratch-backing VA is 
set in the ioctl function. Unless no VMID has been assigned to the 
process yet. So setting this when the VMID is assigned in allocate_vmid 
seems to make the most sense.


>   
> -	if (retval) {
> -		if (list_empty(&qpd->queues_list))
> -			deallocate_vmid(dqm, qpd, q);
> -		goto out_unlock;
> +	if (q->properties.is_active) {
> +
> +		if (WARN(q->process->mm != current->mm,
> +					"should only run in user thread"))
> +			retval = -EFAULT;
> +		else
> +			retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe,
> +					q->queue, &q->properties, current->mm);
> +		if (retval)
> +			goto out_uninit_mqd;
>   	}
>   
>   	list_add(&q->list, &qpd->queues_list);
> @@ -331,7 +357,21 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
>   	dqm->total_queue_count++;
>   	pr_debug("Total of %d queues are accountable so far\n",
>   			dqm->total_queue_count);
> +	goto out_unlock;
>   
> +out_uninit_mqd:
> +	mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
> +out_deallocate_doorbell:
> +	deallocate_doorbell(qpd, q);
> +out_deallocate_hqd:
> +	if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE)
> +		deallocate_hqd(dqm, q);
> +	else if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
> +		q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
> +		deallocate_sdma_queue(dqm, q);
> +deallocate_vmid:
> +	if (list_empty(&qpd->queues_list))
> +		deallocate_vmid(dqm, qpd, q);
>   out_unlock:
>   	dqm_unlock(dqm);
>   	return retval;
> @@ -377,58 +417,6 @@ static inline void deallocate_hqd(struct device_queue_manager *dqm,
>   	dqm->allocated_queues[q->pipe] |= (1 << q->queue);
>   }
>   
> -static int create_compute_queue_nocpsch(struct device_queue_manager *dqm,
> -					struct queue *q,
> -					struct qcm_process_device *qpd)
> -{
> -	struct mqd_manager *mqd_mgr;
> -	int retval;
> -
> -	mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_COMPUTE];
> -
> -	retval = allocate_hqd(dqm, q);
> -	if (retval)
> -		return retval;
> -
> -	retval = allocate_doorbell(qpd, q);
> -	if (retval)
> -		goto out_deallocate_hqd;
> -
> -	retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj,
> -				&q->gart_mqd_addr, &q->properties);
> -	if (retval)
> -		goto out_deallocate_doorbell;
> -
> -	pr_debug("Loading mqd to hqd on pipe %d, queue %d\n",
> -			q->pipe, q->queue);
> -
> -	dqm->dev->kfd2kgd->set_scratch_backing_va(
> -			dqm->dev->kgd, qpd->sh_hidden_private_base, qpd->vmid);
> -
> -	if (!q->properties.is_active)
> -		return 0;
> -
> -	if (WARN(q->process->mm != current->mm,
> -		 "should only run in user thread"))
> -		retval = -EFAULT;
> -	else
> -		retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe, q->queue,
> -					   &q->properties, current->mm);
> -	if (retval)
> -		goto out_uninit_mqd;
> -
> -	return 0;
> -
> -out_uninit_mqd:
> -	mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
> -out_deallocate_doorbell:
> -	deallocate_doorbell(qpd, q);
> -out_deallocate_hqd:
> -	deallocate_hqd(dqm, q);
> -
> -	return retval;
> -}
> -
>   /* Access to DQM has to be locked before calling destroy_queue_nocpsch_locked
>    * to avoid asynchronized access
>    */
> @@ -967,49 +955,6 @@ static void deallocate_sdma_queue(struct device_queue_manager *dqm,
>   	}
>   }
>   
> -static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
> -					struct queue *q,
> -					struct qcm_process_device *qpd)
> -{
> -	struct mqd_manager *mqd_mgr;
> -	int retval;
> -
> -	mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_SDMA];
> -
> -	retval = allocate_sdma_queue(dqm, q);
> -	if (retval)
> -		return retval;
> -
> -	retval = allocate_doorbell(qpd, q);
> -	if (retval)
> -		goto out_deallocate_sdma_queue;
> -
> -	dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
> -	retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj,
> -				&q->gart_mqd_addr, &q->properties);
> -	if (retval)
> -		goto out_deallocate_doorbell;
> -
> -	if (!q->properties.is_active)
> -		return 0;
> -
> -	retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, 0, 0, &q->properties,
> -				current->mm);
> -	if (retval)
> -		goto out_uninit_mqd;
> -
> -	return 0;
> -
> -out_uninit_mqd:
> -	mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
> -out_deallocate_doorbell:
> -	deallocate_doorbell(qpd, q);
> -out_deallocate_sdma_queue:
> -	deallocate_sdma_queue(dqm, q);
> -
> -	return retval;
> -}
> -
>   /*
>    * Device Queue Manager implementation for cp scheduler
>    */
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 4/5] drm/amdkfd: Fix a circular lock dependency
       [not found]     ` <1559584248-12115-4-git-send-email-Oak.Zeng-5C7GfCeVMHo@public.gmane.org>
@ 2019-06-03 22:27       ` Kuehling, Felix
       [not found]         ` <5e896419-e14b-41b2-17fe-10eaee1f314c-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 8+ messages in thread
From: Kuehling, Felix @ 2019-06-03 22:27 UTC (permalink / raw)
  To: Zeng, Oak, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

allocate_vmid, allocate_hqd and allocate_sdma_queue all work on data in 
the DQM structure. So it seems these need to be protected by the DQM 
lock. allocate_doorbell doesn't need the DQM lock because its data 
structures are in the process_device structure, which is protected by 
the process lock.

What's different in the cpsch case is, that we don't need allocate_vmid 
and allocate_hqd. The only thing that was broken there accidentally by 
moving the DQM lock to later, is allocate_sdma_queue, which you're 
addressing in the next patch.

If you move the DQM lock in create_queue_nocpsch, you'll need to fix up 
DQM locking in allocate_vmid, allocate_hqd and allocate_sdma_queue in 
the next change. I think the easier solution is to do 
create_queue_nocpsch with two critical sections protected by the DQM 
lock and do the init_mqd between the two critical sections.

Alternatively you could separate allocate_mqd from init_mqd, so that you 
can do the MQD allocation before you take the DQM lock and the MQD 
initialization safely under the DQM lock. That way the create queue 
functions would each have only one critical section. I think that would 
be the cleanest solution.

Regards,
   Felix

On 2019-06-03 1:51 p.m., Zeng, Oak wrote:
> The idea to break the circular lock dependency is
> to move init_mqd out of lock protection of dqm lock
> in callstack #1 below. There is no need to.
>
> [   59.510149] [drm] Initialized amdgpu 3.30.0 20150101 for 0000:04:00.0 on minor 0
>
> [  513.604034] ======================================================
> [  513.604205] WARNING: possible circular locking dependency detected
> [  513.604375] 4.18.0-kfd-root #2 Tainted: G        W
> [  513.604530] ------------------------------------------------------
> [  513.604699] kswapd0/611 is trying to acquire lock:
> [  513.604840] 00000000d254022e (&dqm->lock_hidden){+.+.}, at: evict_process_queues_nocpsch+0x26/0x140 [amdgpu]
> [  513.605150]
>                 but task is already holding lock:
> [  513.605307] 00000000961547fc (&anon_vma->rwsem){++++}, at: page_lock_anon_vma_read+0xe4/0x250
> [  513.605540]
>                 which lock already depends on the new lock.
>
> [  513.605747]
>                 the existing dependency chain (in reverse order) is:
> [  513.605944]
>                 -> #4 (&anon_vma->rwsem){++++}:
> [  513.606106]        __vma_adjust+0x147/0x7f0
> [  513.606231]        __split_vma+0x179/0x190
> [  513.606353]        mprotect_fixup+0x217/0x260
> [  513.606553]        do_mprotect_pkey+0x211/0x380
> [  513.606752]        __x64_sys_mprotect+0x1b/0x20
> [  513.606954]        do_syscall_64+0x50/0x1a0
> [  513.607149]        entry_SYSCALL_64_after_hwframe+0x49/0xbe
> [  513.607380]
>                 -> #3 (&mapping->i_mmap_rwsem){++++}:
> [  513.607678]        rmap_walk_file+0x1f0/0x280
> [  513.607887]        page_referenced+0xdd/0x180
> [  513.608081]        shrink_page_list+0x853/0xcb0
> [  513.608279]        shrink_inactive_list+0x33b/0x700
> [  513.608483]        shrink_node_memcg+0x37a/0x7f0
> [  513.608682]        shrink_node+0xd8/0x490
> [  513.608869]        balance_pgdat+0x18b/0x3b0
> [  513.609062]        kswapd+0x203/0x5c0
> [  513.609241]        kthread+0x100/0x140
> [  513.609420]        ret_from_fork+0x24/0x30
> [  513.609607]
>                 -> #2 (fs_reclaim){+.+.}:
> [  513.609883]        kmem_cache_alloc_trace+0x34/0x2e0
> [  513.610093]        reservation_object_reserve_shared+0x139/0x300
> [  513.610326]        ttm_bo_init_reserved+0x291/0x480 [ttm]
> [  513.610567]        amdgpu_bo_do_create+0x1d2/0x650 [amdgpu]
> [  513.610811]        amdgpu_bo_create+0x40/0x1f0 [amdgpu]
> [  513.611041]        amdgpu_bo_create_reserved+0x249/0x2d0 [amdgpu]
> [  513.611290]        amdgpu_bo_create_kernel+0x12/0x70 [amdgpu]
> [  513.611584]        amdgpu_ttm_init+0x2cb/0x560 [amdgpu]
> [  513.611823]        gmc_v9_0_sw_init+0x400/0x750 [amdgpu]
> [  513.612491]        amdgpu_device_init+0x14eb/0x1990 [amdgpu]
> [  513.612730]        amdgpu_driver_load_kms+0x78/0x290 [amdgpu]
> [  513.612958]        drm_dev_register+0x111/0x1a0
> [  513.613171]        amdgpu_pci_probe+0x11c/0x1e0 [amdgpu]
> [  513.613389]        local_pci_probe+0x3f/0x90
> [  513.613581]        pci_device_probe+0x102/0x1c0
> [  513.613779]        driver_probe_device+0x2a7/0x480
> [  513.613984]        __driver_attach+0x10a/0x110
> [  513.614179]        bus_for_each_dev+0x67/0xc0
> [  513.614372]        bus_add_driver+0x1eb/0x260
> [  513.614565]        driver_register+0x5b/0xe0
> [  513.614756]        do_one_initcall+0xac/0x357
> [  513.614952]        do_init_module+0x5b/0x213
> [  513.615145]        load_module+0x2542/0x2d30
> [  513.615337]        __do_sys_finit_module+0xd2/0x100
> [  513.615541]        do_syscall_64+0x50/0x1a0
> [  513.615731]        entry_SYSCALL_64_after_hwframe+0x49/0xbe
> [  513.615963]
>                 -> #1 (reservation_ww_class_mutex){+.+.}:
> [  513.616293]        amdgpu_amdkfd_alloc_gtt_mem+0xcf/0x2c0 [amdgpu]
> [  513.616554]        init_mqd+0x223/0x260 [amdgpu]
> [  513.616779]        create_queue_nocpsch+0x4d9/0x600 [amdgpu]
> [  513.617031]        pqm_create_queue+0x37c/0x520 [amdgpu]
> [  513.617270]        kfd_ioctl_create_queue+0x2f9/0x650 [amdgpu]
> [  513.617522]        kfd_ioctl+0x202/0x350 [amdgpu]
> [  513.617724]        do_vfs_ioctl+0x9f/0x6c0
> [  513.617914]        ksys_ioctl+0x66/0x70
> [  513.618095]        __x64_sys_ioctl+0x16/0x20
> [  513.618286]        do_syscall_64+0x50/0x1a0
> [  513.618476]        entry_SYSCALL_64_after_hwframe+0x49/0xbe
> [  513.618695]
>                 -> #0 (&dqm->lock_hidden){+.+.}:
> [  513.618984]        __mutex_lock+0x98/0x970
> [  513.619197]        evict_process_queues_nocpsch+0x26/0x140 [amdgpu]
> [  513.619459]        kfd_process_evict_queues+0x3b/0xb0 [amdgpu]
> [  513.619710]        kgd2kfd_quiesce_mm+0x1c/0x40 [amdgpu]
> [  513.620103]        amdgpu_amdkfd_evict_userptr+0x38/0x70 [amdgpu]
> [  513.620363]        amdgpu_mn_invalidate_range_start_hsa+0xa6/0xc0 [amdgpu]
> [  513.620614]        __mmu_notifier_invalidate_range_start+0x70/0xb0
> [  513.620851]        try_to_unmap_one+0x7fc/0x8f0
> [  513.621049]        rmap_walk_anon+0x121/0x290
> [  513.621242]        try_to_unmap+0x93/0xf0
> [  513.621428]        shrink_page_list+0x606/0xcb0
> [  513.621625]        shrink_inactive_list+0x33b/0x700
> [  513.621835]        shrink_node_memcg+0x37a/0x7f0
> [  513.622034]        shrink_node+0xd8/0x490
> [  513.622219]        balance_pgdat+0x18b/0x3b0
> [  513.622410]        kswapd+0x203/0x5c0
> [  513.622589]        kthread+0x100/0x140
> [  513.622769]        ret_from_fork+0x24/0x30
> [  513.622957]
>                 other info that might help us debug this:
>
> [  513.623354] Chain exists of:
>                   &dqm->lock_hidden --> &mapping->i_mmap_rwsem --> &anon_vma->rwsem
>
> [  513.623900]  Possible unsafe locking scenario:
>
> [  513.624189]        CPU0                    CPU1
> [  513.624397]        ----                    ----
> [  513.624594]   lock(&anon_vma->rwsem);
> [  513.624771]                                lock(&mapping->i_mmap_rwsem);
> [  513.625020]                                lock(&anon_vma->rwsem);
> [  513.625253]   lock(&dqm->lock_hidden);
> [  513.625433]
>                  *** DEADLOCK ***
>
> [  513.625783] 3 locks held by kswapd0/611:
> [  513.625967]  #0: 00000000f14edf84 (fs_reclaim){+.+.}, at: __fs_reclaim_acquire+0x5/0x30
> [  513.626309]  #1: 00000000961547fc (&anon_vma->rwsem){++++}, at: page_lock_anon_vma_read+0xe4/0x250
> [  513.626671]  #2: 0000000067b5cd12 (srcu){....}, at: __mmu_notifier_invalidate_range_start+0x5/0xb0
> [  513.627037]
>                 stack backtrace:
> [  513.627292] CPU: 0 PID: 611 Comm: kswapd0 Tainted: G        W         4.18.0-kfd-root #2
> [  513.627632] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
> [  513.627990] Call Trace:
> [  513.628143]  dump_stack+0x7c/0xbb
> [  513.628315]  print_circular_bug.isra.37+0x21b/0x228
> [  513.628581]  __lock_acquire+0xf7d/0x1470
> [  513.628782]  ? unwind_next_frame+0x6c/0x4f0
> [  513.628974]  ? lock_acquire+0xec/0x1e0
> [  513.629154]  lock_acquire+0xec/0x1e0
> [  513.629357]  ? evict_process_queues_nocpsch+0x26/0x140 [amdgpu]
> [  513.629587]  __mutex_lock+0x98/0x970
> [  513.629790]  ? evict_process_queues_nocpsch+0x26/0x140 [amdgpu]
> [  513.630047]  ? evict_process_queues_nocpsch+0x26/0x140 [amdgpu]
> [  513.630309]  ? evict_process_queues_nocpsch+0x26/0x140 [amdgpu]
> [  513.630562]  evict_process_queues_nocpsch+0x26/0x140 [amdgpu]
> [  513.630816]  kfd_process_evict_queues+0x3b/0xb0 [amdgpu]
> [  513.631057]  kgd2kfd_quiesce_mm+0x1c/0x40 [amdgpu]
> [  513.631288]  amdgpu_amdkfd_evict_userptr+0x38/0x70 [amdgpu]
> [  513.631536]  amdgpu_mn_invalidate_range_start_hsa+0xa6/0xc0 [amdgpu]
> [  513.632076]  __mmu_notifier_invalidate_range_start+0x70/0xb0
> [  513.632299]  try_to_unmap_one+0x7fc/0x8f0
> [  513.632487]  ? page_lock_anon_vma_read+0x68/0x250
> [  513.632690]  rmap_walk_anon+0x121/0x290
> [  513.632875]  try_to_unmap+0x93/0xf0
> [  513.633050]  ? page_remove_rmap+0x330/0x330
> [  513.633239]  ? rcu_read_unlock+0x60/0x60
> [  513.633422]  ? page_get_anon_vma+0x160/0x160
> [  513.633613]  shrink_page_list+0x606/0xcb0
> [  513.633800]  shrink_inactive_list+0x33b/0x700
> [  513.633997]  shrink_node_memcg+0x37a/0x7f0
> [  513.634186]  ? shrink_node+0xd8/0x490
> [  513.634363]  shrink_node+0xd8/0x490
> [  513.634537]  balance_pgdat+0x18b/0x3b0
> [  513.634718]  kswapd+0x203/0x5c0
> [  513.634887]  ? wait_woken+0xb0/0xb0
> [  513.635062]  kthread+0x100/0x140
> [  513.635231]  ? balance_pgdat+0x3b0/0x3b0
> [  513.635414]  ? kthread_delayed_work_timer_fn+0x80/0x80
> [  513.635626]  ret_from_fork+0x24/0x30
> [  513.636042] Evicting PASID 32768 queues
> [  513.936236] Restoring PASID 32768 queues
> [  524.708912] Evicting PASID 32768 queues
> [  524.999875] Restoring PASID 32768 queues
>
> Change-Id: Ib40b5fb8d7c6572888a0c879a315b382eb948647
> Signed-off-by: Oak Zeng <Oak.Zeng@amd.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 13 ++++++-------
>   1 file changed, 6 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 06654de..39e2d6d 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -271,19 +271,17 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
>   
>   	print_queue(q);
>   
> -	dqm_lock(dqm);
> -
>   	if (dqm->total_queue_count >= max_num_of_queues_per_device) {
>   		pr_warn("Can't create new usermode queue because %d queues were already created\n",
>   				dqm->total_queue_count);
>   		retval = -EPERM;
> -		goto out_unlock;
> +		goto out;
>   	}
>   
>   	if (list_empty(&qpd->queues_list)) {
>   		retval = allocate_vmid(dqm, qpd, q);
>   		if (retval)
> -			goto out_unlock;
> +			goto out;
>   	}
>   	q->properties.vmid = qpd->vmid;
>   	/*
> @@ -340,6 +338,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
>   			goto out_uninit_mqd;
>   	}
>   
> +	dqm_lock(dqm);
>   	list_add(&q->list, &qpd->queues_list);
>   	qpd->queue_count++;
>   	if (q->properties.is_active)
> @@ -357,7 +356,8 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
>   	dqm->total_queue_count++;
>   	pr_debug("Total of %d queues are accountable so far\n",
>   			dqm->total_queue_count);
> -	goto out_unlock;
> +	dqm_unlock(dqm);
> +	return retval;
>   
>   out_uninit_mqd:
>   	mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
> @@ -372,8 +372,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
>   deallocate_vmid:
>   	if (list_empty(&qpd->queues_list))
>   		deallocate_vmid(dqm, qpd, q);
> -out_unlock:
> -	dqm_unlock(dqm);
> +out:
>   	return retval;
>   }
>   
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [PATCH 4/5] drm/amdkfd: Fix a circular lock dependency
       [not found]         ` <5e896419-e14b-41b2-17fe-10eaee1f314c-5C7GfCeVMHo@public.gmane.org>
@ 2019-06-04  1:08           ` Zeng, Oak
  0 siblings, 0 replies; 8+ messages in thread
From: Zeng, Oak @ 2019-06-04  1:08 UTC (permalink / raw)
  To: Kuehling, Felix, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Thank you Felix for the catch. Let's do it the cleanest way. Another reason to separate allocate_mqd from init_mqd is, the current init_mqd is actually allocate_and_init_mqd. If we separate it into two functions, the function name makes more sense. 

Regards,
Oak

-----Original Message-----
From: Kuehling, Felix <Felix.Kuehling@amd.com> 
Sent: Monday, June 3, 2019 6:27 PM
To: Zeng, Oak <Oak.Zeng@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH 4/5] drm/amdkfd: Fix a circular lock dependency

allocate_vmid, allocate_hqd and allocate_sdma_queue all work on data in the DQM structure. So it seems these need to be protected by the DQM lock. allocate_doorbell doesn't need the DQM lock because its data structures are in the process_device structure, which is protected by the process lock.

What's different in the cpsch case is, that we don't need allocate_vmid and allocate_hqd. The only thing that was broken there accidentally by moving the DQM lock to later, is allocate_sdma_queue, which you're addressing in the next patch.

If you move the DQM lock in create_queue_nocpsch, you'll need to fix up DQM locking in allocate_vmid, allocate_hqd and allocate_sdma_queue in the next change. I think the easier solution is to do create_queue_nocpsch with two critical sections protected by the DQM lock and do the init_mqd between the two critical sections.

Alternatively you could separate allocate_mqd from init_mqd, so that you can do the MQD allocation before you take the DQM lock and the MQD initialization safely under the DQM lock. That way the create queue functions would each have only one critical section. I think that would be the cleanest solution.

Regards,
   Felix

On 2019-06-03 1:51 p.m., Zeng, Oak wrote:
> The idea to break the circular lock dependency is to move init_mqd out 
> of lock protection of dqm lock in callstack #1 below. There is no need 
> to.
>
> [   59.510149] [drm] Initialized amdgpu 3.30.0 20150101 for 0000:04:00.0 on minor 0
>
> [  513.604034] ======================================================
> [  513.604205] WARNING: possible circular locking dependency detected
> [  513.604375] 4.18.0-kfd-root #2 Tainted: G        W
> [  513.604530] ------------------------------------------------------
> [  513.604699] kswapd0/611 is trying to acquire lock:
> [  513.604840] 00000000d254022e (&dqm->lock_hidden){+.+.}, at: 
> evict_process_queues_nocpsch+0x26/0x140 [amdgpu] [  513.605150]
>                 but task is already holding lock:
> [  513.605307] 00000000961547fc (&anon_vma->rwsem){++++}, at: 
> page_lock_anon_vma_read+0xe4/0x250
> [  513.605540]
>                 which lock already depends on the new lock.
>
> [  513.605747]
>                 the existing dependency chain (in reverse order) is:
> [  513.605944]
>                 -> #4 (&anon_vma->rwsem){++++}:
> [  513.606106]        __vma_adjust+0x147/0x7f0
> [  513.606231]        __split_vma+0x179/0x190
> [  513.606353]        mprotect_fixup+0x217/0x260
> [  513.606553]        do_mprotect_pkey+0x211/0x380
> [  513.606752]        __x64_sys_mprotect+0x1b/0x20
> [  513.606954]        do_syscall_64+0x50/0x1a0
> [  513.607149]        entry_SYSCALL_64_after_hwframe+0x49/0xbe
> [  513.607380]
>                 -> #3 (&mapping->i_mmap_rwsem){++++}:
> [  513.607678]        rmap_walk_file+0x1f0/0x280
> [  513.607887]        page_referenced+0xdd/0x180
> [  513.608081]        shrink_page_list+0x853/0xcb0
> [  513.608279]        shrink_inactive_list+0x33b/0x700
> [  513.608483]        shrink_node_memcg+0x37a/0x7f0
> [  513.608682]        shrink_node+0xd8/0x490
> [  513.608869]        balance_pgdat+0x18b/0x3b0
> [  513.609062]        kswapd+0x203/0x5c0
> [  513.609241]        kthread+0x100/0x140
> [  513.609420]        ret_from_fork+0x24/0x30
> [  513.609607]
>                 -> #2 (fs_reclaim){+.+.}:
> [  513.609883]        kmem_cache_alloc_trace+0x34/0x2e0
> [  513.610093]        reservation_object_reserve_shared+0x139/0x300
> [  513.610326]        ttm_bo_init_reserved+0x291/0x480 [ttm]
> [  513.610567]        amdgpu_bo_do_create+0x1d2/0x650 [amdgpu]
> [  513.610811]        amdgpu_bo_create+0x40/0x1f0 [amdgpu]
> [  513.611041]        amdgpu_bo_create_reserved+0x249/0x2d0 [amdgpu]
> [  513.611290]        amdgpu_bo_create_kernel+0x12/0x70 [amdgpu]
> [  513.611584]        amdgpu_ttm_init+0x2cb/0x560 [amdgpu]
> [  513.611823]        gmc_v9_0_sw_init+0x400/0x750 [amdgpu]
> [  513.612491]        amdgpu_device_init+0x14eb/0x1990 [amdgpu]
> [  513.612730]        amdgpu_driver_load_kms+0x78/0x290 [amdgpu]
> [  513.612958]        drm_dev_register+0x111/0x1a0
> [  513.613171]        amdgpu_pci_probe+0x11c/0x1e0 [amdgpu]
> [  513.613389]        local_pci_probe+0x3f/0x90
> [  513.613581]        pci_device_probe+0x102/0x1c0
> [  513.613779]        driver_probe_device+0x2a7/0x480
> [  513.613984]        __driver_attach+0x10a/0x110
> [  513.614179]        bus_for_each_dev+0x67/0xc0
> [  513.614372]        bus_add_driver+0x1eb/0x260
> [  513.614565]        driver_register+0x5b/0xe0
> [  513.614756]        do_one_initcall+0xac/0x357
> [  513.614952]        do_init_module+0x5b/0x213
> [  513.615145]        load_module+0x2542/0x2d30
> [  513.615337]        __do_sys_finit_module+0xd2/0x100
> [  513.615541]        do_syscall_64+0x50/0x1a0
> [  513.615731]        entry_SYSCALL_64_after_hwframe+0x49/0xbe
> [  513.615963]
>                 -> #1 (reservation_ww_class_mutex){+.+.}:
> [  513.616293]        amdgpu_amdkfd_alloc_gtt_mem+0xcf/0x2c0 [amdgpu]
> [  513.616554]        init_mqd+0x223/0x260 [amdgpu]
> [  513.616779]        create_queue_nocpsch+0x4d9/0x600 [amdgpu]
> [  513.617031]        pqm_create_queue+0x37c/0x520 [amdgpu]
> [  513.617270]        kfd_ioctl_create_queue+0x2f9/0x650 [amdgpu]
> [  513.617522]        kfd_ioctl+0x202/0x350 [amdgpu]
> [  513.617724]        do_vfs_ioctl+0x9f/0x6c0
> [  513.617914]        ksys_ioctl+0x66/0x70
> [  513.618095]        __x64_sys_ioctl+0x16/0x20
> [  513.618286]        do_syscall_64+0x50/0x1a0
> [  513.618476]        entry_SYSCALL_64_after_hwframe+0x49/0xbe
> [  513.618695]
>                 -> #0 (&dqm->lock_hidden){+.+.}:
> [  513.618984]        __mutex_lock+0x98/0x970
> [  513.619197]        evict_process_queues_nocpsch+0x26/0x140 [amdgpu]
> [  513.619459]        kfd_process_evict_queues+0x3b/0xb0 [amdgpu]
> [  513.619710]        kgd2kfd_quiesce_mm+0x1c/0x40 [amdgpu]
> [  513.620103]        amdgpu_amdkfd_evict_userptr+0x38/0x70 [amdgpu]
> [  513.620363]        amdgpu_mn_invalidate_range_start_hsa+0xa6/0xc0 [amdgpu]
> [  513.620614]        __mmu_notifier_invalidate_range_start+0x70/0xb0
> [  513.620851]        try_to_unmap_one+0x7fc/0x8f0
> [  513.621049]        rmap_walk_anon+0x121/0x290
> [  513.621242]        try_to_unmap+0x93/0xf0
> [  513.621428]        shrink_page_list+0x606/0xcb0
> [  513.621625]        shrink_inactive_list+0x33b/0x700
> [  513.621835]        shrink_node_memcg+0x37a/0x7f0
> [  513.622034]        shrink_node+0xd8/0x490
> [  513.622219]        balance_pgdat+0x18b/0x3b0
> [  513.622410]        kswapd+0x203/0x5c0
> [  513.622589]        kthread+0x100/0x140
> [  513.622769]        ret_from_fork+0x24/0x30
> [  513.622957]
>                 other info that might help us debug this:
>
> [  513.623354] Chain exists of:
>                   &dqm->lock_hidden --> &mapping->i_mmap_rwsem --> 
> &anon_vma->rwsem
>
> [  513.623900]  Possible unsafe locking scenario:
>
> [  513.624189]        CPU0                    CPU1
> [  513.624397]        ----                    ----
> [  513.624594]   lock(&anon_vma->rwsem);
> [  513.624771]                                lock(&mapping->i_mmap_rwsem);
> [  513.625020]                                lock(&anon_vma->rwsem);
> [  513.625253]   lock(&dqm->lock_hidden);
> [  513.625433]
>                  *** DEADLOCK ***
>
> [  513.625783] 3 locks held by kswapd0/611:
> [  513.625967]  #0: 00000000f14edf84 (fs_reclaim){+.+.}, at: 
> __fs_reclaim_acquire+0x5/0x30 [  513.626309]  #1: 00000000961547fc 
> (&anon_vma->rwsem){++++}, at: page_lock_anon_vma_read+0xe4/0x250
> [  513.626671]  #2: 0000000067b5cd12 (srcu){....}, at: 
> __mmu_notifier_invalidate_range_start+0x5/0xb0
> [  513.627037]
>                 stack backtrace:
> [  513.627292] CPU: 0 PID: 611 Comm: kswapd0 Tainted: G        W         4.18.0-kfd-root #2
> [  513.627632] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS 
> VirtualBox 12/01/2006 [  513.627990] Call Trace:
> [  513.628143]  dump_stack+0x7c/0xbb
> [  513.628315]  print_circular_bug.isra.37+0x21b/0x228
> [  513.628581]  __lock_acquire+0xf7d/0x1470 [  513.628782]  ? 
> unwind_next_frame+0x6c/0x4f0 [  513.628974]  ? lock_acquire+0xec/0x1e0 
> [  513.629154]  lock_acquire+0xec/0x1e0 [  513.629357]  ? 
> evict_process_queues_nocpsch+0x26/0x140 [amdgpu] [  513.629587]  
> __mutex_lock+0x98/0x970 [  513.629790]  ? 
> evict_process_queues_nocpsch+0x26/0x140 [amdgpu] [  513.630047]  ? 
> evict_process_queues_nocpsch+0x26/0x140 [amdgpu] [  513.630309]  ? 
> evict_process_queues_nocpsch+0x26/0x140 [amdgpu] [  513.630562]  
> evict_process_queues_nocpsch+0x26/0x140 [amdgpu] [  513.630816]  
> kfd_process_evict_queues+0x3b/0xb0 [amdgpu] [  513.631057]  
> kgd2kfd_quiesce_mm+0x1c/0x40 [amdgpu] [  513.631288]  
> amdgpu_amdkfd_evict_userptr+0x38/0x70 [amdgpu] [  513.631536]  
> amdgpu_mn_invalidate_range_start_hsa+0xa6/0xc0 [amdgpu] [  513.632076]  
> __mmu_notifier_invalidate_range_start+0x70/0xb0
> [  513.632299]  try_to_unmap_one+0x7fc/0x8f0 [  513.632487]  ? 
> page_lock_anon_vma_read+0x68/0x250
> [  513.632690]  rmap_walk_anon+0x121/0x290 [  513.632875]  
> try_to_unmap+0x93/0xf0 [  513.633050]  ? page_remove_rmap+0x330/0x330 
> [  513.633239]  ? rcu_read_unlock+0x60/0x60 [  513.633422]  ? 
> page_get_anon_vma+0x160/0x160 [  513.633613]  
> shrink_page_list+0x606/0xcb0 [  513.633800]  
> shrink_inactive_list+0x33b/0x700 [  513.633997]  
> shrink_node_memcg+0x37a/0x7f0 [  513.634186]  ? shrink_node+0xd8/0x490 
> [  513.634363]  shrink_node+0xd8/0x490 [  513.634537]  
> balance_pgdat+0x18b/0x3b0 [  513.634718]  kswapd+0x203/0x5c0 [  
> 513.634887]  ? wait_woken+0xb0/0xb0 [  513.635062]  
> kthread+0x100/0x140 [  513.635231]  ? balance_pgdat+0x3b0/0x3b0 [  
> 513.635414]  ? kthread_delayed_work_timer_fn+0x80/0x80
> [  513.635626]  ret_from_fork+0x24/0x30 [  513.636042] Evicting PASID 
> 32768 queues [  513.936236] Restoring PASID 32768 queues [  
> 524.708912] Evicting PASID 32768 queues [  524.999875] Restoring PASID 
> 32768 queues
>
> Change-Id: Ib40b5fb8d7c6572888a0c879a315b382eb948647
> Signed-off-by: Oak Zeng <Oak.Zeng@amd.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 13 ++++++-------
>   1 file changed, 6 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 06654de..39e2d6d 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -271,19 +271,17 @@ static int create_queue_nocpsch(struct 
> device_queue_manager *dqm,
>   
>   	print_queue(q);
>   
> -	dqm_lock(dqm);
> -
>   	if (dqm->total_queue_count >= max_num_of_queues_per_device) {
>   		pr_warn("Can't create new usermode queue because %d queues were already created\n",
>   				dqm->total_queue_count);
>   		retval = -EPERM;
> -		goto out_unlock;
> +		goto out;
>   	}
>   
>   	if (list_empty(&qpd->queues_list)) {
>   		retval = allocate_vmid(dqm, qpd, q);
>   		if (retval)
> -			goto out_unlock;
> +			goto out;
>   	}
>   	q->properties.vmid = qpd->vmid;
>   	/*
> @@ -340,6 +338,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
>   			goto out_uninit_mqd;
>   	}
>   
> +	dqm_lock(dqm);
>   	list_add(&q->list, &qpd->queues_list);
>   	qpd->queue_count++;
>   	if (q->properties.is_active)
> @@ -357,7 +356,8 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
>   	dqm->total_queue_count++;
>   	pr_debug("Total of %d queues are accountable so far\n",
>   			dqm->total_queue_count);
> -	goto out_unlock;
> +	dqm_unlock(dqm);
> +	return retval;
>   
>   out_uninit_mqd:
>   	mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj); @@ -372,8 
> +372,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
>   deallocate_vmid:
>   	if (list_empty(&qpd->queues_list))
>   		deallocate_vmid(dqm, qpd, q);
> -out_unlock:
> -	dqm_unlock(dqm);
> +out:
>   	return retval;
>   }
>   
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2019-06-04  1:08 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-06-03 17:50 [PATCH 1/5] drm/amdkfd: Only initialize sdma vm for sdma queues Zeng, Oak
     [not found] ` <1559584248-12115-1-git-send-email-Oak.Zeng-5C7GfCeVMHo@public.gmane.org>
2019-06-03 17:51   ` [PATCH 2/5] drm/amdkfd: Only load sdma mqd when queue is active Zeng, Oak
2019-06-03 17:51   ` [PATCH 3/5] drm/amdkfd: Refactor create_queue_nocpsch Zeng, Oak
     [not found]     ` <1559584248-12115-3-git-send-email-Oak.Zeng-5C7GfCeVMHo@public.gmane.org>
2019-06-03 22:04       ` Kuehling, Felix
2019-06-03 17:51   ` [PATCH 4/5] drm/amdkfd: Fix a circular lock dependency Zeng, Oak
     [not found]     ` <1559584248-12115-4-git-send-email-Oak.Zeng-5C7GfCeVMHo@public.gmane.org>
2019-06-03 22:27       ` Kuehling, Felix
     [not found]         ` <5e896419-e14b-41b2-17fe-10eaee1f314c-5C7GfCeVMHo@public.gmane.org>
2019-06-04  1:08           ` Zeng, Oak
2019-06-03 17:51   ` [PATCH 5/5] drm/amdkfd: Fix sdma queue allocate race condition Zeng, Oak

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.