All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] drm/amdgpu: Moved fault hash table to amdgpu vm
@ 2018-09-07  3:28 Oak Zeng
       [not found] ` <1536290910-6476-1-git-send-email-Oak.Zeng-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 27+ messages in thread
From: Oak Zeng @ 2018-09-07  3:28 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Oak Zeng

In stead of share one fault hash table per device, make it
per vm. This can avoid inter-process lock issue when fault
hash table is full.

Change-Id: I5d1281b7c41eddc8e26113e010516557588d3708
Signed-off-by: Oak Zeng <Oak.Zeng@amd.com>
Suggested-by: Christian Konig <Christian.Koenig@amd.com>
Suggested-by: Felix Kuehling <Felix.Kuehling@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c |  75 ------------------------
 drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h |  10 ----
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 102 ++++++++++++++++++++++++++++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h |  12 ++++
 drivers/gpu/drm/amd/amdgpu/vega10_ih.c |  38 +++++-------
 5 files changed, 127 insertions(+), 110 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
index 06373d4..4ed8621 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
@@ -197,78 +197,3 @@ int amdgpu_ih_process(struct amdgpu_device *adev)
 	return IRQ_HANDLED;
 }
 
-/**
- * amdgpu_ih_add_fault - Add a page fault record
- *
- * @adev: amdgpu device pointer
- * @key: 64-bit encoding of PASID and address
- *
- * This should be called when a retry page fault interrupt is
- * received. If this is a new page fault, it will be added to a hash
- * table. The return value indicates whether this is a new fault, or
- * a fault that was already known and is already being handled.
- *
- * If there are too many pending page faults, this will fail. Retry
- * interrupts should be ignored in this case until there is enough
- * free space.
- *
- * Returns 0 if the fault was added, 1 if the fault was already known,
- * -ENOSPC if there are too many pending faults.
- */
-int amdgpu_ih_add_fault(struct amdgpu_device *adev, u64 key)
-{
-	unsigned long flags;
-	int r = -ENOSPC;
-
-	if (WARN_ON_ONCE(!adev->irq.ih.faults))
-		/* Should be allocated in <IP>_ih_sw_init on GPUs that
-		 * support retry faults and require retry filtering.
-		 */
-		return r;
-
-	spin_lock_irqsave(&adev->irq.ih.faults->lock, flags);
-
-	/* Only let the hash table fill up to 50% for best performance */
-	if (adev->irq.ih.faults->count >= (1 << (AMDGPU_PAGEFAULT_HASH_BITS-1)))
-		goto unlock_out;
-
-	r = chash_table_copy_in(&adev->irq.ih.faults->hash, key, NULL);
-	if (!r)
-		adev->irq.ih.faults->count++;
-
-	/* chash_table_copy_in should never fail unless we're losing count */
-	WARN_ON_ONCE(r < 0);
-
-unlock_out:
-	spin_unlock_irqrestore(&adev->irq.ih.faults->lock, flags);
-	return r;
-}
-
-/**
- * amdgpu_ih_clear_fault - Remove a page fault record
- *
- * @adev: amdgpu device pointer
- * @key: 64-bit encoding of PASID and address
- *
- * This should be called when a page fault has been handled. Any
- * future interrupt with this key will be processed as a new
- * page fault.
- */
-void amdgpu_ih_clear_fault(struct amdgpu_device *adev, u64 key)
-{
-	unsigned long flags;
-	int r;
-
-	if (!adev->irq.ih.faults)
-		return;
-
-	spin_lock_irqsave(&adev->irq.ih.faults->lock, flags);
-
-	r = chash_table_remove(&adev->irq.ih.faults->hash, key, NULL);
-	if (!WARN_ON_ONCE(r < 0)) {
-		adev->irq.ih.faults->count--;
-		WARN_ON_ONCE(adev->irq.ih.faults->count < 0);
-	}
-
-	spin_unlock_irqrestore(&adev->irq.ih.faults->lock, flags);
-}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
index a23e1c0..f411ffb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
@@ -32,13 +32,6 @@ struct amdgpu_device;
 #define AMDGPU_IH_CLIENTID_LEGACY 0
 #define AMDGPU_IH_CLIENTID_MAX SOC15_IH_CLIENTID_MAX
 
-#define AMDGPU_PAGEFAULT_HASH_BITS 8
-struct amdgpu_retryfault_hashtable {
-	DECLARE_CHASH_TABLE(hash, AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
-	spinlock_t	lock;
-	int		count;
-};
-
 /*
  * R6xx+ IH ring
  */
@@ -57,7 +50,6 @@ struct amdgpu_ih_ring {
 	bool			use_doorbell;
 	bool			use_bus_addr;
 	dma_addr_t		rb_dma_addr; /* only used when use_bus_addr = true */
-	struct amdgpu_retryfault_hashtable *faults;
 };
 
 #define AMDGPU_IH_SRC_DATA_MAX_SIZE_DW 4
@@ -95,7 +87,5 @@ int amdgpu_ih_ring_init(struct amdgpu_device *adev, unsigned ring_size,
 			bool use_bus_addr);
 void amdgpu_ih_ring_fini(struct amdgpu_device *adev);
 int amdgpu_ih_process(struct amdgpu_device *adev);
-int amdgpu_ih_add_fault(struct amdgpu_device *adev, u64 key);
-void amdgpu_ih_clear_fault(struct amdgpu_device *adev, u64 key);
 
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 1d7e3c1..8b220e9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2692,6 +2692,22 @@ void amdgpu_vm_adjust_size(struct amdgpu_device *adev, uint32_t min_vm_size,
 		 adev->vm_manager.fragment_size);
 }
 
+static struct amdgpu_retryfault_hashtable *init_fault_hash(void)
+{
+	struct amdgpu_retryfault_hashtable *fault_hash;
+
+	fault_hash = kmalloc(sizeof(*fault_hash), GFP_KERNEL);
+	if (!fault_hash)
+		return fault_hash;
+
+	INIT_CHASH_TABLE(fault_hash->hash,
+			AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
+	spin_lock_init(&fault_hash->lock);
+	fault_hash->count = 0;
+
+	return fault_hash;
+}
+
 /**
  * amdgpu_vm_init - initialize a vm instance
  *
@@ -2780,6 +2796,12 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 		vm->pasid = pasid;
 	}
 
+	vm->fault_hash = init_fault_hash();
+	if (!vm->fault_hash) {
+		r = -ENOMEM;
+		goto error_free_root;
+	}
+
 	INIT_KFIFO(vm->faults);
 	vm->fault_credit = 16;
 
@@ -2973,7 +2995,7 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
 
 	/* Clear pending page faults from IH when the VM is destroyed */
 	while (kfifo_get(&vm->faults, &fault))
-		amdgpu_ih_clear_fault(adev, fault);
+		amdgpu_vm_clear_fault(vm->fault_hash, fault);
 
 	if (vm->pasid) {
 		unsigned long flags;
@@ -2983,6 +3005,9 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
 		spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
 	}
 
+	kfree(vm->fault_hash);
+	vm->fault_hash = NULL;
+
 	drm_sched_entity_destroy(&vm->entity);
 
 	if (!RB_EMPTY_ROOT(&vm->va.rb_root)) {
@@ -3183,3 +3208,78 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
 		}
 	}
 }
+
+/**
+ * amdgpu_vm_add_fault - Add a page fault record to fault hash table
+ *
+ * @fault_hash: fault hash table
+ * @key: 64-bit encoding of PASID and address
+ *
+ * This should be called when a retry page fault interrupt is
+ * received. If this is a new page fault, it will be added to a hash
+ * table. The return value indicates whether this is a new fault, or
+ * a fault that was already known and is already being handled.
+ *
+ * If there are too many pending page faults, this will fail. Retry
+ * interrupts should be ignored in this case until there is enough
+ * free space.
+ *
+ * Returns 0 if the fault was added, 1 if the fault was already known,
+ * -ENOSPC if there are too many pending faults.
+ */
+int amdgpu_vm_add_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key)
+{
+	unsigned long flags;
+	int r = -ENOSPC;
+
+	if (WARN_ON_ONCE(!fault_hash))
+		/* Should be allocated in amdgpu_vm_init
+		 */
+		return r;
+
+	spin_lock_irqsave(&fault_hash->lock, flags);
+
+	/* Only let the hash table fill up to 50% for best performance */
+	if (fault_hash->count >= (1 << (AMDGPU_PAGEFAULT_HASH_BITS-1)))
+		goto unlock_out;
+
+	r = chash_table_copy_in(&fault_hash->hash, key, NULL);
+	if (!r)
+		fault_hash->count++;
+
+	/* chash_table_copy_in should never fail unless we're losing count */
+	WARN_ON_ONCE(r < 0);
+
+unlock_out:
+	spin_unlock_irqrestore(&fault_hash->lock, flags);
+	return r;
+}
+
+/**
+ * amdgpu_vm_clear_fault - Remove a page fault record
+ *
+ * @fault_hash: fault hash table
+ * @key: 64-bit encoding of PASID and address
+ *
+ * This should be called when a page fault has been handled. Any
+ * future interrupt with this key will be processed as a new
+ * page fault.
+ */
+void amdgpu_vm_clear_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key)
+{
+	unsigned long flags;
+	int r;
+
+	if (!fault_hash)
+		return;
+
+	spin_lock_irqsave(&fault_hash->lock, flags);
+
+	r = chash_table_remove(&fault_hash->hash, key, NULL);
+	if (!WARN_ON_ONCE(r < 0)) {
+		fault_hash->count--;
+		WARN_ON_ONCE(fault_hash->count < 0);
+	}
+
+	spin_unlock_irqrestore(&fault_hash->lock, flags);
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index e275ee7..6eb1da1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -178,6 +178,13 @@ struct amdgpu_task_info {
 	pid_t	tgid;
 };
 
+#define AMDGPU_PAGEFAULT_HASH_BITS 8
+struct amdgpu_retryfault_hashtable {
+	DECLARE_CHASH_TABLE(hash, AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
+	spinlock_t	lock;
+	int		count;
+};
+
 struct amdgpu_vm {
 	/* tree of virtual addresses mapped */
 	struct rb_root_cached	va;
@@ -240,6 +247,7 @@ struct amdgpu_vm {
 	struct ttm_lru_bulk_move lru_bulk_move;
 	/* mark whether can do the bulk move */
 	bool			bulk_moveable;
+	struct amdgpu_retryfault_hashtable *fault_hash;
 };
 
 struct amdgpu_vm_manager {
@@ -355,4 +363,8 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);
 void amdgpu_vm_move_to_lru_tail(struct amdgpu_device *adev,
 				struct amdgpu_vm *vm);
 
+int amdgpu_vm_add_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key);
+
+void amdgpu_vm_clear_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key);
+
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
index 5ae5ed2..acbe5a7 100644
--- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
+++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
@@ -265,35 +265,36 @@ static bool vega10_ih_prescreen_iv(struct amdgpu_device *adev)
 		return true;
 	}
 
-	addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 12);
-	key = AMDGPU_VM_FAULT(pasid, addr);
-	r = amdgpu_ih_add_fault(adev, key);
-
-	/* Hash table is full or the fault is already being processed,
-	 * ignore further page faults
-	 */
-	if (r != 0)
-		goto ignore_iv;
-
 	/* Track retry faults in per-VM fault FIFO. */
 	spin_lock(&adev->vm_manager.pasid_lock);
 	vm = idr_find(&adev->vm_manager.pasid_idr, pasid);
+	addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 12);
+	key = AMDGPU_VM_FAULT(pasid, addr);
 	if (!vm) {
 		/* VM not found, process it normally */
 		spin_unlock(&adev->vm_manager.pasid_lock);
-		amdgpu_ih_clear_fault(adev, key);
 		return true;
+	} else {
+		r = amdgpu_vm_add_fault(vm->fault_hash, key);
+
+		/* Hash table is full or the fault is already being processed,
+		 * ignore further page faults
+		 */
+		if (r != 0) {
+			spin_unlock(&adev->vm_manager.pasid_lock);
+			goto ignore_iv;
+		}
 	}
 	/* No locking required with single writer and single reader */
 	r = kfifo_put(&vm->faults, key);
 	if (!r) {
 		/* FIFO is full. Ignore it until there is space */
+		amdgpu_vm_clear_fault(vm->fault_hash, key);
 		spin_unlock(&adev->vm_manager.pasid_lock);
-		amdgpu_ih_clear_fault(adev, key);
 		goto ignore_iv;
 	}
-	spin_unlock(&adev->vm_manager.pasid_lock);
 
+	spin_unlock(&adev->vm_manager.pasid_lock);
 	/* It's the first fault for this address, process it normally */
 	return true;
 
@@ -386,14 +387,6 @@ static int vega10_ih_sw_init(void *handle)
 	adev->irq.ih.use_doorbell = true;
 	adev->irq.ih.doorbell_index = AMDGPU_DOORBELL64_IH << 1;
 
-	adev->irq.ih.faults = kmalloc(sizeof(*adev->irq.ih.faults), GFP_KERNEL);
-	if (!adev->irq.ih.faults)
-		return -ENOMEM;
-	INIT_CHASH_TABLE(adev->irq.ih.faults->hash,
-			 AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
-	spin_lock_init(&adev->irq.ih.faults->lock);
-	adev->irq.ih.faults->count = 0;
-
 	r = amdgpu_irq_init(adev);
 
 	return r;
@@ -406,9 +399,6 @@ static int vega10_ih_sw_fini(void *handle)
 	amdgpu_irq_fini(adev);
 	amdgpu_ih_ring_fini(adev);
 
-	kfree(adev->irq.ih.faults);
-	adev->irq.ih.faults = NULL;
-
 	return 0;
 }
 
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 2/2] drm/amdgpu: Deleted faults kfifo
       [not found] ` <1536290910-6476-1-git-send-email-Oak.Zeng-5C7GfCeVMHo@public.gmane.org>
@ 2018-09-07  3:28   ` Oak Zeng
  2018-09-07  4:16   ` [PATCH 1/2] drm/amdgpu: Moved fault hash table to amdgpu vm ozeng
  2018-09-12  9:03   ` Christian König
  2 siblings, 0 replies; 27+ messages in thread
From: Oak Zeng @ 2018-09-07  3:28 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Oak Zeng

With the change of making fault hash per vm, this is not needed
anymore.

Change-Id: I9427ff1786deaf1f5b7d121a6f7f75f00acfc9b7
Signed-off-by: Oak Zeng <Oak.Zeng@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 5 -----
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 3 ---
 drivers/gpu/drm/amd/amdgpu/vega10_ih.c | 9 ---------
 3 files changed, 17 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 8b220e9..5767a95 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2802,7 +2802,6 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 		goto error_free_root;
 	}
 
-	INIT_KFIFO(vm->faults);
 	vm->fault_credit = 16;
 
 	return 0;
@@ -2993,10 +2992,6 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
 
 	amdgpu_amdkfd_gpuvm_destroy_cb(adev, vm);
 
-	/* Clear pending page faults from IH when the VM is destroyed */
-	while (kfifo_get(&vm->faults, &fault))
-		amdgpu_vm_clear_fault(vm->fault_hash, fault);
-
 	if (vm->pasid) {
 		unsigned long flags;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index 6eb1da1..75842f1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -225,9 +225,6 @@ struct amdgpu_vm {
 	/* Flag to indicate ATS support from PTE for GFX9 */
 	bool			pte_support_ats;
 
-	/* Up to 128 pending retry page faults */
-	DECLARE_KFIFO(faults, u64, 128);
-
 	/* Limit non-retry fault storms */
 	unsigned int		fault_credit;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
index acbe5a7..7f0ed91 100644
--- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
+++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
@@ -265,7 +265,6 @@ static bool vega10_ih_prescreen_iv(struct amdgpu_device *adev)
 		return true;
 	}
 
-	/* Track retry faults in per-VM fault FIFO. */
 	spin_lock(&adev->vm_manager.pasid_lock);
 	vm = idr_find(&adev->vm_manager.pasid_idr, pasid);
 	addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 12);
@@ -285,14 +284,6 @@ static bool vega10_ih_prescreen_iv(struct amdgpu_device *adev)
 			goto ignore_iv;
 		}
 	}
-	/* No locking required with single writer and single reader */
-	r = kfifo_put(&vm->faults, key);
-	if (!r) {
-		/* FIFO is full. Ignore it until there is space */
-		amdgpu_vm_clear_fault(vm->fault_hash, key);
-		spin_unlock(&adev->vm_manager.pasid_lock);
-		goto ignore_iv;
-	}
 
 	spin_unlock(&adev->vm_manager.pasid_lock);
 	/* It's the first fault for this address, process it normally */
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to amdgpu vm
       [not found] ` <1536290910-6476-1-git-send-email-Oak.Zeng-5C7GfCeVMHo@public.gmane.org>
  2018-09-07  3:28   ` [PATCH 2/2] drm/amdgpu: Deleted faults kfifo Oak Zeng
@ 2018-09-07  4:16   ` ozeng
       [not found]     ` <12913b70-925d-2e17-2dc2-491ed8592610-5C7GfCeVMHo@public.gmane.org>
  2018-09-12  9:03   ` Christian König
  2 siblings, 1 reply; 27+ messages in thread
From: ozeng @ 2018-09-07  4:16 UTC (permalink / raw)
  To: Oak Zeng, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: oak.zeng-5C7GfCeVMHo


[-- Attachment #1.1: Type: text/plain, Size: 14267 bytes --]

Hi Christian,

In this implementation, fault hash is made per vm, not per pasid as 
suggested, based on below considerations:

  * Delay the destroy of hash introduces more effort like how to set the
    proper grace period after which no retry interrupt will be
    generated. We want to avoid those complication if we can.
  * The  problem of the per vm implementation is that it is hard to
    delay the destroy of fault hash, because when vm is destroyed,
    prescreen function won't be able to retrieve the fault hash. But in
    this case, the prescreen function can either let the interrupt go
    through (to gmc) or ignore it. From this perspective, it is not very
    necessary to delay the destroy of hash table.
  * On the other hand, since ASICs after vega10 have the HW CAM feature.
    So the SW IV prescreen is only useful for vega10. For all the HW
    implemented CAM, we can consider the delayed CAM acknowledgment.
  * Making it per pasid need to introduce another idr to correlate pasid
    and hash table - where to put the idr? You will have to make it a
    global variable which is not very desirable.

The main purpose of this patch is to solve the inter-process lock issue 
(when hash table is full), while still keep codes simple.

Also with this patch, the faults kfifo is not necessary any more. See 
patch 2.

Regards,

Oak


On 09/06/2018 11:28 PM, Oak Zeng wrote:
> In stead of share one fault hash table per device, make it
> per vm. This can avoid inter-process lock issue when fault
> hash table is full.
>
> Change-Id: I5d1281b7c41eddc8e26113e010516557588d3708
> Signed-off-by: Oak Zeng <Oak.Zeng-5C7GfCeVMHo@public.gmane.org>
> Suggested-by: Christian Konig <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>
> Suggested-by: Felix Kuehling <Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c |  75 ------------------------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h |  10 ----
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 102 ++++++++++++++++++++++++++++++++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h |  12 ++++
>   drivers/gpu/drm/amd/amdgpu/vega10_ih.c |  38 +++++-------
>   5 files changed, 127 insertions(+), 110 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
> index 06373d4..4ed8621 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
> @@ -197,78 +197,3 @@ int amdgpu_ih_process(struct amdgpu_device *adev)
>   	return IRQ_HANDLED;
>   }
>   
> -/**
> - * amdgpu_ih_add_fault - Add a page fault record
> - *
> - * @adev: amdgpu device pointer
> - * @key: 64-bit encoding of PASID and address
> - *
> - * This should be called when a retry page fault interrupt is
> - * received. If this is a new page fault, it will be added to a hash
> - * table. The return value indicates whether this is a new fault, or
> - * a fault that was already known and is already being handled.
> - *
> - * If there are too many pending page faults, this will fail. Retry
> - * interrupts should be ignored in this case until there is enough
> - * free space.
> - *
> - * Returns 0 if the fault was added, 1 if the fault was already known,
> - * -ENOSPC if there are too many pending faults.
> - */
> -int amdgpu_ih_add_fault(struct amdgpu_device *adev, u64 key)
> -{
> -	unsigned long flags;
> -	int r = -ENOSPC;
> -
> -	if (WARN_ON_ONCE(!adev->irq.ih.faults))
> -		/* Should be allocated in <IP>_ih_sw_init on GPUs that
> -		 * support retry faults and require retry filtering.
> -		 */
> -		return r;
> -
> -	spin_lock_irqsave(&adev->irq.ih.faults->lock, flags);
> -
> -	/* Only let the hash table fill up to 50% for best performance */
> -	if (adev->irq.ih.faults->count >= (1 << (AMDGPU_PAGEFAULT_HASH_BITS-1)))
> -		goto unlock_out;
> -
> -	r = chash_table_copy_in(&adev->irq.ih.faults->hash, key, NULL);
> -	if (!r)
> -		adev->irq.ih.faults->count++;
> -
> -	/* chash_table_copy_in should never fail unless we're losing count */
> -	WARN_ON_ONCE(r < 0);
> -
> -unlock_out:
> -	spin_unlock_irqrestore(&adev->irq.ih.faults->lock, flags);
> -	return r;
> -}
> -
> -/**
> - * amdgpu_ih_clear_fault - Remove a page fault record
> - *
> - * @adev: amdgpu device pointer
> - * @key: 64-bit encoding of PASID and address
> - *
> - * This should be called when a page fault has been handled. Any
> - * future interrupt with this key will be processed as a new
> - * page fault.
> - */
> -void amdgpu_ih_clear_fault(struct amdgpu_device *adev, u64 key)
> -{
> -	unsigned long flags;
> -	int r;
> -
> -	if (!adev->irq.ih.faults)
> -		return;
> -
> -	spin_lock_irqsave(&adev->irq.ih.faults->lock, flags);
> -
> -	r = chash_table_remove(&adev->irq.ih.faults->hash, key, NULL);
> -	if (!WARN_ON_ONCE(r < 0)) {
> -		adev->irq.ih.faults->count--;
> -		WARN_ON_ONCE(adev->irq.ih.faults->count < 0);
> -	}
> -
> -	spin_unlock_irqrestore(&adev->irq.ih.faults->lock, flags);
> -}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
> index a23e1c0..f411ffb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
> @@ -32,13 +32,6 @@ struct amdgpu_device;
>   #define AMDGPU_IH_CLIENTID_LEGACY 0
>   #define AMDGPU_IH_CLIENTID_MAX SOC15_IH_CLIENTID_MAX
>   
> -#define AMDGPU_PAGEFAULT_HASH_BITS 8
> -struct amdgpu_retryfault_hashtable {
> -	DECLARE_CHASH_TABLE(hash, AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
> -	spinlock_t	lock;
> -	int		count;
> -};
> -
>   /*
>    * R6xx+ IH ring
>    */
> @@ -57,7 +50,6 @@ struct amdgpu_ih_ring {
>   	bool			use_doorbell;
>   	bool			use_bus_addr;
>   	dma_addr_t		rb_dma_addr; /* only used when use_bus_addr = true */
> -	struct amdgpu_retryfault_hashtable *faults;
>   };
>   
>   #define AMDGPU_IH_SRC_DATA_MAX_SIZE_DW 4
> @@ -95,7 +87,5 @@ int amdgpu_ih_ring_init(struct amdgpu_device *adev, unsigned ring_size,
>   			bool use_bus_addr);
>   void amdgpu_ih_ring_fini(struct amdgpu_device *adev);
>   int amdgpu_ih_process(struct amdgpu_device *adev);
> -int amdgpu_ih_add_fault(struct amdgpu_device *adev, u64 key);
> -void amdgpu_ih_clear_fault(struct amdgpu_device *adev, u64 key);
>   
>   #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 1d7e3c1..8b220e9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -2692,6 +2692,22 @@ void amdgpu_vm_adjust_size(struct amdgpu_device *adev, uint32_t min_vm_size,
>   		 adev->vm_manager.fragment_size);
>   }
>   
> +static struct amdgpu_retryfault_hashtable *init_fault_hash(void)
> +{
> +	struct amdgpu_retryfault_hashtable *fault_hash;
> +
> +	fault_hash = kmalloc(sizeof(*fault_hash), GFP_KERNEL);
> +	if (!fault_hash)
> +		return fault_hash;
> +
> +	INIT_CHASH_TABLE(fault_hash->hash,
> +			AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
> +	spin_lock_init(&fault_hash->lock);
> +	fault_hash->count = 0;
> +
> +	return fault_hash;
> +}
> +
>   /**
>    * amdgpu_vm_init - initialize a vm instance
>    *
> @@ -2780,6 +2796,12 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>   		vm->pasid = pasid;
>   	}
>   
> +	vm->fault_hash = init_fault_hash();
> +	if (!vm->fault_hash) {
> +		r = -ENOMEM;
> +		goto error_free_root;
> +	}
> +
>   	INIT_KFIFO(vm->faults);
>   	vm->fault_credit = 16;
>   
> @@ -2973,7 +2995,7 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
>   
>   	/* Clear pending page faults from IH when the VM is destroyed */
>   	while (kfifo_get(&vm->faults, &fault))
> -		amdgpu_ih_clear_fault(adev, fault);
> +		amdgpu_vm_clear_fault(vm->fault_hash, fault);
>   
>   	if (vm->pasid) {
>   		unsigned long flags;
> @@ -2983,6 +3005,9 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
>   		spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
>   	}
>   
> +	kfree(vm->fault_hash);
> +	vm->fault_hash = NULL;
> +
>   	drm_sched_entity_destroy(&vm->entity);
>   
>   	if (!RB_EMPTY_ROOT(&vm->va.rb_root)) {
> @@ -3183,3 +3208,78 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
>   		}
>   	}
>   }
> +
> +/**
> + * amdgpu_vm_add_fault - Add a page fault record to fault hash table
> + *
> + * @fault_hash: fault hash table
> + * @key: 64-bit encoding of PASID and address
> + *
> + * This should be called when a retry page fault interrupt is
> + * received. If this is a new page fault, it will be added to a hash
> + * table. The return value indicates whether this is a new fault, or
> + * a fault that was already known and is already being handled.
> + *
> + * If there are too many pending page faults, this will fail. Retry
> + * interrupts should be ignored in this case until there is enough
> + * free space.
> + *
> + * Returns 0 if the fault was added, 1 if the fault was already known,
> + * -ENOSPC if there are too many pending faults.
> + */
> +int amdgpu_vm_add_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key)
> +{
> +	unsigned long flags;
> +	int r = -ENOSPC;
> +
> +	if (WARN_ON_ONCE(!fault_hash))
> +		/* Should be allocated in amdgpu_vm_init
> +		 */
> +		return r;
> +
> +	spin_lock_irqsave(&fault_hash->lock, flags);
> +
> +	/* Only let the hash table fill up to 50% for best performance */
> +	if (fault_hash->count >= (1 << (AMDGPU_PAGEFAULT_HASH_BITS-1)))
> +		goto unlock_out;
> +
> +	r = chash_table_copy_in(&fault_hash->hash, key, NULL);
> +	if (!r)
> +		fault_hash->count++;
> +
> +	/* chash_table_copy_in should never fail unless we're losing count */
> +	WARN_ON_ONCE(r < 0);
> +
> +unlock_out:
> +	spin_unlock_irqrestore(&fault_hash->lock, flags);
> +	return r;
> +}
> +
> +/**
> + * amdgpu_vm_clear_fault - Remove a page fault record
> + *
> + * @fault_hash: fault hash table
> + * @key: 64-bit encoding of PASID and address
> + *
> + * This should be called when a page fault has been handled. Any
> + * future interrupt with this key will be processed as a new
> + * page fault.
> + */
> +void amdgpu_vm_clear_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key)
> +{
> +	unsigned long flags;
> +	int r;
> +
> +	if (!fault_hash)
> +		return;
> +
> +	spin_lock_irqsave(&fault_hash->lock, flags);
> +
> +	r = chash_table_remove(&fault_hash->hash, key, NULL);
> +	if (!WARN_ON_ONCE(r < 0)) {
> +		fault_hash->count--;
> +		WARN_ON_ONCE(fault_hash->count < 0);
> +	}
> +
> +	spin_unlock_irqrestore(&fault_hash->lock, flags);
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> index e275ee7..6eb1da1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> @@ -178,6 +178,13 @@ struct amdgpu_task_info {
>   	pid_t	tgid;
>   };
>   
> +#define AMDGPU_PAGEFAULT_HASH_BITS 8
> +struct amdgpu_retryfault_hashtable {
> +	DECLARE_CHASH_TABLE(hash, AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
> +	spinlock_t	lock;
> +	int		count;
> +};
> +
>   struct amdgpu_vm {
>   	/* tree of virtual addresses mapped */
>   	struct rb_root_cached	va;
> @@ -240,6 +247,7 @@ struct amdgpu_vm {
>   	struct ttm_lru_bulk_move lru_bulk_move;
>   	/* mark whether can do the bulk move */
>   	bool			bulk_moveable;
> +	struct amdgpu_retryfault_hashtable *fault_hash;
>   };
>   
>   struct amdgpu_vm_manager {
> @@ -355,4 +363,8 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);
>   void amdgpu_vm_move_to_lru_tail(struct amdgpu_device *adev,
>   				struct amdgpu_vm *vm);
>   
> +int amdgpu_vm_add_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key);
> +
> +void amdgpu_vm_clear_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key);
> +
>   #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
> index 5ae5ed2..acbe5a7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
> +++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
> @@ -265,35 +265,36 @@ static bool vega10_ih_prescreen_iv(struct amdgpu_device *adev)
>   		return true;
>   	}
>   
> -	addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 12);
> -	key = AMDGPU_VM_FAULT(pasid, addr);
> -	r = amdgpu_ih_add_fault(adev, key);
> -
> -	/* Hash table is full or the fault is already being processed,
> -	 * ignore further page faults
> -	 */
> -	if (r != 0)
> -		goto ignore_iv;
> -
>   	/* Track retry faults in per-VM fault FIFO. */
>   	spin_lock(&adev->vm_manager.pasid_lock);
>   	vm = idr_find(&adev->vm_manager.pasid_idr, pasid);
> +	addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 12);
> +	key = AMDGPU_VM_FAULT(pasid, addr);
>   	if (!vm) {
>   		/* VM not found, process it normally */
>   		spin_unlock(&adev->vm_manager.pasid_lock);
> -		amdgpu_ih_clear_fault(adev, key);
>   		return true;
> +	} else {
> +		r = amdgpu_vm_add_fault(vm->fault_hash, key);
> +
> +		/* Hash table is full or the fault is already being processed,
> +		 * ignore further page faults
> +		 */
> +		if (r != 0) {
> +			spin_unlock(&adev->vm_manager.pasid_lock);
> +			goto ignore_iv;
> +		}
>   	}
>   	/* No locking required with single writer and single reader */
>   	r = kfifo_put(&vm->faults, key);
>   	if (!r) {
>   		/* FIFO is full. Ignore it until there is space */
> +		amdgpu_vm_clear_fault(vm->fault_hash, key);
>   		spin_unlock(&adev->vm_manager.pasid_lock);
> -		amdgpu_ih_clear_fault(adev, key);
>   		goto ignore_iv;
>   	}
> -	spin_unlock(&adev->vm_manager.pasid_lock);
>   
> +	spin_unlock(&adev->vm_manager.pasid_lock);
>   	/* It's the first fault for this address, process it normally */
>   	return true;
>   
> @@ -386,14 +387,6 @@ static int vega10_ih_sw_init(void *handle)
>   	adev->irq.ih.use_doorbell = true;
>   	adev->irq.ih.doorbell_index = AMDGPU_DOORBELL64_IH << 1;
>   
> -	adev->irq.ih.faults = kmalloc(sizeof(*adev->irq.ih.faults), GFP_KERNEL);
> -	if (!adev->irq.ih.faults)
> -		return -ENOMEM;
> -	INIT_CHASH_TABLE(adev->irq.ih.faults->hash,
> -			 AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
> -	spin_lock_init(&adev->irq.ih.faults->lock);
> -	adev->irq.ih.faults->count = 0;
> -
>   	r = amdgpu_irq_init(adev);
>   
>   	return r;
> @@ -406,9 +399,6 @@ static int vega10_ih_sw_fini(void *handle)
>   	amdgpu_irq_fini(adev);
>   	amdgpu_ih_ring_fini(adev);
>   
> -	kfree(adev->irq.ih.faults);
> -	adev->irq.ih.faults = NULL;
> -
>   	return 0;
>   }
>   


[-- Attachment #1.2: Type: text/html, Size: 14723 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to amdgpu vm
       [not found]     ` <12913b70-925d-2e17-2dc2-491ed8592610-5C7GfCeVMHo@public.gmane.org>
@ 2018-09-07  8:38       ` Christian König
       [not found]         ` <546c9911-b7af-03c8-b5e2-f2bb07932c0b-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
  0 siblings, 1 reply; 27+ messages in thread
From: Christian König @ 2018-09-07  8:38 UTC (permalink / raw)
  To: ozeng, Oak Zeng, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: oak.zeng-5C7GfCeVMHo


[-- Attachment #1.1: Type: text/plain, Size: 15521 bytes --]

Hi Oak,

yeah, but this is still a NAK. Making the hash per PASID was not a 
suggestion but rather a must have.

The VM structures must be destroyed while the processing is still 
ongoing, or otherwise we would not have a clean OOM handling.

The IDR for PASID lockup can be put into amdgpu_ids.c, you can just 
replace the IDA already used there with an IDR.

Since the PASID is already freed up delayed we should have the grace 
period for interrupt processing included. If that still isn't sufficient 
we can still add some delayed work for this.

Regards,
Christian.

Am 07.09.2018 um 06:16 schrieb ozeng:
>
> Hi Christian,
>
> In this implementation, fault hash is made per vm, not per pasid as 
> suggested, based on below considerations:
>
>   * Delay the destroy of hash introduces more effort like how to set
>     the proper grace period after which no retry interrupt will be
>     generated. We want to avoid those complication if we can.
>   * The  problem of the per vm implementation is that it is hard to
>     delay the destroy of fault hash, because when vm is destroyed,
>     prescreen function won't be able to retrieve the fault hash. But
>     in this case, the prescreen function can either let the interrupt
>     go through (to gmc) or ignore it. From this perspective, it is not
>     very necessary to delay the destroy of hash table.
>   * On the other hand, since ASICs after vega10 have the HW CAM
>     feature. So the SW IV prescreen is only useful for vega10. For all
>     the HW implemented CAM, we can consider the delayed CAM
>     acknowledgment.
>   * Making it per pasid need to introduce another idr to correlate
>     pasid and hash table - where to put the idr? You will have to make
>     it a global variable which is not very desirable.
>
> The main purpose of this patch is to solve the inter-process lock 
> issue (when hash table is full), while still keep codes simple.
>
> Also with this patch, the faults kfifo is not necessary any more. See 
> patch 2.
>
> Regards,
>
> Oak
>
>
> On 09/06/2018 11:28 PM, Oak Zeng wrote:
>> In stead of share one fault hash table per device, make it
>> per vm. This can avoid inter-process lock issue when fault
>> hash table is full.
>>
>> Change-Id: I5d1281b7c41eddc8e26113e010516557588d3708
>> Signed-off-by: Oak Zeng<Oak.Zeng-5C7GfCeVMHo@public.gmane.org>
>> Suggested-by: Christian Konig<Christian.Koenig-5C7GfCeVMHo@public.gmane.org>
>> Suggested-by: Felix Kuehling<Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c |  75 ------------------------
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h |  10 ----
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 102 ++++++++++++++++++++++++++++++++-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h |  12 ++++
>>   drivers/gpu/drm/amd/amdgpu/vega10_ih.c |  38 +++++-------
>>   5 files changed, 127 insertions(+), 110 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>> index 06373d4..4ed8621 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>> @@ -197,78 +197,3 @@ int amdgpu_ih_process(struct amdgpu_device *adev)
>>   	return IRQ_HANDLED;
>>   }
>>   
>> -/**
>> - * amdgpu_ih_add_fault - Add a page fault record
>> - *
>> - * @adev: amdgpu device pointer
>> - * @key: 64-bit encoding of PASID and address
>> - *
>> - * This should be called when a retry page fault interrupt is
>> - * received. If this is a new page fault, it will be added to a hash
>> - * table. The return value indicates whether this is a new fault, or
>> - * a fault that was already known and is already being handled.
>> - *
>> - * If there are too many pending page faults, this will fail. Retry
>> - * interrupts should be ignored in this case until there is enough
>> - * free space.
>> - *
>> - * Returns 0 if the fault was added, 1 if the fault was already known,
>> - * -ENOSPC if there are too many pending faults.
>> - */
>> -int amdgpu_ih_add_fault(struct amdgpu_device *adev, u64 key)
>> -{
>> -	unsigned long flags;
>> -	int r = -ENOSPC;
>> -
>> -	if (WARN_ON_ONCE(!adev->irq.ih.faults))
>> -		/* Should be allocated in <IP>_ih_sw_init on GPUs that
>> -		 * support retry faults and require retry filtering.
>> -		 */
>> -		return r;
>> -
>> -	spin_lock_irqsave(&adev->irq.ih.faults->lock, flags);
>> -
>> -	/* Only let the hash table fill up to 50% for best performance */
>> -	if (adev->irq.ih.faults->count >= (1 << (AMDGPU_PAGEFAULT_HASH_BITS-1)))
>> -		goto unlock_out;
>> -
>> -	r = chash_table_copy_in(&adev->irq.ih.faults->hash, key, NULL);
>> -	if (!r)
>> -		adev->irq.ih.faults->count++;
>> -
>> -	/* chash_table_copy_in should never fail unless we're losing count */
>> -	WARN_ON_ONCE(r < 0);
>> -
>> -unlock_out:
>> -	spin_unlock_irqrestore(&adev->irq.ih.faults->lock, flags);
>> -	return r;
>> -}
>> -
>> -/**
>> - * amdgpu_ih_clear_fault - Remove a page fault record
>> - *
>> - * @adev: amdgpu device pointer
>> - * @key: 64-bit encoding of PASID and address
>> - *
>> - * This should be called when a page fault has been handled. Any
>> - * future interrupt with this key will be processed as a new
>> - * page fault.
>> - */
>> -void amdgpu_ih_clear_fault(struct amdgpu_device *adev, u64 key)
>> -{
>> -	unsigned long flags;
>> -	int r;
>> -
>> -	if (!adev->irq.ih.faults)
>> -		return;
>> -
>> -	spin_lock_irqsave(&adev->irq.ih.faults->lock, flags);
>> -
>> -	r = chash_table_remove(&adev->irq.ih.faults->hash, key, NULL);
>> -	if (!WARN_ON_ONCE(r < 0)) {
>> -		adev->irq.ih.faults->count--;
>> -		WARN_ON_ONCE(adev->irq.ih.faults->count < 0);
>> -	}
>> -
>> -	spin_unlock_irqrestore(&adev->irq.ih.faults->lock, flags);
>> -}
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>> index a23e1c0..f411ffb 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>> @@ -32,13 +32,6 @@ struct amdgpu_device;
>>   #define AMDGPU_IH_CLIENTID_LEGACY 0
>>   #define AMDGPU_IH_CLIENTID_MAX SOC15_IH_CLIENTID_MAX
>>   
>> -#define AMDGPU_PAGEFAULT_HASH_BITS 8
>> -struct amdgpu_retryfault_hashtable {
>> -	DECLARE_CHASH_TABLE(hash, AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
>> -	spinlock_t	lock;
>> -	int		count;
>> -};
>> -
>>   /*
>>    * R6xx+ IH ring
>>    */
>> @@ -57,7 +50,6 @@ struct amdgpu_ih_ring {
>>   	bool			use_doorbell;
>>   	bool			use_bus_addr;
>>   	dma_addr_t		rb_dma_addr; /* only used when use_bus_addr = true */
>> -	struct amdgpu_retryfault_hashtable *faults;
>>   };
>>   
>>   #define AMDGPU_IH_SRC_DATA_MAX_SIZE_DW 4
>> @@ -95,7 +87,5 @@ int amdgpu_ih_ring_init(struct amdgpu_device *adev, unsigned ring_size,
>>   			bool use_bus_addr);
>>   void amdgpu_ih_ring_fini(struct amdgpu_device *adev);
>>   int amdgpu_ih_process(struct amdgpu_device *adev);
>> -int amdgpu_ih_add_fault(struct amdgpu_device *adev, u64 key);
>> -void amdgpu_ih_clear_fault(struct amdgpu_device *adev, u64 key);
>>   
>>   #endif
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> index 1d7e3c1..8b220e9 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> @@ -2692,6 +2692,22 @@ void amdgpu_vm_adjust_size(struct amdgpu_device *adev, uint32_t min_vm_size,
>>   		 adev->vm_manager.fragment_size);
>>   }
>>   
>> +static struct amdgpu_retryfault_hashtable *init_fault_hash(void)
>> +{
>> +	struct amdgpu_retryfault_hashtable *fault_hash;
>> +
>> +	fault_hash = kmalloc(sizeof(*fault_hash), GFP_KERNEL);
>> +	if (!fault_hash)
>> +		return fault_hash;
>> +
>> +	INIT_CHASH_TABLE(fault_hash->hash,
>> +			AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
>> +	spin_lock_init(&fault_hash->lock);
>> +	fault_hash->count = 0;
>> +
>> +	return fault_hash;
>> +}
>> +
>>   /**
>>    * amdgpu_vm_init - initialize a vm instance
>>    *
>> @@ -2780,6 +2796,12 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>>   		vm->pasid = pasid;
>>   	}
>>   
>> +	vm->fault_hash = init_fault_hash();
>> +	if (!vm->fault_hash) {
>> +		r = -ENOMEM;
>> +		goto error_free_root;
>> +	}
>> +
>>   	INIT_KFIFO(vm->faults);
>>   	vm->fault_credit = 16;
>>   
>> @@ -2973,7 +2995,7 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
>>   
>>   	/* Clear pending page faults from IH when the VM is destroyed */
>>   	while (kfifo_get(&vm->faults, &fault))
>> -		amdgpu_ih_clear_fault(adev, fault);
>> +		amdgpu_vm_clear_fault(vm->fault_hash, fault);
>>   
>>   	if (vm->pasid) {
>>   		unsigned long flags;
>> @@ -2983,6 +3005,9 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
>>   		spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
>>   	}
>>   
>> +	kfree(vm->fault_hash);
>> +	vm->fault_hash = NULL;
>> +
>>   	drm_sched_entity_destroy(&vm->entity);
>>   
>>   	if (!RB_EMPTY_ROOT(&vm->va.rb_root)) {
>> @@ -3183,3 +3208,78 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
>>   		}
>>   	}
>>   }
>> +
>> +/**
>> + * amdgpu_vm_add_fault - Add a page fault record to fault hash table
>> + *
>> + * @fault_hash: fault hash table
>> + * @key: 64-bit encoding of PASID and address
>> + *
>> + * This should be called when a retry page fault interrupt is
>> + * received. If this is a new page fault, it will be added to a hash
>> + * table. The return value indicates whether this is a new fault, or
>> + * a fault that was already known and is already being handled.
>> + *
>> + * If there are too many pending page faults, this will fail. Retry
>> + * interrupts should be ignored in this case until there is enough
>> + * free space.
>> + *
>> + * Returns 0 if the fault was added, 1 if the fault was already known,
>> + * -ENOSPC if there are too many pending faults.
>> + */
>> +int amdgpu_vm_add_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key)
>> +{
>> +	unsigned long flags;
>> +	int r = -ENOSPC;
>> +
>> +	if (WARN_ON_ONCE(!fault_hash))
>> +		/* Should be allocated in amdgpu_vm_init
>> +		 */
>> +		return r;
>> +
>> +	spin_lock_irqsave(&fault_hash->lock, flags);
>> +
>> +	/* Only let the hash table fill up to 50% for best performance */
>> +	if (fault_hash->count >= (1 << (AMDGPU_PAGEFAULT_HASH_BITS-1)))
>> +		goto unlock_out;
>> +
>> +	r = chash_table_copy_in(&fault_hash->hash, key, NULL);
>> +	if (!r)
>> +		fault_hash->count++;
>> +
>> +	/* chash_table_copy_in should never fail unless we're losing count */
>> +	WARN_ON_ONCE(r < 0);
>> +
>> +unlock_out:
>> +	spin_unlock_irqrestore(&fault_hash->lock, flags);
>> +	return r;
>> +}
>> +
>> +/**
>> + * amdgpu_vm_clear_fault - Remove a page fault record
>> + *
>> + * @fault_hash: fault hash table
>> + * @key: 64-bit encoding of PASID and address
>> + *
>> + * This should be called when a page fault has been handled. Any
>> + * future interrupt with this key will be processed as a new
>> + * page fault.
>> + */
>> +void amdgpu_vm_clear_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key)
>> +{
>> +	unsigned long flags;
>> +	int r;
>> +
>> +	if (!fault_hash)
>> +		return;
>> +
>> +	spin_lock_irqsave(&fault_hash->lock, flags);
>> +
>> +	r = chash_table_remove(&fault_hash->hash, key, NULL);
>> +	if (!WARN_ON_ONCE(r < 0)) {
>> +		fault_hash->count--;
>> +		WARN_ON_ONCE(fault_hash->count < 0);
>> +	}
>> +
>> +	spin_unlock_irqrestore(&fault_hash->lock, flags);
>> +}
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> index e275ee7..6eb1da1 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> @@ -178,6 +178,13 @@ struct amdgpu_task_info {
>>   	pid_t	tgid;
>>   };
>>   
>> +#define AMDGPU_PAGEFAULT_HASH_BITS 8
>> +struct amdgpu_retryfault_hashtable {
>> +	DECLARE_CHASH_TABLE(hash, AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
>> +	spinlock_t	lock;
>> +	int		count;
>> +};
>> +
>>   struct amdgpu_vm {
>>   	/* tree of virtual addresses mapped */
>>   	struct rb_root_cached	va;
>> @@ -240,6 +247,7 @@ struct amdgpu_vm {
>>   	struct ttm_lru_bulk_move lru_bulk_move;
>>   	/* mark whether can do the bulk move */
>>   	bool			bulk_moveable;
>> +	struct amdgpu_retryfault_hashtable *fault_hash;
>>   };
>>   
>>   struct amdgpu_vm_manager {
>> @@ -355,4 +363,8 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);
>>   void amdgpu_vm_move_to_lru_tail(struct amdgpu_device *adev,
>>   				struct amdgpu_vm *vm);
>>   
>> +int amdgpu_vm_add_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key);
>> +
>> +void amdgpu_vm_clear_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key);
>> +
>>   #endif
>> diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>> index 5ae5ed2..acbe5a7 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>> @@ -265,35 +265,36 @@ static bool vega10_ih_prescreen_iv(struct amdgpu_device *adev)
>>   		return true;
>>   	}
>>   
>> -	addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 12);
>> -	key = AMDGPU_VM_FAULT(pasid, addr);
>> -	r = amdgpu_ih_add_fault(adev, key);
>> -
>> -	/* Hash table is full or the fault is already being processed,
>> -	 * ignore further page faults
>> -	 */
>> -	if (r != 0)
>> -		goto ignore_iv;
>> -
>>   	/* Track retry faults in per-VM fault FIFO. */
>>   	spin_lock(&adev->vm_manager.pasid_lock);
>>   	vm = idr_find(&adev->vm_manager.pasid_idr, pasid);
>> +	addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 12);
>> +	key = AMDGPU_VM_FAULT(pasid, addr);
>>   	if (!vm) {
>>   		/* VM not found, process it normally */
>>   		spin_unlock(&adev->vm_manager.pasid_lock);
>> -		amdgpu_ih_clear_fault(adev, key);
>>   		return true;
>> +	} else {
>> +		r = amdgpu_vm_add_fault(vm->fault_hash, key);
>> +
>> +		/* Hash table is full or the fault is already being processed,
>> +		 * ignore further page faults
>> +		 */
>> +		if (r != 0) {
>> +			spin_unlock(&adev->vm_manager.pasid_lock);
>> +			goto ignore_iv;
>> +		}
>>   	}
>>   	/* No locking required with single writer and single reader */
>>   	r = kfifo_put(&vm->faults, key);
>>   	if (!r) {
>>   		/* FIFO is full. Ignore it until there is space */
>> +		amdgpu_vm_clear_fault(vm->fault_hash, key);
>>   		spin_unlock(&adev->vm_manager.pasid_lock);
>> -		amdgpu_ih_clear_fault(adev, key);
>>   		goto ignore_iv;
>>   	}
>> -	spin_unlock(&adev->vm_manager.pasid_lock);
>>   
>> +	spin_unlock(&adev->vm_manager.pasid_lock);
>>   	/* It's the first fault for this address, process it normally */
>>   	return true;
>>   
>> @@ -386,14 +387,6 @@ static int vega10_ih_sw_init(void *handle)
>>   	adev->irq.ih.use_doorbell = true;
>>   	adev->irq.ih.doorbell_index = AMDGPU_DOORBELL64_IH << 1;
>>   
>> -	adev->irq.ih.faults = kmalloc(sizeof(*adev->irq.ih.faults), GFP_KERNEL);
>> -	if (!adev->irq.ih.faults)
>> -		return -ENOMEM;
>> -	INIT_CHASH_TABLE(adev->irq.ih.faults->hash,
>> -			 AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
>> -	spin_lock_init(&adev->irq.ih.faults->lock);
>> -	adev->irq.ih.faults->count = 0;
>> -
>>   	r = amdgpu_irq_init(adev);
>>   
>>   	return r;
>> @@ -406,9 +399,6 @@ static int vega10_ih_sw_fini(void *handle)
>>   	amdgpu_irq_fini(adev);
>>   	amdgpu_ih_ring_fini(adev);
>>   
>> -	kfree(adev->irq.ih.faults);
>> -	adev->irq.ih.faults = NULL;
>> -
>>   	return 0;
>>   }
>>   
>
>
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[-- Attachment #1.2: Type: text/html, Size: 16432 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* RE: [PATCH 1/2] drm/amdgpu: Moved fault hash table to amdgpu vm
       [not found]         ` <546c9911-b7af-03c8-b5e2-f2bb07932c0b-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
@ 2018-09-07 14:55           ` Zeng, Oak
       [not found]             ` <BN6PR12MB1651779BF0B3AE84E1C4430C80000-/b2+HYfkarRSqX7PDniLCgdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  0 siblings, 1 reply; 27+ messages in thread
From: Zeng, Oak @ 2018-09-07 14:55 UTC (permalink / raw)
  To: Koenig, Christian, Oak Zeng,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Yang, Philip


[-- Attachment #1.1: Type: text/plain, Size: 17036 bytes --]

Hi Christian,

What is the value of delay the destroy of hash to after vm is destroyed? Since vm is destroyed, you basically don’t have enough information to paging in the correct page to gpuvm. Am I right that the handling of page fault need a valid VM? For example, you need the VM to get the corresponding physical address of the faulty page.

After vm is destroyed, the retry interrupt can be directly discard as you can’t find vm through pasid. You can think this is also one kind of prescreen.

So I am stand for the point that, there is no need to delay the destroy of hash to after vm is destroyed. Prescreening hash can be destroyed at the time of vm_fini.

Thanks,
Oak

From: Christian König <ckoenig.leichtzumerken@gmail.com>
Sent: Friday, September 07, 2018 4:39 AM
To: Zeng, Oak <Oak.Zeng@amd.com>; Oak Zeng <zengshanjun@gmail.com>; amd-gfx@lists.freedesktop.org
Cc: Zeng, Oak <Oak.Zeng@amd.com>
Subject: Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to amdgpu vm

Hi Oak,

yeah, but this is still a NAK. Making the hash per PASID was not a suggestion but rather a must have.

The VM structures must be destroyed while the processing is still ongoing, or otherwise we would not have a clean OOM handling.

The IDR for PASID lockup can be put into amdgpu_ids.c, you can just replace the IDA already used there with an IDR.

Since the PASID is already freed up delayed we should have the grace period for interrupt processing included. If that still isn't sufficient we can still add some delayed work for this.

Regards,
Christian.

Am 07.09.2018 um 06:16 schrieb ozeng:

Hi Christian,

In this implementation, fault hash is made per vm, not per pasid as suggested, based on below considerations:

  *   Delay the destroy of hash introduces more effort like how to set the proper grace period after which no retry interrupt will be generated. We want to avoid those complication if we can.
  *   The  problem of the per vm implementation is that it is hard to delay the destroy of fault hash, because when vm is destroyed, prescreen function won't be able to retrieve the fault hash. But in this case, the prescreen function can either let the interrupt go through (to gmc) or ignore it. From this perspective, it is not very necessary to delay the destroy of hash table.
  *   On the other hand, since ASICs after vega10 have the HW CAM feature. So the SW IV prescreen is only useful for vega10. For all the HW implemented CAM, we can consider the delayed CAM acknowledgment.
  *   Making it per pasid need to introduce another idr to correlate pasid and hash table - where to put the idr? You will have to make it a global variable which is not very desirable.

The main purpose of this patch is to solve the inter-process lock issue (when hash table is full), while still keep codes simple.

Also with this patch, the faults kfifo is not necessary any more. See patch 2.

Regards,

Oak

On 09/06/2018 11:28 PM, Oak Zeng wrote:

In stead of share one fault hash table per device, make it

per vm. This can avoid inter-process lock issue when fault

hash table is full.



Change-Id: I5d1281b7c41eddc8e26113e010516557588d3708

Signed-off-by: Oak Zeng <Oak.Zeng@amd.com><mailto:Oak.Zeng@amd.com>

Suggested-by: Christian Konig <Christian.Koenig@amd.com><mailto:Christian.Koenig@amd.com>

Suggested-by: Felix Kuehling <Felix.Kuehling@amd.com><mailto:Felix.Kuehling@amd.com>

---

 drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c |  75 ------------------------

 drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h |  10 ----

 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 102 ++++++++++++++++++++++++++++++++-

 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h |  12 ++++

 drivers/gpu/drm/amd/amdgpu/vega10_ih.c |  38 +++++-------

 5 files changed, 127 insertions(+), 110 deletions(-)



diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c

index 06373d4..4ed8621 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c

@@ -197,78 +197,3 @@ int amdgpu_ih_process(struct amdgpu_device *adev)

   return IRQ_HANDLED;

 }



-/**

- * amdgpu_ih_add_fault - Add a page fault record

- *

- * @adev: amdgpu device pointer

- * @key: 64-bit encoding of PASID and address

- *

- * This should be called when a retry page fault interrupt is

- * received. If this is a new page fault, it will be added to a hash

- * table. The return value indicates whether this is a new fault, or

- * a fault that was already known and is already being handled.

- *

- * If there are too many pending page faults, this will fail. Retry

- * interrupts should be ignored in this case until there is enough

- * free space.

- *

- * Returns 0 if the fault was added, 1 if the fault was already known,

- * -ENOSPC if there are too many pending faults.

- */

-int amdgpu_ih_add_fault(struct amdgpu_device *adev, u64 key)

-{

-  unsigned long flags;

-  int r = -ENOSPC;

-

-  if (WARN_ON_ONCE(!adev->irq.ih.faults))

-          /* Should be allocated in <IP>_ih_sw_init on GPUs that

-           * support retry faults and require retry filtering.

-           */

-          return r;

-

-  spin_lock_irqsave(&adev->irq.ih.faults->lock, flags);

-

-  /* Only let the hash table fill up to 50% for best performance */

-  if (adev->irq.ih.faults->count >= (1 << (AMDGPU_PAGEFAULT_HASH_BITS-1)))

-          goto unlock_out;

-

-  r = chash_table_copy_in(&adev->irq.ih.faults->hash, key, NULL);

-  if (!r)

-          adev->irq.ih.faults->count++;

-

-  /* chash_table_copy_in should never fail unless we're losing count */

-  WARN_ON_ONCE(r < 0);

-

-unlock_out:

-  spin_unlock_irqrestore(&adev->irq.ih.faults->lock, flags);

-  return r;

-}

-

-/**

- * amdgpu_ih_clear_fault - Remove a page fault record

- *

- * @adev: amdgpu device pointer

- * @key: 64-bit encoding of PASID and address

- *

- * This should be called when a page fault has been handled. Any

- * future interrupt with this key will be processed as a new

- * page fault.

- */

-void amdgpu_ih_clear_fault(struct amdgpu_device *adev, u64 key)

-{

-  unsigned long flags;

-  int r;

-

-  if (!adev->irq.ih.faults)

-          return;

-

-  spin_lock_irqsave(&adev->irq.ih.faults->lock, flags);

-

-  r = chash_table_remove(&adev->irq.ih.faults->hash, key, NULL);

-  if (!WARN_ON_ONCE(r < 0)) {

-          adev->irq.ih.faults->count--;

-          WARN_ON_ONCE(adev->irq.ih.faults->count < 0);

-  }

-

-  spin_unlock_irqrestore(&adev->irq.ih.faults->lock, flags);

-}

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h

index a23e1c0..f411ffb 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h

@@ -32,13 +32,6 @@ struct amdgpu_device;

 #define AMDGPU_IH_CLIENTID_LEGACY 0

 #define AMDGPU_IH_CLIENTID_MAX SOC15_IH_CLIENTID_MAX



-#define AMDGPU_PAGEFAULT_HASH_BITS 8

-struct amdgpu_retryfault_hashtable {

-  DECLARE_CHASH_TABLE(hash, AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);

-  spinlock_t     lock;

-  int            count;

-};

-

 /*

  * R6xx+ IH ring

  */

@@ -57,7 +50,6 @@ struct amdgpu_ih_ring {

   bool                   use_doorbell;

   bool                   use_bus_addr;

   dma_addr_t             rb_dma_addr; /* only used when use_bus_addr = true */

-  struct amdgpu_retryfault_hashtable *faults;

 };



 #define AMDGPU_IH_SRC_DATA_MAX_SIZE_DW 4

@@ -95,7 +87,5 @@ int amdgpu_ih_ring_init(struct amdgpu_device *adev, unsigned ring_size,

                   bool use_bus_addr);

 void amdgpu_ih_ring_fini(struct amdgpu_device *adev);

 int amdgpu_ih_process(struct amdgpu_device *adev);

-int amdgpu_ih_add_fault(struct amdgpu_device *adev, u64 key);

-void amdgpu_ih_clear_fault(struct amdgpu_device *adev, u64 key);



 #endif

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c

index 1d7e3c1..8b220e9 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c

@@ -2692,6 +2692,22 @@ void amdgpu_vm_adjust_size(struct amdgpu_device *adev, uint32_t min_vm_size,

            adev->vm_manager.fragment_size);

 }



+static struct amdgpu_retryfault_hashtable *init_fault_hash(void)

+{

+  struct amdgpu_retryfault_hashtable *fault_hash;

+

+  fault_hash = kmalloc(sizeof(*fault_hash), GFP_KERNEL);

+  if (!fault_hash)

+          return fault_hash;

+

+  INIT_CHASH_TABLE(fault_hash->hash,

+                  AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);

+  spin_lock_init(&fault_hash->lock);

+  fault_hash->count = 0;

+

+  return fault_hash;

+}

+

 /**

  * amdgpu_vm_init - initialize a vm instance

  *

@@ -2780,6 +2796,12 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,

           vm->pasid = pasid;

   }



+  vm->fault_hash = init_fault_hash();

+  if (!vm->fault_hash) {

+          r = -ENOMEM;

+          goto error_free_root;

+  }

+

   INIT_KFIFO(vm->faults);

   vm->fault_credit = 16;



@@ -2973,7 +2995,7 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)



   /* Clear pending page faults from IH when the VM is destroyed */

   while (kfifo_get(&vm->faults, &fault))

-          amdgpu_ih_clear_fault(adev, fault);

+          amdgpu_vm_clear_fault(vm->fault_hash, fault);



   if (vm->pasid) {

           unsigned long flags;

@@ -2983,6 +3005,9 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)

           spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);

   }



+  kfree(vm->fault_hash);

+  vm->fault_hash = NULL;

+

   drm_sched_entity_destroy(&vm->entity);



   if (!RB_EMPTY_ROOT(&vm->va.rb_root)) {

@@ -3183,3 +3208,78 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)

           }

   }

 }

+

+/**

+ * amdgpu_vm_add_fault - Add a page fault record to fault hash table

+ *

+ * @fault_hash: fault hash table

+ * @key: 64-bit encoding of PASID and address

+ *

+ * This should be called when a retry page fault interrupt is

+ * received. If this is a new page fault, it will be added to a hash

+ * table. The return value indicates whether this is a new fault, or

+ * a fault that was already known and is already being handled.

+ *

+ * If there are too many pending page faults, this will fail. Retry

+ * interrupts should be ignored in this case until there is enough

+ * free space.

+ *

+ * Returns 0 if the fault was added, 1 if the fault was already known,

+ * -ENOSPC if there are too many pending faults.

+ */

+int amdgpu_vm_add_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key)

+{

+  unsigned long flags;

+  int r = -ENOSPC;

+

+  if (WARN_ON_ONCE(!fault_hash))

+          /* Should be allocated in amdgpu_vm_init

+           */

+          return r;

+

+  spin_lock_irqsave(&fault_hash->lock, flags);

+

+  /* Only let the hash table fill up to 50% for best performance */

+  if (fault_hash->count >= (1 << (AMDGPU_PAGEFAULT_HASH_BITS-1)))

+          goto unlock_out;

+

+  r = chash_table_copy_in(&fault_hash->hash, key, NULL);

+  if (!r)

+          fault_hash->count++;

+

+  /* chash_table_copy_in should never fail unless we're losing count */

+  WARN_ON_ONCE(r < 0);

+

+unlock_out:

+  spin_unlock_irqrestore(&fault_hash->lock, flags);

+  return r;

+}

+

+/**

+ * amdgpu_vm_clear_fault - Remove a page fault record

+ *

+ * @fault_hash: fault hash table

+ * @key: 64-bit encoding of PASID and address

+ *

+ * This should be called when a page fault has been handled. Any

+ * future interrupt with this key will be processed as a new

+ * page fault.

+ */

+void amdgpu_vm_clear_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key)

+{

+  unsigned long flags;

+  int r;

+

+  if (!fault_hash)

+          return;

+

+  spin_lock_irqsave(&fault_hash->lock, flags);

+

+  r = chash_table_remove(&fault_hash->hash, key, NULL);

+  if (!WARN_ON_ONCE(r < 0)) {

+          fault_hash->count--;

+          WARN_ON_ONCE(fault_hash->count < 0);

+  }

+

+  spin_unlock_irqrestore(&fault_hash->lock, flags);

+}

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h

index e275ee7..6eb1da1 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h

@@ -178,6 +178,13 @@ struct amdgpu_task_info {

   pid_t   tgid;

 };



+#define AMDGPU_PAGEFAULT_HASH_BITS 8

+struct amdgpu_retryfault_hashtable {

+  DECLARE_CHASH_TABLE(hash, AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);

+  spinlock_t     lock;

+  int            count;

+};

+

 struct amdgpu_vm {

   /* tree of virtual addresses mapped */

   struct rb_root_cached  va;

@@ -240,6 +247,7 @@ struct amdgpu_vm {

   struct ttm_lru_bulk_move lru_bulk_move;

   /* mark whether can do the bulk move */

   bool                   bulk_moveable;

+  struct amdgpu_retryfault_hashtable *fault_hash;

 };



 struct amdgpu_vm_manager {

@@ -355,4 +363,8 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);

 void amdgpu_vm_move_to_lru_tail(struct amdgpu_device *adev,

                          struct amdgpu_vm *vm);



+int amdgpu_vm_add_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key);

+

+void amdgpu_vm_clear_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key);

+

 #endif

diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c

index 5ae5ed2..acbe5a7 100644

--- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c

+++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c

@@ -265,35 +265,36 @@ static bool vega10_ih_prescreen_iv(struct amdgpu_device *adev)

           return true;

   }



-  addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 12);

-  key = AMDGPU_VM_FAULT(pasid, addr);

-  r = amdgpu_ih_add_fault(adev, key);

-

-  /* Hash table is full or the fault is already being processed,

-   * ignore further page faults

-   */

-  if (r != 0)

-          goto ignore_iv;

-

   /* Track retry faults in per-VM fault FIFO. */

   spin_lock(&adev->vm_manager.pasid_lock);

   vm = idr_find(&adev->vm_manager.pasid_idr, pasid);

+  addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 12);

+  key = AMDGPU_VM_FAULT(pasid, addr);

   if (!vm) {

           /* VM not found, process it normally */

           spin_unlock(&adev->vm_manager.pasid_lock);

-          amdgpu_ih_clear_fault(adev, key);

           return true;

+  } else {

+          r = amdgpu_vm_add_fault(vm->fault_hash, key);

+

+          /* Hash table is full or the fault is already being processed,

+           * ignore further page faults

+           */

+          if (r != 0) {

+                  spin_unlock(&adev->vm_manager.pasid_lock);

+                  goto ignore_iv;

+          }

   }

   /* No locking required with single writer and single reader */

   r = kfifo_put(&vm->faults, key);

   if (!r) {

           /* FIFO is full. Ignore it until there is space */

+          amdgpu_vm_clear_fault(vm->fault_hash, key);

           spin_unlock(&adev->vm_manager.pasid_lock);

-          amdgpu_ih_clear_fault(adev, key);

           goto ignore_iv;

   }

-  spin_unlock(&adev->vm_manager.pasid_lock);



+  spin_unlock(&adev->vm_manager.pasid_lock);

   /* It's the first fault for this address, process it normally */

   return true;



@@ -386,14 +387,6 @@ static int vega10_ih_sw_init(void *handle)

   adev->irq.ih.use_doorbell = true;

   adev->irq.ih.doorbell_index = AMDGPU_DOORBELL64_IH << 1;



-  adev->irq.ih.faults = kmalloc(sizeof(*adev->irq.ih.faults), GFP_KERNEL);

-  if (!adev->irq.ih.faults)

-          return -ENOMEM;

-  INIT_CHASH_TABLE(adev->irq.ih.faults->hash,

-                   AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);

-  spin_lock_init(&adev->irq.ih.faults->lock);

-  adev->irq.ih.faults->count = 0;

-

   r = amdgpu_irq_init(adev);



   return r;

@@ -406,9 +399,6 @@ static int vega10_ih_sw_fini(void *handle)

   amdgpu_irq_fini(adev);

   amdgpu_ih_ring_fini(adev);



-  kfree(adev->irq.ih.faults);

-  adev->irq.ih.faults = NULL;

-

   return 0;

 }







_______________________________________________

amd-gfx mailing list

amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>

https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[-- Attachment #1.2: Type: text/html, Size: 36856 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to amdgpu vm
       [not found]             ` <BN6PR12MB1651779BF0B3AE84E1C4430C80000-/b2+HYfkarRSqX7PDniLCgdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2018-09-07 15:18               ` Christian König
       [not found]                 ` <89e7515f-53f1-9f10-aeb6-7fc1b8df2702-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 27+ messages in thread
From: Christian König @ 2018-09-07 15:18 UTC (permalink / raw)
  To: Zeng, Oak, Oak Zeng, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	Yang, Philip


[-- Attachment #1.1: Type: text/plain, Size: 22587 bytes --]

> Am I right that the handling of page fault need a valid VM? 
No, exactly that view is incorrect.

The VM is meant to be a memory management structure of page tables and 
is completely pointless fault processing because it represent the future 
state of the page tables and not the current one.

What we need for fault processing is a new structure build around PASIDs 
which is feed by the with addresses when page tables are moved around.

Alternatively I hope that we can use the SDMA to walk the page tables 
and update the required entries by just using the address.

Regards,
Christian.

Am 07.09.2018 um 16:55 schrieb Zeng, Oak:
>
> Hi Christian,
>
> What is the value of delay the destroy of hash to after vm is 
> destroyed? Since vm is destroyed, you basically don’t have enough 
> information to paging in the correct page to gpuvm. Am I right that 
> the handling of page fault need a valid VM? For example, you need the 
> VM to get the corresponding physical address of the faulty page.
>
> After vm is destroyed, the retry interrupt can be directly discard as 
> you can’t find vm through pasid. You can think this is also one kind 
> of prescreen.
>
> So I am stand for the point that, there is no need to delay the 
> destroy of hash to after vm is destroyed. Prescreening hash can be 
> destroyed at the time of vm_fini.
>
> Thanks,
>
> Oak
>
> *From:*Christian König <ckoenig.leichtzumerken-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
> *Sent:* Friday, September 07, 2018 4:39 AM
> *To:* Zeng, Oak <Oak.Zeng-5C7GfCeVMHo@public.gmane.org>; Oak Zeng <zengshanjun-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>; 
> amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
> *Cc:* Zeng, Oak <Oak.Zeng-5C7GfCeVMHo@public.gmane.org>
> *Subject:* Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to amdgpu vm
>
> Hi Oak,
>
> yeah, but this is still a NAK. Making the hash per PASID was not a 
> suggestion but rather a must have.
>
> The VM structures must be destroyed while the processing is still 
> ongoing, or otherwise we would not have a clean OOM handling.
>
> The IDR for PASID lockup can be put into amdgpu_ids.c, you can just 
> replace the IDA already used there with an IDR.
>
> Since the PASID is already freed up delayed we should have the grace 
> period for interrupt processing included. If that still isn't 
> sufficient we can still add some delayed work for this.
>
> Regards,
> Christian.
>
> Am 07.09.2018 um 06:16 schrieb ozeng:
>
>     Hi Christian,
>
>     In this implementation, fault hash is made per vm, not per pasid
>     as suggested, based on below considerations:
>
>       * Delay the destroy of hash introduces more effort like how to
>         set the proper grace period after which no retry interrupt
>         will be generated. We want to avoid those complication if we can.
>       * The  problem of the per vm implementation is that it is hard
>         to delay the destroy of fault hash, because when vm is
>         destroyed, prescreen function won't be able to retrieve the
>         fault hash. But in this case, the prescreen function can
>         either let the interrupt go through (to gmc) or ignore it.
>         From this perspective, it is not very necessary to delay the
>         destroy of hash table.
>       * On the other hand, since ASICs after vega10 have the HW CAM
>         feature. So the SW IV prescreen is only useful for vega10. For
>         all the HW implemented CAM, we can consider the delayed CAM
>         acknowledgment.
>       * Making it per pasid need to introduce another idr to correlate
>         pasid and hash table - where to put the idr? You will have to
>         make it a global variable which is not very desirable.
>
>     The main purpose of this patch is to solve the inter-process lock
>     issue (when hash table is full), while still keep codes simple.
>
>     Also with this patch, the faults kfifo is not necessary any more.
>     See patch 2.
>
>     Regards,
>
>     Oak
>
>     On 09/06/2018 11:28 PM, Oak Zeng wrote:
>
>         In stead of share one fault hash table per device, make it
>
>         per vm. This can avoid inter-process lock issue when fault
>
>         hash table is full.
>
>         Change-Id: I5d1281b7c41eddc8e26113e010516557588d3708
>
>         Signed-off-by: Oak Zeng<Oak.Zeng-5C7GfCeVMHo@public.gmane.org> <mailto:Oak.Zeng-5C7GfCeVMHo@public.gmane.org>
>
>         Suggested-by: Christian Konig<Christian.Koenig-5C7GfCeVMHo@public.gmane.org> <mailto:Christian.Koenig-5C7GfCeVMHo@public.gmane.org>
>
>         Suggested-by: Felix Kuehling<Felix.Kuehling-5C7GfCeVMHo@public.gmane.org> <mailto:Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
>
>         ---
>
>           drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c |  75 ------------------------
>
>           drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h |  10 ----
>
>           drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 102 ++++++++++++++++++++++++++++++++-
>
>           drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h |  12 ++++
>
>           drivers/gpu/drm/amd/amdgpu/vega10_ih.c |  38 +++++-------
>
>           5 files changed, 127 insertions(+), 110 deletions(-)
>
>         diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>
>         index 06373d4..4ed8621 100644
>
>         --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>
>         +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>
>         @@ -197,78 +197,3 @@ int amdgpu_ih_process(struct amdgpu_device *adev)
>
>             return IRQ_HANDLED;
>
>           }
>
>           
>
>         -/**
>
>         - * amdgpu_ih_add_fault - Add a page fault record
>
>         - *
>
>         - * @adev: amdgpu device pointer
>
>         - * @key: 64-bit encoding of PASID and address
>
>         - *
>
>         - * This should be called when a retry page fault interrupt is
>
>         - * received. If this is a new page fault, it will be added to a hash
>
>         - * table. The return value indicates whether this is a new fault, or
>
>         - * a fault that was already known and is already being handled.
>
>         - *
>
>         - * If there are too many pending page faults, this will fail. Retry
>
>         - * interrupts should be ignored in this case until there is enough
>
>         - * free space.
>
>         - *
>
>         - * Returns 0 if the fault was added, 1 if the fault was already known,
>
>         - * -ENOSPC if there are too many pending faults.
>
>         - */
>
>         -int amdgpu_ih_add_fault(struct amdgpu_device *adev, u64 key)
>
>         -{
>
>         -  unsigned long flags;
>
>         -  int r = -ENOSPC;
>
>         -
>
>         -  if (WARN_ON_ONCE(!adev->irq.ih.faults))
>
>         -          /* Should be allocated in <IP>_ih_sw_init on GPUs that
>
>         -           * support retry faults and require retry filtering.
>
>         -           */
>
>         -          return r;
>
>         -
>
>         -  spin_lock_irqsave(&adev->irq.ih.faults->lock, flags);
>
>         -
>
>         -  /* Only let the hash table fill up to 50% for best performance */
>
>         -  if (adev->irq.ih.faults->count >= (1 << (AMDGPU_PAGEFAULT_HASH_BITS-1)))
>
>         -          goto unlock_out;
>
>         -
>
>         -  r = chash_table_copy_in(&adev->irq.ih.faults->hash, key, NULL);
>
>         -  if (!r)
>
>         -          adev->irq.ih.faults->count++;
>
>         -
>
>         -  /* chash_table_copy_in should never fail unless we're losing count */
>
>         -  WARN_ON_ONCE(r < 0);
>
>         -
>
>         -unlock_out:
>
>         -  spin_unlock_irqrestore(&adev->irq.ih.faults->lock, flags);
>
>         -  return r;
>
>         -}
>
>         -
>
>         -/**
>
>         - * amdgpu_ih_clear_fault - Remove a page fault record
>
>         - *
>
>         - * @adev: amdgpu device pointer
>
>         - * @key: 64-bit encoding of PASID and address
>
>         - *
>
>         - * This should be called when a page fault has been handled. Any
>
>         - * future interrupt with this key will be processed as a new
>
>         - * page fault.
>
>         - */
>
>         -void amdgpu_ih_clear_fault(struct amdgpu_device *adev, u64 key)
>
>         -{
>
>         -  unsigned long flags;
>
>         -  int r;
>
>         -
>
>         -  if (!adev->irq.ih.faults)
>
>         -          return;
>
>         -
>
>         -  spin_lock_irqsave(&adev->irq.ih.faults->lock, flags);
>
>         -
>
>         -  r = chash_table_remove(&adev->irq.ih.faults->hash, key, NULL);
>
>         -  if (!WARN_ON_ONCE(r < 0)) {
>
>         -          adev->irq.ih.faults->count--;
>
>         -          WARN_ON_ONCE(adev->irq.ih.faults->count < 0);
>
>         -  }
>
>         -
>
>         -  spin_unlock_irqrestore(&adev->irq.ih.faults->lock, flags);
>
>         -}
>
>         diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>
>         index a23e1c0..f411ffb 100644
>
>         --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>
>         +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>
>         @@ -32,13 +32,6 @@ struct amdgpu_device;
>
>           #define AMDGPU_IH_CLIENTID_LEGACY 0
>
>           #define AMDGPU_IH_CLIENTID_MAX SOC15_IH_CLIENTID_MAX
>
>           
>
>         -#define AMDGPU_PAGEFAULT_HASH_BITS 8
>
>         -struct amdgpu_retryfault_hashtable {
>
>         -  DECLARE_CHASH_TABLE(hash, AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
>
>         -  spinlock_t     lock;
>
>         -  int            count;
>
>         -};
>
>         -
>
>           /*
>
>            * R6xx+ IH ring
>
>            */
>
>         @@ -57,7 +50,6 @@ struct amdgpu_ih_ring {
>
>             bool                   use_doorbell;
>
>             bool                   use_bus_addr;
>
>             dma_addr_t             rb_dma_addr; /* only used when use_bus_addr = true */
>
>         -  struct amdgpu_retryfault_hashtable *faults;
>
>           };
>
>           
>
>           #define AMDGPU_IH_SRC_DATA_MAX_SIZE_DW 4
>
>         @@ -95,7 +87,5 @@ int amdgpu_ih_ring_init(struct amdgpu_device *adev, unsigned ring_size,
>
>                             bool use_bus_addr);
>
>           void amdgpu_ih_ring_fini(struct amdgpu_device *adev);
>
>           int amdgpu_ih_process(struct amdgpu_device *adev);
>
>         -int amdgpu_ih_add_fault(struct amdgpu_device *adev, u64 key);
>
>         -void amdgpu_ih_clear_fault(struct amdgpu_device *adev, u64 key);
>
>           
>
>           #endif
>
>         diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>
>         index 1d7e3c1..8b220e9 100644
>
>         --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>
>         +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>
>         @@ -2692,6 +2692,22 @@ void amdgpu_vm_adjust_size(struct amdgpu_device *adev, uint32_t min_vm_size,
>
>                      adev->vm_manager.fragment_size);
>
>           }
>
>           
>
>         +static struct amdgpu_retryfault_hashtable *init_fault_hash(void)
>
>         +{
>
>         +  struct amdgpu_retryfault_hashtable *fault_hash;
>
>         +
>
>         +  fault_hash = kmalloc(sizeof(*fault_hash), GFP_KERNEL);
>
>         +  if (!fault_hash)
>
>         +          return fault_hash;
>
>         +
>
>         +  INIT_CHASH_TABLE(fault_hash->hash,
>
>         +                  AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
>
>         +  spin_lock_init(&fault_hash->lock);
>
>         +  fault_hash->count = 0;
>
>         +
>
>         +  return fault_hash;
>
>         +}
>
>         +
>
>           /**
>
>            * amdgpu_vm_init - initialize a vm instance
>
>            *
>
>         @@ -2780,6 +2796,12 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>
>                     vm->pasid = pasid;
>
>             }
>
>           
>
>         +  vm->fault_hash = init_fault_hash();
>
>         +  if (!vm->fault_hash) {
>
>         +          r = -ENOMEM;
>
>         +          goto error_free_root;
>
>         +  }
>
>         +
>
>             INIT_KFIFO(vm->faults);
>
>             vm->fault_credit = 16;
>
>           
>
>         @@ -2973,7 +2995,7 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
>
>           
>
>             /* Clear pending page faults from IH when the VM is destroyed */
>
>             while (kfifo_get(&vm->faults, &fault))
>
>         -          amdgpu_ih_clear_fault(adev, fault);
>
>         +          amdgpu_vm_clear_fault(vm->fault_hash, fault);
>
>           
>
>             if (vm->pasid) {
>
>                     unsigned long flags;
>
>         @@ -2983,6 +3005,9 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
>
>                     spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
>
>             }
>
>           
>
>         +  kfree(vm->fault_hash);
>
>         +  vm->fault_hash = NULL;
>
>         +
>
>             drm_sched_entity_destroy(&vm->entity);
>
>           
>
>             if (!RB_EMPTY_ROOT(&vm->va.rb_root)) {
>
>         @@ -3183,3 +3208,78 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
>
>                     }
>
>             }
>
>           }
>
>         +
>
>         +/**
>
>         + * amdgpu_vm_add_fault - Add a page fault record to fault hash table
>
>         + *
>
>         + * @fault_hash: fault hash table
>
>         + * @key: 64-bit encoding of PASID and address
>
>         + *
>
>         + * This should be called when a retry page fault interrupt is
>
>         + * received. If this is a new page fault, it will be added to a hash
>
>         + * table. The return value indicates whether this is a new fault, or
>
>         + * a fault that was already known and is already being handled.
>
>         + *
>
>         + * If there are too many pending page faults, this will fail. Retry
>
>         + * interrupts should be ignored in this case until there is enough
>
>         + * free space.
>
>         + *
>
>         + * Returns 0 if the fault was added, 1 if the fault was already known,
>
>         + * -ENOSPC if there are too many pending faults.
>
>         + */
>
>         +int amdgpu_vm_add_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key)
>
>         +{
>
>         +  unsigned long flags;
>
>         +  int r = -ENOSPC;
>
>         +
>
>         +  if (WARN_ON_ONCE(!fault_hash))
>
>         +          /* Should be allocated in amdgpu_vm_init
>
>         +           */
>
>         +          return r;
>
>         +
>
>         +  spin_lock_irqsave(&fault_hash->lock, flags);
>
>         +
>
>         +  /* Only let the hash table fill up to 50% for best performance */
>
>         +  if (fault_hash->count >= (1 << (AMDGPU_PAGEFAULT_HASH_BITS-1)))
>
>         +          goto unlock_out;
>
>         +
>
>         +  r = chash_table_copy_in(&fault_hash->hash, key, NULL);
>
>         +  if (!r)
>
>         +          fault_hash->count++;
>
>         +
>
>         +  /* chash_table_copy_in should never fail unless we're losing count */
>
>         +  WARN_ON_ONCE(r < 0);
>
>         +
>
>         +unlock_out:
>
>         +  spin_unlock_irqrestore(&fault_hash->lock, flags);
>
>         +  return r;
>
>         +}
>
>         +
>
>         +/**
>
>         + * amdgpu_vm_clear_fault - Remove a page fault record
>
>         + *
>
>         + * @fault_hash: fault hash table
>
>         + * @key: 64-bit encoding of PASID and address
>
>         + *
>
>         + * This should be called when a page fault has been handled. Any
>
>         + * future interrupt with this key will be processed as a new
>
>         + * page fault.
>
>         + */
>
>         +void amdgpu_vm_clear_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key)
>
>         +{
>
>         +  unsigned long flags;
>
>         +  int r;
>
>         +
>
>         +  if (!fault_hash)
>
>         +          return;
>
>         +
>
>         +  spin_lock_irqsave(&fault_hash->lock, flags);
>
>         +
>
>         +  r = chash_table_remove(&fault_hash->hash, key, NULL);
>
>         +  if (!WARN_ON_ONCE(r < 0)) {
>
>         +          fault_hash->count--;
>
>         +          WARN_ON_ONCE(fault_hash->count < 0);
>
>         +  }
>
>         +
>
>         +  spin_unlock_irqrestore(&fault_hash->lock, flags);
>
>         +}
>
>         diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>
>         index e275ee7..6eb1da1 100644
>
>         --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>
>         +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>
>         @@ -178,6 +178,13 @@ struct amdgpu_task_info {
>
>             pid_t   tgid;
>
>           };
>
>           
>
>         +#define AMDGPU_PAGEFAULT_HASH_BITS 8
>
>         +struct amdgpu_retryfault_hashtable {
>
>         +  DECLARE_CHASH_TABLE(hash, AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
>
>         +  spinlock_t     lock;
>
>         +  int            count;
>
>         +};
>
>         +
>
>           struct amdgpu_vm {
>
>             /* tree of virtual addresses mapped */
>
>             struct rb_root_cached  va;
>
>         @@ -240,6 +247,7 @@ struct amdgpu_vm {
>
>             struct ttm_lru_bulk_move lru_bulk_move;
>
>             /* mark whether can do the bulk move */
>
>             bool                   bulk_moveable;
>
>         +  struct amdgpu_retryfault_hashtable *fault_hash;
>
>           };
>
>           
>
>           struct amdgpu_vm_manager {
>
>         @@ -355,4 +363,8 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);
>
>           void amdgpu_vm_move_to_lru_tail(struct amdgpu_device *adev,
>
>                                    struct amdgpu_vm *vm);
>
>           
>
>         +int amdgpu_vm_add_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key);
>
>         +
>
>         +void amdgpu_vm_clear_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key);
>
>         +
>
>           #endif
>
>         diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>
>         index 5ae5ed2..acbe5a7 100644
>
>         --- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>
>         +++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>
>         @@ -265,35 +265,36 @@ static bool vega10_ih_prescreen_iv(struct amdgpu_device *adev)
>
>                     return true;
>
>             }
>
>           
>
>         -  addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 12);
>
>         -  key = AMDGPU_VM_FAULT(pasid, addr);
>
>         -  r = amdgpu_ih_add_fault(adev, key);
>
>         -
>
>         -  /* Hash table is full or the fault is already being processed,
>
>         -   * ignore further page faults
>
>         -   */
>
>         -  if (r != 0)
>
>         -          goto ignore_iv;
>
>         -
>
>             /* Track retry faults in per-VM fault FIFO. */
>
>             spin_lock(&adev->vm_manager.pasid_lock);
>
>             vm = idr_find(&adev->vm_manager.pasid_idr, pasid);
>
>         +  addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 12);
>
>         +  key = AMDGPU_VM_FAULT(pasid, addr);
>
>             if (!vm) {
>
>                     /* VM not found, process it normally */
>
>                     spin_unlock(&adev->vm_manager.pasid_lock);
>
>         -          amdgpu_ih_clear_fault(adev, key);
>
>                     return true;
>
>         +  } else {
>
>         +          r = amdgpu_vm_add_fault(vm->fault_hash, key);
>
>         +
>
>         +          /* Hash table is full or the fault is already being processed,
>
>         +           * ignore further page faults
>
>         +           */
>
>         +          if (r != 0) {
>
>         +                  spin_unlock(&adev->vm_manager.pasid_lock);
>
>         +                  goto ignore_iv;
>
>         +          }
>
>             }
>
>             /* No locking required with single writer and single reader */
>
>             r = kfifo_put(&vm->faults, key);
>
>             if (!r) {
>
>                     /* FIFO is full. Ignore it until there is space */
>
>         +          amdgpu_vm_clear_fault(vm->fault_hash, key);
>
>                     spin_unlock(&adev->vm_manager.pasid_lock);
>
>         -          amdgpu_ih_clear_fault(adev, key);
>
>                     goto ignore_iv;
>
>             }
>
>         -  spin_unlock(&adev->vm_manager.pasid_lock);
>
>           
>
>         +  spin_unlock(&adev->vm_manager.pasid_lock);
>
>             /* It's the first fault for this address, process it normally */
>
>             return true;
>
>           
>
>         @@ -386,14 +387,6 @@ static int vega10_ih_sw_init(void *handle)
>
>             adev->irq.ih.use_doorbell = true;
>
>             adev->irq.ih.doorbell_index = AMDGPU_DOORBELL64_IH << 1;
>
>           
>
>         -  adev->irq.ih.faults = kmalloc(sizeof(*adev->irq.ih.faults), GFP_KERNEL);
>
>         -  if (!adev->irq.ih.faults)
>
>         -          return -ENOMEM;
>
>         -  INIT_CHASH_TABLE(adev->irq.ih.faults->hash,
>
>         -                   AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
>
>         -  spin_lock_init(&adev->irq.ih.faults->lock);
>
>         -  adev->irq.ih.faults->count = 0;
>
>         -
>
>             r = amdgpu_irq_init(adev);
>
>           
>
>             return r;
>
>         @@ -406,9 +399,6 @@ static int vega10_ih_sw_fini(void *handle)
>
>             amdgpu_irq_fini(adev);
>
>             amdgpu_ih_ring_fini(adev);
>
>           
>
>         -  kfree(adev->irq.ih.faults);
>
>         -  adev->irq.ih.faults = NULL;
>
>         -
>
>             return 0;
>
>           }
>
>           
>
>
>
>
>
>     _______________________________________________
>
>     amd-gfx mailing list
>
>     amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org <mailto:amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org>
>
>     https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>


[-- Attachment #1.2: Type: text/html, Size: 40471 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* RE: [PATCH 1/2] drm/amdgpu: Moved fault hash table to amdgpu vm
       [not found]                 ` <89e7515f-53f1-9f10-aeb6-7fc1b8df2702-5C7GfCeVMHo@public.gmane.org>
@ 2018-09-07 15:30                   ` Zeng, Oak
       [not found]                     ` <BN6PR12MB1651CA24220B91670A75890C80000-/b2+HYfkarRSqX7PDniLCgdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  2018-09-08  0:27                   ` Felix Kuehling
  1 sibling, 1 reply; 27+ messages in thread
From: Zeng, Oak @ 2018-09-07 15:30 UTC (permalink / raw)
  To: Koenig, Christian, Oak Zeng,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Yang, Philip


[-- Attachment #1.1: Type: text/plain, Size: 18260 bytes --]

Without a VM, how can you get the physical address of the faulty virtual address? Where this information should be hold?

Thanks,
Oak

From: Koenig, Christian
Sent: Monday, September 10, 2018 7:20 AM
To: Zeng, Oak <Oak.Zeng@amd.com>; Oak Zeng <zengshanjun@gmail.com>; amd-gfx@lists.freedesktop.org; Yang, Philip <Philip.Yang@amd.com>
Subject: Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to amdgpu vm

Am I right that the handling of page fault need a valid VM?
No, exactly that view is incorrect.

The VM is meant to be a memory management structure of page tables and is completely pointless fault processing because it represent the future state of the page tables and not the current one.

What we need for fault processing is a new structure build around PASIDs which is feed by the with addresses when page tables are moved around.

Alternatively I hope that we can use the SDMA to walk the page tables and update the required entries by just using the address.

Regards,
Christian.

Am 07.09.2018 um 16:55 schrieb Zeng, Oak:
Hi Christian,

What is the value of delay the destroy of hash to after vm is destroyed? Since vm is destroyed, you basically don’t have enough information to paging in the correct page to gpuvm. Am I right that the handling of page fault need a valid VM? For example, you need the VM to get the corresponding physical address of the faulty page.

After vm is destroyed, the retry interrupt can be directly discard as you can’t find vm through pasid. You can think this is also one kind of prescreen.

So I am stand for the point that, there is no need to delay the destroy of hash to after vm is destroyed. Prescreening hash can be destroyed at the time of vm_fini.

Thanks,
Oak

From: Christian König <ckoenig.leichtzumerken@gmail.com><mailto:ckoenig.leichtzumerken@gmail.com>
Sent: Friday, September 07, 2018 4:39 AM
To: Zeng, Oak <Oak.Zeng@amd.com><mailto:Oak.Zeng@amd.com>; Oak Zeng <zengshanjun@gmail.com><mailto:zengshanjun@gmail.com>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
Cc: Zeng, Oak <Oak.Zeng@amd.com><mailto:Oak.Zeng@amd.com>
Subject: Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to amdgpu vm

Hi Oak,

yeah, but this is still a NAK. Making the hash per PASID was not a suggestion but rather a must have.

The VM structures must be destroyed while the processing is still ongoing, or otherwise we would not have a clean OOM handling.

The IDR for PASID lockup can be put into amdgpu_ids.c, you can just replace the IDA already used there with an IDR.

Since the PASID is already freed up delayed we should have the grace period for interrupt processing included. If that still isn't sufficient we can still add some delayed work for this.

Regards,
Christian.

Am 07.09.2018 um 06:16 schrieb ozeng:

Hi Christian,

In this implementation, fault hash is made per vm, not per pasid as suggested, based on below considerations:

  *   Delay the destroy of hash introduces more effort like how to set the proper grace period after which no retry interrupt will be generated. We want to avoid those complication if we can.
  *   The  problem of the per vm implementation is that it is hard to delay the destroy of fault hash, because when vm is destroyed, prescreen function won't be able to retrieve the fault hash. But in this case, the prescreen function can either let the interrupt go through (to gmc) or ignore it. From this perspective, it is not very necessary to delay the destroy of hash table.
  *   On the other hand, since ASICs after vega10 have the HW CAM feature. So the SW IV prescreen is only useful for vega10. For all the HW implemented CAM, we can consider the delayed CAM acknowledgment.
  *   Making it per pasid need to introduce another idr to correlate pasid and hash table - where to put the idr? You will have to make it a global variable which is not very desirable.

The main purpose of this patch is to solve the inter-process lock issue (when hash table is full), while still keep codes simple.

Also with this patch, the faults kfifo is not necessary any more. See patch 2.

Regards,

Oak

On 09/06/2018 11:28 PM, Oak Zeng wrote:

In stead of share one fault hash table per device, make it

per vm. This can avoid inter-process lock issue when fault

hash table is full.



Change-Id: I5d1281b7c41eddc8e26113e010516557588d3708

Signed-off-by: Oak Zeng <Oak.Zeng@amd.com><mailto:Oak.Zeng@amd.com>

Suggested-by: Christian Konig <Christian.Koenig@amd.com><mailto:Christian.Koenig@amd.com>

Suggested-by: Felix Kuehling <Felix.Kuehling@amd.com><mailto:Felix.Kuehling@amd.com>

---

 drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c |  75 ------------------------

 drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h |  10 ----

 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 102 ++++++++++++++++++++++++++++++++-

 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h |  12 ++++

 drivers/gpu/drm/amd/amdgpu/vega10_ih.c |  38 +++++-------

 5 files changed, 127 insertions(+), 110 deletions(-)



diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c

index 06373d4..4ed8621 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c

@@ -197,78 +197,3 @@ int amdgpu_ih_process(struct amdgpu_device *adev)

   return IRQ_HANDLED;

 }



-/**

- * amdgpu_ih_add_fault - Add a page fault record

- *

- * @adev: amdgpu device pointer

- * @key: 64-bit encoding of PASID and address

- *

- * This should be called when a retry page fault interrupt is

- * received. If this is a new page fault, it will be added to a hash

- * table. The return value indicates whether this is a new fault, or

- * a fault that was already known and is already being handled.

- *

- * If there are too many pending page faults, this will fail. Retry

- * interrupts should be ignored in this case until there is enough

- * free space.

- *

- * Returns 0 if the fault was added, 1 if the fault was already known,

- * -ENOSPC if there are too many pending faults.

- */

-int amdgpu_ih_add_fault(struct amdgpu_device *adev, u64 key)

-{

-  unsigned long flags;

-  int r = -ENOSPC;

-

-  if (WARN_ON_ONCE(!adev->irq.ih.faults))

-          /* Should be allocated in <IP>_ih_sw_init on GPUs that

-           * support retry faults and require retry filtering.

-           */

-          return r;

-

-  spin_lock_irqsave(&adev->irq.ih.faults->lock, flags);

-

-  /* Only let the hash table fill up to 50% for best performance */

-  if (adev->irq.ih.faults->count >= (1 << (AMDGPU_PAGEFAULT_HASH_BITS-1)))

-          goto unlock_out;

-

-  r = chash_table_copy_in(&adev->irq.ih.faults->hash, key, NULL);

-  if (!r)

-          adev->irq.ih.faults->count++;

-

-  /* chash_table_copy_in should never fail unless we're losing count */

-  WARN_ON_ONCE(r < 0);

-

-unlock_out:

-  spin_unlock_irqrestore(&adev->irq.ih.faults->lock, flags);

-  return r;

-}

-

-/**

- * amdgpu_ih_clear_fault - Remove a page fault record

- *

- * @adev: amdgpu device pointer

- * @key: 64-bit encoding of PASID and address

- *

- * This should be called when a page fault has been handled. Any

- * future interrupt with this key will be processed as a new

- * page fault.

- */

-void amdgpu_ih_clear_fault(struct amdgpu_device *adev, u64 key)

-{

-  unsigned long flags;

-  int r;

-

-  if (!adev->irq.ih.faults)

-          return;

-

-  spin_lock_irqsave(&adev->irq.ih.faults->lock, flags);

-

-  r = chash_table_remove(&adev->irq.ih.faults->hash, key, NULL);

-  if (!WARN_ON_ONCE(r < 0)) {

-          adev->irq.ih.faults->count--;

-          WARN_ON_ONCE(adev->irq.ih.faults->count < 0);

-  }

-

-  spin_unlock_irqrestore(&adev->irq.ih.faults->lock, flags);

-}

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h

index a23e1c0..f411ffb 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h

@@ -32,13 +32,6 @@ struct amdgpu_device;

 #define AMDGPU_IH_CLIENTID_LEGACY 0

 #define AMDGPU_IH_CLIENTID_MAX SOC15_IH_CLIENTID_MAX



-#define AMDGPU_PAGEFAULT_HASH_BITS 8

-struct amdgpu_retryfault_hashtable {

-  DECLARE_CHASH_TABLE(hash, AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);

-  spinlock_t     lock;

-  int            count;

-};

-

 /*

  * R6xx+ IH ring

  */

@@ -57,7 +50,6 @@ struct amdgpu_ih_ring {

   bool                   use_doorbell;

   bool                   use_bus_addr;

   dma_addr_t             rb_dma_addr; /* only used when use_bus_addr = true */

-  struct amdgpu_retryfault_hashtable *faults;

 };



 #define AMDGPU_IH_SRC_DATA_MAX_SIZE_DW 4

@@ -95,7 +87,5 @@ int amdgpu_ih_ring_init(struct amdgpu_device *adev, unsigned ring_size,

                   bool use_bus_addr);

 void amdgpu_ih_ring_fini(struct amdgpu_device *adev);

 int amdgpu_ih_process(struct amdgpu_device *adev);

-int amdgpu_ih_add_fault(struct amdgpu_device *adev, u64 key);

-void amdgpu_ih_clear_fault(struct amdgpu_device *adev, u64 key);



 #endif

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c

index 1d7e3c1..8b220e9 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c

@@ -2692,6 +2692,22 @@ void amdgpu_vm_adjust_size(struct amdgpu_device *adev, uint32_t min_vm_size,

            adev->vm_manager.fragment_size);

 }



+static struct amdgpu_retryfault_hashtable *init_fault_hash(void)

+{

+  struct amdgpu_retryfault_hashtable *fault_hash;

+

+  fault_hash = kmalloc(sizeof(*fault_hash), GFP_KERNEL);

+  if (!fault_hash)

+          return fault_hash;

+

+  INIT_CHASH_TABLE(fault_hash->hash,

+                  AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);

+  spin_lock_init(&fault_hash->lock);

+  fault_hash->count = 0;

+

+  return fault_hash;

+}

+

 /**

  * amdgpu_vm_init - initialize a vm instance

  *

@@ -2780,6 +2796,12 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,

           vm->pasid = pasid;

   }



+  vm->fault_hash = init_fault_hash();

+  if (!vm->fault_hash) {

+          r = -ENOMEM;

+          goto error_free_root;

+  }

+

   INIT_KFIFO(vm->faults);

   vm->fault_credit = 16;



@@ -2973,7 +2995,7 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)



   /* Clear pending page faults from IH when the VM is destroyed */

   while (kfifo_get(&vm->faults, &fault))

-          amdgpu_ih_clear_fault(adev, fault);

+          amdgpu_vm_clear_fault(vm->fault_hash, fault);



   if (vm->pasid) {

           unsigned long flags;

@@ -2983,6 +3005,9 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)

           spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);

   }



+  kfree(vm->fault_hash);

+  vm->fault_hash = NULL;

+

   drm_sched_entity_destroy(&vm->entity);



   if (!RB_EMPTY_ROOT(&vm->va.rb_root)) {

@@ -3183,3 +3208,78 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)

           }

   }

 }

+

+/**

+ * amdgpu_vm_add_fault - Add a page fault record to fault hash table

+ *

+ * @fault_hash: fault hash table

+ * @key: 64-bit encoding of PASID and address

+ *

+ * This should be called when a retry page fault interrupt is

+ * received. If this is a new page fault, it will be added to a hash

+ * table. The return value indicates whether this is a new fault, or

+ * a fault that was already known and is already being handled.

+ *

+ * If there are too many pending page faults, this will fail. Retry

+ * interrupts should be ignored in this case until there is enough

+ * free space.

+ *

+ * Returns 0 if the fault was added, 1 if the fault was already known,

+ * -ENOSPC if there are too many pending faults.

+ */

+int amdgpu_vm_add_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key)

+{

+  unsigned long flags;

+  int r = -ENOSPC;

+

+  if (WARN_ON_ONCE(!fault_hash))

+          /* Should be allocated in amdgpu_vm_init

+           */

+          return r;

+

+  spin_lock_irqsave(&fault_hash->lock, flags);

+

+  /* Only let the hash table fill up to 50% for best performance */

+  if (fault_hash->count >= (1 << (AMDGPU_PAGEFAULT_HASH_BITS-1)))

+          goto unlock_out;

+

+  r = chash_table_copy_in(&fault_hash->hash, key, NULL);

+  if (!r)

+          fault_hash->count++;

+

+  /* chash_table_copy_in should never fail unless we're losing count */

+  WARN_ON_ONCE(r < 0);

+

+unlock_out:

+  spin_unlock_irqrestore(&fault_hash->lock, flags);

+  return r;

+}

+

+/**

+ * amdgpu_vm_clear_fault - Remove a page fault record

+ *

+ * @fault_hash: fault hash table

+ * @key: 64-bit encoding of PASID and address

+ *

+ * This should be called when a page fault has been handled. Any

+ * future interrupt with this key will be processed as a new

+ * page fault.

+ */

+void amdgpu_vm_clear_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key)

+{

+  unsigned long flags;

+  int r;

+

+  if (!fault_hash)

+          return;

+

+  spin_lock_irqsave(&fault_hash->lock, flags);

+

+  r = chash_table_remove(&fault_hash->hash, key, NULL);

+  if (!WARN_ON_ONCE(r < 0)) {

+          fault_hash->count--;

+          WARN_ON_ONCE(fault_hash->count < 0);

+  }

+

+  spin_unlock_irqrestore(&fault_hash->lock, flags);

+}

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h

index e275ee7..6eb1da1 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h

@@ -178,6 +178,13 @@ struct amdgpu_task_info {

   pid_t   tgid;

 };



+#define AMDGPU_PAGEFAULT_HASH_BITS 8

+struct amdgpu_retryfault_hashtable {

+  DECLARE_CHASH_TABLE(hash, AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);

+  spinlock_t     lock;

+  int            count;

+};

+

 struct amdgpu_vm {

   /* tree of virtual addresses mapped */

   struct rb_root_cached  va;

@@ -240,6 +247,7 @@ struct amdgpu_vm {

   struct ttm_lru_bulk_move lru_bulk_move;

   /* mark whether can do the bulk move */

   bool                   bulk_moveable;

+  struct amdgpu_retryfault_hashtable *fault_hash;

 };



 struct amdgpu_vm_manager {

@@ -355,4 +363,8 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);

 void amdgpu_vm_move_to_lru_tail(struct amdgpu_device *adev,

                          struct amdgpu_vm *vm);



+int amdgpu_vm_add_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key);

+

+void amdgpu_vm_clear_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key);

+

 #endif

diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c

index 5ae5ed2..acbe5a7 100644

--- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c

+++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c

@@ -265,35 +265,36 @@ static bool vega10_ih_prescreen_iv(struct amdgpu_device *adev)

           return true;

   }



-  addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 12);

-  key = AMDGPU_VM_FAULT(pasid, addr);

-  r = amdgpu_ih_add_fault(adev, key);

-

-  /* Hash table is full or the fault is already being processed,

-   * ignore further page faults

-   */

-  if (r != 0)

-          goto ignore_iv;

-

   /* Track retry faults in per-VM fault FIFO. */

   spin_lock(&adev->vm_manager.pasid_lock);

   vm = idr_find(&adev->vm_manager.pasid_idr, pasid);

+  addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 12);

+  key = AMDGPU_VM_FAULT(pasid, addr);

   if (!vm) {

           /* VM not found, process it normally */

           spin_unlock(&adev->vm_manager.pasid_lock);

-          amdgpu_ih_clear_fault(adev, key);

           return true;

+  } else {

+          r = amdgpu_vm_add_fault(vm->fault_hash, key);

+

+          /* Hash table is full or the fault is already being processed,

+           * ignore further page faults

+           */

+          if (r != 0) {

+                  spin_unlock(&adev->vm_manager.pasid_lock);

+                  goto ignore_iv;

+          }

   }

   /* No locking required with single writer and single reader */

   r = kfifo_put(&vm->faults, key);

   if (!r) {

           /* FIFO is full. Ignore it until there is space */

+          amdgpu_vm_clear_fault(vm->fault_hash, key);

           spin_unlock(&adev->vm_manager.pasid_lock);

-          amdgpu_ih_clear_fault(adev, key);

           goto ignore_iv;

   }

-  spin_unlock(&adev->vm_manager.pasid_lock);



+  spin_unlock(&adev->vm_manager.pasid_lock);

   /* It's the first fault for this address, process it normally */

   return true;



@@ -386,14 +387,6 @@ static int vega10_ih_sw_init(void *handle)

   adev->irq.ih.use_doorbell = true;

   adev->irq.ih.doorbell_index = AMDGPU_DOORBELL64_IH << 1;



-  adev->irq.ih.faults = kmalloc(sizeof(*adev->irq.ih.faults), GFP_KERNEL);

-  if (!adev->irq.ih.faults)

-          return -ENOMEM;

-  INIT_CHASH_TABLE(adev->irq.ih.faults->hash,

-                   AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);

-  spin_lock_init(&adev->irq.ih.faults->lock);

-  adev->irq.ih.faults->count = 0;

-

   r = amdgpu_irq_init(adev);



   return r;

@@ -406,9 +399,6 @@ static int vega10_ih_sw_fini(void *handle)

   amdgpu_irq_fini(adev);

   amdgpu_ih_ring_fini(adev);



-  kfree(adev->irq.ih.faults);

-  adev->irq.ih.faults = NULL;

-

   return 0;

 }








_______________________________________________

amd-gfx mailing list

amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>

https://lists.freedesktop.org/mailman/listinfo/amd-gfx



[-- Attachment #1.2: Type: text/html, Size: 41412 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to amdgpu vm
       [not found]                     ` <BN6PR12MB1651CA24220B91670A75890C80000-/b2+HYfkarRSqX7PDniLCgdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2018-09-07 15:41                       ` Christian König
       [not found]                         ` <0e52fe27-716c-f9d3-9b2c-5540825ffcef-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 27+ messages in thread
From: Christian König @ 2018-09-07 15:41 UTC (permalink / raw)
  To: Zeng, Oak, Oak Zeng, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	Yang, Philip


[-- Attachment #1.1: Type: text/plain, Size: 26059 bytes --]

The VM doesn't know that information either.

In other words when you request the address of a PT you get the address 
where memory management has moved it, not the address where the hardware 
is processing the fault from.

I see only a few possible solutions for that:
1. We use an extra structure in the kernel driver which holds the 
current address of the PTs and is feed by memory management when buffers 
move around.
2. We change the SDMA firmware to do the walk for us and update the PTEs 
based on the information in the page directory.
3. We use the UTCL walker to get us to the correct address which is then 
updated with the SDMA.

Regards,
Christian.

Am 07.09.2018 um 17:30 schrieb Zeng, Oak:
>
> Without a VM, how can you get the physical address of the faulty 
> virtual address? Where this information should be hold?
>
> Thanks,
>
> Oak
>
> *From:*Koenig, Christian
> *Sent:* Monday, September 10, 2018 7:20 AM
> *To:* Zeng, Oak <Oak.Zeng-5C7GfCeVMHo@public.gmane.org>; Oak Zeng <zengshanjun-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>; 
> amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org; Yang, Philip <Philip.Yang-5C7GfCeVMHo@public.gmane.org>
> *Subject:* Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to amdgpu vm
>
>     Am I right that the handling of page fault need a valid VM?
>
> No, exactly that view is incorrect.
>
> The VM is meant to be a memory management structure of page tables and 
> is completely pointless fault processing because it represent the 
> future state of the page tables and not the current one.
>
> What we need for fault processing is a new structure build around 
> PASIDs which is feed by the with addresses when page tables are moved 
> around.
>
> Alternatively I hope that we can use the SDMA to walk the page tables 
> and update the required entries by just using the address.
>
> Regards,
> Christian.
>
> Am 07.09.2018 um 16:55 schrieb Zeng, Oak:
>
>     Hi Christian,
>
>     What is the value of delay the destroy of hash to after vm is
>     destroyed? Since vm is destroyed, you basically don’t have enough
>     information to paging in the correct page to gpuvm. Am I right
>     that the handling of page fault need a valid VM? For example, you
>     need the VM to get the corresponding physical address of the
>     faulty page.
>
>     After vm is destroyed, the retry interrupt can be directly discard
>     as you can’t find vm through pasid. You can think this is also one
>     kind of prescreen.
>
>     So I am stand for the point that, there is no need to delay the
>     destroy of hash to after vm is destroyed. Prescreening hash can be
>     destroyed at the time of vm_fini.
>
>     Thanks,
>
>     Oak
>
>     *From:*Christian König <ckoenig.leichtzumerken-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
>     <mailto:ckoenig.leichtzumerken-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
>     *Sent:* Friday, September 07, 2018 4:39 AM
>     *To:* Zeng, Oak <Oak.Zeng-5C7GfCeVMHo@public.gmane.org> <mailto:Oak.Zeng-5C7GfCeVMHo@public.gmane.org>; Oak
>     Zeng <zengshanjun-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> <mailto:zengshanjun-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>;
>     amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org <mailto:amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org>
>     *Cc:* Zeng, Oak <Oak.Zeng-5C7GfCeVMHo@public.gmane.org> <mailto:Oak.Zeng-5C7GfCeVMHo@public.gmane.org>
>     *Subject:* Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to
>     amdgpu vm
>
>     Hi Oak,
>
>     yeah, but this is still a NAK. Making the hash per PASID was not a
>     suggestion but rather a must have.
>
>     The VM structures must be destroyed while the processing is still
>     ongoing, or otherwise we would not have a clean OOM handling.
>
>     The IDR for PASID lockup can be put into amdgpu_ids.c, you can
>     just replace the IDA already used there with an IDR.
>
>     Since the PASID is already freed up delayed we should have the
>     grace period for interrupt processing included. If that still
>     isn't sufficient we can still add some delayed work for this.
>
>     Regards,
>     Christian.
>
>     Am 07.09.2018 um 06:16 schrieb ozeng:
>
>         Hi Christian,
>
>         In this implementation, fault hash is made per vm, not per
>         pasid as suggested, based on below considerations:
>
>           * Delay the destroy of hash introduces more effort like how
>             to set the proper grace period after which no retry
>             interrupt will be generated. We want to avoid those
>             complication if we can.
>           * The  problem of the per vm implementation is that it is
>             hard to delay the destroy of fault hash, because when vm
>             is destroyed, prescreen function won't be able to retrieve
>             the fault hash. But in this case, the prescreen function
>             can either let the interrupt go through (to gmc) or ignore
>             it. From this perspective, it is not very necessary to
>             delay the destroy of hash table.
>           * On the other hand, since ASICs after vega10 have the HW
>             CAM feature. So the SW IV prescreen is only useful for
>             vega10. For all the HW implemented CAM, we can consider
>             the delayed CAM acknowledgment.
>           * Making it per pasid need to introduce another idr to
>             correlate pasid and hash table - where to put the idr? You
>             will have to make it a global variable which is not very
>             desirable.
>
>         The main purpose of this patch is to solve the inter-process
>         lock issue (when hash table is full), while still keep codes
>         simple.
>
>         Also with this patch, the faults kfifo is not necessary any
>         more. See patch 2.
>
>         Regards,
>
>         Oak
>
>         On 09/06/2018 11:28 PM, Oak Zeng wrote:
>
>             In stead of share one fault hash table per device, make it
>
>             per vm. This can avoid inter-process lock issue when fault
>
>             hash table is full.
>
>               
>
>             Change-Id: I5d1281b7c41eddc8e26113e010516557588d3708
>
>             Signed-off-by: Oak Zeng<Oak.Zeng-5C7GfCeVMHo@public.gmane.org> <mailto:Oak.Zeng-5C7GfCeVMHo@public.gmane.org>
>
>             Suggested-by: Christian Konig<Christian.Koenig-5C7GfCeVMHo@public.gmane.org> <mailto:Christian.Koenig-5C7GfCeVMHo@public.gmane.org>
>
>             Suggested-by: Felix Kuehling<Felix.Kuehling-5C7GfCeVMHo@public.gmane.org> <mailto:Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
>
>             ---
>
>               drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c |  75 ------------------------
>
>               drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h |  10 ----
>
>               drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 102 ++++++++++++++++++++++++++++++++-
>
>               drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h |  12 ++++
>
>               drivers/gpu/drm/amd/amdgpu/vega10_ih.c |  38 +++++-------
>
>               5 files changed, 127 insertions(+), 110 deletions(-)
>
>               
>
>             diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>
>             index 06373d4..4ed8621 100644
>
>             --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>
>             +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>
>             @@ -197,78 +197,3 @@ int amdgpu_ih_process(struct amdgpu_device *adev)
>
>                 return IRQ_HANDLED;
>
>               }
>
>               
>
>             -/**
>
>             - * amdgpu_ih_add_fault - Add a page fault record
>
>             - *
>
>             - * @adev: amdgpu device pointer
>
>             - * @key: 64-bit encoding of PASID and address
>
>             - *
>
>             - * This should be called when a retry page fault interrupt is
>
>             - * received. If this is a new page fault, it will be added to a hash
>
>             - * table. The return value indicates whether this is a new fault, or
>
>             - * a fault that was already known and is already being handled.
>
>             - *
>
>             - * If there are too many pending page faults, this will fail. Retry
>
>             - * interrupts should be ignored in this case until there is enough
>
>             - * free space.
>
>             - *
>
>             - * Returns 0 if the fault was added, 1 if the fault was already known,
>
>             - * -ENOSPC if there are too many pending faults.
>
>             - */
>
>             -int amdgpu_ih_add_fault(struct amdgpu_device *adev, u64 key)
>
>             -{
>
>             -  unsigned long flags;
>
>             -  int r = -ENOSPC;
>
>             -
>
>             -  if (WARN_ON_ONCE(!adev->irq.ih.faults))
>
>             -          /* Should be allocated in <IP>_ih_sw_init on GPUs that
>
>             -           * support retry faults and require retry filtering.
>
>             -           */
>
>             -          return r;
>
>             -
>
>             -  spin_lock_irqsave(&adev->irq.ih.faults->lock, flags);
>
>             -
>
>             -  /* Only let the hash table fill up to 50% for best performance */
>
>             -  if (adev->irq.ih.faults->count >= (1 << (AMDGPU_PAGEFAULT_HASH_BITS-1)))
>
>             -          goto unlock_out;
>
>             -
>
>             -  r = chash_table_copy_in(&adev->irq.ih.faults->hash, key, NULL);
>
>             -  if (!r)
>
>             -          adev->irq.ih.faults->count++;
>
>             -
>
>             -  /* chash_table_copy_in should never fail unless we're losing count */
>
>             -  WARN_ON_ONCE(r < 0);
>
>             -
>
>             -unlock_out:
>
>             -  spin_unlock_irqrestore(&adev->irq.ih.faults->lock, flags);
>
>             -  return r;
>
>             -}
>
>             -
>
>             -/**
>
>             - * amdgpu_ih_clear_fault - Remove a page fault record
>
>             - *
>
>             - * @adev: amdgpu device pointer
>
>             - * @key: 64-bit encoding of PASID and address
>
>             - *
>
>             - * This should be called when a page fault has been handled. Any
>
>             - * future interrupt with this key will be processed as a new
>
>             - * page fault.
>
>             - */
>
>             -void amdgpu_ih_clear_fault(struct amdgpu_device *adev, u64 key)
>
>             -{
>
>             -  unsigned long flags;
>
>             -  int r;
>
>             -
>
>             -  if (!adev->irq.ih.faults)
>
>             -          return;
>
>             -
>
>             -  spin_lock_irqsave(&adev->irq.ih.faults->lock, flags);
>
>             -
>
>             -  r = chash_table_remove(&adev->irq.ih.faults->hash, key, NULL);
>
>             -  if (!WARN_ON_ONCE(r < 0)) {
>
>             -          adev->irq.ih.faults->count--;
>
>             -          WARN_ON_ONCE(adev->irq.ih.faults->count < 0);
>
>             -  }
>
>             -
>
>             -  spin_unlock_irqrestore(&adev->irq.ih.faults->lock, flags);
>
>             -}
>
>             diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>
>             index a23e1c0..f411ffb 100644
>
>             --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>
>             +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>
>             @@ -32,13 +32,6 @@ struct amdgpu_device;
>
>               #define AMDGPU_IH_CLIENTID_LEGACY 0
>
>               #define AMDGPU_IH_CLIENTID_MAX SOC15_IH_CLIENTID_MAX
>
>               
>
>             -#define AMDGPU_PAGEFAULT_HASH_BITS 8
>
>             -struct amdgpu_retryfault_hashtable {
>
>             -  DECLARE_CHASH_TABLE(hash, AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
>
>             -  spinlock_t     lock;
>
>             -  int            count;
>
>             -};
>
>             -
>
>               /*
>
>                * R6xx+ IH ring
>
>                */
>
>             @@ -57,7 +50,6 @@ struct amdgpu_ih_ring {
>
>                 bool                   use_doorbell;
>
>                 bool                   use_bus_addr;
>
>                 dma_addr_t             rb_dma_addr; /* only used when use_bus_addr = true */
>
>             -  struct amdgpu_retryfault_hashtable *faults;
>
>               };
>
>               
>
>               #define AMDGPU_IH_SRC_DATA_MAX_SIZE_DW 4
>
>             @@ -95,7 +87,5 @@ int amdgpu_ih_ring_init(struct amdgpu_device *adev, unsigned ring_size,
>
>                                 bool use_bus_addr);
>
>               void amdgpu_ih_ring_fini(struct amdgpu_device *adev);
>
>               int amdgpu_ih_process(struct amdgpu_device *adev);
>
>             -int amdgpu_ih_add_fault(struct amdgpu_device *adev, u64 key);
>
>             -void amdgpu_ih_clear_fault(struct amdgpu_device *adev, u64 key);
>
>               
>
>               #endif
>
>             diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>
>             index 1d7e3c1..8b220e9 100644
>
>             --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>
>             +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>
>             @@ -2692,6 +2692,22 @@ void amdgpu_vm_adjust_size(struct amdgpu_device *adev, uint32_t min_vm_size,
>
>                          adev->vm_manager.fragment_size);
>
>               }
>
>               
>
>             +static struct amdgpu_retryfault_hashtable *init_fault_hash(void)
>
>             +{
>
>             +  struct amdgpu_retryfault_hashtable *fault_hash;
>
>             +
>
>             +  fault_hash = kmalloc(sizeof(*fault_hash), GFP_KERNEL);
>
>             +  if (!fault_hash)
>
>             +          return fault_hash;
>
>             +
>
>             +  INIT_CHASH_TABLE(fault_hash->hash,
>
>             +                  AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
>
>             +  spin_lock_init(&fault_hash->lock);
>
>             +  fault_hash->count = 0;
>
>             +
>
>             +  return fault_hash;
>
>             +}
>
>             +
>
>               /**
>
>                * amdgpu_vm_init - initialize a vm instance
>
>                *
>
>             @@ -2780,6 +2796,12 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>
>                         vm->pasid = pasid;
>
>                 }
>
>               
>
>             +  vm->fault_hash = init_fault_hash();
>
>             +  if (!vm->fault_hash) {
>
>             +          r = -ENOMEM;
>
>             +          goto error_free_root;
>
>             +  }
>
>             +
>
>                 INIT_KFIFO(vm->faults);
>
>                 vm->fault_credit = 16;
>
>               
>
>             @@ -2973,7 +2995,7 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
>
>               
>
>                 /* Clear pending page faults from IH when the VM is destroyed */
>
>                 while (kfifo_get(&vm->faults, &fault))
>
>             -          amdgpu_ih_clear_fault(adev, fault);
>
>             +          amdgpu_vm_clear_fault(vm->fault_hash, fault);
>
>               
>
>                 if (vm->pasid) {
>
>                         unsigned long flags;
>
>             @@ -2983,6 +3005,9 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
>
>                         spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
>
>                 }
>
>               
>
>             +  kfree(vm->fault_hash);
>
>             +  vm->fault_hash = NULL;
>
>             +
>
>                 drm_sched_entity_destroy(&vm->entity);
>
>               
>
>                 if (!RB_EMPTY_ROOT(&vm->va.rb_root)) {
>
>             @@ -3183,3 +3208,78 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
>
>                         }
>
>                 }
>
>               }
>
>             +
>
>             +/**
>
>             + * amdgpu_vm_add_fault - Add a page fault record to fault hash table
>
>             + *
>
>             + * @fault_hash: fault hash table
>
>             + * @key: 64-bit encoding of PASID and address
>
>             + *
>
>             + * This should be called when a retry page fault interrupt is
>
>             + * received. If this is a new page fault, it will be added to a hash
>
>             + * table. The return value indicates whether this is a new fault, or
>
>             + * a fault that was already known and is already being handled.
>
>             + *
>
>             + * If there are too many pending page faults, this will fail. Retry
>
>             + * interrupts should be ignored in this case until there is enough
>
>             + * free space.
>
>             + *
>
>             + * Returns 0 if the fault was added, 1 if the fault was already known,
>
>             + * -ENOSPC if there are too many pending faults.
>
>             + */
>
>             +int amdgpu_vm_add_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key)
>
>             +{
>
>             +  unsigned long flags;
>
>             +  int r = -ENOSPC;
>
>             +
>
>             +  if (WARN_ON_ONCE(!fault_hash))
>
>             +          /* Should be allocated in amdgpu_vm_init
>
>             +           */
>
>             +          return r;
>
>             +
>
>             +  spin_lock_irqsave(&fault_hash->lock, flags);
>
>             +
>
>             +  /* Only let the hash table fill up to 50% for best performance */
>
>             +  if (fault_hash->count >= (1 << (AMDGPU_PAGEFAULT_HASH_BITS-1)))
>
>             +          goto unlock_out;
>
>             +
>
>             +  r = chash_table_copy_in(&fault_hash->hash, key, NULL);
>
>             +  if (!r)
>
>             +          fault_hash->count++;
>
>             +
>
>             +  /* chash_table_copy_in should never fail unless we're losing count */
>
>             +  WARN_ON_ONCE(r < 0);
>
>             +
>
>             +unlock_out:
>
>             +  spin_unlock_irqrestore(&fault_hash->lock, flags);
>
>             +  return r;
>
>             +}
>
>             +
>
>             +/**
>
>             + * amdgpu_vm_clear_fault - Remove a page fault record
>
>             + *
>
>             + * @fault_hash: fault hash table
>
>             + * @key: 64-bit encoding of PASID and address
>
>             + *
>
>             + * This should be called when a page fault has been handled. Any
>
>             + * future interrupt with this key will be processed as a new
>
>             + * page fault.
>
>             + */
>
>             +void amdgpu_vm_clear_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key)
>
>             +{
>
>             +  unsigned long flags;
>
>             +  int r;
>
>             +
>
>             +  if (!fault_hash)
>
>             +          return;
>
>             +
>
>             +  spin_lock_irqsave(&fault_hash->lock, flags);
>
>             +
>
>             +  r = chash_table_remove(&fault_hash->hash, key, NULL);
>
>             +  if (!WARN_ON_ONCE(r < 0)) {
>
>             +          fault_hash->count--;
>
>             +          WARN_ON_ONCE(fault_hash->count < 0);
>
>             +  }
>
>             +
>
>             +  spin_unlock_irqrestore(&fault_hash->lock, flags);
>
>             +}
>
>             diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>
>             index e275ee7..6eb1da1 100644
>
>             --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>
>             +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>
>             @@ -178,6 +178,13 @@ struct amdgpu_task_info {
>
>                 pid_t   tgid;
>
>               };
>
>               
>
>             +#define AMDGPU_PAGEFAULT_HASH_BITS 8
>
>             +struct amdgpu_retryfault_hashtable {
>
>             +  DECLARE_CHASH_TABLE(hash, AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
>
>             +  spinlock_t     lock;
>
>             +  int            count;
>
>             +};
>
>             +
>
>               struct amdgpu_vm {
>
>                 /* tree of virtual addresses mapped */
>
>                 struct rb_root_cached  va;
>
>             @@ -240,6 +247,7 @@ struct amdgpu_vm {
>
>                 struct ttm_lru_bulk_move lru_bulk_move;
>
>                 /* mark whether can do the bulk move */
>
>                 bool                   bulk_moveable;
>
>             +  struct amdgpu_retryfault_hashtable *fault_hash;
>
>               };
>
>               
>
>               struct amdgpu_vm_manager {
>
>             @@ -355,4 +363,8 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);
>
>               void amdgpu_vm_move_to_lru_tail(struct amdgpu_device *adev,
>
>                                        struct amdgpu_vm *vm);
>
>               
>
>             +int amdgpu_vm_add_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key);
>
>             +
>
>             +void amdgpu_vm_clear_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key);
>
>             +
>
>               #endif
>
>             diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>
>             index 5ae5ed2..acbe5a7 100644
>
>             --- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>
>             +++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>
>             @@ -265,35 +265,36 @@ static bool vega10_ih_prescreen_iv(struct amdgpu_device *adev)
>
>                         return true;
>
>                 }
>
>               
>
>             -  addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 12);
>
>             -  key = AMDGPU_VM_FAULT(pasid, addr);
>
>             -  r = amdgpu_ih_add_fault(adev, key);
>
>             -
>
>             -  /* Hash table is full or the fault is already being processed,
>
>             -   * ignore further page faults
>
>             -   */
>
>             -  if (r != 0)
>
>             -          goto ignore_iv;
>
>             -
>
>                 /* Track retry faults in per-VM fault FIFO. */
>
>                 spin_lock(&adev->vm_manager.pasid_lock);
>
>                 vm = idr_find(&adev->vm_manager.pasid_idr, pasid);
>
>             +  addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 12);
>
>             +  key = AMDGPU_VM_FAULT(pasid, addr);
>
>                 if (!vm) {
>
>                         /* VM not found, process it normally */
>
>                         spin_unlock(&adev->vm_manager.pasid_lock);
>
>             -          amdgpu_ih_clear_fault(adev, key);
>
>                         return true;
>
>             +  } else {
>
>             +          r = amdgpu_vm_add_fault(vm->fault_hash, key);
>
>             +
>
>             +          /* Hash table is full or the fault is already being processed,
>
>             +           * ignore further page faults
>
>             +           */
>
>             +          if (r != 0) {
>
>             +                  spin_unlock(&adev->vm_manager.pasid_lock);
>
>             +                  goto ignore_iv;
>
>             +          }
>
>                 }
>
>                 /* No locking required with single writer and single reader */
>
>                 r = kfifo_put(&vm->faults, key);
>
>                 if (!r) {
>
>                         /* FIFO is full. Ignore it until there is space */
>
>             +          amdgpu_vm_clear_fault(vm->fault_hash, key);
>
>                         spin_unlock(&adev->vm_manager.pasid_lock);
>
>             -          amdgpu_ih_clear_fault(adev, key);
>
>                         goto ignore_iv;
>
>                 }
>
>             -  spin_unlock(&adev->vm_manager.pasid_lock);
>
>               
>
>             +  spin_unlock(&adev->vm_manager.pasid_lock);
>
>                 /* It's the first fault for this address, process it normally */
>
>                 return true;
>
>               
>
>             @@ -386,14 +387,6 @@ static int vega10_ih_sw_init(void *handle)
>
>                 adev->irq.ih.use_doorbell = true;
>
>                 adev->irq.ih.doorbell_index = AMDGPU_DOORBELL64_IH << 1;
>
>               
>
>             -  adev->irq.ih.faults = kmalloc(sizeof(*adev->irq.ih.faults), GFP_KERNEL);
>
>             -  if (!adev->irq.ih.faults)
>
>             -          return -ENOMEM;
>
>             -  INIT_CHASH_TABLE(adev->irq.ih.faults->hash,
>
>             -                   AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
>
>             -  spin_lock_init(&adev->irq.ih.faults->lock);
>
>             -  adev->irq.ih.faults->count = 0;
>
>             -
>
>                 r = amdgpu_irq_init(adev);
>
>               
>
>                 return r;
>
>             @@ -406,9 +399,6 @@ static int vega10_ih_sw_fini(void *handle)
>
>                 amdgpu_irq_fini(adev);
>
>                 amdgpu_ih_ring_fini(adev);
>
>               
>
>             -  kfree(adev->irq.ih.faults);
>
>             -  adev->irq.ih.faults = NULL;
>
>             -
>
>                 return 0;
>
>               }
>
>               
>
>
>
>
>
>
>         _______________________________________________
>
>         amd-gfx mailing list
>
>         amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>         <mailto:amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org>
>
>         https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>


[-- Attachment #1.2: Type: text/html, Size: 46949 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* RE: [PATCH 1/2] drm/amdgpu: Moved fault hash table to amdgpu vm
       [not found]                         ` <0e52fe27-716c-f9d3-9b2c-5540825ffcef-5C7GfCeVMHo@public.gmane.org>
@ 2018-09-07 21:52                           ` Zeng, Oak
       [not found]                             ` <BN6PR12MB1651CCA8BB73DE64815EB0B280000-/b2+HYfkarRSqX7PDniLCgdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  0 siblings, 1 reply; 27+ messages in thread
From: Zeng, Oak @ 2018-09-07 21:52 UTC (permalink / raw)
  To: Koenig, Christian, Oak Zeng,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Yang, Philip


[-- Attachment #1.1: Type: text/plain, Size: 21109 bytes --]

Hi Christian,

See comments inline

Thanks,
Oak

From: Koenig, Christian
Sent: Monday, September 10, 2018 7:44 AM
To: Zeng, Oak <Oak.Zeng@amd.com>; Oak Zeng <zengshanjun@gmail.com>; amd-gfx@lists.freedesktop.org; Yang, Philip <Philip.Yang@amd.com>
Subject: Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to amdgpu vm

>The VM doesn't know that information either.

Yes, VM doesn’t know the information. More specifically I meant we need VM data structure to update PTs during page fault handling. For example, we walk the 4 level page table (the first level is pointed by vm->root) to figure out the entries need to be filled, allocate page table entry Bos if necessary, allocate physical page for the faulty address, setup 4 level page table for the faulty virtual address to pointing to the allocated physical page.

>In other words when you request the address of a PT you get the address where memory management has moved it, not the address where the hardware is processing the fault from.

Do you mean the address in the PT is what vm has mapped and the faulty address need to be mapped in the PT?


I see only a few possible solutions for that:
>1. We use an extra structure in the kernel driver which holds the current address of the PTs and is feed by memory management when buffers move around.
The amdgpu_vm and amdgpu_vm_pt data structure are invented for the purpose of management of page table BO. Why we need invent extra structure? Think about the normal case of the page fault handling when the process and vm are still alive. In the normal case, the amdgpu_vm structure can meet exactly purpose of page table update.

The said extra data structure must look very similar to the amdgpu_vm/vm_pt structure. So the best way is to reuse the existing vm structure, either in normal case or in process termination case. Is it possible to delay the destroy of vm during process termination?

>2. We change the SDMA firmware to do the walk for us and update the PTEs based on the information in the page directory.
As the page table BO need to be freed later. You still need to manage them in the driver. SDMA FW can walk and update but it need information from driver side. Driver still need a similar struct like amdgpu_vm.

>3. We use the UTCL walker to get us to the correct address which is then updated with the SDMA.
I don’t quite understand your point here. What do you mean by “correct address”? Do you mean the physical address to be filled to the page table? If this is your meaning, how UTCL2 knows this information – the physical address has to be allocated/validated through driver.


Regards,
Christian.

Am 07.09.2018 um 17:30 schrieb Zeng, Oak:
Without a VM, how can you get the physical address of the faulty virtual address? Where this information should be hold?

Thanks,
Oak

From: Koenig, Christian
Sent: Monday, September 10, 2018 7:20 AM
To: Zeng, Oak <Oak.Zeng@amd.com><mailto:Oak.Zeng@amd.com>; Oak Zeng <zengshanjun@gmail.com><mailto:zengshanjun@gmail.com>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>; Yang, Philip <Philip.Yang@amd.com><mailto:Philip.Yang@amd.com>
Subject: Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to amdgpu vm

Am I right that the handling of page fault need a valid VM?
No, exactly that view is incorrect.

The VM is meant to be a memory management structure of page tables and is completely pointless fault processing because it represent the future state of the page tables and not the current one.

What we need for fault processing is a new structure build around PASIDs which is feed by the with addresses when page tables are moved around.

Alternatively I hope that we can use the SDMA to walk the page tables and update the required entries by just using the address.

Regards,
Christian.

Am 07.09.2018 um 16:55 schrieb Zeng, Oak:
Hi Christian,

What is the value of delay the destroy of hash to after vm is destroyed? Since vm is destroyed, you basically don’t have enough information to paging in the correct page to gpuvm. Am I right that the handling of page fault need a valid VM? For example, you need the VM to get the corresponding physical address of the faulty page.

After vm is destroyed, the retry interrupt can be directly discard as you can’t find vm through pasid. You can think this is also one kind of prescreen.

So I am stand for the point that, there is no need to delay the destroy of hash to after vm is destroyed. Prescreening hash can be destroyed at the time of vm_fini.

Thanks,
Oak

From: Christian König <ckoenig.leichtzumerken@gmail.com><mailto:ckoenig.leichtzumerken@gmail.com>
Sent: Friday, September 07, 2018 4:39 AM
To: Zeng, Oak <Oak.Zeng@amd.com><mailto:Oak.Zeng@amd.com>; Oak Zeng <zengshanjun@gmail.com><mailto:zengshanjun@gmail.com>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
Cc: Zeng, Oak <Oak.Zeng@amd.com><mailto:Oak.Zeng@amd.com>
Subject: Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to amdgpu vm

Hi Oak,

yeah, but this is still a NAK. Making the hash per PASID was not a suggestion but rather a must have.

The VM structures must be destroyed while the processing is still ongoing, or otherwise we would not have a clean OOM handling.

The IDR for PASID lockup can be put into amdgpu_ids.c, you can just replace the IDA already used there with an IDR.

Since the PASID is already freed up delayed we should have the grace period for interrupt processing included. If that still isn't sufficient we can still add some delayed work for this.

Regards,
Christian.

Am 07.09.2018 um 06:16 schrieb ozeng:

Hi Christian,

In this implementation, fault hash is made per vm, not per pasid as suggested, based on below considerations:

  *   Delay the destroy of hash introduces more effort like how to set the proper grace period after which no retry interrupt will be generated. We want to avoid those complication if we can.
  *   The  problem of the per vm implementation is that it is hard to delay the destroy of fault hash, because when vm is destroyed, prescreen function won't be able to retrieve the fault hash. But in this case, the prescreen function can either let the interrupt go through (to gmc) or ignore it. From this perspective, it is not very necessary to delay the destroy of hash table.
  *   On the other hand, since ASICs after vega10 have the HW CAM feature. So the SW IV prescreen is only useful for vega10. For all the HW implemented CAM, we can consider the delayed CAM acknowledgment.
  *   Making it per pasid need to introduce another idr to correlate pasid and hash table - where to put the idr? You will have to make it a global variable which is not very desirable.

The main purpose of this patch is to solve the inter-process lock issue (when hash table is full), while still keep codes simple.

Also with this patch, the faults kfifo is not necessary any more. See patch 2.

Regards,

Oak

On 09/06/2018 11:28 PM, Oak Zeng wrote:

In stead of share one fault hash table per device, make it

per vm. This can avoid inter-process lock issue when fault

hash table is full.



Change-Id: I5d1281b7c41eddc8e26113e010516557588d3708

Signed-off-by: Oak Zeng <Oak.Zeng@amd.com><mailto:Oak.Zeng@amd.com>

Suggested-by: Christian Konig <Christian.Koenig@amd.com><mailto:Christian.Koenig@amd.com>

Suggested-by: Felix Kuehling <Felix.Kuehling@amd.com><mailto:Felix.Kuehling@amd.com>

---

 drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c |  75 ------------------------

 drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h |  10 ----

 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 102 ++++++++++++++++++++++++++++++++-

 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h |  12 ++++

 drivers/gpu/drm/amd/amdgpu/vega10_ih.c |  38 +++++-------

 5 files changed, 127 insertions(+), 110 deletions(-)



diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c

index 06373d4..4ed8621 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c

@@ -197,78 +197,3 @@ int amdgpu_ih_process(struct amdgpu_device *adev)

   return IRQ_HANDLED;

 }



-/**

- * amdgpu_ih_add_fault - Add a page fault record

- *

- * @adev: amdgpu device pointer

- * @key: 64-bit encoding of PASID and address

- *

- * This should be called when a retry page fault interrupt is

- * received. If this is a new page fault, it will be added to a hash

- * table. The return value indicates whether this is a new fault, or

- * a fault that was already known and is already being handled.

- *

- * If there are too many pending page faults, this will fail. Retry

- * interrupts should be ignored in this case until there is enough

- * free space.

- *

- * Returns 0 if the fault was added, 1 if the fault was already known,

- * -ENOSPC if there are too many pending faults.

- */

-int amdgpu_ih_add_fault(struct amdgpu_device *adev, u64 key)

-{

-  unsigned long flags;

-  int r = -ENOSPC;

-

-  if (WARN_ON_ONCE(!adev->irq.ih.faults))

-          /* Should be allocated in <IP>_ih_sw_init on GPUs that

-           * support retry faults and require retry filtering.

-           */

-          return r;

-

-  spin_lock_irqsave(&adev->irq.ih.faults->lock, flags);

-

-  /* Only let the hash table fill up to 50% for best performance */

-  if (adev->irq.ih.faults->count >= (1 << (AMDGPU_PAGEFAULT_HASH_BITS-1)))

-          goto unlock_out;

-

-  r = chash_table_copy_in(&adev->irq.ih.faults->hash, key, NULL);

-  if (!r)

-          adev->irq.ih.faults->count++;

-

-  /* chash_table_copy_in should never fail unless we're losing count */

-  WARN_ON_ONCE(r < 0);

-

-unlock_out:

-  spin_unlock_irqrestore(&adev->irq.ih.faults->lock, flags);

-  return r;

-}

-

-/**

- * amdgpu_ih_clear_fault - Remove a page fault record

- *

- * @adev: amdgpu device pointer

- * @key: 64-bit encoding of PASID and address

- *

- * This should be called when a page fault has been handled. Any

- * future interrupt with this key will be processed as a new

- * page fault.

- */

-void amdgpu_ih_clear_fault(struct amdgpu_device *adev, u64 key)

-{

-  unsigned long flags;

-  int r;

-

-  if (!adev->irq.ih.faults)

-          return;

-

-  spin_lock_irqsave(&adev->irq.ih.faults->lock, flags);

-

-  r = chash_table_remove(&adev->irq.ih.faults->hash, key, NULL);

-  if (!WARN_ON_ONCE(r < 0)) {

-          adev->irq.ih.faults->count--;

-          WARN_ON_ONCE(adev->irq.ih.faults->count < 0);

-  }

-

-  spin_unlock_irqrestore(&adev->irq.ih.faults->lock, flags);

-}

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h

index a23e1c0..f411ffb 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h

@@ -32,13 +32,6 @@ struct amdgpu_device;

 #define AMDGPU_IH_CLIENTID_LEGACY 0

 #define AMDGPU_IH_CLIENTID_MAX SOC15_IH_CLIENTID_MAX



-#define AMDGPU_PAGEFAULT_HASH_BITS 8

-struct amdgpu_retryfault_hashtable {

-  DECLARE_CHASH_TABLE(hash, AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);

-  spinlock_t     lock;

-  int            count;

-};

-

 /*

  * R6xx+ IH ring

  */

@@ -57,7 +50,6 @@ struct amdgpu_ih_ring {

   bool                   use_doorbell;

   bool                   use_bus_addr;

   dma_addr_t             rb_dma_addr; /* only used when use_bus_addr = true */

-  struct amdgpu_retryfault_hashtable *faults;

 };



 #define AMDGPU_IH_SRC_DATA_MAX_SIZE_DW 4

@@ -95,7 +87,5 @@ int amdgpu_ih_ring_init(struct amdgpu_device *adev, unsigned ring_size,

                   bool use_bus_addr);

 void amdgpu_ih_ring_fini(struct amdgpu_device *adev);

 int amdgpu_ih_process(struct amdgpu_device *adev);

-int amdgpu_ih_add_fault(struct amdgpu_device *adev, u64 key);

-void amdgpu_ih_clear_fault(struct amdgpu_device *adev, u64 key);



 #endif

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c

index 1d7e3c1..8b220e9 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c

@@ -2692,6 +2692,22 @@ void amdgpu_vm_adjust_size(struct amdgpu_device *adev, uint32_t min_vm_size,

            adev->vm_manager.fragment_size);

 }



+static struct amdgpu_retryfault_hashtable *init_fault_hash(void)

+{

+  struct amdgpu_retryfault_hashtable *fault_hash;

+

+  fault_hash = kmalloc(sizeof(*fault_hash), GFP_KERNEL);

+  if (!fault_hash)

+          return fault_hash;

+

+  INIT_CHASH_TABLE(fault_hash->hash,

+                  AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);

+  spin_lock_init(&fault_hash->lock);

+  fault_hash->count = 0;

+

+  return fault_hash;

+}

+

 /**

  * amdgpu_vm_init - initialize a vm instance

  *

@@ -2780,6 +2796,12 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,

           vm->pasid = pasid;

   }



+  vm->fault_hash = init_fault_hash();

+  if (!vm->fault_hash) {

+          r = -ENOMEM;

+          goto error_free_root;

+  }

+

   INIT_KFIFO(vm->faults);

   vm->fault_credit = 16;



@@ -2973,7 +2995,7 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)



   /* Clear pending page faults from IH when the VM is destroyed */

   while (kfifo_get(&vm->faults, &fault))

-          amdgpu_ih_clear_fault(adev, fault);

+          amdgpu_vm_clear_fault(vm->fault_hash, fault);



   if (vm->pasid) {

           unsigned long flags;

@@ -2983,6 +3005,9 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)

           spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);

   }



+  kfree(vm->fault_hash);

+  vm->fault_hash = NULL;

+

   drm_sched_entity_destroy(&vm->entity);



   if (!RB_EMPTY_ROOT(&vm->va.rb_root)) {

@@ -3183,3 +3208,78 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)

           }

   }

 }

+

+/**

+ * amdgpu_vm_add_fault - Add a page fault record to fault hash table

+ *

+ * @fault_hash: fault hash table

+ * @key: 64-bit encoding of PASID and address

+ *

+ * This should be called when a retry page fault interrupt is

+ * received. If this is a new page fault, it will be added to a hash

+ * table. The return value indicates whether this is a new fault, or

+ * a fault that was already known and is already being handled.

+ *

+ * If there are too many pending page faults, this will fail. Retry

+ * interrupts should be ignored in this case until there is enough

+ * free space.

+ *

+ * Returns 0 if the fault was added, 1 if the fault was already known,

+ * -ENOSPC if there are too many pending faults.

+ */

+int amdgpu_vm_add_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key)

+{

+  unsigned long flags;

+  int r = -ENOSPC;

+

+  if (WARN_ON_ONCE(!fault_hash))

+          /* Should be allocated in amdgpu_vm_init

+           */

+          return r;

+

+  spin_lock_irqsave(&fault_hash->lock, flags);

+

+  /* Only let the hash table fill up to 50% for best performance */

+  if (fault_hash->count >= (1 << (AMDGPU_PAGEFAULT_HASH_BITS-1)))

+          goto unlock_out;

+

+  r = chash_table_copy_in(&fault_hash->hash, key, NULL);

+  if (!r)

+          fault_hash->count++;

+

+  /* chash_table_copy_in should never fail unless we're losing count */

+  WARN_ON_ONCE(r < 0);

+

+unlock_out:

+  spin_unlock_irqrestore(&fault_hash->lock, flags);

+  return r;

+}

+

+/**

+ * amdgpu_vm_clear_fault - Remove a page fault record

+ *

+ * @fault_hash: fault hash table

+ * @key: 64-bit encoding of PASID and address

+ *

+ * This should be called when a page fault has been handled. Any

+ * future interrupt with this key will be processed as a new

+ * page fault.

+ */

+void amdgpu_vm_clear_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key)

+{

+  unsigned long flags;

+  int r;

+

+  if (!fault_hash)

+          return;

+

+  spin_lock_irqsave(&fault_hash->lock, flags);

+

+  r = chash_table_remove(&fault_hash->hash, key, NULL);

+  if (!WARN_ON_ONCE(r < 0)) {

+          fault_hash->count--;

+          WARN_ON_ONCE(fault_hash->count < 0);

+  }

+

+  spin_unlock_irqrestore(&fault_hash->lock, flags);

+}

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h

index e275ee7..6eb1da1 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h

@@ -178,6 +178,13 @@ struct amdgpu_task_info {

   pid_t   tgid;

 };



+#define AMDGPU_PAGEFAULT_HASH_BITS 8

+struct amdgpu_retryfault_hashtable {

+  DECLARE_CHASH_TABLE(hash, AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);

+  spinlock_t     lock;

+  int            count;

+};

+

 struct amdgpu_vm {

   /* tree of virtual addresses mapped */

   struct rb_root_cached  va;

@@ -240,6 +247,7 @@ struct amdgpu_vm {

   struct ttm_lru_bulk_move lru_bulk_move;

   /* mark whether can do the bulk move */

   bool                   bulk_moveable;

+  struct amdgpu_retryfault_hashtable *fault_hash;

 };



 struct amdgpu_vm_manager {

@@ -355,4 +363,8 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);

 void amdgpu_vm_move_to_lru_tail(struct amdgpu_device *adev,

                          struct amdgpu_vm *vm);



+int amdgpu_vm_add_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key);

+

+void amdgpu_vm_clear_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key);

+

 #endif

diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c

index 5ae5ed2..acbe5a7 100644

--- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c

+++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c

@@ -265,35 +265,36 @@ static bool vega10_ih_prescreen_iv(struct amdgpu_device *adev)

           return true;

   }



-  addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 12);

-  key = AMDGPU_VM_FAULT(pasid, addr);

-  r = amdgpu_ih_add_fault(adev, key);

-

-  /* Hash table is full or the fault is already being processed,

-   * ignore further page faults

-   */

-  if (r != 0)

-          goto ignore_iv;

-

   /* Track retry faults in per-VM fault FIFO. */

   spin_lock(&adev->vm_manager.pasid_lock);

   vm = idr_find(&adev->vm_manager.pasid_idr, pasid);

+  addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 12);

+  key = AMDGPU_VM_FAULT(pasid, addr);

   if (!vm) {

           /* VM not found, process it normally */

           spin_unlock(&adev->vm_manager.pasid_lock);

-          amdgpu_ih_clear_fault(adev, key);

           return true;

+  } else {

+          r = amdgpu_vm_add_fault(vm->fault_hash, key);

+

+          /* Hash table is full or the fault is already being processed,

+           * ignore further page faults

+           */

+          if (r != 0) {

+                  spin_unlock(&adev->vm_manager.pasid_lock);

+                  goto ignore_iv;

+          }

   }

   /* No locking required with single writer and single reader */

   r = kfifo_put(&vm->faults, key);

   if (!r) {

           /* FIFO is full. Ignore it until there is space */

+          amdgpu_vm_clear_fault(vm->fault_hash, key);

           spin_unlock(&adev->vm_manager.pasid_lock);

-          amdgpu_ih_clear_fault(adev, key);

           goto ignore_iv;

   }

-  spin_unlock(&adev->vm_manager.pasid_lock);



+  spin_unlock(&adev->vm_manager.pasid_lock);

   /* It's the first fault for this address, process it normally */

   return true;



@@ -386,14 +387,6 @@ static int vega10_ih_sw_init(void *handle)

   adev->irq.ih.use_doorbell = true;

   adev->irq.ih.doorbell_index = AMDGPU_DOORBELL64_IH << 1;



-  adev->irq.ih.faults = kmalloc(sizeof(*adev->irq.ih.faults), GFP_KERNEL);

-  if (!adev->irq.ih.faults)

-          return -ENOMEM;

-  INIT_CHASH_TABLE(adev->irq.ih.faults->hash,

-                   AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);

-  spin_lock_init(&adev->irq.ih.faults->lock);

-  adev->irq.ih.faults->count = 0;

-

   r = amdgpu_irq_init(adev);



   return r;

@@ -406,9 +399,6 @@ static int vega10_ih_sw_fini(void *handle)

   amdgpu_irq_fini(adev);

   amdgpu_ih_ring_fini(adev);



-  kfree(adev->irq.ih.faults);

-  adev->irq.ih.faults = NULL;

-

   return 0;

 }









_______________________________________________

amd-gfx mailing list

amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>

https://lists.freedesktop.org/mailman/listinfo/amd-gfx




[-- Attachment #1.2: Type: text/html, Size: 48967 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to amdgpu vm
       [not found]                 ` <89e7515f-53f1-9f10-aeb6-7fc1b8df2702-5C7GfCeVMHo@public.gmane.org>
  2018-09-07 15:30                   ` Zeng, Oak
@ 2018-09-08  0:27                   ` Felix Kuehling
       [not found]                     ` <ab2c26bc-e168-eaae-3d80-826a47b3e0d3-5C7GfCeVMHo@public.gmane.org>
  1 sibling, 1 reply; 27+ messages in thread
From: Felix Kuehling @ 2018-09-08  0:27 UTC (permalink / raw)
  To: Christian König, Zeng, Oak, Oak Zeng,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Yang, Philip

Hi Christian,

I'm not sure I get your point here.

As I understand it, the amdgpu_vm structure represents the user mode VM
context. It has the pointers to the root page directory and the rest of
the page table hierarchy. If there is no amdgpu_vm, there is no user
mode context that cares about the page tables.

If we leave the page table pointers in the amdgpu_vm, then handling
faults after the VM is destroyed is pointless. We can't find the page
tables and we can't update them, so there is nothing we can do in
response to VM faults.

So I'm guessing that you want to move the page table pointers into a
different structure that exists half-independently of the VM. It would
be created when the VM is created (that's where we currently allocate
the PASID) but can be destroyed slightly later.

But why do you want to update page tables when there is no more user
mode context that cares about them? Is this just to allow pending work
to complete after the user mode context is gone? To prevent hangs?

Regards,
  Felix

On 2018-09-10 07:20 AM, Christian König wrote:
>> Am I right that the handling of page fault need a valid VM? 
> No, exactly that view is incorrect.
>
> The VM is meant to be a memory management structure of page tables and
> is completely pointless fault processing because it represent the
> future state of the page tables and not the current one.
>
> What we need for fault processing is a new structure build around
> PASIDs which is feed by the with addresses when page tables are moved
> around.
>
> Alternatively I hope that we can use the SDMA to walk the page tables
> and update the required entries by just using the address.
>
> Regards,
> Christian.
>
> Am 07.09.2018 um 16:55 schrieb Zeng, Oak:
>>
>> Hi Christian,
>>
>>  
>>
>> What is the value of delay the destroy of hash to after vm is
>> destroyed? Since vm is destroyed, you basically don’t have enough
>> information to paging in the correct page to gpuvm. Am I right that
>> the handling of page fault need a valid VM? For example, you need the
>> VM to get the corresponding physical address of the faulty page.
>>
>>  
>>
>> After vm is destroyed, the retry interrupt can be directly discard as
>> you can’t find vm through pasid. You can think this is also one kind
>> of prescreen.
>>
>>  
>>
>> So I am stand for the point that, there is no need to delay the
>> destroy of hash to after vm is destroyed. Prescreening hash can be
>> destroyed at the time of vm_fini.
>>
>>  
>>
>> Thanks,
>>
>> Oak
>>
>>  
>>
>> *From:*Christian König <ckoenig.leichtzumerken@gmail.com>
>> *Sent:* Friday, September 07, 2018 4:39 AM
>> *To:* Zeng, Oak <Oak.Zeng@amd.com>; Oak Zeng <zengshanjun@gmail.com>;
>> amd-gfx@lists.freedesktop.org
>> *Cc:* Zeng, Oak <Oak.Zeng@amd.com>
>> *Subject:* Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to
>> amdgpu vm
>>
>>  
>>
>> Hi Oak,
>>
>> yeah, but this is still a NAK. Making the hash per PASID was not a
>> suggestion but rather a must have.
>>
>> The VM structures must be destroyed while the processing is still
>> ongoing, or otherwise we would not have a clean OOM handling.
>>
>> The IDR for PASID lockup can be put into amdgpu_ids.c, you can just
>> replace the IDA already used there with an IDR.
>>
>> Since the PASID is already freed up delayed we should have the grace
>> period for interrupt processing included. If that still isn't
>> sufficient we can still add some delayed work for this.
>>
>> Regards,
>> Christian.
>>
>> Am 07.09.2018 um 06:16 schrieb ozeng:
>>
>>     Hi Christian,
>>
>>     In this implementation, fault hash is made per vm, not per pasid
>>     as suggested, based on below considerations:
>>
>>       * Delay the destroy of hash introduces more effort like how to
>>         set the proper grace period after which no retry interrupt
>>         will be generated. We want to avoid those complication if we
>>         can.
>>       * The  problem of the per vm implementation is that it is hard
>>         to delay the destroy of fault hash, because when vm is
>>         destroyed, prescreen function won't be able to retrieve the
>>         fault hash. But in this case, the prescreen function can
>>         either let the interrupt go through (to gmc) or ignore it.
>>         From this perspective, it is not very necessary to delay the
>>         destroy of hash table.
>>       * On the other hand, since ASICs after vega10 have the HW CAM
>>         feature. So the SW IV prescreen is only useful for vega10.
>>         For all the HW implemented CAM, we can consider the delayed
>>         CAM acknowledgment.
>>       * Making it per pasid need to introduce another idr to
>>         correlate pasid and hash table - where to put the idr? You
>>         will have to make it a global variable which is not very
>>         desirable.
>>
>>     The main purpose of this patch is to solve the inter-process lock
>>     issue (when hash table is full), while still keep codes simple.
>>
>>     Also with this patch, the faults kfifo is not necessary any more.
>>     See patch 2.
>>
>>     Regards,
>>
>>     Oak
>>
>>      
>>
>>     On 09/06/2018 11:28 PM, Oak Zeng wrote:
>>
>>         In stead of share one fault hash table per device, make it
>>
>>         per vm. This can avoid inter-process lock issue when fault
>>
>>         hash table is full.
>>
>>          
>>
>>         Change-Id: I5d1281b7c41eddc8e26113e010516557588d3708
>>
>>         Signed-off-by: Oak Zeng <Oak.Zeng@amd.com> <mailto:Oak.Zeng@amd.com>
>>
>>         Suggested-by: Christian Konig <Christian.Koenig@amd.com> <mailto:Christian.Koenig@amd.com>
>>
>>         Suggested-by: Felix Kuehling <Felix.Kuehling@amd.com> <mailto:Felix.Kuehling@amd.com>
>>
>>         ---
>>
>>          drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c |  75 ------------------------
>>
>>          drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h |  10 ----
>>
>>          drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 102 ++++++++++++++++++++++++++++++++-
>>
>>          drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h |  12 ++++
>>
>>          drivers/gpu/drm/amd/amdgpu/vega10_ih.c |  38 +++++-------
>>
>>          5 files changed, 127 insertions(+), 110 deletions(-)
>>
>>          
>>
>>         diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>
>>         index 06373d4..4ed8621 100644
>>
>>         --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>
>>         +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>
>>         @@ -197,78 +197,3 @@ int amdgpu_ih_process(struct amdgpu_device *adev)
>>
>>            return IRQ_HANDLED;
>>
>>          }
>>
>>          
>>
>>         -/**
>>
>>         - * amdgpu_ih_add_fault - Add a page fault record
>>
>>         - *
>>
>>         - * @adev: amdgpu device pointer
>>
>>         - * @key: 64-bit encoding of PASID and address
>>
>>         - *
>>
>>         - * This should be called when a retry page fault interrupt is
>>
>>         - * received. If this is a new page fault, it will be added to a hash
>>
>>         - * table. The return value indicates whether this is a new fault, or
>>
>>         - * a fault that was already known and is already being handled.
>>
>>         - *
>>
>>         - * If there are too many pending page faults, this will fail. Retry
>>
>>         - * interrupts should be ignored in this case until there is enough
>>
>>         - * free space.
>>
>>         - *
>>
>>         - * Returns 0 if the fault was added, 1 if the fault was already known,
>>
>>         - * -ENOSPC if there are too many pending faults.
>>
>>         - */
>>
>>         -int amdgpu_ih_add_fault(struct amdgpu_device *adev, u64 key)
>>
>>         -{
>>
>>         -  unsigned long flags;
>>
>>         -  int r = -ENOSPC;
>>
>>         -
>>
>>         -  if (WARN_ON_ONCE(!adev->irq.ih.faults))
>>
>>         -          /* Should be allocated in <IP>_ih_sw_init on GPUs that
>>
>>         -           * support retry faults and require retry filtering.
>>
>>         -           */
>>
>>         -          return r;
>>
>>         -
>>
>>         -  spin_lock_irqsave(&adev->irq.ih.faults->lock, flags);
>>
>>         -
>>
>>         -  /* Only let the hash table fill up to 50% for best performance */
>>
>>         -  if (adev->irq.ih.faults->count >= (1 << (AMDGPU_PAGEFAULT_HASH_BITS-1)))
>>
>>         -          goto unlock_out;
>>
>>         -
>>
>>         -  r = chash_table_copy_in(&adev->irq.ih.faults->hash, key, NULL);
>>
>>         -  if (!r)
>>
>>         -          adev->irq.ih.faults->count++;
>>
>>         -
>>
>>         -  /* chash_table_copy_in should never fail unless we're losing count */
>>
>>         -  WARN_ON_ONCE(r < 0);
>>
>>         -
>>
>>         -unlock_out:
>>
>>         -  spin_unlock_irqrestore(&adev->irq.ih.faults->lock, flags);
>>
>>         -  return r;
>>
>>         -}
>>
>>         -
>>
>>         -/**
>>
>>         - * amdgpu_ih_clear_fault - Remove a page fault record
>>
>>         - *
>>
>>         - * @adev: amdgpu device pointer
>>
>>         - * @key: 64-bit encoding of PASID and address
>>
>>         - *
>>
>>         - * This should be called when a page fault has been handled. Any
>>
>>         - * future interrupt with this key will be processed as a new
>>
>>         - * page fault.
>>
>>         - */
>>
>>         -void amdgpu_ih_clear_fault(struct amdgpu_device *adev, u64 key)
>>
>>         -{
>>
>>         -  unsigned long flags;
>>
>>         -  int r;
>>
>>         -
>>
>>         -  if (!adev->irq.ih.faults)
>>
>>         -          return;
>>
>>         -
>>
>>         -  spin_lock_irqsave(&adev->irq.ih.faults->lock, flags);
>>
>>         -
>>
>>         -  r = chash_table_remove(&adev->irq.ih.faults->hash, key, NULL);
>>
>>         -  if (!WARN_ON_ONCE(r < 0)) {
>>
>>         -          adev->irq.ih.faults->count--;
>>
>>         -          WARN_ON_ONCE(adev->irq.ih.faults->count < 0);
>>
>>         -  }
>>
>>         -
>>
>>         -  spin_unlock_irqrestore(&adev->irq.ih.faults->lock, flags);
>>
>>         -}
>>
>>         diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>
>>         index a23e1c0..f411ffb 100644
>>
>>         --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>
>>         +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>
>>         @@ -32,13 +32,6 @@ struct amdgpu_device;
>>
>>          #define AMDGPU_IH_CLIENTID_LEGACY 0
>>
>>          #define AMDGPU_IH_CLIENTID_MAX SOC15_IH_CLIENTID_MAX
>>
>>          
>>
>>         -#define AMDGPU_PAGEFAULT_HASH_BITS 8
>>
>>         -struct amdgpu_retryfault_hashtable {
>>
>>         -  DECLARE_CHASH_TABLE(hash, AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
>>
>>         -  spinlock_t     lock;
>>
>>         -  int            count;
>>
>>         -};
>>
>>         -
>>
>>          /*
>>
>>           * R6xx+ IH ring
>>
>>           */
>>
>>         @@ -57,7 +50,6 @@ struct amdgpu_ih_ring {
>>
>>            bool                   use_doorbell;
>>
>>            bool                   use_bus_addr;
>>
>>            dma_addr_t             rb_dma_addr; /* only used when use_bus_addr = true */
>>
>>         -  struct amdgpu_retryfault_hashtable *faults;
>>
>>          };
>>
>>          
>>
>>          #define AMDGPU_IH_SRC_DATA_MAX_SIZE_DW 4
>>
>>         @@ -95,7 +87,5 @@ int amdgpu_ih_ring_init(struct amdgpu_device *adev, unsigned ring_size,
>>
>>                            bool use_bus_addr);
>>
>>          void amdgpu_ih_ring_fini(struct amdgpu_device *adev);
>>
>>          int amdgpu_ih_process(struct amdgpu_device *adev);
>>
>>         -int amdgpu_ih_add_fault(struct amdgpu_device *adev, u64 key);
>>
>>         -void amdgpu_ih_clear_fault(struct amdgpu_device *adev, u64 key);
>>
>>          
>>
>>          #endif
>>
>>         diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>
>>         index 1d7e3c1..8b220e9 100644
>>
>>         --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>
>>         +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>
>>         @@ -2692,6 +2692,22 @@ void amdgpu_vm_adjust_size(struct amdgpu_device *adev, uint32_t min_vm_size,
>>
>>                     adev->vm_manager.fragment_size);
>>
>>          }
>>
>>          
>>
>>         +static struct amdgpu_retryfault_hashtable *init_fault_hash(void)
>>
>>         +{
>>
>>         +  struct amdgpu_retryfault_hashtable *fault_hash;
>>
>>         +
>>
>>         +  fault_hash = kmalloc(sizeof(*fault_hash), GFP_KERNEL);
>>
>>         +  if (!fault_hash)
>>
>>         +          return fault_hash;
>>
>>         +
>>
>>         +  INIT_CHASH_TABLE(fault_hash->hash,
>>
>>         +                  AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
>>
>>         +  spin_lock_init(&fault_hash->lock);
>>
>>         +  fault_hash->count = 0;
>>
>>         +
>>
>>         +  return fault_hash;
>>
>>         +}
>>
>>         +
>>
>>          /**
>>
>>           * amdgpu_vm_init - initialize a vm instance
>>
>>           *
>>
>>         @@ -2780,6 +2796,12 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>>
>>                    vm->pasid = pasid;
>>
>>            }
>>
>>          
>>
>>         +  vm->fault_hash = init_fault_hash();
>>
>>         +  if (!vm->fault_hash) {
>>
>>         +          r = -ENOMEM;
>>
>>         +          goto error_free_root;
>>
>>         +  }
>>
>>         +
>>
>>            INIT_KFIFO(vm->faults);
>>
>>            vm->fault_credit = 16;
>>
>>          
>>
>>         @@ -2973,7 +2995,7 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
>>
>>          
>>
>>            /* Clear pending page faults from IH when the VM is destroyed */
>>
>>            while (kfifo_get(&vm->faults, &fault))
>>
>>         -          amdgpu_ih_clear_fault(adev, fault);
>>
>>         +          amdgpu_vm_clear_fault(vm->fault_hash, fault);
>>
>>          
>>
>>            if (vm->pasid) {
>>
>>                    unsigned long flags;
>>
>>         @@ -2983,6 +3005,9 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
>>
>>                    spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
>>
>>            }
>>
>>          
>>
>>         +  kfree(vm->fault_hash);
>>
>>         +  vm->fault_hash = NULL;
>>
>>         +
>>
>>            drm_sched_entity_destroy(&vm->entity);
>>
>>          
>>
>>            if (!RB_EMPTY_ROOT(&vm->va.rb_root)) {
>>
>>         @@ -3183,3 +3208,78 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
>>
>>                    }
>>
>>            }
>>
>>          }
>>
>>         +
>>
>>         +/**
>>
>>         + * amdgpu_vm_add_fault - Add a page fault record to fault hash table
>>
>>         + *
>>
>>         + * @fault_hash: fault hash table
>>
>>         + * @key: 64-bit encoding of PASID and address
>>
>>         + *
>>
>>         + * This should be called when a retry page fault interrupt is
>>
>>         + * received. If this is a new page fault, it will be added to a hash
>>
>>         + * table. The return value indicates whether this is a new fault, or
>>
>>         + * a fault that was already known and is already being handled.
>>
>>         + *
>>
>>         + * If there are too many pending page faults, this will fail. Retry
>>
>>         + * interrupts should be ignored in this case until there is enough
>>
>>         + * free space.
>>
>>         + *
>>
>>         + * Returns 0 if the fault was added, 1 if the fault was already known,
>>
>>         + * -ENOSPC if there are too many pending faults.
>>
>>         + */
>>
>>         +int amdgpu_vm_add_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key)
>>
>>         +{
>>
>>         +  unsigned long flags;
>>
>>         +  int r = -ENOSPC;
>>
>>         +
>>
>>         +  if (WARN_ON_ONCE(!fault_hash))
>>
>>         +          /* Should be allocated in amdgpu_vm_init
>>
>>         +           */
>>
>>         +          return r;
>>
>>         +
>>
>>         +  spin_lock_irqsave(&fault_hash->lock, flags);
>>
>>         +
>>
>>         +  /* Only let the hash table fill up to 50% for best performance */
>>
>>         +  if (fault_hash->count >= (1 << (AMDGPU_PAGEFAULT_HASH_BITS-1)))
>>
>>         +          goto unlock_out;
>>
>>         +
>>
>>         +  r = chash_table_copy_in(&fault_hash->hash, key, NULL);
>>
>>         +  if (!r)
>>
>>         +          fault_hash->count++;
>>
>>         +
>>
>>         +  /* chash_table_copy_in should never fail unless we're losing count */
>>
>>         +  WARN_ON_ONCE(r < 0);
>>
>>         +
>>
>>         +unlock_out:
>>
>>         +  spin_unlock_irqrestore(&fault_hash->lock, flags);
>>
>>         +  return r;
>>
>>         +}
>>
>>         +
>>
>>         +/**
>>
>>         + * amdgpu_vm_clear_fault - Remove a page fault record
>>
>>         + *
>>
>>         + * @fault_hash: fault hash table
>>
>>         + * @key: 64-bit encoding of PASID and address
>>
>>         + *
>>
>>         + * This should be called when a page fault has been handled. Any
>>
>>         + * future interrupt with this key will be processed as a new
>>
>>         + * page fault.
>>
>>         + */
>>
>>         +void amdgpu_vm_clear_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key)
>>
>>         +{
>>
>>         +  unsigned long flags;
>>
>>         +  int r;
>>
>>         +
>>
>>         +  if (!fault_hash)
>>
>>         +          return;
>>
>>         +
>>
>>         +  spin_lock_irqsave(&fault_hash->lock, flags);
>>
>>         +
>>
>>         +  r = chash_table_remove(&fault_hash->hash, key, NULL);
>>
>>         +  if (!WARN_ON_ONCE(r < 0)) {
>>
>>         +          fault_hash->count--;
>>
>>         +          WARN_ON_ONCE(fault_hash->count < 0);
>>
>>         +  }
>>
>>         +
>>
>>         +  spin_unlock_irqrestore(&fault_hash->lock, flags);
>>
>>         +}
>>
>>         diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>
>>         index e275ee7..6eb1da1 100644
>>
>>         --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>
>>         +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>
>>         @@ -178,6 +178,13 @@ struct amdgpu_task_info {
>>
>>            pid_t   tgid;
>>
>>          };
>>
>>          
>>
>>         +#define AMDGPU_PAGEFAULT_HASH_BITS 8
>>
>>         +struct amdgpu_retryfault_hashtable {
>>
>>         +  DECLARE_CHASH_TABLE(hash, AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
>>
>>         +  spinlock_t     lock;
>>
>>         +  int            count;
>>
>>         +};
>>
>>         +
>>
>>          struct amdgpu_vm {
>>
>>            /* tree of virtual addresses mapped */
>>
>>            struct rb_root_cached  va;
>>
>>         @@ -240,6 +247,7 @@ struct amdgpu_vm {
>>
>>            struct ttm_lru_bulk_move lru_bulk_move;
>>
>>            /* mark whether can do the bulk move */
>>
>>            bool                   bulk_moveable;
>>
>>         +  struct amdgpu_retryfault_hashtable *fault_hash;
>>
>>          };
>>
>>          
>>
>>          struct amdgpu_vm_manager {
>>
>>         @@ -355,4 +363,8 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);
>>
>>          void amdgpu_vm_move_to_lru_tail(struct amdgpu_device *adev,
>>
>>                                   struct amdgpu_vm *vm);
>>
>>          
>>
>>         +int amdgpu_vm_add_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key);
>>
>>         +
>>
>>         +void amdgpu_vm_clear_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key);
>>
>>         +
>>
>>          #endif
>>
>>         diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>
>>         index 5ae5ed2..acbe5a7 100644
>>
>>         --- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>
>>         +++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>
>>         @@ -265,35 +265,36 @@ static bool vega10_ih_prescreen_iv(struct amdgpu_device *adev)
>>
>>                    return true;
>>
>>            }
>>
>>          
>>
>>         -  addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 12);
>>
>>         -  key = AMDGPU_VM_FAULT(pasid, addr);
>>
>>         -  r = amdgpu_ih_add_fault(adev, key);
>>
>>         -
>>
>>         -  /* Hash table is full or the fault is already being processed,
>>
>>         -   * ignore further page faults
>>
>>         -   */
>>
>>         -  if (r != 0)
>>
>>         -          goto ignore_iv;
>>
>>         -
>>
>>            /* Track retry faults in per-VM fault FIFO. */
>>
>>            spin_lock(&adev->vm_manager.pasid_lock);
>>
>>            vm = idr_find(&adev->vm_manager.pasid_idr, pasid);
>>
>>         +  addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 12);
>>
>>         +  key = AMDGPU_VM_FAULT(pasid, addr);
>>
>>            if (!vm) {
>>
>>                    /* VM not found, process it normally */
>>
>>                    spin_unlock(&adev->vm_manager.pasid_lock);
>>
>>         -          amdgpu_ih_clear_fault(adev, key);
>>
>>                    return true;
>>
>>         +  } else {
>>
>>         +          r = amdgpu_vm_add_fault(vm->fault_hash, key);
>>
>>         +
>>
>>         +          /* Hash table is full or the fault is already being processed,
>>
>>         +           * ignore further page faults
>>
>>         +           */
>>
>>         +          if (r != 0) {
>>
>>         +                  spin_unlock(&adev->vm_manager.pasid_lock);
>>
>>         +                  goto ignore_iv;
>>
>>         +          }
>>
>>            }
>>
>>            /* No locking required with single writer and single reader */
>>
>>            r = kfifo_put(&vm->faults, key);
>>
>>            if (!r) {
>>
>>                    /* FIFO is full. Ignore it until there is space */
>>
>>         +          amdgpu_vm_clear_fault(vm->fault_hash, key);
>>
>>                    spin_unlock(&adev->vm_manager.pasid_lock);
>>
>>         -          amdgpu_ih_clear_fault(adev, key);
>>
>>                    goto ignore_iv;
>>
>>            }
>>
>>         -  spin_unlock(&adev->vm_manager.pasid_lock);
>>
>>          
>>
>>         +  spin_unlock(&adev->vm_manager.pasid_lock);
>>
>>            /* It's the first fault for this address, process it normally */
>>
>>            return true;
>>
>>          
>>
>>         @@ -386,14 +387,6 @@ static int vega10_ih_sw_init(void *handle)
>>
>>            adev->irq.ih.use_doorbell = true;
>>
>>            adev->irq.ih.doorbell_index = AMDGPU_DOORBELL64_IH << 1;
>>
>>          
>>
>>         -  adev->irq.ih.faults = kmalloc(sizeof(*adev->irq.ih.faults), GFP_KERNEL);
>>
>>         -  if (!adev->irq.ih.faults)
>>
>>         -          return -ENOMEM;
>>
>>         -  INIT_CHASH_TABLE(adev->irq.ih.faults->hash,
>>
>>         -                   AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
>>
>>         -  spin_lock_init(&adev->irq.ih.faults->lock);
>>
>>         -  adev->irq.ih.faults->count = 0;
>>
>>         -
>>
>>            r = amdgpu_irq_init(adev);
>>
>>          
>>
>>            return r;
>>
>>         @@ -406,9 +399,6 @@ static int vega10_ih_sw_fini(void *handle)
>>
>>            amdgpu_irq_fini(adev);
>>
>>            amdgpu_ih_ring_fini(adev);
>>
>>          
>>
>>         -  kfree(adev->irq.ih.faults);
>>
>>         -  adev->irq.ih.faults = NULL;
>>
>>         -
>>
>>            return 0;
>>
>>          }
>>
>>          
>>
>>
>>
>>
>>
>>     _______________________________________________
>>
>>     amd-gfx mailing list
>>
>>     amd-gfx@lists.freedesktop.org <mailto:amd-gfx@lists.freedesktop.org>
>>
>>     https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>
>>  
>>
>
>
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to amdgpu vm
       [not found]                             ` <BN6PR12MB1651CCA8BB73DE64815EB0B280000-/b2+HYfkarRSqX7PDniLCgdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2018-09-08  7:16                               ` Christian König
       [not found]                                 ` <bf233da4-5cd9-d7e4-1e9a-fbef3613bfd6-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 27+ messages in thread
From: Christian König @ 2018-09-08  7:16 UTC (permalink / raw)
  To: Zeng, Oak, Oak Zeng, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	Yang, Philip


[-- Attachment #1.1: Type: text/plain, Size: 32826 bytes --]

Hi Oak,

> >In other words when you request the address of a PT you get the 
> address where memory management has moved it, not the address where 
> the hardware is processing the fault from.
>
> Do you mean the address in the PT is what vm has mapped and the faulty 
> address need to be mapped in the PT?
>
No, the memory management is pipelined to avoid stalls during eviction.

This means that the state we manage is always the future state, e.g. 
when you walk the page tables in the VM structure you see the state 
after all the running operations completed.

In other words when a page table is pipelined for eviction you see the 
evicted state and not the current executing one.

But what we need for page fault handling is the current state and not 
the future state.

> The amdgpu_vm and amdgpu_vm_pt data structure are invented for the 
> purpose of management of page table BO. Why we need invent extra 
> structure?
We will need something like a double entry accounting for the page 
tables where we both keep the future as well as the current state.

> Think about the normal case of the page fault handling when the 
> process and vm are still alive. In the normal case, the amdgpu_vm 
> structure can meet exactly purpose of page table update.
No, the problem is not if the VM is still alive or dead but rather that 
in the case of an eviction you wouldn't be able to handle page faults 
any more.

> The said extra data structure must look very similar to the 
> amdgpu_vm/vm_pt structure. So the best way is to reuse the existing vm 
> structure, either in normal case or in process termination case.
Completely agree that we will probably need the same structure as the VM 
hierarchy offers, just with another set of information.

> Is it possible to delay the destroy of vm during process termination?
Yeah, thought about that as well. The problem with that is the OOM 
killer and that we are still not able to reliable kill running command 
submissions.

Ok you convinced me that we should probably work on that instead.

E.g. when a process has bound its address space to a VM and exits we 
need to find a way to kill all still executing jobs. We should let the 
UMD handle that case gracefully when they really need this.

Thanks,
Christian.

Am 07.09.2018 um 23:52 schrieb Zeng, Oak:
>
> Hi Christian,
>
> See comments inline
>
> Thanks,
>
> Oak
>
> *From:*Koenig, Christian
> *Sent:* Monday, September 10, 2018 7:44 AM
> *To:* Zeng, Oak <Oak.Zeng-5C7GfCeVMHo@public.gmane.org>; Oak Zeng <zengshanjun-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>; 
> amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org; Yang, Philip <Philip.Yang-5C7GfCeVMHo@public.gmane.org>
> *Subject:* Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to amdgpu vm
>
> >The VM doesn't know that information either.
>
> Yes, VM doesn’t know the information. More specifically I meant we 
> need VM data structure to update PTs during page fault handling. For 
> example, we walk the 4 level page table (the first level is pointed by 
> vm->root) to figure out the entries need to be filled, allocate page 
> table entry Bos if necessary, allocate physical page for the faulty 
> address, setup 4 level page table for the faulty virtual address to 
> pointing to the allocated physical page.
>
> >In other words when you request the address of a PT you get the 
> address where memory management has moved it, not the address where 
> the hardware is processing the fault from.
>
> Do you mean the address in the PT is what vm has mapped and the faulty 
> address need to be mapped in the PT?
>
>
>
> I see only a few possible solutions for that:
> >1. We use an extra structure in the kernel driver which holds the 
> current address of the PTs and is feed by memory management when 
> buffers move around.
>
> The amdgpu_vm and amdgpu_vm_pt data structure are invented for the 
> purpose of management of page table BO. Why we need invent extra 
> structure? Think about the normal case of the page fault handling when 
> the process and vm are still alive. In the normal case, the amdgpu_vm 
> structure can meet exactly purpose of page table update.
>
> The said extra data structure must look very similar to the 
> amdgpu_vm/vm_pt structure. So the best way is to reuse the existing vm 
> structure, either in normal case or in process termination case. Is it 
> possible to delay the destroy of vm during process termination?
>
>
> >2. We change the SDMA firmware to do the walk for us and update the 
> PTEs based on the information in the page directory.
>
> As the page table BO need to be freed later. You still need to manage 
> them in the driver. SDMA FW can walk and update but it need 
> information from driver side. Driver still need a similar struct like 
> amdgpu_vm.
>
>
> >3. We use the UTCL walker to get us to the correct address which is 
> then updated with the SDMA.
> I don’t quite understand your point here. What do you mean by “correct 
> address”? Do you mean the physical address to be filled to the page 
> table? If this is your meaning, how UTCL2 knows this information – the 
> physical address has to be allocated/validated through driver.
>
>
> Regards,
> Christian.
>
> Am 07.09.2018 um 17:30 schrieb Zeng, Oak:
>
>     Without a VM, how can you get the physical address of the faulty
>     virtual address? Where this information should be hold?
>
>     Thanks,
>
>     Oak
>
>     *From:*Koenig, Christian
>     *Sent:* Monday, September 10, 2018 7:20 AM
>     *To:* Zeng, Oak <Oak.Zeng-5C7GfCeVMHo@public.gmane.org> <mailto:Oak.Zeng-5C7GfCeVMHo@public.gmane.org>; Oak
>     Zeng <zengshanjun-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> <mailto:zengshanjun-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>;
>     amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>     <mailto:amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org>; Yang, Philip
>     <Philip.Yang-5C7GfCeVMHo@public.gmane.org> <mailto:Philip.Yang-5C7GfCeVMHo@public.gmane.org>
>     *Subject:* Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to
>     amdgpu vm
>
>         Am I right that the handling of page fault need a valid VM?
>
>     No, exactly that view is incorrect.
>
>     The VM is meant to be a memory management structure of page tables
>     and is completely pointless fault processing because it represent
>     the future state of the page tables and not the current one.
>
>     What we need for fault processing is a new structure build around
>     PASIDs which is feed by the with addresses when page tables are
>     moved around.
>
>     Alternatively I hope that we can use the SDMA to walk the page
>     tables and update the required entries by just using the address.
>
>     Regards,
>     Christian.
>
>     Am 07.09.2018 um 16:55 schrieb Zeng, Oak:
>
>         Hi Christian,
>
>         What is the value of delay the destroy of hash to after vm is
>         destroyed? Since vm is destroyed, you basically don’t have
>         enough information to paging in the correct page to gpuvm. Am
>         I right that the handling of page fault need a valid VM? For
>         example, you need the VM to get the corresponding physical
>         address of the faulty page.
>
>         After vm is destroyed, the retry interrupt can be directly
>         discard as you can’t find vm through pasid. You can think this
>         is also one kind of prescreen.
>
>         So I am stand for the point that, there is no need to delay
>         the destroy of hash to after vm is destroyed. Prescreening
>         hash can be destroyed at the time of vm_fini.
>
>         Thanks,
>
>         Oak
>
>         *From:*Christian König <ckoenig.leichtzumerken-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
>         <mailto:ckoenig.leichtzumerken-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
>         *Sent:* Friday, September 07, 2018 4:39 AM
>         *To:* Zeng, Oak <Oak.Zeng-5C7GfCeVMHo@public.gmane.org> <mailto:Oak.Zeng-5C7GfCeVMHo@public.gmane.org>;
>         Oak Zeng <zengshanjun-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
>         <mailto:zengshanjun-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>; amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>         <mailto:amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org>
>         *Cc:* Zeng, Oak <Oak.Zeng-5C7GfCeVMHo@public.gmane.org> <mailto:Oak.Zeng-5C7GfCeVMHo@public.gmane.org>
>         *Subject:* Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table
>         to amdgpu vm
>
>         Hi Oak,
>
>         yeah, but this is still a NAK. Making the hash per PASID was
>         not a suggestion but rather a must have.
>
>         The VM structures must be destroyed while the processing is
>         still ongoing, or otherwise we would not have a clean OOM
>         handling.
>
>         The IDR for PASID lockup can be put into amdgpu_ids.c, you can
>         just replace the IDA already used there with an IDR.
>
>         Since the PASID is already freed up delayed we should have the
>         grace period for interrupt processing included. If that still
>         isn't sufficient we can still add some delayed work for this.
>
>         Regards,
>         Christian.
>
>         Am 07.09.2018 um 06:16 schrieb ozeng:
>
>             Hi Christian,
>
>             In this implementation, fault hash is made per vm, not per
>             pasid as suggested, based on below considerations:
>
>               * Delay the destroy of hash introduces more effort like
>                 how to set the proper grace period after which no
>                 retry interrupt will be generated. We want to avoid
>                 those complication if we can.
>               * The  problem of the per vm implementation is that it
>                 is hard to delay the destroy of fault hash, because
>                 when vm is destroyed, prescreen function won't be able
>                 to retrieve the fault hash. But in this case, the
>                 prescreen function can either let the interrupt go
>                 through (to gmc) or ignore it. From this perspective,
>                 it is not very necessary to delay the destroy of hash
>                 table.
>               * On the other hand, since ASICs after vega10 have the
>                 HW CAM feature. So the SW IV prescreen is only useful
>                 for vega10. For all the HW implemented CAM, we can
>                 consider the delayed CAM acknowledgment.
>               * Making it per pasid need to introduce another idr to
>                 correlate pasid and hash table - where to put the idr?
>                 You will have to make it a global variable which is
>                 not very desirable.
>
>             The main purpose of this patch is to solve the
>             inter-process lock issue (when hash table is full), while
>             still keep codes simple.
>
>             Also with this patch, the faults kfifo is not necessary
>             any more. See patch 2.
>
>             Regards,
>
>             Oak
>
>             On 09/06/2018 11:28 PM, Oak Zeng wrote:
>
>                 In stead of share one fault hash table per device, make it
>
>                 per vm. This can avoid inter-process lock issue when fault
>
>                 hash table is full.
>
>                   
>
>                 Change-Id: I5d1281b7c41eddc8e26113e010516557588d3708
>
>                 Signed-off-by: Oak Zeng<Oak.Zeng-5C7GfCeVMHo@public.gmane.org> <mailto:Oak.Zeng-5C7GfCeVMHo@public.gmane.org>
>
>                 Suggested-by: Christian Konig<Christian.Koenig-5C7GfCeVMHo@public.gmane.org>
>                 <mailto:Christian.Koenig-5C7GfCeVMHo@public.gmane.org>
>
>                 Suggested-by: Felix Kuehling<Felix.Kuehling-5C7GfCeVMHo@public.gmane.org> <mailto:Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
>
>                 ---
>
>                   drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c |  75 ------------------------
>
>                   drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h |  10 ----
>
>                   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 102 ++++++++++++++++++++++++++++++++-
>
>                   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h |  12 ++++
>
>                   drivers/gpu/drm/amd/amdgpu/vega10_ih.c |  38 +++++-------
>
>                   5 files changed, 127 insertions(+), 110 deletions(-)
>
>                   
>
>                 diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>
>                 index 06373d4..4ed8621 100644
>
>                 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>
>                 +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>
>                 @@ -197,78 +197,3 @@ int amdgpu_ih_process(struct amdgpu_device *adev)
>
>                     return IRQ_HANDLED;
>
>                   }
>
>                   
>
>                 -/**
>
>                 - * amdgpu_ih_add_fault - Add a page fault record
>
>                 - *
>
>                 - * @adev: amdgpu device pointer
>
>                 - * @key: 64-bit encoding of PASID and address
>
>                 - *
>
>                 - * This should be called when a retry page fault interrupt is
>
>                 - * received. If this is a new page fault, it will be added to a hash
>
>                 - * table. The return value indicates whether this is a new fault, or
>
>                 - * a fault that was already known and is already being handled.
>
>                 - *
>
>                 - * If there are too many pending page faults, this will fail. Retry
>
>                 - * interrupts should be ignored in this case until there is enough
>
>                 - * free space.
>
>                 - *
>
>                 - * Returns 0 if the fault was added, 1 if the fault was already known,
>
>                 - * -ENOSPC if there are too many pending faults.
>
>                 - */
>
>                 -int amdgpu_ih_add_fault(struct amdgpu_device *adev, u64 key)
>
>                 -{
>
>                 -  unsigned long flags;
>
>                 -  int r = -ENOSPC;
>
>                 -
>
>                 -  if (WARN_ON_ONCE(!adev->irq.ih.faults))
>
>                 -          /* Should be allocated in <IP>_ih_sw_init on GPUs that
>
>                 -           * support retry faults and require retry filtering.
>
>                 -           */
>
>                 -          return r;
>
>                 -
>
>                 -  spin_lock_irqsave(&adev->irq.ih.faults->lock, flags);
>
>                 -
>
>                 -  /* Only let the hash table fill up to 50% for best performance */
>
>                 -  if (adev->irq.ih.faults->count >= (1 << (AMDGPU_PAGEFAULT_HASH_BITS-1)))
>
>                 -          goto unlock_out;
>
>                 -
>
>                 -  r = chash_table_copy_in(&adev->irq.ih.faults->hash, key, NULL);
>
>                 -  if (!r)
>
>                 -          adev->irq.ih.faults->count++;
>
>                 -
>
>                 -  /* chash_table_copy_in should never fail unless we're losing count */
>
>                 -  WARN_ON_ONCE(r < 0);
>
>                 -
>
>                 -unlock_out:
>
>                 -  spin_unlock_irqrestore(&adev->irq.ih.faults->lock, flags);
>
>                 -  return r;
>
>                 -}
>
>                 -
>
>                 -/**
>
>                 - * amdgpu_ih_clear_fault - Remove a page fault record
>
>                 - *
>
>                 - * @adev: amdgpu device pointer
>
>                 - * @key: 64-bit encoding of PASID and address
>
>                 - *
>
>                 - * This should be called when a page fault has been handled. Any
>
>                 - * future interrupt with this key will be processed as a new
>
>                 - * page fault.
>
>                 - */
>
>                 -void amdgpu_ih_clear_fault(struct amdgpu_device *adev, u64 key)
>
>                 -{
>
>                 -  unsigned long flags;
>
>                 -  int r;
>
>                 -
>
>                 -  if (!adev->irq.ih.faults)
>
>                 -          return;
>
>                 -
>
>                 -  spin_lock_irqsave(&adev->irq.ih.faults->lock, flags);
>
>                 -
>
>                 -  r = chash_table_remove(&adev->irq.ih.faults->hash, key, NULL);
>
>                 -  if (!WARN_ON_ONCE(r < 0)) {
>
>                 -          adev->irq.ih.faults->count--;
>
>                 -          WARN_ON_ONCE(adev->irq.ih.faults->count < 0);
>
>                 -  }
>
>                 -
>
>                 -  spin_unlock_irqrestore(&adev->irq.ih.faults->lock, flags);
>
>                 -}
>
>                 diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>
>                 index a23e1c0..f411ffb 100644
>
>                 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>
>                 +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>
>                 @@ -32,13 +32,6 @@ struct amdgpu_device;
>
>                   #define AMDGPU_IH_CLIENTID_LEGACY 0
>
>                   #define AMDGPU_IH_CLIENTID_MAX SOC15_IH_CLIENTID_MAX
>
>                   
>
>                 -#define AMDGPU_PAGEFAULT_HASH_BITS 8
>
>                 -struct amdgpu_retryfault_hashtable {
>
>                 -  DECLARE_CHASH_TABLE(hash, AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
>
>                 -  spinlock_t     lock;
>
>                 -  int            count;
>
>                 -};
>
>                 -
>
>                   /*
>
>                    * R6xx+ IH ring
>
>                    */
>
>                 @@ -57,7 +50,6 @@ struct amdgpu_ih_ring {
>
>                     bool                   use_doorbell;
>
>                     bool                   use_bus_addr;
>
>                     dma_addr_t             rb_dma_addr; /* only used when use_bus_addr = true */
>
>                 -  struct amdgpu_retryfault_hashtable *faults;
>
>                   };
>
>                   
>
>                   #define AMDGPU_IH_SRC_DATA_MAX_SIZE_DW 4
>
>                 @@ -95,7 +87,5 @@ int amdgpu_ih_ring_init(struct amdgpu_device *adev, unsigned ring_size,
>
>                                     bool use_bus_addr);
>
>                   void amdgpu_ih_ring_fini(struct amdgpu_device *adev);
>
>                   int amdgpu_ih_process(struct amdgpu_device *adev);
>
>                 -int amdgpu_ih_add_fault(struct amdgpu_device *adev, u64 key);
>
>                 -void amdgpu_ih_clear_fault(struct amdgpu_device *adev, u64 key);
>
>                   
>
>                   #endif
>
>                 diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>
>                 index 1d7e3c1..8b220e9 100644
>
>                 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>
>                 +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>
>                 @@ -2692,6 +2692,22 @@ void amdgpu_vm_adjust_size(struct amdgpu_device *adev, uint32_t min_vm_size,
>
>                              adev->vm_manager.fragment_size);
>
>                   }
>
>                   
>
>                 +static struct amdgpu_retryfault_hashtable *init_fault_hash(void)
>
>                 +{
>
>                 +  struct amdgpu_retryfault_hashtable *fault_hash;
>
>                 +
>
>                 +  fault_hash = kmalloc(sizeof(*fault_hash), GFP_KERNEL);
>
>                 +  if (!fault_hash)
>
>                 +          return fault_hash;
>
>                 +
>
>                 +  INIT_CHASH_TABLE(fault_hash->hash,
>
>                 +                  AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
>
>                 +  spin_lock_init(&fault_hash->lock);
>
>                 +  fault_hash->count = 0;
>
>                 +
>
>                 +  return fault_hash;
>
>                 +}
>
>                 +
>
>                   /**
>
>                    * amdgpu_vm_init - initialize a vm instance
>
>                    *
>
>                 @@ -2780,6 +2796,12 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>
>                             vm->pasid = pasid;
>
>                     }
>
>                   
>
>                 +  vm->fault_hash = init_fault_hash();
>
>                 +  if (!vm->fault_hash) {
>
>                 +          r = -ENOMEM;
>
>                 +          goto error_free_root;
>
>                 +  }
>
>                 +
>
>                     INIT_KFIFO(vm->faults);
>
>                     vm->fault_credit = 16;
>
>                   
>
>                 @@ -2973,7 +2995,7 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
>
>                   
>
>                     /* Clear pending page faults from IH when the VM is destroyed */
>
>                     while (kfifo_get(&vm->faults, &fault))
>
>                 -          amdgpu_ih_clear_fault(adev, fault);
>
>                 +          amdgpu_vm_clear_fault(vm->fault_hash, fault);
>
>                   
>
>                     if (vm->pasid) {
>
>                             unsigned long flags;
>
>                 @@ -2983,6 +3005,9 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
>
>                             spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
>
>                     }
>
>                   
>
>                 +  kfree(vm->fault_hash);
>
>                 +  vm->fault_hash = NULL;
>
>                 +
>
>                     drm_sched_entity_destroy(&vm->entity);
>
>                   
>
>                     if (!RB_EMPTY_ROOT(&vm->va.rb_root)) {
>
>                 @@ -3183,3 +3208,78 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
>
>                             }
>
>                     }
>
>                   }
>
>                 +
>
>                 +/**
>
>                 + * amdgpu_vm_add_fault - Add a page fault record to fault hash table
>
>                 + *
>
>                 + * @fault_hash: fault hash table
>
>                 + * @key: 64-bit encoding of PASID and address
>
>                 + *
>
>                 + * This should be called when a retry page fault interrupt is
>
>                 + * received. If this is a new page fault, it will be added to a hash
>
>                 + * table. The return value indicates whether this is a new fault, or
>
>                 + * a fault that was already known and is already being handled.
>
>                 + *
>
>                 + * If there are too many pending page faults, this will fail. Retry
>
>                 + * interrupts should be ignored in this case until there is enough
>
>                 + * free space.
>
>                 + *
>
>                 + * Returns 0 if the fault was added, 1 if the fault was already known,
>
>                 + * -ENOSPC if there are too many pending faults.
>
>                 + */
>
>                 +int amdgpu_vm_add_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key)
>
>                 +{
>
>                 +  unsigned long flags;
>
>                 +  int r = -ENOSPC;
>
>                 +
>
>                 +  if (WARN_ON_ONCE(!fault_hash))
>
>                 +          /* Should be allocated in amdgpu_vm_init
>
>                 +           */
>
>                 +          return r;
>
>                 +
>
>                 +  spin_lock_irqsave(&fault_hash->lock, flags);
>
>                 +
>
>                 +  /* Only let the hash table fill up to 50% for best performance */
>
>                 +  if (fault_hash->count >= (1 << (AMDGPU_PAGEFAULT_HASH_BITS-1)))
>
>                 +          goto unlock_out;
>
>                 +
>
>                 +  r = chash_table_copy_in(&fault_hash->hash, key, NULL);
>
>                 +  if (!r)
>
>                 +          fault_hash->count++;
>
>                 +
>
>                 +  /* chash_table_copy_in should never fail unless we're losing count */
>
>                 +  WARN_ON_ONCE(r < 0);
>
>                 +
>
>                 +unlock_out:
>
>                 +  spin_unlock_irqrestore(&fault_hash->lock, flags);
>
>                 +  return r;
>
>                 +}
>
>                 +
>
>                 +/**
>
>                 + * amdgpu_vm_clear_fault - Remove a page fault record
>
>                 + *
>
>                 + * @fault_hash: fault hash table
>
>                 + * @key: 64-bit encoding of PASID and address
>
>                 + *
>
>                 + * This should be called when a page fault has been handled. Any
>
>                 + * future interrupt with this key will be processed as a new
>
>                 + * page fault.
>
>                 + */
>
>                 +void amdgpu_vm_clear_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key)
>
>                 +{
>
>                 +  unsigned long flags;
>
>                 +  int r;
>
>                 +
>
>                 +  if (!fault_hash)
>
>                 +          return;
>
>                 +
>
>                 +  spin_lock_irqsave(&fault_hash->lock, flags);
>
>                 +
>
>                 +  r = chash_table_remove(&fault_hash->hash, key, NULL);
>
>                 +  if (!WARN_ON_ONCE(r < 0)) {
>
>                 +          fault_hash->count--;
>
>                 +          WARN_ON_ONCE(fault_hash->count < 0);
>
>                 +  }
>
>                 +
>
>                 +  spin_unlock_irqrestore(&fault_hash->lock, flags);
>
>                 +}
>
>                 diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>
>                 index e275ee7..6eb1da1 100644
>
>                 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>
>                 +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>
>                 @@ -178,6 +178,13 @@ struct amdgpu_task_info {
>
>                     pid_t   tgid;
>
>                   };
>
>                   
>
>                 +#define AMDGPU_PAGEFAULT_HASH_BITS 8
>
>                 +struct amdgpu_retryfault_hashtable {
>
>                 +  DECLARE_CHASH_TABLE(hash, AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
>
>                 +  spinlock_t     lock;
>
>                 +  int            count;
>
>                 +};
>
>                 +
>
>                   struct amdgpu_vm {
>
>                     /* tree of virtual addresses mapped */
>
>                     struct rb_root_cached  va;
>
>                 @@ -240,6 +247,7 @@ struct amdgpu_vm {
>
>                     struct ttm_lru_bulk_move lru_bulk_move;
>
>                     /* mark whether can do the bulk move */
>
>                     bool                   bulk_moveable;
>
>                 +  struct amdgpu_retryfault_hashtable *fault_hash;
>
>                   };
>
>                   
>
>                   struct amdgpu_vm_manager {
>
>                 @@ -355,4 +363,8 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);
>
>                   void amdgpu_vm_move_to_lru_tail(struct amdgpu_device *adev,
>
>                                            struct amdgpu_vm *vm);
>
>                   
>
>                 +int amdgpu_vm_add_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key);
>
>                 +
>
>                 +void amdgpu_vm_clear_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key);
>
>                 +
>
>                   #endif
>
>                 diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>
>                 index 5ae5ed2..acbe5a7 100644
>
>                 --- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>
>                 +++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>
>                 @@ -265,35 +265,36 @@ static bool vega10_ih_prescreen_iv(struct amdgpu_device *adev)
>
>                             return true;
>
>                     }
>
>                   
>
>                 -  addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 12);
>
>                 -  key = AMDGPU_VM_FAULT(pasid, addr);
>
>                 -  r = amdgpu_ih_add_fault(adev, key);
>
>                 -
>
>                 -  /* Hash table is full or the fault is already being processed,
>
>                 -   * ignore further page faults
>
>                 -   */
>
>                 -  if (r != 0)
>
>                 -          goto ignore_iv;
>
>                 -
>
>                     /* Track retry faults in per-VM fault FIFO. */
>
>                     spin_lock(&adev->vm_manager.pasid_lock);
>
>                     vm = idr_find(&adev->vm_manager.pasid_idr, pasid);
>
>                 +  addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 12);
>
>                 +  key = AMDGPU_VM_FAULT(pasid, addr);
>
>                     if (!vm) {
>
>                             /* VM not found, process it normally */
>
>                             spin_unlock(&adev->vm_manager.pasid_lock);
>
>                 -          amdgpu_ih_clear_fault(adev, key);
>
>                             return true;
>
>                 +  } else {
>
>                 +          r = amdgpu_vm_add_fault(vm->fault_hash, key);
>
>                 +
>
>                 +          /* Hash table is full or the fault is already being processed,
>
>                 +           * ignore further page faults
>
>                 +           */
>
>                 +          if (r != 0) {
>
>                 +                  spin_unlock(&adev->vm_manager.pasid_lock);
>
>                 +                  goto ignore_iv;
>
>                 +          }
>
>                     }
>
>                     /* No locking required with single writer and single reader */
>
>                     r = kfifo_put(&vm->faults, key);
>
>                     if (!r) {
>
>                             /* FIFO is full. Ignore it until there is space */
>
>                 +          amdgpu_vm_clear_fault(vm->fault_hash, key);
>
>                             spin_unlock(&adev->vm_manager.pasid_lock);
>
>                 -          amdgpu_ih_clear_fault(adev, key);
>
>                             goto ignore_iv;
>
>                     }
>
>                 -  spin_unlock(&adev->vm_manager.pasid_lock);
>
>                   
>
>                 +  spin_unlock(&adev->vm_manager.pasid_lock);
>
>                     /* It's the first fault for this address, process it normally */
>
>                     return true;
>
>                   
>
>                 @@ -386,14 +387,6 @@ static int vega10_ih_sw_init(void *handle)
>
>                     adev->irq.ih.use_doorbell = true;
>
>                     adev->irq.ih.doorbell_index = AMDGPU_DOORBELL64_IH << 1;
>
>                   
>
>                 -  adev->irq.ih.faults = kmalloc(sizeof(*adev->irq.ih.faults), GFP_KERNEL);
>
>                 -  if (!adev->irq.ih.faults)
>
>                 -          return -ENOMEM;
>
>                 -  INIT_CHASH_TABLE(adev->irq.ih.faults->hash,
>
>                 -                   AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
>
>                 -  spin_lock_init(&adev->irq.ih.faults->lock);
>
>                 -  adev->irq.ih.faults->count = 0;
>
>                 -
>
>                     r = amdgpu_irq_init(adev);
>
>                   
>
>                     return r;
>
>                 @@ -406,9 +399,6 @@ static int vega10_ih_sw_fini(void *handle)
>
>                     amdgpu_irq_fini(adev);
>
>                     amdgpu_ih_ring_fini(adev);
>
>                   
>
>                 -  kfree(adev->irq.ih.faults);
>
>                 -  adev->irq.ih.faults = NULL;
>
>                 -
>
>                     return 0;
>
>                   }
>
>                   
>
>
>
>
>
>
>
>             _______________________________________________
>
>             amd-gfx mailing list
>
>             amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>             <mailto:amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org>
>
>             https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>


[-- Attachment #1.2: Type: text/html, Size: 59698 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to amdgpu vm
       [not found]                     ` <ab2c26bc-e168-eaae-3d80-826a47b3e0d3-5C7GfCeVMHo@public.gmane.org>
@ 2018-09-08  7:34                       ` Christian König
       [not found]                         ` <9b6b78aa-c98a-a741-7ce2-0e7811e1f806-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 27+ messages in thread
From: Christian König @ 2018-09-08  7:34 UTC (permalink / raw)
  To: Felix Kuehling, Zeng, Oak, Oak Zeng,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Yang, Philip

Hi Felix,

> But why do you want to update page tables when there is no more user
> mode context that cares about them? Is this just to allow pending work
> to complete after the user mode context is gone? To prevent hangs?

The problem I'm see is that we still don't have a reliable way of 
killing a command submission while making sure that no more faults of it 
are in the flight.

E.g. the HWS has a function to kill a running job on the hardware, but 
that only works for compute and there is no way of telling when that 
operation has finished.

But Oak has already convinced me that we should probably work on that 
instead of trying to keep the VM alive as long as possible.

Regards,
Christian.

Am 08.09.2018 um 02:27 schrieb Felix Kuehling:
> Hi Christian,
>
> I'm not sure I get your point here.
>
> As I understand it, the amdgpu_vm structure represents the user mode VM
> context. It has the pointers to the root page directory and the rest of
> the page table hierarchy. If there is no amdgpu_vm, there is no user
> mode context that cares about the page tables.
>
> If we leave the page table pointers in the amdgpu_vm, then handling
> faults after the VM is destroyed is pointless. We can't find the page
> tables and we can't update them, so there is nothing we can do in
> response to VM faults.
>
> So I'm guessing that you want to move the page table pointers into a
> different structure that exists half-independently of the VM. It would
> be created when the VM is created (that's where we currently allocate
> the PASID) but can be destroyed slightly later.
>
> But why do you want to update page tables when there is no more user
> mode context that cares about them? Is this just to allow pending work
> to complete after the user mode context is gone? To prevent hangs?
>
> Regards,
>    Felix
>
> On 2018-09-10 07:20 AM, Christian König wrote:
>>> Am I right that the handling of page fault need a valid VM?
>> No, exactly that view is incorrect.
>>
>> The VM is meant to be a memory management structure of page tables and
>> is completely pointless fault processing because it represent the
>> future state of the page tables and not the current one.
>>
>> What we need for fault processing is a new structure build around
>> PASIDs which is feed by the with addresses when page tables are moved
>> around.
>>
>> Alternatively I hope that we can use the SDMA to walk the page tables
>> and update the required entries by just using the address.
>>
>> Regards,
>> Christian.
>>
>> Am 07.09.2018 um 16:55 schrieb Zeng, Oak:
>>> Hi Christian,
>>>
>>>   
>>>
>>> What is the value of delay the destroy of hash to after vm is
>>> destroyed? Since vm is destroyed, you basically don’t have enough
>>> information to paging in the correct page to gpuvm. Am I right that
>>> the handling of page fault need a valid VM? For example, you need the
>>> VM to get the corresponding physical address of the faulty page.
>>>
>>>   
>>>
>>> After vm is destroyed, the retry interrupt can be directly discard as
>>> you can’t find vm through pasid. You can think this is also one kind
>>> of prescreen.
>>>
>>>   
>>>
>>> So I am stand for the point that, there is no need to delay the
>>> destroy of hash to after vm is destroyed. Prescreening hash can be
>>> destroyed at the time of vm_fini.
>>>
>>>   
>>>
>>> Thanks,
>>>
>>> Oak
>>>
>>>   
>>>
>>> *From:*Christian König <ckoenig.leichtzumerken@gmail.com>
>>> *Sent:* Friday, September 07, 2018 4:39 AM
>>> *To:* Zeng, Oak <Oak.Zeng@amd.com>; Oak Zeng <zengshanjun@gmail.com>;
>>> amd-gfx@lists.freedesktop.org
>>> *Cc:* Zeng, Oak <Oak.Zeng@amd.com>
>>> *Subject:* Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to
>>> amdgpu vm
>>>
>>>   
>>>
>>> Hi Oak,
>>>
>>> yeah, but this is still a NAK. Making the hash per PASID was not a
>>> suggestion but rather a must have.
>>>
>>> The VM structures must be destroyed while the processing is still
>>> ongoing, or otherwise we would not have a clean OOM handling.
>>>
>>> The IDR for PASID lockup can be put into amdgpu_ids.c, you can just
>>> replace the IDA already used there with an IDR.
>>>
>>> Since the PASID is already freed up delayed we should have the grace
>>> period for interrupt processing included. If that still isn't
>>> sufficient we can still add some delayed work for this.
>>>
>>> Regards,
>>> Christian.
>>>
>>> Am 07.09.2018 um 06:16 schrieb ozeng:
>>>
>>>      Hi Christian,
>>>
>>>      In this implementation, fault hash is made per vm, not per pasid
>>>      as suggested, based on below considerations:
>>>
>>>        * Delay the destroy of hash introduces more effort like how to
>>>          set the proper grace period after which no retry interrupt
>>>          will be generated. We want to avoid those complication if we
>>>          can.
>>>        * The  problem of the per vm implementation is that it is hard
>>>          to delay the destroy of fault hash, because when vm is
>>>          destroyed, prescreen function won't be able to retrieve the
>>>          fault hash. But in this case, the prescreen function can
>>>          either let the interrupt go through (to gmc) or ignore it.
>>>          From this perspective, it is not very necessary to delay the
>>>          destroy of hash table.
>>>        * On the other hand, since ASICs after vega10 have the HW CAM
>>>          feature. So the SW IV prescreen is only useful for vega10.
>>>          For all the HW implemented CAM, we can consider the delayed
>>>          CAM acknowledgment.
>>>        * Making it per pasid need to introduce another idr to
>>>          correlate pasid and hash table - where to put the idr? You
>>>          will have to make it a global variable which is not very
>>>          desirable.
>>>
>>>      The main purpose of this patch is to solve the inter-process lock
>>>      issue (when hash table is full), while still keep codes simple.
>>>
>>>      Also with this patch, the faults kfifo is not necessary any more.
>>>      See patch 2.
>>>
>>>      Regards,
>>>
>>>      Oak
>>>
>>>       
>>>
>>>      On 09/06/2018 11:28 PM, Oak Zeng wrote:
>>>
>>>          In stead of share one fault hash table per device, make it
>>>
>>>          per vm. This can avoid inter-process lock issue when fault
>>>
>>>          hash table is full.
>>>
>>>           
>>>
>>>          Change-Id: I5d1281b7c41eddc8e26113e010516557588d3708
>>>
>>>          Signed-off-by: Oak Zeng <Oak.Zeng@amd.com> <mailto:Oak.Zeng@amd.com>
>>>
>>>          Suggested-by: Christian Konig <Christian.Koenig@amd.com> <mailto:Christian.Koenig@amd.com>
>>>
>>>          Suggested-by: Felix Kuehling <Felix.Kuehling@amd.com> <mailto:Felix.Kuehling@amd.com>
>>>
>>>          ---
>>>
>>>           drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c |  75 ------------------------
>>>
>>>           drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h |  10 ----
>>>
>>>           drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 102 ++++++++++++++++++++++++++++++++-
>>>
>>>           drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h |  12 ++++
>>>
>>>           drivers/gpu/drm/amd/amdgpu/vega10_ih.c |  38 +++++-------
>>>
>>>           5 files changed, 127 insertions(+), 110 deletions(-)
>>>
>>>           
>>>
>>>          diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>>
>>>          index 06373d4..4ed8621 100644
>>>
>>>          --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>>
>>>          +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>>
>>>          @@ -197,78 +197,3 @@ int amdgpu_ih_process(struct amdgpu_device *adev)
>>>
>>>             return IRQ_HANDLED;
>>>
>>>           }
>>>
>>>           
>>>
>>>          -/**
>>>
>>>          - * amdgpu_ih_add_fault - Add a page fault record
>>>
>>>          - *
>>>
>>>          - * @adev: amdgpu device pointer
>>>
>>>          - * @key: 64-bit encoding of PASID and address
>>>
>>>          - *
>>>
>>>          - * This should be called when a retry page fault interrupt is
>>>
>>>          - * received. If this is a new page fault, it will be added to a hash
>>>
>>>          - * table. The return value indicates whether this is a new fault, or
>>>
>>>          - * a fault that was already known and is already being handled.
>>>
>>>          - *
>>>
>>>          - * If there are too many pending page faults, this will fail. Retry
>>>
>>>          - * interrupts should be ignored in this case until there is enough
>>>
>>>          - * free space.
>>>
>>>          - *
>>>
>>>          - * Returns 0 if the fault was added, 1 if the fault was already known,
>>>
>>>          - * -ENOSPC if there are too many pending faults.
>>>
>>>          - */
>>>
>>>          -int amdgpu_ih_add_fault(struct amdgpu_device *adev, u64 key)
>>>
>>>          -{
>>>
>>>          -  unsigned long flags;
>>>
>>>          -  int r = -ENOSPC;
>>>
>>>          -
>>>
>>>          -  if (WARN_ON_ONCE(!adev->irq.ih.faults))
>>>
>>>          -          /* Should be allocated in <IP>_ih_sw_init on GPUs that
>>>
>>>          -           * support retry faults and require retry filtering.
>>>
>>>          -           */
>>>
>>>          -          return r;
>>>
>>>          -
>>>
>>>          -  spin_lock_irqsave(&adev->irq.ih.faults->lock, flags);
>>>
>>>          -
>>>
>>>          -  /* Only let the hash table fill up to 50% for best performance */
>>>
>>>          -  if (adev->irq.ih.faults->count >= (1 << (AMDGPU_PAGEFAULT_HASH_BITS-1)))
>>>
>>>          -          goto unlock_out;
>>>
>>>          -
>>>
>>>          -  r = chash_table_copy_in(&adev->irq.ih.faults->hash, key, NULL);
>>>
>>>          -  if (!r)
>>>
>>>          -          adev->irq.ih.faults->count++;
>>>
>>>          -
>>>
>>>          -  /* chash_table_copy_in should never fail unless we're losing count */
>>>
>>>          -  WARN_ON_ONCE(r < 0);
>>>
>>>          -
>>>
>>>          -unlock_out:
>>>
>>>          -  spin_unlock_irqrestore(&adev->irq.ih.faults->lock, flags);
>>>
>>>          -  return r;
>>>
>>>          -}
>>>
>>>          -
>>>
>>>          -/**
>>>
>>>          - * amdgpu_ih_clear_fault - Remove a page fault record
>>>
>>>          - *
>>>
>>>          - * @adev: amdgpu device pointer
>>>
>>>          - * @key: 64-bit encoding of PASID and address
>>>
>>>          - *
>>>
>>>          - * This should be called when a page fault has been handled. Any
>>>
>>>          - * future interrupt with this key will be processed as a new
>>>
>>>          - * page fault.
>>>
>>>          - */
>>>
>>>          -void amdgpu_ih_clear_fault(struct amdgpu_device *adev, u64 key)
>>>
>>>          -{
>>>
>>>          -  unsigned long flags;
>>>
>>>          -  int r;
>>>
>>>          -
>>>
>>>          -  if (!adev->irq.ih.faults)
>>>
>>>          -          return;
>>>
>>>          -
>>>
>>>          -  spin_lock_irqsave(&adev->irq.ih.faults->lock, flags);
>>>
>>>          -
>>>
>>>          -  r = chash_table_remove(&adev->irq.ih.faults->hash, key, NULL);
>>>
>>>          -  if (!WARN_ON_ONCE(r < 0)) {
>>>
>>>          -          adev->irq.ih.faults->count--;
>>>
>>>          -          WARN_ON_ONCE(adev->irq.ih.faults->count < 0);
>>>
>>>          -  }
>>>
>>>          -
>>>
>>>          -  spin_unlock_irqrestore(&adev->irq.ih.faults->lock, flags);
>>>
>>>          -}
>>>
>>>          diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>
>>>          index a23e1c0..f411ffb 100644
>>>
>>>          --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>
>>>          +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>
>>>          @@ -32,13 +32,6 @@ struct amdgpu_device;
>>>
>>>           #define AMDGPU_IH_CLIENTID_LEGACY 0
>>>
>>>           #define AMDGPU_IH_CLIENTID_MAX SOC15_IH_CLIENTID_MAX
>>>
>>>           
>>>
>>>          -#define AMDGPU_PAGEFAULT_HASH_BITS 8
>>>
>>>          -struct amdgpu_retryfault_hashtable {
>>>
>>>          -  DECLARE_CHASH_TABLE(hash, AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
>>>
>>>          -  spinlock_t     lock;
>>>
>>>          -  int            count;
>>>
>>>          -};
>>>
>>>          -
>>>
>>>           /*
>>>
>>>            * R6xx+ IH ring
>>>
>>>            */
>>>
>>>          @@ -57,7 +50,6 @@ struct amdgpu_ih_ring {
>>>
>>>             bool                   use_doorbell;
>>>
>>>             bool                   use_bus_addr;
>>>
>>>             dma_addr_t             rb_dma_addr; /* only used when use_bus_addr = true */
>>>
>>>          -  struct amdgpu_retryfault_hashtable *faults;
>>>
>>>           };
>>>
>>>           
>>>
>>>           #define AMDGPU_IH_SRC_DATA_MAX_SIZE_DW 4
>>>
>>>          @@ -95,7 +87,5 @@ int amdgpu_ih_ring_init(struct amdgpu_device *adev, unsigned ring_size,
>>>
>>>                             bool use_bus_addr);
>>>
>>>           void amdgpu_ih_ring_fini(struct amdgpu_device *adev);
>>>
>>>           int amdgpu_ih_process(struct amdgpu_device *adev);
>>>
>>>          -int amdgpu_ih_add_fault(struct amdgpu_device *adev, u64 key);
>>>
>>>          -void amdgpu_ih_clear_fault(struct amdgpu_device *adev, u64 key);
>>>
>>>           
>>>
>>>           #endif
>>>
>>>          diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>
>>>          index 1d7e3c1..8b220e9 100644
>>>
>>>          --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>
>>>          +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>
>>>          @@ -2692,6 +2692,22 @@ void amdgpu_vm_adjust_size(struct amdgpu_device *adev, uint32_t min_vm_size,
>>>
>>>                      adev->vm_manager.fragment_size);
>>>
>>>           }
>>>
>>>           
>>>
>>>          +static struct amdgpu_retryfault_hashtable *init_fault_hash(void)
>>>
>>>          +{
>>>
>>>          +  struct amdgpu_retryfault_hashtable *fault_hash;
>>>
>>>          +
>>>
>>>          +  fault_hash = kmalloc(sizeof(*fault_hash), GFP_KERNEL);
>>>
>>>          +  if (!fault_hash)
>>>
>>>          +          return fault_hash;
>>>
>>>          +
>>>
>>>          +  INIT_CHASH_TABLE(fault_hash->hash,
>>>
>>>          +                  AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
>>>
>>>          +  spin_lock_init(&fault_hash->lock);
>>>
>>>          +  fault_hash->count = 0;
>>>
>>>          +
>>>
>>>          +  return fault_hash;
>>>
>>>          +}
>>>
>>>          +
>>>
>>>           /**
>>>
>>>            * amdgpu_vm_init - initialize a vm instance
>>>
>>>            *
>>>
>>>          @@ -2780,6 +2796,12 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>>>
>>>                     vm->pasid = pasid;
>>>
>>>             }
>>>
>>>           
>>>
>>>          +  vm->fault_hash = init_fault_hash();
>>>
>>>          +  if (!vm->fault_hash) {
>>>
>>>          +          r = -ENOMEM;
>>>
>>>          +          goto error_free_root;
>>>
>>>          +  }
>>>
>>>          +
>>>
>>>             INIT_KFIFO(vm->faults);
>>>
>>>             vm->fault_credit = 16;
>>>
>>>           
>>>
>>>          @@ -2973,7 +2995,7 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
>>>
>>>           
>>>
>>>             /* Clear pending page faults from IH when the VM is destroyed */
>>>
>>>             while (kfifo_get(&vm->faults, &fault))
>>>
>>>          -          amdgpu_ih_clear_fault(adev, fault);
>>>
>>>          +          amdgpu_vm_clear_fault(vm->fault_hash, fault);
>>>
>>>           
>>>
>>>             if (vm->pasid) {
>>>
>>>                     unsigned long flags;
>>>
>>>          @@ -2983,6 +3005,9 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
>>>
>>>                     spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
>>>
>>>             }
>>>
>>>           
>>>
>>>          +  kfree(vm->fault_hash);
>>>
>>>          +  vm->fault_hash = NULL;
>>>
>>>          +
>>>
>>>             drm_sched_entity_destroy(&vm->entity);
>>>
>>>           
>>>
>>>             if (!RB_EMPTY_ROOT(&vm->va.rb_root)) {
>>>
>>>          @@ -3183,3 +3208,78 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
>>>
>>>                     }
>>>
>>>             }
>>>
>>>           }
>>>
>>>          +
>>>
>>>          +/**
>>>
>>>          + * amdgpu_vm_add_fault - Add a page fault record to fault hash table
>>>
>>>          + *
>>>
>>>          + * @fault_hash: fault hash table
>>>
>>>          + * @key: 64-bit encoding of PASID and address
>>>
>>>          + *
>>>
>>>          + * This should be called when a retry page fault interrupt is
>>>
>>>          + * received. If this is a new page fault, it will be added to a hash
>>>
>>>          + * table. The return value indicates whether this is a new fault, or
>>>
>>>          + * a fault that was already known and is already being handled.
>>>
>>>          + *
>>>
>>>          + * If there are too many pending page faults, this will fail. Retry
>>>
>>>          + * interrupts should be ignored in this case until there is enough
>>>
>>>          + * free space.
>>>
>>>          + *
>>>
>>>          + * Returns 0 if the fault was added, 1 if the fault was already known,
>>>
>>>          + * -ENOSPC if there are too many pending faults.
>>>
>>>          + */
>>>
>>>          +int amdgpu_vm_add_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key)
>>>
>>>          +{
>>>
>>>          +  unsigned long flags;
>>>
>>>          +  int r = -ENOSPC;
>>>
>>>          +
>>>
>>>          +  if (WARN_ON_ONCE(!fault_hash))
>>>
>>>          +          /* Should be allocated in amdgpu_vm_init
>>>
>>>          +           */
>>>
>>>          +          return r;
>>>
>>>          +
>>>
>>>          +  spin_lock_irqsave(&fault_hash->lock, flags);
>>>
>>>          +
>>>
>>>          +  /* Only let the hash table fill up to 50% for best performance */
>>>
>>>          +  if (fault_hash->count >= (1 << (AMDGPU_PAGEFAULT_HASH_BITS-1)))
>>>
>>>          +          goto unlock_out;
>>>
>>>          +
>>>
>>>          +  r = chash_table_copy_in(&fault_hash->hash, key, NULL);
>>>
>>>          +  if (!r)
>>>
>>>          +          fault_hash->count++;
>>>
>>>          +
>>>
>>>          +  /* chash_table_copy_in should never fail unless we're losing count */
>>>
>>>          +  WARN_ON_ONCE(r < 0);
>>>
>>>          +
>>>
>>>          +unlock_out:
>>>
>>>          +  spin_unlock_irqrestore(&fault_hash->lock, flags);
>>>
>>>          +  return r;
>>>
>>>          +}
>>>
>>>          +
>>>
>>>          +/**
>>>
>>>          + * amdgpu_vm_clear_fault - Remove a page fault record
>>>
>>>          + *
>>>
>>>          + * @fault_hash: fault hash table
>>>
>>>          + * @key: 64-bit encoding of PASID and address
>>>
>>>          + *
>>>
>>>          + * This should be called when a page fault has been handled. Any
>>>
>>>          + * future interrupt with this key will be processed as a new
>>>
>>>          + * page fault.
>>>
>>>          + */
>>>
>>>          +void amdgpu_vm_clear_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key)
>>>
>>>          +{
>>>
>>>          +  unsigned long flags;
>>>
>>>          +  int r;
>>>
>>>          +
>>>
>>>          +  if (!fault_hash)
>>>
>>>          +          return;
>>>
>>>          +
>>>
>>>          +  spin_lock_irqsave(&fault_hash->lock, flags);
>>>
>>>          +
>>>
>>>          +  r = chash_table_remove(&fault_hash->hash, key, NULL);
>>>
>>>          +  if (!WARN_ON_ONCE(r < 0)) {
>>>
>>>          +          fault_hash->count--;
>>>
>>>          +          WARN_ON_ONCE(fault_hash->count < 0);
>>>
>>>          +  }
>>>
>>>          +
>>>
>>>          +  spin_unlock_irqrestore(&fault_hash->lock, flags);
>>>
>>>          +}
>>>
>>>          diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>
>>>          index e275ee7..6eb1da1 100644
>>>
>>>          --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>
>>>          +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>
>>>          @@ -178,6 +178,13 @@ struct amdgpu_task_info {
>>>
>>>             pid_t   tgid;
>>>
>>>           };
>>>
>>>           
>>>
>>>          +#define AMDGPU_PAGEFAULT_HASH_BITS 8
>>>
>>>          +struct amdgpu_retryfault_hashtable {
>>>
>>>          +  DECLARE_CHASH_TABLE(hash, AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
>>>
>>>          +  spinlock_t     lock;
>>>
>>>          +  int            count;
>>>
>>>          +};
>>>
>>>          +
>>>
>>>           struct amdgpu_vm {
>>>
>>>             /* tree of virtual addresses mapped */
>>>
>>>             struct rb_root_cached  va;
>>>
>>>          @@ -240,6 +247,7 @@ struct amdgpu_vm {
>>>
>>>             struct ttm_lru_bulk_move lru_bulk_move;
>>>
>>>             /* mark whether can do the bulk move */
>>>
>>>             bool                   bulk_moveable;
>>>
>>>          +  struct amdgpu_retryfault_hashtable *fault_hash;
>>>
>>>           };
>>>
>>>           
>>>
>>>           struct amdgpu_vm_manager {
>>>
>>>          @@ -355,4 +363,8 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);
>>>
>>>           void amdgpu_vm_move_to_lru_tail(struct amdgpu_device *adev,
>>>
>>>                                    struct amdgpu_vm *vm);
>>>
>>>           
>>>
>>>          +int amdgpu_vm_add_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key);
>>>
>>>          +
>>>
>>>          +void amdgpu_vm_clear_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key);
>>>
>>>          +
>>>
>>>           #endif
>>>
>>>          diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>
>>>          index 5ae5ed2..acbe5a7 100644
>>>
>>>          --- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>
>>>          +++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>
>>>          @@ -265,35 +265,36 @@ static bool vega10_ih_prescreen_iv(struct amdgpu_device *adev)
>>>
>>>                     return true;
>>>
>>>             }
>>>
>>>           
>>>
>>>          -  addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 12);
>>>
>>>          -  key = AMDGPU_VM_FAULT(pasid, addr);
>>>
>>>          -  r = amdgpu_ih_add_fault(adev, key);
>>>
>>>          -
>>>
>>>          -  /* Hash table is full or the fault is already being processed,
>>>
>>>          -   * ignore further page faults
>>>
>>>          -   */
>>>
>>>          -  if (r != 0)
>>>
>>>          -          goto ignore_iv;
>>>
>>>          -
>>>
>>>             /* Track retry faults in per-VM fault FIFO. */
>>>
>>>             spin_lock(&adev->vm_manager.pasid_lock);
>>>
>>>             vm = idr_find(&adev->vm_manager.pasid_idr, pasid);
>>>
>>>          +  addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 12);
>>>
>>>          +  key = AMDGPU_VM_FAULT(pasid, addr);
>>>
>>>             if (!vm) {
>>>
>>>                     /* VM not found, process it normally */
>>>
>>>                     spin_unlock(&adev->vm_manager.pasid_lock);
>>>
>>>          -          amdgpu_ih_clear_fault(adev, key);
>>>
>>>                     return true;
>>>
>>>          +  } else {
>>>
>>>          +          r = amdgpu_vm_add_fault(vm->fault_hash, key);
>>>
>>>          +
>>>
>>>          +          /* Hash table is full or the fault is already being processed,
>>>
>>>          +           * ignore further page faults
>>>
>>>          +           */
>>>
>>>          +          if (r != 0) {
>>>
>>>          +                  spin_unlock(&adev->vm_manager.pasid_lock);
>>>
>>>          +                  goto ignore_iv;
>>>
>>>          +          }
>>>
>>>             }
>>>
>>>             /* No locking required with single writer and single reader */
>>>
>>>             r = kfifo_put(&vm->faults, key);
>>>
>>>             if (!r) {
>>>
>>>                     /* FIFO is full. Ignore it until there is space */
>>>
>>>          +          amdgpu_vm_clear_fault(vm->fault_hash, key);
>>>
>>>                     spin_unlock(&adev->vm_manager.pasid_lock);
>>>
>>>          -          amdgpu_ih_clear_fault(adev, key);
>>>
>>>                     goto ignore_iv;
>>>
>>>             }
>>>
>>>          -  spin_unlock(&adev->vm_manager.pasid_lock);
>>>
>>>           
>>>
>>>          +  spin_unlock(&adev->vm_manager.pasid_lock);
>>>
>>>             /* It's the first fault for this address, process it normally */
>>>
>>>             return true;
>>>
>>>           
>>>
>>>          @@ -386,14 +387,6 @@ static int vega10_ih_sw_init(void *handle)
>>>
>>>             adev->irq.ih.use_doorbell = true;
>>>
>>>             adev->irq.ih.doorbell_index = AMDGPU_DOORBELL64_IH << 1;
>>>
>>>           
>>>
>>>          -  adev->irq.ih.faults = kmalloc(sizeof(*adev->irq.ih.faults), GFP_KERNEL);
>>>
>>>          -  if (!adev->irq.ih.faults)
>>>
>>>          -          return -ENOMEM;
>>>
>>>          -  INIT_CHASH_TABLE(adev->irq.ih.faults->hash,
>>>
>>>          -                   AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
>>>
>>>          -  spin_lock_init(&adev->irq.ih.faults->lock);
>>>
>>>          -  adev->irq.ih.faults->count = 0;
>>>
>>>          -
>>>
>>>             r = amdgpu_irq_init(adev);
>>>
>>>           
>>>
>>>             return r;
>>>
>>>          @@ -406,9 +399,6 @@ static int vega10_ih_sw_fini(void *handle)
>>>
>>>             amdgpu_irq_fini(adev);
>>>
>>>             amdgpu_ih_ring_fini(adev);
>>>
>>>           
>>>
>>>          -  kfree(adev->irq.ih.faults);
>>>
>>>          -  adev->irq.ih.faults = NULL;
>>>
>>>          -
>>>
>>>             return 0;
>>>
>>>           }
>>>
>>>           
>>>
>>>
>>>
>>>
>>>
>>>      _______________________________________________
>>>
>>>      amd-gfx mailing list
>>>
>>>      amd-gfx@lists.freedesktop.org <mailto:amd-gfx@lists.freedesktop.org>
>>>
>>>      https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>
>>>   
>>>
>>
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to amdgpu vm
       [not found]                         ` <9b6b78aa-c98a-a741-7ce2-0e7811e1f806-5C7GfCeVMHo@public.gmane.org>
@ 2018-09-10  9:42                           ` Christian König
       [not found]                             ` <143b57c2-12d1-8cb1-ddea-a8fe7226e1c4-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
  0 siblings, 1 reply; 27+ messages in thread
From: Christian König @ 2018-09-10  9:42 UTC (permalink / raw)
  To: Christian König, Felix Kuehling, Zeng, Oak, Oak Zeng,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Yang, Philip

Hi Felix & Oak,

over the weekend I had the idea that we could just use the shadow BOs to 
have the current state in a page fault. They are GTT BOs and CPU 
accessible anyway.

Regards,
Christian.

Am 08.09.2018 um 09:34 schrieb Christian König:
> Hi Felix,
>
>> But why do you want to update page tables when there is no more user
>> mode context that cares about them? Is this just to allow pending work
>> to complete after the user mode context is gone? To prevent hangs?
>
> The problem I'm see is that we still don't have a reliable way of 
> killing a command submission while making sure that no more faults of 
> it are in the flight.
>
> E.g. the HWS has a function to kill a running job on the hardware, but 
> that only works for compute and there is no way of telling when that 
> operation has finished.
>
> But Oak has already convinced me that we should probably work on that 
> instead of trying to keep the VM alive as long as possible.
>
> Regards,
> Christian.
>
> Am 08.09.2018 um 02:27 schrieb Felix Kuehling:
>> Hi Christian,
>>
>> I'm not sure I get your point here.
>>
>> As I understand it, the amdgpu_vm structure represents the user mode VM
>> context. It has the pointers to the root page directory and the rest of
>> the page table hierarchy. If there is no amdgpu_vm, there is no user
>> mode context that cares about the page tables.
>>
>> If we leave the page table pointers in the amdgpu_vm, then handling
>> faults after the VM is destroyed is pointless. We can't find the page
>> tables and we can't update them, so there is nothing we can do in
>> response to VM faults.
>>
>> So I'm guessing that you want to move the page table pointers into a
>> different structure that exists half-independently of the VM. It would
>> be created when the VM is created (that's where we currently allocate
>> the PASID) but can be destroyed slightly later.
>>
>> But why do you want to update page tables when there is no more user
>> mode context that cares about them? Is this just to allow pending work
>> to complete after the user mode context is gone? To prevent hangs?
>>
>> Regards,
>>    Felix
>>
>> On 2018-09-10 07:20 AM, Christian König wrote:
>>>> Am I right that the handling of page fault need a valid VM?
>>> No, exactly that view is incorrect.
>>>
>>> The VM is meant to be a memory management structure of page tables and
>>> is completely pointless fault processing because it represent the
>>> future state of the page tables and not the current one.
>>>
>>> What we need for fault processing is a new structure build around
>>> PASIDs which is feed by the with addresses when page tables are moved
>>> around.
>>>
>>> Alternatively I hope that we can use the SDMA to walk the page tables
>>> and update the required entries by just using the address.
>>>
>>> Regards,
>>> Christian.
>>>
>>> Am 07.09.2018 um 16:55 schrieb Zeng, Oak:
>>>> Hi Christian,
>>>>
>>>>
>>>> What is the value of delay the destroy of hash to after vm is
>>>> destroyed? Since vm is destroyed, you basically don’t have enough
>>>> information to paging in the correct page to gpuvm. Am I right that
>>>> the handling of page fault need a valid VM? For example, you need the
>>>> VM to get the corresponding physical address of the faulty page.
>>>>
>>>>
>>>> After vm is destroyed, the retry interrupt can be directly discard as
>>>> you can’t find vm through pasid. You can think this is also one kind
>>>> of prescreen.
>>>>
>>>>
>>>> So I am stand for the point that, there is no need to delay the
>>>> destroy of hash to after vm is destroyed. Prescreening hash can be
>>>> destroyed at the time of vm_fini.
>>>>
>>>>
>>>> Thanks,
>>>>
>>>> Oak
>>>>
>>>>
>>>> *From:*Christian König <ckoenig.leichtzumerken@gmail.com>
>>>> *Sent:* Friday, September 07, 2018 4:39 AM
>>>> *To:* Zeng, Oak <Oak.Zeng@amd.com>; Oak Zeng <zengshanjun@gmail.com>;
>>>> amd-gfx@lists.freedesktop.org
>>>> *Cc:* Zeng, Oak <Oak.Zeng@amd.com>
>>>> *Subject:* Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to
>>>> amdgpu vm
>>>>
>>>>
>>>> Hi Oak,
>>>>
>>>> yeah, but this is still a NAK. Making the hash per PASID was not a
>>>> suggestion but rather a must have.
>>>>
>>>> The VM structures must be destroyed while the processing is still
>>>> ongoing, or otherwise we would not have a clean OOM handling.
>>>>
>>>> The IDR for PASID lockup can be put into amdgpu_ids.c, you can just
>>>> replace the IDA already used there with an IDR.
>>>>
>>>> Since the PASID is already freed up delayed we should have the grace
>>>> period for interrupt processing included. If that still isn't
>>>> sufficient we can still add some delayed work for this.
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>> Am 07.09.2018 um 06:16 schrieb ozeng:
>>>>
>>>>      Hi Christian,
>>>>
>>>>      In this implementation, fault hash is made per vm, not per pasid
>>>>      as suggested, based on below considerations:
>>>>
>>>>        * Delay the destroy of hash introduces more effort like how to
>>>>          set the proper grace period after which no retry interrupt
>>>>          will be generated. We want to avoid those complication if we
>>>>          can.
>>>>        * The  problem of the per vm implementation is that it is hard
>>>>          to delay the destroy of fault hash, because when vm is
>>>>          destroyed, prescreen function won't be able to retrieve the
>>>>          fault hash. But in this case, the prescreen function can
>>>>          either let the interrupt go through (to gmc) or ignore it.
>>>>          From this perspective, it is not very necessary to delay the
>>>>          destroy of hash table.
>>>>        * On the other hand, since ASICs after vega10 have the HW CAM
>>>>          feature. So the SW IV prescreen is only useful for vega10.
>>>>          For all the HW implemented CAM, we can consider the delayed
>>>>          CAM acknowledgment.
>>>>        * Making it per pasid need to introduce another idr to
>>>>          correlate pasid and hash table - where to put the idr? You
>>>>          will have to make it a global variable which is not very
>>>>          desirable.
>>>>
>>>>      The main purpose of this patch is to solve the inter-process lock
>>>>      issue (when hash table is full), while still keep codes simple.
>>>>
>>>>      Also with this patch, the faults kfifo is not necessary any more.
>>>>      See patch 2.
>>>>
>>>>      Regards,
>>>>
>>>>      Oak
>>>>
>>>>
>>>>      On 09/06/2018 11:28 PM, Oak Zeng wrote:
>>>>
>>>>          In stead of share one fault hash table per device, make it
>>>>
>>>>          per vm. This can avoid inter-process lock issue when fault
>>>>
>>>>          hash table is full.
>>>>
>>>>
>>>>          Change-Id: I5d1281b7c41eddc8e26113e010516557588d3708
>>>>
>>>>          Signed-off-by: Oak Zeng <Oak.Zeng@amd.com> 
>>>> <mailto:Oak.Zeng@amd.com>
>>>>
>>>>          Suggested-by: Christian Konig <Christian.Koenig@amd.com> 
>>>> <mailto:Christian.Koenig@amd.com>
>>>>
>>>>          Suggested-by: Felix Kuehling <Felix.Kuehling@amd.com> 
>>>> <mailto:Felix.Kuehling@amd.com>
>>>>
>>>>          ---
>>>>
>>>>           drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c |  75 
>>>> ------------------------
>>>>
>>>>           drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h |  10 ----
>>>>
>>>>           drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 102 
>>>> ++++++++++++++++++++++++++++++++-
>>>>
>>>>           drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h |  12 ++++
>>>>
>>>>           drivers/gpu/drm/amd/amdgpu/vega10_ih.c |  38 +++++-------
>>>>
>>>>           5 files changed, 127 insertions(+), 110 deletions(-)
>>>>
>>>>
>>>>          diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>>>
>>>>          index 06373d4..4ed8621 100644
>>>>
>>>>          --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>>>
>>>>          +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>>>
>>>>          @@ -197,78 +197,3 @@ int amdgpu_ih_process(struct 
>>>> amdgpu_device *adev)
>>>>
>>>>             return IRQ_HANDLED;
>>>>
>>>>           }
>>>>
>>>>
>>>>          -/**
>>>>
>>>>          - * amdgpu_ih_add_fault - Add a page fault record
>>>>
>>>>          - *
>>>>
>>>>          - * @adev: amdgpu device pointer
>>>>
>>>>          - * @key: 64-bit encoding of PASID and address
>>>>
>>>>          - *
>>>>
>>>>          - * This should be called when a retry page fault 
>>>> interrupt is
>>>>
>>>>          - * received. If this is a new page fault, it will be 
>>>> added to a hash
>>>>
>>>>          - * table. The return value indicates whether this is a 
>>>> new fault, or
>>>>
>>>>          - * a fault that was already known and is already being 
>>>> handled.
>>>>
>>>>          - *
>>>>
>>>>          - * If there are too many pending page faults, this will 
>>>> fail. Retry
>>>>
>>>>          - * interrupts should be ignored in this case until there 
>>>> is enough
>>>>
>>>>          - * free space.
>>>>
>>>>          - *
>>>>
>>>>          - * Returns 0 if the fault was added, 1 if the fault was 
>>>> already known,
>>>>
>>>>          - * -ENOSPC if there are too many pending faults.
>>>>
>>>>          - */
>>>>
>>>>          -int amdgpu_ih_add_fault(struct amdgpu_device *adev, u64 key)
>>>>
>>>>          -{
>>>>
>>>>          -  unsigned long flags;
>>>>
>>>>          -  int r = -ENOSPC;
>>>>
>>>>          -
>>>>
>>>>          -  if (WARN_ON_ONCE(!adev->irq.ih.faults))
>>>>
>>>>          -          /* Should be allocated in <IP>_ih_sw_init on 
>>>> GPUs that
>>>>
>>>>          -           * support retry faults and require retry 
>>>> filtering.
>>>>
>>>>          -           */
>>>>
>>>>          -          return r;
>>>>
>>>>          -
>>>>
>>>>          - spin_lock_irqsave(&adev->irq.ih.faults->lock, flags);
>>>>
>>>>          -
>>>>
>>>>          -  /* Only let the hash table fill up to 50% for best 
>>>> performance */
>>>>
>>>>          -  if (adev->irq.ih.faults->count >= (1 << 
>>>> (AMDGPU_PAGEFAULT_HASH_BITS-1)))
>>>>
>>>>          -          goto unlock_out;
>>>>
>>>>          -
>>>>
>>>>          -  r = chash_table_copy_in(&adev->irq.ih.faults->hash, 
>>>> key, NULL);
>>>>
>>>>          -  if (!r)
>>>>
>>>>          -          adev->irq.ih.faults->count++;
>>>>
>>>>          -
>>>>
>>>>          -  /* chash_table_copy_in should never fail unless we're 
>>>> losing count */
>>>>
>>>>          -  WARN_ON_ONCE(r < 0);
>>>>
>>>>          -
>>>>
>>>>          -unlock_out:
>>>>
>>>>          - spin_unlock_irqrestore(&adev->irq.ih.faults->lock, flags);
>>>>
>>>>          -  return r;
>>>>
>>>>          -}
>>>>
>>>>          -
>>>>
>>>>          -/**
>>>>
>>>>          - * amdgpu_ih_clear_fault - Remove a page fault record
>>>>
>>>>          - *
>>>>
>>>>          - * @adev: amdgpu device pointer
>>>>
>>>>          - * @key: 64-bit encoding of PASID and address
>>>>
>>>>          - *
>>>>
>>>>          - * This should be called when a page fault has been 
>>>> handled. Any
>>>>
>>>>          - * future interrupt with this key will be processed as a new
>>>>
>>>>          - * page fault.
>>>>
>>>>          - */
>>>>
>>>>          -void amdgpu_ih_clear_fault(struct amdgpu_device *adev, 
>>>> u64 key)
>>>>
>>>>          -{
>>>>
>>>>          -  unsigned long flags;
>>>>
>>>>          -  int r;
>>>>
>>>>          -
>>>>
>>>>          -  if (!adev->irq.ih.faults)
>>>>
>>>>          -          return;
>>>>
>>>>          -
>>>>
>>>>          - spin_lock_irqsave(&adev->irq.ih.faults->lock, flags);
>>>>
>>>>          -
>>>>
>>>>          -  r = chash_table_remove(&adev->irq.ih.faults->hash, key, 
>>>> NULL);
>>>>
>>>>          -  if (!WARN_ON_ONCE(r < 0)) {
>>>>
>>>>          -          adev->irq.ih.faults->count--;
>>>>
>>>>          - WARN_ON_ONCE(adev->irq.ih.faults->count < 0);
>>>>
>>>>          -  }
>>>>
>>>>          -
>>>>
>>>>          - spin_unlock_irqrestore(&adev->irq.ih.faults->lock, flags);
>>>>
>>>>          -}
>>>>
>>>>          diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>>
>>>>          index a23e1c0..f411ffb 100644
>>>>
>>>>          --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>>
>>>>          +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>>
>>>>          @@ -32,13 +32,6 @@ struct amdgpu_device;
>>>>
>>>>           #define AMDGPU_IH_CLIENTID_LEGACY 0
>>>>
>>>>           #define AMDGPU_IH_CLIENTID_MAX SOC15_IH_CLIENTID_MAX
>>>>
>>>>
>>>>          -#define AMDGPU_PAGEFAULT_HASH_BITS 8
>>>>
>>>>          -struct amdgpu_retryfault_hashtable {
>>>>
>>>>          -  DECLARE_CHASH_TABLE(hash, AMDGPU_PAGEFAULT_HASH_BITS, 
>>>> 8, 0);
>>>>
>>>>          -  spinlock_t     lock;
>>>>
>>>>          -  int            count;
>>>>
>>>>          -};
>>>>
>>>>          -
>>>>
>>>>           /*
>>>>
>>>>            * R6xx+ IH ring
>>>>
>>>>            */
>>>>
>>>>          @@ -57,7 +50,6 @@ struct amdgpu_ih_ring {
>>>>
>>>>             bool                   use_doorbell;
>>>>
>>>>             bool                   use_bus_addr;
>>>>
>>>>             dma_addr_t             rb_dma_addr; /* only used when 
>>>> use_bus_addr = true */
>>>>
>>>>          -  struct amdgpu_retryfault_hashtable *faults;
>>>>
>>>>           };
>>>>
>>>>
>>>>           #define AMDGPU_IH_SRC_DATA_MAX_SIZE_DW 4
>>>>
>>>>          @@ -95,7 +87,5 @@ int amdgpu_ih_ring_init(struct 
>>>> amdgpu_device *adev, unsigned ring_size,
>>>>
>>>>                             bool use_bus_addr);
>>>>
>>>>           void amdgpu_ih_ring_fini(struct amdgpu_device *adev);
>>>>
>>>>           int amdgpu_ih_process(struct amdgpu_device *adev);
>>>>
>>>>          -int amdgpu_ih_add_fault(struct amdgpu_device *adev, u64 
>>>> key);
>>>>
>>>>          -void amdgpu_ih_clear_fault(struct amdgpu_device *adev, 
>>>> u64 key);
>>>>
>>>>
>>>>           #endif
>>>>
>>>>          diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>
>>>>          index 1d7e3c1..8b220e9 100644
>>>>
>>>>          --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>
>>>>          +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>
>>>>          @@ -2692,6 +2692,22 @@ void amdgpu_vm_adjust_size(struct 
>>>> amdgpu_device *adev, uint32_t min_vm_size,
>>>>
>>>>                      adev->vm_manager.fragment_size);
>>>>
>>>>           }
>>>>
>>>>
>>>>          +static struct amdgpu_retryfault_hashtable 
>>>> *init_fault_hash(void)
>>>>
>>>>          +{
>>>>
>>>>          +  struct amdgpu_retryfault_hashtable *fault_hash;
>>>>
>>>>          +
>>>>
>>>>          +  fault_hash = kmalloc(sizeof(*fault_hash), GFP_KERNEL);
>>>>
>>>>          +  if (!fault_hash)
>>>>
>>>>          +          return fault_hash;
>>>>
>>>>          +
>>>>
>>>>          +  INIT_CHASH_TABLE(fault_hash->hash,
>>>>
>>>>          +                  AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
>>>>
>>>>          +  spin_lock_init(&fault_hash->lock);
>>>>
>>>>          +  fault_hash->count = 0;
>>>>
>>>>          +
>>>>
>>>>          +  return fault_hash;
>>>>
>>>>          +}
>>>>
>>>>          +
>>>>
>>>>           /**
>>>>
>>>>            * amdgpu_vm_init - initialize a vm instance
>>>>
>>>>            *
>>>>
>>>>          @@ -2780,6 +2796,12 @@ int amdgpu_vm_init(struct 
>>>> amdgpu_device *adev, struct amdgpu_vm *vm,
>>>>
>>>>                     vm->pasid = pasid;
>>>>
>>>>             }
>>>>
>>>>
>>>>          +  vm->fault_hash = init_fault_hash();
>>>>
>>>>          +  if (!vm->fault_hash) {
>>>>
>>>>          +          r = -ENOMEM;
>>>>
>>>>          +          goto error_free_root;
>>>>
>>>>          +  }
>>>>
>>>>          +
>>>>
>>>>             INIT_KFIFO(vm->faults);
>>>>
>>>>             vm->fault_credit = 16;
>>>>
>>>>
>>>>          @@ -2973,7 +2995,7 @@ void amdgpu_vm_fini(struct 
>>>> amdgpu_device *adev, struct amdgpu_vm *vm)
>>>>
>>>>
>>>>             /* Clear pending page faults from IH when the VM is 
>>>> destroyed */
>>>>
>>>>             while (kfifo_get(&vm->faults, &fault))
>>>>
>>>>          -          amdgpu_ih_clear_fault(adev, fault);
>>>>
>>>>          +          amdgpu_vm_clear_fault(vm->fault_hash, fault);
>>>>
>>>>
>>>>             if (vm->pasid) {
>>>>
>>>>                     unsigned long flags;
>>>>
>>>>          @@ -2983,6 +3005,9 @@ void amdgpu_vm_fini(struct 
>>>> amdgpu_device *adev, struct amdgpu_vm *vm)
>>>>
>>>> spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
>>>>
>>>>             }
>>>>
>>>>
>>>>          +  kfree(vm->fault_hash);
>>>>
>>>>          +  vm->fault_hash = NULL;
>>>>
>>>>          +
>>>>
>>>>             drm_sched_entity_destroy(&vm->entity);
>>>>
>>>>
>>>>             if (!RB_EMPTY_ROOT(&vm->va.rb_root)) {
>>>>
>>>>          @@ -3183,3 +3208,78 @@ void amdgpu_vm_set_task_info(struct 
>>>> amdgpu_vm *vm)
>>>>
>>>>                     }
>>>>
>>>>             }
>>>>
>>>>           }
>>>>
>>>>          +
>>>>
>>>>          +/**
>>>>
>>>>          + * amdgpu_vm_add_fault - Add a page fault record to fault 
>>>> hash table
>>>>
>>>>          + *
>>>>
>>>>          + * @fault_hash: fault hash table
>>>>
>>>>          + * @key: 64-bit encoding of PASID and address
>>>>
>>>>          + *
>>>>
>>>>          + * This should be called when a retry page fault 
>>>> interrupt is
>>>>
>>>>          + * received. If this is a new page fault, it will be 
>>>> added to a hash
>>>>
>>>>          + * table. The return value indicates whether this is a 
>>>> new fault, or
>>>>
>>>>          + * a fault that was already known and is already being 
>>>> handled.
>>>>
>>>>          + *
>>>>
>>>>          + * If there are too many pending page faults, this will 
>>>> fail. Retry
>>>>
>>>>          + * interrupts should be ignored in this case until there 
>>>> is enough
>>>>
>>>>          + * free space.
>>>>
>>>>          + *
>>>>
>>>>          + * Returns 0 if the fault was added, 1 if the fault was 
>>>> already known,
>>>>
>>>>          + * -ENOSPC if there are too many pending faults.
>>>>
>>>>          + */
>>>>
>>>>          +int amdgpu_vm_add_fault(struct 
>>>> amdgpu_retryfault_hashtable *fault_hash, u64 key)
>>>>
>>>>          +{
>>>>
>>>>          +  unsigned long flags;
>>>>
>>>>          +  int r = -ENOSPC;
>>>>
>>>>          +
>>>>
>>>>          +  if (WARN_ON_ONCE(!fault_hash))
>>>>
>>>>          +          /* Should be allocated in amdgpu_vm_init
>>>>
>>>>          +           */
>>>>
>>>>          +          return r;
>>>>
>>>>          +
>>>>
>>>>          +  spin_lock_irqsave(&fault_hash->lock, flags);
>>>>
>>>>          +
>>>>
>>>>          +  /* Only let the hash table fill up to 50% for best 
>>>> performance */
>>>>
>>>>          +  if (fault_hash->count >= (1 << 
>>>> (AMDGPU_PAGEFAULT_HASH_BITS-1)))
>>>>
>>>>          +          goto unlock_out;
>>>>
>>>>          +
>>>>
>>>>          +  r = chash_table_copy_in(&fault_hash->hash, key, NULL);
>>>>
>>>>          +  if (!r)
>>>>
>>>>          +          fault_hash->count++;
>>>>
>>>>          +
>>>>
>>>>          +  /* chash_table_copy_in should never fail unless we're 
>>>> losing count */
>>>>
>>>>          +  WARN_ON_ONCE(r < 0);
>>>>
>>>>          +
>>>>
>>>>          +unlock_out:
>>>>
>>>>          +  spin_unlock_irqrestore(&fault_hash->lock, flags);
>>>>
>>>>          +  return r;
>>>>
>>>>          +}
>>>>
>>>>          +
>>>>
>>>>          +/**
>>>>
>>>>          + * amdgpu_vm_clear_fault - Remove a page fault record
>>>>
>>>>          + *
>>>>
>>>>          + * @fault_hash: fault hash table
>>>>
>>>>          + * @key: 64-bit encoding of PASID and address
>>>>
>>>>          + *
>>>>
>>>>          + * This should be called when a page fault has been 
>>>> handled. Any
>>>>
>>>>          + * future interrupt with this key will be processed as a new
>>>>
>>>>          + * page fault.
>>>>
>>>>          + */
>>>>
>>>>          +void amdgpu_vm_clear_fault(struct 
>>>> amdgpu_retryfault_hashtable *fault_hash, u64 key)
>>>>
>>>>          +{
>>>>
>>>>          +  unsigned long flags;
>>>>
>>>>          +  int r;
>>>>
>>>>          +
>>>>
>>>>          +  if (!fault_hash)
>>>>
>>>>          +          return;
>>>>
>>>>          +
>>>>
>>>>          +  spin_lock_irqsave(&fault_hash->lock, flags);
>>>>
>>>>          +
>>>>
>>>>          +  r = chash_table_remove(&fault_hash->hash, key, NULL);
>>>>
>>>>          +  if (!WARN_ON_ONCE(r < 0)) {
>>>>
>>>>          +          fault_hash->count--;
>>>>
>>>>          +          WARN_ON_ONCE(fault_hash->count < 0);
>>>>
>>>>          +  }
>>>>
>>>>          +
>>>>
>>>>          +  spin_unlock_irqrestore(&fault_hash->lock, flags);
>>>>
>>>>          +}
>>>>
>>>>          diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>>
>>>>          index e275ee7..6eb1da1 100644
>>>>
>>>>          --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>>
>>>>          +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>>
>>>>          @@ -178,6 +178,13 @@ struct amdgpu_task_info {
>>>>
>>>>             pid_t   tgid;
>>>>
>>>>           };
>>>>
>>>>
>>>>          +#define AMDGPU_PAGEFAULT_HASH_BITS 8
>>>>
>>>>          +struct amdgpu_retryfault_hashtable {
>>>>
>>>>          +  DECLARE_CHASH_TABLE(hash, AMDGPU_PAGEFAULT_HASH_BITS, 
>>>> 8, 0);
>>>>
>>>>          +  spinlock_t     lock;
>>>>
>>>>          +  int            count;
>>>>
>>>>          +};
>>>>
>>>>          +
>>>>
>>>>           struct amdgpu_vm {
>>>>
>>>>             /* tree of virtual addresses mapped */
>>>>
>>>>             struct rb_root_cached  va;
>>>>
>>>>          @@ -240,6 +247,7 @@ struct amdgpu_vm {
>>>>
>>>>             struct ttm_lru_bulk_move lru_bulk_move;
>>>>
>>>>             /* mark whether can do the bulk move */
>>>>
>>>>             bool                   bulk_moveable;
>>>>
>>>>          +  struct amdgpu_retryfault_hashtable *fault_hash;
>>>>
>>>>           };
>>>>
>>>>
>>>>           struct amdgpu_vm_manager {
>>>>
>>>>          @@ -355,4 +363,8 @@ void amdgpu_vm_set_task_info(struct 
>>>> amdgpu_vm *vm);
>>>>
>>>>           void amdgpu_vm_move_to_lru_tail(struct amdgpu_device *adev,
>>>>
>>>>                                    struct amdgpu_vm *vm);
>>>>
>>>>
>>>>          +int amdgpu_vm_add_fault(struct 
>>>> amdgpu_retryfault_hashtable *fault_hash, u64 key);
>>>>
>>>>          +
>>>>
>>>>          +void amdgpu_vm_clear_fault(struct 
>>>> amdgpu_retryfault_hashtable *fault_hash, u64 key);
>>>>
>>>>          +
>>>>
>>>>           #endif
>>>>
>>>>          diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>
>>>>          index 5ae5ed2..acbe5a7 100644
>>>>
>>>>          --- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>
>>>>          +++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>
>>>>          @@ -265,35 +265,36 @@ static bool 
>>>> vega10_ih_prescreen_iv(struct amdgpu_device *adev)
>>>>
>>>>                     return true;
>>>>
>>>>             }
>>>>
>>>>
>>>>          -  addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 12);
>>>>
>>>>          -  key = AMDGPU_VM_FAULT(pasid, addr);
>>>>
>>>>          -  r = amdgpu_ih_add_fault(adev, key);
>>>>
>>>>          -
>>>>
>>>>          -  /* Hash table is full or the fault is already being 
>>>> processed,
>>>>
>>>>          -   * ignore further page faults
>>>>
>>>>          -   */
>>>>
>>>>          -  if (r != 0)
>>>>
>>>>          -          goto ignore_iv;
>>>>
>>>>          -
>>>>
>>>>             /* Track retry faults in per-VM fault FIFO. */
>>>>
>>>>             spin_lock(&adev->vm_manager.pasid_lock);
>>>>
>>>>             vm = idr_find(&adev->vm_manager.pasid_idr, pasid);
>>>>
>>>>          +  addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 12);
>>>>
>>>>          +  key = AMDGPU_VM_FAULT(pasid, addr);
>>>>
>>>>             if (!vm) {
>>>>
>>>>                     /* VM not found, process it normally */
>>>>
>>>> spin_unlock(&adev->vm_manager.pasid_lock);
>>>>
>>>>          -          amdgpu_ih_clear_fault(adev, key);
>>>>
>>>>                     return true;
>>>>
>>>>          +  } else {
>>>>
>>>>          +          r = amdgpu_vm_add_fault(vm->fault_hash, key);
>>>>
>>>>          +
>>>>
>>>>          +          /* Hash table is full or the fault is already 
>>>> being processed,
>>>>
>>>>          +           * ignore further page faults
>>>>
>>>>          +           */
>>>>
>>>>          +          if (r != 0) {
>>>>
>>>>          + spin_unlock(&adev->vm_manager.pasid_lock);
>>>>
>>>>          +                  goto ignore_iv;
>>>>
>>>>          +          }
>>>>
>>>>             }
>>>>
>>>>             /* No locking required with single writer and single 
>>>> reader */
>>>>
>>>>             r = kfifo_put(&vm->faults, key);
>>>>
>>>>             if (!r) {
>>>>
>>>>                     /* FIFO is full. Ignore it until there is space */
>>>>
>>>>          +          amdgpu_vm_clear_fault(vm->fault_hash, key);
>>>>
>>>> spin_unlock(&adev->vm_manager.pasid_lock);
>>>>
>>>>          -          amdgpu_ih_clear_fault(adev, key);
>>>>
>>>>                     goto ignore_iv;
>>>>
>>>>             }
>>>>
>>>>          - spin_unlock(&adev->vm_manager.pasid_lock);
>>>>
>>>>
>>>>          + spin_unlock(&adev->vm_manager.pasid_lock);
>>>>
>>>>             /* It's the first fault for this address, process it 
>>>> normally */
>>>>
>>>>             return true;
>>>>
>>>>
>>>>          @@ -386,14 +387,6 @@ static int vega10_ih_sw_init(void 
>>>> *handle)
>>>>
>>>>             adev->irq.ih.use_doorbell = true;
>>>>
>>>>             adev->irq.ih.doorbell_index = AMDGPU_DOORBELL64_IH << 1;
>>>>
>>>>
>>>>          -  adev->irq.ih.faults = 
>>>> kmalloc(sizeof(*adev->irq.ih.faults), GFP_KERNEL);
>>>>
>>>>          -  if (!adev->irq.ih.faults)
>>>>
>>>>          -          return -ENOMEM;
>>>>
>>>>          - INIT_CHASH_TABLE(adev->irq.ih.faults->hash,
>>>>
>>>>          -                   AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
>>>>
>>>>          - spin_lock_init(&adev->irq.ih.faults->lock);
>>>>
>>>>          -  adev->irq.ih.faults->count = 0;
>>>>
>>>>          -
>>>>
>>>>             r = amdgpu_irq_init(adev);
>>>>
>>>>
>>>>             return r;
>>>>
>>>>          @@ -406,9 +399,6 @@ static int vega10_ih_sw_fini(void 
>>>> *handle)
>>>>
>>>>             amdgpu_irq_fini(adev);
>>>>
>>>>             amdgpu_ih_ring_fini(adev);
>>>>
>>>>
>>>>          -  kfree(adev->irq.ih.faults);
>>>>
>>>>          -  adev->irq.ih.faults = NULL;
>>>>
>>>>          -
>>>>
>>>>             return 0;
>>>>
>>>>           }
>>>>
>>>>
>>>>
>>>>
>>>>
>>>>
>>>>      _______________________________________________
>>>>
>>>>      amd-gfx mailing list
>>>>
>>>>      amd-gfx@lists.freedesktop.org 
>>>> <mailto:amd-gfx@lists.freedesktop.org>
>>>>
>>>>      https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>
>>>>
>>>
>>>
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to amdgpu vm
       [not found]                             ` <143b57c2-12d1-8cb1-ddea-a8fe7226e1c4-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
@ 2018-09-10 19:10                               ` Felix Kuehling
       [not found]                                 ` <8cee8563-2df9-1075-3113-66135cbd14fd-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 27+ messages in thread
From: Felix Kuehling @ 2018-09-10 19:10 UTC (permalink / raw)
  To: christian.koenig-5C7GfCeVMHo, Zeng, Oak, Oak Zeng,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Yang, Philip

I'm not sure why you need to distinguish current and future state when
dealing with page faults. When you get a page fault, you know that the
GPU is trying to access memory right now, in the present. So you're
always working with the current state. When the CPU page table changes,
you get an MMU notifier that allows you to invalidate the corresponding
GPU PTEs. If you have a valid GPU PTE, it always represents the current
states and is in sync with the CPU page table. If the GPU page table is
ever outdated, it should have an invalid entry in that place.

If SDMA is used to update the GPU PTEs, there is a delay. The MMU
notifier is synchronous, so it shouldn't be a problem. You just wait for
the SDMA job to complete before returning. When updating PTEs with new
valid addresses, it's asynchronous. But the GPU will continue retrying
on the invalid entry until SDMA finishes. So it's also implicitly
synchronized on the GPU side.

Regards,
  Felix


On 2018-09-10 05:42 AM, Christian König wrote:
> Hi Felix & Oak,
>
> over the weekend I had the idea that we could just use the shadow BOs
> to have the current state in a page fault. They are GTT BOs and CPU
> accessible anyway.
>
> Regards,
> Christian.
>
> Am 08.09.2018 um 09:34 schrieb Christian König:
>> Hi Felix,
>>
>>> But why do you want to update page tables when there is no more user
>>> mode context that cares about them? Is this just to allow pending work
>>> to complete after the user mode context is gone? To prevent hangs?
>>
>> The problem I'm see is that we still don't have a reliable way of
>> killing a command submission while making sure that no more faults of
>> it are in the flight.
>>
>> E.g. the HWS has a function to kill a running job on the hardware,
>> but that only works for compute and there is no way of telling when
>> that operation has finished.
>>
>> But Oak has already convinced me that we should probably work on that
>> instead of trying to keep the VM alive as long as possible.
>>
>> Regards,
>> Christian.
>>
>> Am 08.09.2018 um 02:27 schrieb Felix Kuehling:
>>> Hi Christian,
>>>
>>> I'm not sure I get your point here.
>>>
>>> As I understand it, the amdgpu_vm structure represents the user mode VM
>>> context. It has the pointers to the root page directory and the rest of
>>> the page table hierarchy. If there is no amdgpu_vm, there is no user
>>> mode context that cares about the page tables.
>>>
>>> If we leave the page table pointers in the amdgpu_vm, then handling
>>> faults after the VM is destroyed is pointless. We can't find the page
>>> tables and we can't update them, so there is nothing we can do in
>>> response to VM faults.
>>>
>>> So I'm guessing that you want to move the page table pointers into a
>>> different structure that exists half-independently of the VM. It would
>>> be created when the VM is created (that's where we currently allocate
>>> the PASID) but can be destroyed slightly later.
>>>
>>> But why do you want to update page tables when there is no more user
>>> mode context that cares about them? Is this just to allow pending work
>>> to complete after the user mode context is gone? To prevent hangs?
>>>
>>> Regards,
>>>    Felix
>>>
>>> On 2018-09-10 07:20 AM, Christian König wrote:
>>>>> Am I right that the handling of page fault need a valid VM?
>>>> No, exactly that view is incorrect.
>>>>
>>>> The VM is meant to be a memory management structure of page tables and
>>>> is completely pointless fault processing because it represent the
>>>> future state of the page tables and not the current one.
>>>>
>>>> What we need for fault processing is a new structure build around
>>>> PASIDs which is feed by the with addresses when page tables are moved
>>>> around.
>>>>
>>>> Alternatively I hope that we can use the SDMA to walk the page tables
>>>> and update the required entries by just using the address.
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>> Am 07.09.2018 um 16:55 schrieb Zeng, Oak:
>>>>> Hi Christian,
>>>>>
>>>>>
>>>>> What is the value of delay the destroy of hash to after vm is
>>>>> destroyed? Since vm is destroyed, you basically don’t have enough
>>>>> information to paging in the correct page to gpuvm. Am I right that
>>>>> the handling of page fault need a valid VM? For example, you need the
>>>>> VM to get the corresponding physical address of the faulty page.
>>>>>
>>>>>
>>>>> After vm is destroyed, the retry interrupt can be directly discard as
>>>>> you can’t find vm through pasid. You can think this is also one kind
>>>>> of prescreen.
>>>>>
>>>>>
>>>>> So I am stand for the point that, there is no need to delay the
>>>>> destroy of hash to after vm is destroyed. Prescreening hash can be
>>>>> destroyed at the time of vm_fini.
>>>>>
>>>>>
>>>>> Thanks,
>>>>>
>>>>> Oak
>>>>>
>>>>>
>>>>> *From:*Christian König <ckoenig.leichtzumerken@gmail.com>
>>>>> *Sent:* Friday, September 07, 2018 4:39 AM
>>>>> *To:* Zeng, Oak <Oak.Zeng@amd.com>; Oak Zeng <zengshanjun@gmail.com>;
>>>>> amd-gfx@lists.freedesktop.org
>>>>> *Cc:* Zeng, Oak <Oak.Zeng@amd.com>
>>>>> *Subject:* Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to
>>>>> amdgpu vm
>>>>>
>>>>>
>>>>> Hi Oak,
>>>>>
>>>>> yeah, but this is still a NAK. Making the hash per PASID was not a
>>>>> suggestion but rather a must have.
>>>>>
>>>>> The VM structures must be destroyed while the processing is still
>>>>> ongoing, or otherwise we would not have a clean OOM handling.
>>>>>
>>>>> The IDR for PASID lockup can be put into amdgpu_ids.c, you can just
>>>>> replace the IDA already used there with an IDR.
>>>>>
>>>>> Since the PASID is already freed up delayed we should have the grace
>>>>> period for interrupt processing included. If that still isn't
>>>>> sufficient we can still add some delayed work for this.
>>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>> Am 07.09.2018 um 06:16 schrieb ozeng:
>>>>>
>>>>>      Hi Christian,
>>>>>
>>>>>      In this implementation, fault hash is made per vm, not per pasid
>>>>>      as suggested, based on below considerations:
>>>>>
>>>>>        * Delay the destroy of hash introduces more effort like how to
>>>>>          set the proper grace period after which no retry interrupt
>>>>>          will be generated. We want to avoid those complication if we
>>>>>          can.
>>>>>        * The  problem of the per vm implementation is that it is hard
>>>>>          to delay the destroy of fault hash, because when vm is
>>>>>          destroyed, prescreen function won't be able to retrieve the
>>>>>          fault hash. But in this case, the prescreen function can
>>>>>          either let the interrupt go through (to gmc) or ignore it.
>>>>>          From this perspective, it is not very necessary to delay the
>>>>>          destroy of hash table.
>>>>>        * On the other hand, since ASICs after vega10 have the HW CAM
>>>>>          feature. So the SW IV prescreen is only useful for vega10.
>>>>>          For all the HW implemented CAM, we can consider the delayed
>>>>>          CAM acknowledgment.
>>>>>        * Making it per pasid need to introduce another idr to
>>>>>          correlate pasid and hash table - where to put the idr? You
>>>>>          will have to make it a global variable which is not very
>>>>>          desirable.
>>>>>
>>>>>      The main purpose of this patch is to solve the inter-process
>>>>> lock
>>>>>      issue (when hash table is full), while still keep codes simple.
>>>>>
>>>>>      Also with this patch, the faults kfifo is not necessary any
>>>>> more.
>>>>>      See patch 2.
>>>>>
>>>>>      Regards,
>>>>>
>>>>>      Oak
>>>>>
>>>>>
>>>>>      On 09/06/2018 11:28 PM, Oak Zeng wrote:
>>>>>
>>>>>          In stead of share one fault hash table per device, make it
>>>>>
>>>>>          per vm. This can avoid inter-process lock issue when fault
>>>>>
>>>>>          hash table is full.
>>>>>
>>>>>
>>>>>          Change-Id: I5d1281b7c41eddc8e26113e010516557588d3708
>>>>>
>>>>>          Signed-off-by: Oak Zeng <Oak.Zeng@amd.com>
>>>>> <mailto:Oak.Zeng@amd.com>
>>>>>
>>>>>          Suggested-by: Christian Konig <Christian.Koenig@amd.com>
>>>>> <mailto:Christian.Koenig@amd.com>
>>>>>
>>>>>          Suggested-by: Felix Kuehling <Felix.Kuehling@amd.com>
>>>>> <mailto:Felix.Kuehling@amd.com>
>>>>>
>>>>>          ---
>>>>>
>>>>>           drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c |  75
>>>>> ------------------------
>>>>>
>>>>>           drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h |  10 ----
>>>>>
>>>>>           drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 102
>>>>> ++++++++++++++++++++++++++++++++-
>>>>>
>>>>>           drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h |  12 ++++
>>>>>
>>>>>           drivers/gpu/drm/amd/amdgpu/vega10_ih.c |  38 +++++-------
>>>>>
>>>>>           5 files changed, 127 insertions(+), 110 deletions(-)
>>>>>
>>>>>
>>>>>          diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>>>>
>>>>>          index 06373d4..4ed8621 100644
>>>>>
>>>>>          --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>>>>
>>>>>          +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>>>>
>>>>>          @@ -197,78 +197,3 @@ int amdgpu_ih_process(struct
>>>>> amdgpu_device *adev)
>>>>>
>>>>>             return IRQ_HANDLED;
>>>>>
>>>>>           }
>>>>>
>>>>>
>>>>>          -/**
>>>>>
>>>>>          - * amdgpu_ih_add_fault - Add a page fault record
>>>>>
>>>>>          - *
>>>>>
>>>>>          - * @adev: amdgpu device pointer
>>>>>
>>>>>          - * @key: 64-bit encoding of PASID and address
>>>>>
>>>>>          - *
>>>>>
>>>>>          - * This should be called when a retry page fault
>>>>> interrupt is
>>>>>
>>>>>          - * received. If this is a new page fault, it will be
>>>>> added to a hash
>>>>>
>>>>>          - * table. The return value indicates whether this is a
>>>>> new fault, or
>>>>>
>>>>>          - * a fault that was already known and is already being
>>>>> handled.
>>>>>
>>>>>          - *
>>>>>
>>>>>          - * If there are too many pending page faults, this will
>>>>> fail. Retry
>>>>>
>>>>>          - * interrupts should be ignored in this case until there
>>>>> is enough
>>>>>
>>>>>          - * free space.
>>>>>
>>>>>          - *
>>>>>
>>>>>          - * Returns 0 if the fault was added, 1 if the fault was
>>>>> already known,
>>>>>
>>>>>          - * -ENOSPC if there are too many pending faults.
>>>>>
>>>>>          - */
>>>>>
>>>>>          -int amdgpu_ih_add_fault(struct amdgpu_device *adev, u64
>>>>> key)
>>>>>
>>>>>          -{
>>>>>
>>>>>          -  unsigned long flags;
>>>>>
>>>>>          -  int r = -ENOSPC;
>>>>>
>>>>>          -
>>>>>
>>>>>          -  if (WARN_ON_ONCE(!adev->irq.ih.faults))
>>>>>
>>>>>          -          /* Should be allocated in <IP>_ih_sw_init on
>>>>> GPUs that
>>>>>
>>>>>          -           * support retry faults and require retry
>>>>> filtering.
>>>>>
>>>>>          -           */
>>>>>
>>>>>          -          return r;
>>>>>
>>>>>          -
>>>>>
>>>>>          - spin_lock_irqsave(&adev->irq.ih.faults->lock, flags);
>>>>>
>>>>>          -
>>>>>
>>>>>          -  /* Only let the hash table fill up to 50% for best
>>>>> performance */
>>>>>
>>>>>          -  if (adev->irq.ih.faults->count >= (1 <<
>>>>> (AMDGPU_PAGEFAULT_HASH_BITS-1)))
>>>>>
>>>>>          -          goto unlock_out;
>>>>>
>>>>>          -
>>>>>
>>>>>          -  r = chash_table_copy_in(&adev->irq.ih.faults->hash,
>>>>> key, NULL);
>>>>>
>>>>>          -  if (!r)
>>>>>
>>>>>          -          adev->irq.ih.faults->count++;
>>>>>
>>>>>          -
>>>>>
>>>>>          -  /* chash_table_copy_in should never fail unless we're
>>>>> losing count */
>>>>>
>>>>>          -  WARN_ON_ONCE(r < 0);
>>>>>
>>>>>          -
>>>>>
>>>>>          -unlock_out:
>>>>>
>>>>>          - spin_unlock_irqrestore(&adev->irq.ih.faults->lock, flags);
>>>>>
>>>>>          -  return r;
>>>>>
>>>>>          -}
>>>>>
>>>>>          -
>>>>>
>>>>>          -/**
>>>>>
>>>>>          - * amdgpu_ih_clear_fault - Remove a page fault record
>>>>>
>>>>>          - *
>>>>>
>>>>>          - * @adev: amdgpu device pointer
>>>>>
>>>>>          - * @key: 64-bit encoding of PASID and address
>>>>>
>>>>>          - *
>>>>>
>>>>>          - * This should be called when a page fault has been
>>>>> handled. Any
>>>>>
>>>>>          - * future interrupt with this key will be processed as a
>>>>> new
>>>>>
>>>>>          - * page fault.
>>>>>
>>>>>          - */
>>>>>
>>>>>          -void amdgpu_ih_clear_fault(struct amdgpu_device *adev,
>>>>> u64 key)
>>>>>
>>>>>          -{
>>>>>
>>>>>          -  unsigned long flags;
>>>>>
>>>>>          -  int r;
>>>>>
>>>>>          -
>>>>>
>>>>>          -  if (!adev->irq.ih.faults)
>>>>>
>>>>>          -          return;
>>>>>
>>>>>          -
>>>>>
>>>>>          - spin_lock_irqsave(&adev->irq.ih.faults->lock, flags);
>>>>>
>>>>>          -
>>>>>
>>>>>          -  r = chash_table_remove(&adev->irq.ih.faults->hash,
>>>>> key, NULL);
>>>>>
>>>>>          -  if (!WARN_ON_ONCE(r < 0)) {
>>>>>
>>>>>          -          adev->irq.ih.faults->count--;
>>>>>
>>>>>          - WARN_ON_ONCE(adev->irq.ih.faults->count < 0);
>>>>>
>>>>>          -  }
>>>>>
>>>>>          -
>>>>>
>>>>>          - spin_unlock_irqrestore(&adev->irq.ih.faults->lock, flags);
>>>>>
>>>>>          -}
>>>>>
>>>>>          diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>>>
>>>>>          index a23e1c0..f411ffb 100644
>>>>>
>>>>>          --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>>>
>>>>>          +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>>>
>>>>>          @@ -32,13 +32,6 @@ struct amdgpu_device;
>>>>>
>>>>>           #define AMDGPU_IH_CLIENTID_LEGACY 0
>>>>>
>>>>>           #define AMDGPU_IH_CLIENTID_MAX SOC15_IH_CLIENTID_MAX
>>>>>
>>>>>
>>>>>          -#define AMDGPU_PAGEFAULT_HASH_BITS 8
>>>>>
>>>>>          -struct amdgpu_retryfault_hashtable {
>>>>>
>>>>>          -  DECLARE_CHASH_TABLE(hash, AMDGPU_PAGEFAULT_HASH_BITS,
>>>>> 8, 0);
>>>>>
>>>>>          -  spinlock_t     lock;
>>>>>
>>>>>          -  int            count;
>>>>>
>>>>>          -};
>>>>>
>>>>>          -
>>>>>
>>>>>           /*
>>>>>
>>>>>            * R6xx+ IH ring
>>>>>
>>>>>            */
>>>>>
>>>>>          @@ -57,7 +50,6 @@ struct amdgpu_ih_ring {
>>>>>
>>>>>             bool                   use_doorbell;
>>>>>
>>>>>             bool                   use_bus_addr;
>>>>>
>>>>>             dma_addr_t             rb_dma_addr; /* only used when
>>>>> use_bus_addr = true */
>>>>>
>>>>>          -  struct amdgpu_retryfault_hashtable *faults;
>>>>>
>>>>>           };
>>>>>
>>>>>
>>>>>           #define AMDGPU_IH_SRC_DATA_MAX_SIZE_DW 4
>>>>>
>>>>>          @@ -95,7 +87,5 @@ int amdgpu_ih_ring_init(struct
>>>>> amdgpu_device *adev, unsigned ring_size,
>>>>>
>>>>>                             bool use_bus_addr);
>>>>>
>>>>>           void amdgpu_ih_ring_fini(struct amdgpu_device *adev);
>>>>>
>>>>>           int amdgpu_ih_process(struct amdgpu_device *adev);
>>>>>
>>>>>          -int amdgpu_ih_add_fault(struct amdgpu_device *adev, u64
>>>>> key);
>>>>>
>>>>>          -void amdgpu_ih_clear_fault(struct amdgpu_device *adev,
>>>>> u64 key);
>>>>>
>>>>>
>>>>>           #endif
>>>>>
>>>>>          diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>
>>>>>          index 1d7e3c1..8b220e9 100644
>>>>>
>>>>>          --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>
>>>>>          +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>
>>>>>          @@ -2692,6 +2692,22 @@ void amdgpu_vm_adjust_size(struct
>>>>> amdgpu_device *adev, uint32_t min_vm_size,
>>>>>
>>>>>                      adev->vm_manager.fragment_size);
>>>>>
>>>>>           }
>>>>>
>>>>>
>>>>>          +static struct amdgpu_retryfault_hashtable
>>>>> *init_fault_hash(void)
>>>>>
>>>>>          +{
>>>>>
>>>>>          +  struct amdgpu_retryfault_hashtable *fault_hash;
>>>>>
>>>>>          +
>>>>>
>>>>>          +  fault_hash = kmalloc(sizeof(*fault_hash), GFP_KERNEL);
>>>>>
>>>>>          +  if (!fault_hash)
>>>>>
>>>>>          +          return fault_hash;
>>>>>
>>>>>          +
>>>>>
>>>>>          +  INIT_CHASH_TABLE(fault_hash->hash,
>>>>>
>>>>>          +                  AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
>>>>>
>>>>>          +  spin_lock_init(&fault_hash->lock);
>>>>>
>>>>>          +  fault_hash->count = 0;
>>>>>
>>>>>          +
>>>>>
>>>>>          +  return fault_hash;
>>>>>
>>>>>          +}
>>>>>
>>>>>          +
>>>>>
>>>>>           /**
>>>>>
>>>>>            * amdgpu_vm_init - initialize a vm instance
>>>>>
>>>>>            *
>>>>>
>>>>>          @@ -2780,6 +2796,12 @@ int amdgpu_vm_init(struct
>>>>> amdgpu_device *adev, struct amdgpu_vm *vm,
>>>>>
>>>>>                     vm->pasid = pasid;
>>>>>
>>>>>             }
>>>>>
>>>>>
>>>>>          +  vm->fault_hash = init_fault_hash();
>>>>>
>>>>>          +  if (!vm->fault_hash) {
>>>>>
>>>>>          +          r = -ENOMEM;
>>>>>
>>>>>          +          goto error_free_root;
>>>>>
>>>>>          +  }
>>>>>
>>>>>          +
>>>>>
>>>>>             INIT_KFIFO(vm->faults);
>>>>>
>>>>>             vm->fault_credit = 16;
>>>>>
>>>>>
>>>>>          @@ -2973,7 +2995,7 @@ void amdgpu_vm_fini(struct
>>>>> amdgpu_device *adev, struct amdgpu_vm *vm)
>>>>>
>>>>>
>>>>>             /* Clear pending page faults from IH when the VM is
>>>>> destroyed */
>>>>>
>>>>>             while (kfifo_get(&vm->faults, &fault))
>>>>>
>>>>>          -          amdgpu_ih_clear_fault(adev, fault);
>>>>>
>>>>>          +          amdgpu_vm_clear_fault(vm->fault_hash, fault);
>>>>>
>>>>>
>>>>>             if (vm->pasid) {
>>>>>
>>>>>                     unsigned long flags;
>>>>>
>>>>>          @@ -2983,6 +3005,9 @@ void amdgpu_vm_fini(struct
>>>>> amdgpu_device *adev, struct amdgpu_vm *vm)
>>>>>
>>>>> spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
>>>>>
>>>>>             }
>>>>>
>>>>>
>>>>>          +  kfree(vm->fault_hash);
>>>>>
>>>>>          +  vm->fault_hash = NULL;
>>>>>
>>>>>          +
>>>>>
>>>>>             drm_sched_entity_destroy(&vm->entity);
>>>>>
>>>>>
>>>>>             if (!RB_EMPTY_ROOT(&vm->va.rb_root)) {
>>>>>
>>>>>          @@ -3183,3 +3208,78 @@ void
>>>>> amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
>>>>>
>>>>>                     }
>>>>>
>>>>>             }
>>>>>
>>>>>           }
>>>>>
>>>>>          +
>>>>>
>>>>>          +/**
>>>>>
>>>>>          + * amdgpu_vm_add_fault - Add a page fault record to
>>>>> fault hash table
>>>>>
>>>>>          + *
>>>>>
>>>>>          + * @fault_hash: fault hash table
>>>>>
>>>>>          + * @key: 64-bit encoding of PASID and address
>>>>>
>>>>>          + *
>>>>>
>>>>>          + * This should be called when a retry page fault
>>>>> interrupt is
>>>>>
>>>>>          + * received. If this is a new page fault, it will be
>>>>> added to a hash
>>>>>
>>>>>          + * table. The return value indicates whether this is a
>>>>> new fault, or
>>>>>
>>>>>          + * a fault that was already known and is already being
>>>>> handled.
>>>>>
>>>>>          + *
>>>>>
>>>>>          + * If there are too many pending page faults, this will
>>>>> fail. Retry
>>>>>
>>>>>          + * interrupts should be ignored in this case until there
>>>>> is enough
>>>>>
>>>>>          + * free space.
>>>>>
>>>>>          + *
>>>>>
>>>>>          + * Returns 0 if the fault was added, 1 if the fault was
>>>>> already known,
>>>>>
>>>>>          + * -ENOSPC if there are too many pending faults.
>>>>>
>>>>>          + */
>>>>>
>>>>>          +int amdgpu_vm_add_fault(struct
>>>>> amdgpu_retryfault_hashtable *fault_hash, u64 key)
>>>>>
>>>>>          +{
>>>>>
>>>>>          +  unsigned long flags;
>>>>>
>>>>>          +  int r = -ENOSPC;
>>>>>
>>>>>          +
>>>>>
>>>>>          +  if (WARN_ON_ONCE(!fault_hash))
>>>>>
>>>>>          +          /* Should be allocated in amdgpu_vm_init
>>>>>
>>>>>          +           */
>>>>>
>>>>>          +          return r;
>>>>>
>>>>>          +
>>>>>
>>>>>          +  spin_lock_irqsave(&fault_hash->lock, flags);
>>>>>
>>>>>          +
>>>>>
>>>>>          +  /* Only let the hash table fill up to 50% for best
>>>>> performance */
>>>>>
>>>>>          +  if (fault_hash->count >= (1 <<
>>>>> (AMDGPU_PAGEFAULT_HASH_BITS-1)))
>>>>>
>>>>>          +          goto unlock_out;
>>>>>
>>>>>          +
>>>>>
>>>>>          +  r = chash_table_copy_in(&fault_hash->hash, key, NULL);
>>>>>
>>>>>          +  if (!r)
>>>>>
>>>>>          +          fault_hash->count++;
>>>>>
>>>>>          +
>>>>>
>>>>>          +  /* chash_table_copy_in should never fail unless we're
>>>>> losing count */
>>>>>
>>>>>          +  WARN_ON_ONCE(r < 0);
>>>>>
>>>>>          +
>>>>>
>>>>>          +unlock_out:
>>>>>
>>>>>          +  spin_unlock_irqrestore(&fault_hash->lock, flags);
>>>>>
>>>>>          +  return r;
>>>>>
>>>>>          +}
>>>>>
>>>>>          +
>>>>>
>>>>>          +/**
>>>>>
>>>>>          + * amdgpu_vm_clear_fault - Remove a page fault record
>>>>>
>>>>>          + *
>>>>>
>>>>>          + * @fault_hash: fault hash table
>>>>>
>>>>>          + * @key: 64-bit encoding of PASID and address
>>>>>
>>>>>          + *
>>>>>
>>>>>          + * This should be called when a page fault has been
>>>>> handled. Any
>>>>>
>>>>>          + * future interrupt with this key will be processed as a
>>>>> new
>>>>>
>>>>>          + * page fault.
>>>>>
>>>>>          + */
>>>>>
>>>>>          +void amdgpu_vm_clear_fault(struct
>>>>> amdgpu_retryfault_hashtable *fault_hash, u64 key)
>>>>>
>>>>>          +{
>>>>>
>>>>>          +  unsigned long flags;
>>>>>
>>>>>          +  int r;
>>>>>
>>>>>          +
>>>>>
>>>>>          +  if (!fault_hash)
>>>>>
>>>>>          +          return;
>>>>>
>>>>>          +
>>>>>
>>>>>          +  spin_lock_irqsave(&fault_hash->lock, flags);
>>>>>
>>>>>          +
>>>>>
>>>>>          +  r = chash_table_remove(&fault_hash->hash, key, NULL);
>>>>>
>>>>>          +  if (!WARN_ON_ONCE(r < 0)) {
>>>>>
>>>>>          +          fault_hash->count--;
>>>>>
>>>>>          +          WARN_ON_ONCE(fault_hash->count < 0);
>>>>>
>>>>>          +  }
>>>>>
>>>>>          +
>>>>>
>>>>>          +  spin_unlock_irqrestore(&fault_hash->lock, flags);
>>>>>
>>>>>          +}
>>>>>
>>>>>          diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>>>
>>>>>          index e275ee7..6eb1da1 100644
>>>>>
>>>>>          --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>>>
>>>>>          +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>>>
>>>>>          @@ -178,6 +178,13 @@ struct amdgpu_task_info {
>>>>>
>>>>>             pid_t   tgid;
>>>>>
>>>>>           };
>>>>>
>>>>>
>>>>>          +#define AMDGPU_PAGEFAULT_HASH_BITS 8
>>>>>
>>>>>          +struct amdgpu_retryfault_hashtable {
>>>>>
>>>>>          +  DECLARE_CHASH_TABLE(hash, AMDGPU_PAGEFAULT_HASH_BITS,
>>>>> 8, 0);
>>>>>
>>>>>          +  spinlock_t     lock;
>>>>>
>>>>>          +  int            count;
>>>>>
>>>>>          +};
>>>>>
>>>>>          +
>>>>>
>>>>>           struct amdgpu_vm {
>>>>>
>>>>>             /* tree of virtual addresses mapped */
>>>>>
>>>>>             struct rb_root_cached  va;
>>>>>
>>>>>          @@ -240,6 +247,7 @@ struct amdgpu_vm {
>>>>>
>>>>>             struct ttm_lru_bulk_move lru_bulk_move;
>>>>>
>>>>>             /* mark whether can do the bulk move */
>>>>>
>>>>>             bool                   bulk_moveable;
>>>>>
>>>>>          +  struct amdgpu_retryfault_hashtable *fault_hash;
>>>>>
>>>>>           };
>>>>>
>>>>>
>>>>>           struct amdgpu_vm_manager {
>>>>>
>>>>>          @@ -355,4 +363,8 @@ void amdgpu_vm_set_task_info(struct
>>>>> amdgpu_vm *vm);
>>>>>
>>>>>           void amdgpu_vm_move_to_lru_tail(struct amdgpu_device *adev,
>>>>>
>>>>>                                    struct amdgpu_vm *vm);
>>>>>
>>>>>
>>>>>          +int amdgpu_vm_add_fault(struct
>>>>> amdgpu_retryfault_hashtable *fault_hash, u64 key);
>>>>>
>>>>>          +
>>>>>
>>>>>          +void amdgpu_vm_clear_fault(struct
>>>>> amdgpu_retryfault_hashtable *fault_hash, u64 key);
>>>>>
>>>>>          +
>>>>>
>>>>>           #endif
>>>>>
>>>>>          diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>
>>>>>          index 5ae5ed2..acbe5a7 100644
>>>>>
>>>>>          --- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>
>>>>>          +++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>
>>>>>          @@ -265,35 +265,36 @@ static bool
>>>>> vega10_ih_prescreen_iv(struct amdgpu_device *adev)
>>>>>
>>>>>                     return true;
>>>>>
>>>>>             }
>>>>>
>>>>>
>>>>>          -  addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 12);
>>>>>
>>>>>          -  key = AMDGPU_VM_FAULT(pasid, addr);
>>>>>
>>>>>          -  r = amdgpu_ih_add_fault(adev, key);
>>>>>
>>>>>          -
>>>>>
>>>>>          -  /* Hash table is full or the fault is already being
>>>>> processed,
>>>>>
>>>>>          -   * ignore further page faults
>>>>>
>>>>>          -   */
>>>>>
>>>>>          -  if (r != 0)
>>>>>
>>>>>          -          goto ignore_iv;
>>>>>
>>>>>          -
>>>>>
>>>>>             /* Track retry faults in per-VM fault FIFO. */
>>>>>
>>>>>             spin_lock(&adev->vm_manager.pasid_lock);
>>>>>
>>>>>             vm = idr_find(&adev->vm_manager.pasid_idr, pasid);
>>>>>
>>>>>          +  addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 12);
>>>>>
>>>>>          +  key = AMDGPU_VM_FAULT(pasid, addr);
>>>>>
>>>>>             if (!vm) {
>>>>>
>>>>>                     /* VM not found, process it normally */
>>>>>
>>>>> spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>
>>>>>          -          amdgpu_ih_clear_fault(adev, key);
>>>>>
>>>>>                     return true;
>>>>>
>>>>>          +  } else {
>>>>>
>>>>>          +          r = amdgpu_vm_add_fault(vm->fault_hash, key);
>>>>>
>>>>>          +
>>>>>
>>>>>          +          /* Hash table is full or the fault is already
>>>>> being processed,
>>>>>
>>>>>          +           * ignore further page faults
>>>>>
>>>>>          +           */
>>>>>
>>>>>          +          if (r != 0) {
>>>>>
>>>>>          + spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>
>>>>>          +                  goto ignore_iv;
>>>>>
>>>>>          +          }
>>>>>
>>>>>             }
>>>>>
>>>>>             /* No locking required with single writer and single
>>>>> reader */
>>>>>
>>>>>             r = kfifo_put(&vm->faults, key);
>>>>>
>>>>>             if (!r) {
>>>>>
>>>>>                     /* FIFO is full. Ignore it until there is
>>>>> space */
>>>>>
>>>>>          +          amdgpu_vm_clear_fault(vm->fault_hash, key);
>>>>>
>>>>> spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>
>>>>>          -          amdgpu_ih_clear_fault(adev, key);
>>>>>
>>>>>                     goto ignore_iv;
>>>>>
>>>>>             }
>>>>>
>>>>>          - spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>
>>>>>
>>>>>          + spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>
>>>>>             /* It's the first fault for this address, process it
>>>>> normally */
>>>>>
>>>>>             return true;
>>>>>
>>>>>
>>>>>          @@ -386,14 +387,6 @@ static int vega10_ih_sw_init(void
>>>>> *handle)
>>>>>
>>>>>             adev->irq.ih.use_doorbell = true;
>>>>>
>>>>>             adev->irq.ih.doorbell_index = AMDGPU_DOORBELL64_IH << 1;
>>>>>
>>>>>
>>>>>          -  adev->irq.ih.faults =
>>>>> kmalloc(sizeof(*adev->irq.ih.faults), GFP_KERNEL);
>>>>>
>>>>>          -  if (!adev->irq.ih.faults)
>>>>>
>>>>>          -          return -ENOMEM;
>>>>>
>>>>>          - INIT_CHASH_TABLE(adev->irq.ih.faults->hash,
>>>>>
>>>>>          -                   AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
>>>>>
>>>>>          - spin_lock_init(&adev->irq.ih.faults->lock);
>>>>>
>>>>>          -  adev->irq.ih.faults->count = 0;
>>>>>
>>>>>          -
>>>>>
>>>>>             r = amdgpu_irq_init(adev);
>>>>>
>>>>>
>>>>>             return r;
>>>>>
>>>>>          @@ -406,9 +399,6 @@ static int vega10_ih_sw_fini(void
>>>>> *handle)
>>>>>
>>>>>             amdgpu_irq_fini(adev);
>>>>>
>>>>>             amdgpu_ih_ring_fini(adev);
>>>>>
>>>>>
>>>>>          -  kfree(adev->irq.ih.faults);
>>>>>
>>>>>          -  adev->irq.ih.faults = NULL;
>>>>>
>>>>>          -
>>>>>
>>>>>             return 0;
>>>>>
>>>>>           }
>>>>>
>>>>>
>>>>>
>>>>>
>>>>>
>>>>>
>>>>>      _______________________________________________
>>>>>
>>>>>      amd-gfx mailing list
>>>>>
>>>>>      amd-gfx@lists.freedesktop.org
>>>>> <mailto:amd-gfx@lists.freedesktop.org>
>>>>>
>>>>>      https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>
>>>>>
>>>>
>>>>
>>>> _______________________________________________
>>>> amd-gfx mailing list
>>>> amd-gfx@lists.freedesktop.org
>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* RE: [PATCH 1/2] drm/amdgpu: Moved fault hash table to amdgpu vm
       [not found]                                 ` <bf233da4-5cd9-d7e4-1e9a-fbef3613bfd6-5C7GfCeVMHo@public.gmane.org>
@ 2018-09-10 21:09                                   ` Zeng, Oak
  0 siblings, 0 replies; 27+ messages in thread
From: Zeng, Oak @ 2018-09-10 21:09 UTC (permalink / raw)
  To: Koenig, Christian, Oak Zeng,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Yang, Philip


[-- Attachment #1.1: Type: text/plain, Size: 24744 bytes --]

Hi Christian,

Ok, then can I get a reviewed-by for this change?

I will drop patch 2 for now, as Philip said offline that he will still need to use that kfifo. I will follow up with Philip anyway.

I will re-work on the translation retry patches, after this one.

Probably also follow up the idea of calling kfd interrupt handling from IP block interrupt handler.

Also see one more comments inline.

Thanks,
Oak

From: Koenig, Christian
Sent: Saturday, September 08, 2018 3:16 AM
To: Zeng, Oak <Oak.Zeng@amd.com>; Oak Zeng <zengshanjun@gmail.com>; amd-gfx@lists.freedesktop.org; Yang, Philip <Philip.Yang@amd.com>
Subject: Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to amdgpu vm

Hi Oak,


>In other words when you request the address of a PT you get the address where memory management has moved it, not the address where the hardware is processing the fault from.

Do you mean the address in the PT is what vm has mapped and the faulty address need to be mapped in the PT?
No, the memory management is pipelined to avoid stalls during eviction.

This means that the state we manage is always the future state, e.g. when you walk the page tables in the VM structure you see the state after all the running operations completed.

In other words when a page table is pipelined for eviction you see the evicted state and not the current executing one.

But what we need for page fault handling is the current state and not the future state.

[Oak]: We might think the page fault handling differently. In my mind, there is not much need to differentiate b/t current and future page table. When pages are moved, the corresponding page table entries should be marked as invalid. During the page fault handling, driver should allocate new physical pages (and swap in pages content.  Only necessary for evicted pages.) and update page table accordingly. In this process, there is only one page table state, the one reflect the current mapping.



The amdgpu_vm and amdgpu_vm_pt data structure are invented for the purpose of management of page table BO. Why we need invent extra structure?
We will need something like a double entry accounting for the page tables where we both keep the future as well as the current state.


Think about the normal case of the page fault handling when the process and vm are still alive. In the normal case, the amdgpu_vm structure can meet exactly purpose of page table update.
No, the problem is not if the VM is still alive or dead but rather that in the case of an eviction you wouldn't be able to handle page faults any more.


The said extra data structure must look very similar to the amdgpu_vm/vm_pt structure. So the best way is to reuse the existing vm structure, either in normal case or in process termination case.
Completely agree that we will probably need the same structure as the VM hierarchy offers, just with another set of information.


Is it possible to delay the destroy of vm during process termination?
Yeah, thought about that as well. The problem with that is the OOM killer and that we are still not able to reliable kill running command submissions.

Ok you convinced me that we should probably work on that instead.

E.g. when a process has bound its address space to a VM and exits we need to find a way to kill all still executing jobs. We should let the UMD handle that case gracefully when they really need this.

Thanks,
Christian.

Am 07.09.2018 um 23:52 schrieb Zeng, Oak:
Hi Christian,

See comments inline

Thanks,
Oak

From: Koenig, Christian
Sent: Monday, September 10, 2018 7:44 AM
To: Zeng, Oak <Oak.Zeng@amd.com><mailto:Oak.Zeng@amd.com>; Oak Zeng <zengshanjun@gmail.com><mailto:zengshanjun@gmail.com>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>; Yang, Philip <Philip.Yang@amd.com><mailto:Philip.Yang@amd.com>
Subject: Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to amdgpu vm

>The VM doesn't know that information either.

Yes, VM doesn’t know the information. More specifically I meant we need VM data structure to update PTs during page fault handling. For example, we walk the 4 level page table (the first level is pointed by vm->root) to figure out the entries need to be filled, allocate page table entry Bos if necessary, allocate physical page for the faulty address, setup 4 level page table for the faulty virtual address to pointing to the allocated physical page.

>In other words when you request the address of a PT you get the address where memory management has moved it, not the address where the hardware is processing the fault from.

Do you mean the address in the PT is what vm has mapped and the faulty address need to be mapped in the PT?


I see only a few possible solutions for that:
>1. We use an extra structure in the kernel driver which holds the current address of the PTs and is feed by memory management when buffers move around.
The amdgpu_vm and amdgpu_vm_pt data structure are invented for the purpose of management of page table BO. Why we need invent extra structure? Think about the normal case of the page fault handling when the process and vm are still alive. In the normal case, the amdgpu_vm structure can meet exactly purpose of page table update.

The said extra data structure must look very similar to the amdgpu_vm/vm_pt structure. So the best way is to reuse the existing vm structure, either in normal case or in process termination case. Is it possible to delay the destroy of vm during process termination?

>2. We change the SDMA firmware to do the walk for us and update the PTEs based on the information in the page directory.
As the page table BO need to be freed later. You still need to manage them in the driver. SDMA FW can walk and update but it need information from driver side. Driver still need a similar struct like amdgpu_vm.

>3. We use the UTCL walker to get us to the correct address which is then updated with the SDMA.
I don’t quite understand your point here. What do you mean by “correct address”? Do you mean the physical address to be filled to the page table? If this is your meaning, how UTCL2 knows this information – the physical address has to be allocated/validated through driver.


Regards,
Christian.

Am 07.09.2018 um 17:30 schrieb Zeng, Oak:
Without a VM, how can you get the physical address of the faulty virtual address? Where this information should be hold?

Thanks,
Oak

From: Koenig, Christian
Sent: Monday, September 10, 2018 7:20 AM
To: Zeng, Oak <Oak.Zeng@amd.com><mailto:Oak.Zeng@amd.com>; Oak Zeng <zengshanjun@gmail.com><mailto:zengshanjun@gmail.com>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>; Yang, Philip <Philip.Yang@amd.com><mailto:Philip.Yang@amd.com>
Subject: Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to amdgpu vm

Am I right that the handling of page fault need a valid VM?
No, exactly that view is incorrect.

The VM is meant to be a memory management structure of page tables and is completely pointless fault processing because it represent the future state of the page tables and not the current one.

What we need for fault processing is a new structure build around PASIDs which is feed by the with addresses when page tables are moved around.

Alternatively I hope that we can use the SDMA to walk the page tables and update the required entries by just using the address.

Regards,
Christian.

Am 07.09.2018 um 16:55 schrieb Zeng, Oak:
Hi Christian,

What is the value of delay the destroy of hash to after vm is destroyed? Since vm is destroyed, you basically don’t have enough information to paging in the correct page to gpuvm. Am I right that the handling of page fault need a valid VM? For example, you need the VM to get the corresponding physical address of the faulty page.

After vm is destroyed, the retry interrupt can be directly discard as you can’t find vm through pasid. You can think this is also one kind of prescreen.

So I am stand for the point that, there is no need to delay the destroy of hash to after vm is destroyed. Prescreening hash can be destroyed at the time of vm_fini.

Thanks,
Oak

From: Christian König <ckoenig.leichtzumerken@gmail.com><mailto:ckoenig.leichtzumerken@gmail.com>
Sent: Friday, September 07, 2018 4:39 AM
To: Zeng, Oak <Oak.Zeng@amd.com><mailto:Oak.Zeng@amd.com>; Oak Zeng <zengshanjun@gmail.com><mailto:zengshanjun@gmail.com>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
Cc: Zeng, Oak <Oak.Zeng@amd.com><mailto:Oak.Zeng@amd.com>
Subject: Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to amdgpu vm

Hi Oak,

yeah, but this is still a NAK. Making the hash per PASID was not a suggestion but rather a must have.

The VM structures must be destroyed while the processing is still ongoing, or otherwise we would not have a clean OOM handling.

The IDR for PASID lockup can be put into amdgpu_ids.c, you can just replace the IDA already used there with an IDR.

Since the PASID is already freed up delayed we should have the grace period for interrupt processing included. If that still isn't sufficient we can still add some delayed work for this.

Regards,
Christian.

Am 07.09.2018 um 06:16 schrieb ozeng:

Hi Christian,

In this implementation, fault hash is made per vm, not per pasid as suggested, based on below considerations:

  *   Delay the destroy of hash introduces more effort like how to set the proper grace period after which no retry interrupt will be generated. We want to avoid those complication if we can.
  *   The  problem of the per vm implementation is that it is hard to delay the destroy of fault hash, because when vm is destroyed, prescreen function won't be able to retrieve the fault hash. But in this case, the prescreen function can either let the interrupt go through (to gmc) or ignore it. From this perspective, it is not very necessary to delay the destroy of hash table.
  *   On the other hand, since ASICs after vega10 have the HW CAM feature. So the SW IV prescreen is only useful for vega10. For all the HW implemented CAM, we can consider the delayed CAM acknowledgment.
  *   Making it per pasid need to introduce another idr to correlate pasid and hash table - where to put the idr? You will have to make it a global variable which is not very desirable.

The main purpose of this patch is to solve the inter-process lock issue (when hash table is full), while still keep codes simple.

Also with this patch, the faults kfifo is not necessary any more. See patch 2.

Regards,

Oak

On 09/06/2018 11:28 PM, Oak Zeng wrote:

In stead of share one fault hash table per device, make it

per vm. This can avoid inter-process lock issue when fault

hash table is full.



Change-Id: I5d1281b7c41eddc8e26113e010516557588d3708

Signed-off-by: Oak Zeng <Oak.Zeng@amd.com><mailto:Oak.Zeng@amd.com>

Suggested-by: Christian Konig <Christian.Koenig@amd.com><mailto:Christian.Koenig@amd.com>

Suggested-by: Felix Kuehling <Felix.Kuehling@amd.com><mailto:Felix.Kuehling@amd.com>

---

 drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c |  75 ------------------------

 drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h |  10 ----

 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 102 ++++++++++++++++++++++++++++++++-

 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h |  12 ++++

 drivers/gpu/drm/amd/amdgpu/vega10_ih.c |  38 +++++-------

 5 files changed, 127 insertions(+), 110 deletions(-)



diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c

index 06373d4..4ed8621 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c

@@ -197,78 +197,3 @@ int amdgpu_ih_process(struct amdgpu_device *adev)

   return IRQ_HANDLED;

 }



-/**

- * amdgpu_ih_add_fault - Add a page fault record

- *

- * @adev: amdgpu device pointer

- * @key: 64-bit encoding of PASID and address

- *

- * This should be called when a retry page fault interrupt is

- * received. If this is a new page fault, it will be added to a hash

- * table. The return value indicates whether this is a new fault, or

- * a fault that was already known and is already being handled.

- *

- * If there are too many pending page faults, this will fail. Retry

- * interrupts should be ignored in this case until there is enough

- * free space.

- *

- * Returns 0 if the fault was added, 1 if the fault was already known,

- * -ENOSPC if there are too many pending faults.

- */

-int amdgpu_ih_add_fault(struct amdgpu_device *adev, u64 key)

-{

-  unsigned long flags;

-  int r = -ENOSPC;

-

-  if (WARN_ON_ONCE(!adev->irq.ih.faults))

-          /* Should be allocated in <IP>_ih_sw_init on GPUs that

-           * support retry faults and require retry filtering.

-           */

-          return r;

-

-  spin_lock_irqsave(&adev->irq.ih.faults->lock, flags);

-

-  /* Only let the hash table fill up to 50% for best performance */

-  if (adev->irq.ih.faults->count >= (1 << (AMDGPU_PAGEFAULT_HASH_BITS-1)))

-          goto unlock_out;

-

-  r = chash_table_copy_in(&adev->irq.ih.faults->hash, key, NULL);

-  if (!r)

-          adev->irq.ih.faults->count++;

-

-  /* chash_table_copy_in should never fail unless we're losing count */

-  WARN_ON_ONCE(r < 0);

-

-unlock_out:

-  spin_unlock_irqrestore(&adev->irq.ih.faults->lock, flags);

-  return r;

-}

-

-/**

- * amdgpu_ih_clear_fault - Remove a page fault record

- *

- * @adev: amdgpu device pointer

- * @key: 64-bit encoding of PASID and address

- *

- * This should be called when a page fault has been handled. Any

- * future interrupt with this key will be processed as a new

- * page fault.

- */

-void amdgpu_ih_clear_fault(struct amdgpu_device *adev, u64 key)

-{

-  unsigned long flags;

-  int r;

-

-  if (!adev->irq.ih.faults)

-          return;

-

-  spin_lock_irqsave(&adev->irq.ih.faults->lock, flags);

-

-  r = chash_table_remove(&adev->irq.ih.faults->hash, key, NULL);

-  if (!WARN_ON_ONCE(r < 0)) {

-          adev->irq.ih.faults->count--;

-          WARN_ON_ONCE(adev->irq.ih.faults->count < 0);

-  }

-

-  spin_unlock_irqrestore(&adev->irq.ih.faults->lock, flags);

-}

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h

index a23e1c0..f411ffb 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h

@@ -32,13 +32,6 @@ struct amdgpu_device;

 #define AMDGPU_IH_CLIENTID_LEGACY 0

 #define AMDGPU_IH_CLIENTID_MAX SOC15_IH_CLIENTID_MAX



-#define AMDGPU_PAGEFAULT_HASH_BITS 8

-struct amdgpu_retryfault_hashtable {

-  DECLARE_CHASH_TABLE(hash, AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);

-  spinlock_t     lock;

-  int            count;

-};

-

 /*

  * R6xx+ IH ring

  */

@@ -57,7 +50,6 @@ struct amdgpu_ih_ring {

   bool                   use_doorbell;

   bool                   use_bus_addr;

   dma_addr_t             rb_dma_addr; /* only used when use_bus_addr = true */

-  struct amdgpu_retryfault_hashtable *faults;

 };



 #define AMDGPU_IH_SRC_DATA_MAX_SIZE_DW 4

@@ -95,7 +87,5 @@ int amdgpu_ih_ring_init(struct amdgpu_device *adev, unsigned ring_size,

                   bool use_bus_addr);

 void amdgpu_ih_ring_fini(struct amdgpu_device *adev);

 int amdgpu_ih_process(struct amdgpu_device *adev);

-int amdgpu_ih_add_fault(struct amdgpu_device *adev, u64 key);

-void amdgpu_ih_clear_fault(struct amdgpu_device *adev, u64 key);



 #endif

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c

index 1d7e3c1..8b220e9 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c

@@ -2692,6 +2692,22 @@ void amdgpu_vm_adjust_size(struct amdgpu_device *adev, uint32_t min_vm_size,

            adev->vm_manager.fragment_size);

 }



+static struct amdgpu_retryfault_hashtable *init_fault_hash(void)

+{

+  struct amdgpu_retryfault_hashtable *fault_hash;

+

+  fault_hash = kmalloc(sizeof(*fault_hash), GFP_KERNEL);

+  if (!fault_hash)

+          return fault_hash;

+

+  INIT_CHASH_TABLE(fault_hash->hash,

+                  AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);

+  spin_lock_init(&fault_hash->lock);

+  fault_hash->count = 0;

+

+  return fault_hash;

+}

+

 /**

  * amdgpu_vm_init - initialize a vm instance

  *

@@ -2780,6 +2796,12 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,

           vm->pasid = pasid;

   }



+  vm->fault_hash = init_fault_hash();

+  if (!vm->fault_hash) {

+          r = -ENOMEM;

+          goto error_free_root;

+  }

+

   INIT_KFIFO(vm->faults);

   vm->fault_credit = 16;



@@ -2973,7 +2995,7 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)



   /* Clear pending page faults from IH when the VM is destroyed */

   while (kfifo_get(&vm->faults, &fault))

-          amdgpu_ih_clear_fault(adev, fault);

+          amdgpu_vm_clear_fault(vm->fault_hash, fault);



   if (vm->pasid) {

           unsigned long flags;

@@ -2983,6 +3005,9 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)

           spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);

   }



+  kfree(vm->fault_hash);

+  vm->fault_hash = NULL;

+

   drm_sched_entity_destroy(&vm->entity);



   if (!RB_EMPTY_ROOT(&vm->va.rb_root)) {

@@ -3183,3 +3208,78 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)

           }

   }

 }

+

+/**

+ * amdgpu_vm_add_fault - Add a page fault record to fault hash table

+ *

+ * @fault_hash: fault hash table

+ * @key: 64-bit encoding of PASID and address

+ *

+ * This should be called when a retry page fault interrupt is

+ * received. If this is a new page fault, it will be added to a hash

+ * table. The return value indicates whether this is a new fault, or

+ * a fault that was already known and is already being handled.

+ *

+ * If there are too many pending page faults, this will fail. Retry

+ * interrupts should be ignored in this case until there is enough

+ * free space.

+ *

+ * Returns 0 if the fault was added, 1 if the fault was already known,

+ * -ENOSPC if there are too many pending faults.

+ */

+int amdgpu_vm_add_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key)

+{

+  unsigned long flags;

+  int r = -ENOSPC;

+

+  if (WARN_ON_ONCE(!fault_hash))

+          /* Should be allocated in amdgpu_vm_init

+           */

+          return r;

+

+  spin_lock_irqsave(&fault_hash->lock, flags);

+

+  /* Only let the hash table fill up to 50% for best performance */

+  if (fault_hash->count >= (1 << (AMDGPU_PAGEFAULT_HASH_BITS-1)))

+          goto unlock_out;

+

+  r = chash_table_copy_in(&fault_hash->hash, key, NULL);

+  if (!r)

+          fault_hash->count++;

+

+  /* chash_table_copy_in should never fail unless we're losing count */

+  WARN_ON_ONCE(r < 0);

+

+unlock_out:

+  spin_unlock_irqrestore(&fault_hash->lock, flags);

+  return r;

+}

+

+/**

+ * amdgpu_vm_clear_fault - Remove a page fault record

+ *

+ * @fault_hash: fault hash table

+ * @key: 64-bit encoding of PASID and address

+ *

+ * This should be called when a page fault has been handled. Any

+ * future interrupt with this key will be processed as a new

+ * page fault.

+ */

+void amdgpu_vm_clear_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key)

+{

+  unsigned long flags;

+  int r;

+

+  if (!fault_hash)

+          return;

+

+  spin_lock_irqsave(&fault_hash->lock, flags);

+

+  r = chash_table_remove(&fault_hash->hash, key, NULL);

+  if (!WARN_ON_ONCE(r < 0)) {

+          fault_hash->count--;

+          WARN_ON_ONCE(fault_hash->count < 0);

+  }

+

+  spin_unlock_irqrestore(&fault_hash->lock, flags);

+}

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h

index e275ee7..6eb1da1 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h

@@ -178,6 +178,13 @@ struct amdgpu_task_info {

   pid_t   tgid;

 };



+#define AMDGPU_PAGEFAULT_HASH_BITS 8

+struct amdgpu_retryfault_hashtable {

+  DECLARE_CHASH_TABLE(hash, AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);

+  spinlock_t     lock;

+  int            count;

+};

+

 struct amdgpu_vm {

   /* tree of virtual addresses mapped */

   struct rb_root_cached  va;

@@ -240,6 +247,7 @@ struct amdgpu_vm {

   struct ttm_lru_bulk_move lru_bulk_move;

   /* mark whether can do the bulk move */

   bool                   bulk_moveable;

+  struct amdgpu_retryfault_hashtable *fault_hash;

 };



 struct amdgpu_vm_manager {

@@ -355,4 +363,8 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);

 void amdgpu_vm_move_to_lru_tail(struct amdgpu_device *adev,

                          struct amdgpu_vm *vm);



+int amdgpu_vm_add_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key);

+

+void amdgpu_vm_clear_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key);

+

 #endif

diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c

index 5ae5ed2..acbe5a7 100644

--- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c

+++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c

@@ -265,35 +265,36 @@ static bool vega10_ih_prescreen_iv(struct amdgpu_device *adev)

           return true;

   }



-  addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 12);

-  key = AMDGPU_VM_FAULT(pasid, addr);

-  r = amdgpu_ih_add_fault(adev, key);

-

-  /* Hash table is full or the fault is already being processed,

-   * ignore further page faults

-   */

-  if (r != 0)

-          goto ignore_iv;

-

   /* Track retry faults in per-VM fault FIFO. */

   spin_lock(&adev->vm_manager.pasid_lock);

   vm = idr_find(&adev->vm_manager.pasid_idr, pasid);

+  addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 12);

+  key = AMDGPU_VM_FAULT(pasid, addr);

   if (!vm) {

           /* VM not found, process it normally */

           spin_unlock(&adev->vm_manager.pasid_lock);

-          amdgpu_ih_clear_fault(adev, key);

           return true;

+  } else {

+          r = amdgpu_vm_add_fault(vm->fault_hash, key);

+

+          /* Hash table is full or the fault is already being processed,

+           * ignore further page faults

+           */

+          if (r != 0) {

+                  spin_unlock(&adev->vm_manager.pasid_lock);

+                  goto ignore_iv;

+          }

   }

   /* No locking required with single writer and single reader */

   r = kfifo_put(&vm->faults, key);

   if (!r) {

           /* FIFO is full. Ignore it until there is space */

+          amdgpu_vm_clear_fault(vm->fault_hash, key);

           spin_unlock(&adev->vm_manager.pasid_lock);

-          amdgpu_ih_clear_fault(adev, key);

           goto ignore_iv;

   }

-  spin_unlock(&adev->vm_manager.pasid_lock);



+  spin_unlock(&adev->vm_manager.pasid_lock);

   /* It's the first fault for this address, process it normally */

   return true;



@@ -386,14 +387,6 @@ static int vega10_ih_sw_init(void *handle)

   adev->irq.ih.use_doorbell = true;

   adev->irq.ih.doorbell_index = AMDGPU_DOORBELL64_IH << 1;



-  adev->irq.ih.faults = kmalloc(sizeof(*adev->irq.ih.faults), GFP_KERNEL);

-  if (!adev->irq.ih.faults)

-          return -ENOMEM;

-  INIT_CHASH_TABLE(adev->irq.ih.faults->hash,

-                   AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);

-  spin_lock_init(&adev->irq.ih.faults->lock);

-  adev->irq.ih.faults->count = 0;

-

   r = amdgpu_irq_init(adev);



   return r;

@@ -406,9 +399,6 @@ static int vega10_ih_sw_fini(void *handle)

   amdgpu_irq_fini(adev);

   amdgpu_ih_ring_fini(adev);



-  kfree(adev->irq.ih.faults);

-  adev->irq.ih.faults = NULL;

-

   return 0;

 }










_______________________________________________

amd-gfx mailing list

amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>

https://lists.freedesktop.org/mailman/listinfo/amd-gfx





[-- Attachment #1.2: Type: text/html, Size: 55732 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to amdgpu vm
       [not found]                                 ` <8cee8563-2df9-1075-3113-66135cbd14fd-5C7GfCeVMHo@public.gmane.org>
@ 2018-09-11  7:19                                   ` Christian König
       [not found]                                     ` <82fb54c4-8cf2-9033-9fb4-f76e747d7b87-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
  0 siblings, 1 reply; 27+ messages in thread
From: Christian König @ 2018-09-11  7:19 UTC (permalink / raw)
  To: Felix Kuehling, christian.koenig-5C7GfCeVMHo, Zeng, Oak,
	Oak Zeng, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Yang, Philip

Hi Felix,

let me try to explain the problem on an example:

1. We have a running job which needs recoverable page faults for 
accessing the process address space.
2. We run into memory pressure on VRAM and start to evict things.
3. A page tables of the running job is picked up for eviction.
4. We schedule a copy command to move the content of the page table to 
system memory.
5. The new system memory location of the page table is noted in its BO.
6. We get a fault from the job and swap in the page from the process 
address space.
7. Now we need to enter the new page address into the page table -> *BAM*

The problem is now that we don't know the address of the page table 
because the current location was replaced with the future location in 
step #5.

We could now argue that we could update the page tables on the fly for 
the evicted page and never wait for the current job to finish, but this 
won't work because not all engines can handle that.

I will circumvent this problem for now by blocking the eviction before 
step #5. The issue with that is that this pipelining is responsible for 
nearly 20% of the fps in some testcases.

I hope that it won't be that bad when I limit disabling the pipelining 
to only page tables, but it is certainly possible that we will get 
pushback on this.

If that happens we probably need to add some flags to limit this 
workaround even more to only the root PD and all the PDs/PTs which are 
involved in recoverable faults.

Regards,
Christian.

Am 10.09.2018 um 21:10 schrieb Felix Kuehling:
> I'm not sure why you need to distinguish current and future state when
> dealing with page faults. When you get a page fault, you know that the
> GPU is trying to access memory right now, in the present. So you're
> always working with the current state. When the CPU page table changes,
> you get an MMU notifier that allows you to invalidate the corresponding
> GPU PTEs. If you have a valid GPU PTE, it always represents the current
> states and is in sync with the CPU page table. If the GPU page table is
> ever outdated, it should have an invalid entry in that place.
>
> If SDMA is used to update the GPU PTEs, there is a delay. The MMU
> notifier is synchronous, so it shouldn't be a problem. You just wait for
> the SDMA job to complete before returning. When updating PTEs with new
> valid addresses, it's asynchronous. But the GPU will continue retrying
> on the invalid entry until SDMA finishes. So it's also implicitly
> synchronized on the GPU side.
>
> Regards,
>    Felix
>
>
> On 2018-09-10 05:42 AM, Christian König wrote:
>> Hi Felix & Oak,
>>
>> over the weekend I had the idea that we could just use the shadow BOs
>> to have the current state in a page fault. They are GTT BOs and CPU
>> accessible anyway.
>>
>> Regards,
>> Christian.
>>
>> Am 08.09.2018 um 09:34 schrieb Christian König:
>>> Hi Felix,
>>>
>>>> But why do you want to update page tables when there is no more user
>>>> mode context that cares about them? Is this just to allow pending work
>>>> to complete after the user mode context is gone? To prevent hangs?
>>> The problem I'm see is that we still don't have a reliable way of
>>> killing a command submission while making sure that no more faults of
>>> it are in the flight.
>>>
>>> E.g. the HWS has a function to kill a running job on the hardware,
>>> but that only works for compute and there is no way of telling when
>>> that operation has finished.
>>>
>>> But Oak has already convinced me that we should probably work on that
>>> instead of trying to keep the VM alive as long as possible.
>>>
>>> Regards,
>>> Christian.
>>>
>>> Am 08.09.2018 um 02:27 schrieb Felix Kuehling:
>>>> Hi Christian,
>>>>
>>>> I'm not sure I get your point here.
>>>>
>>>> As I understand it, the amdgpu_vm structure represents the user mode VM
>>>> context. It has the pointers to the root page directory and the rest of
>>>> the page table hierarchy. If there is no amdgpu_vm, there is no user
>>>> mode context that cares about the page tables.
>>>>
>>>> If we leave the page table pointers in the amdgpu_vm, then handling
>>>> faults after the VM is destroyed is pointless. We can't find the page
>>>> tables and we can't update them, so there is nothing we can do in
>>>> response to VM faults.
>>>>
>>>> So I'm guessing that you want to move the page table pointers into a
>>>> different structure that exists half-independently of the VM. It would
>>>> be created when the VM is created (that's where we currently allocate
>>>> the PASID) but can be destroyed slightly later.
>>>>
>>>> But why do you want to update page tables when there is no more user
>>>> mode context that cares about them? Is this just to allow pending work
>>>> to complete after the user mode context is gone? To prevent hangs?
>>>>
>>>> Regards,
>>>>     Felix
>>>>
>>>> On 2018-09-10 07:20 AM, Christian König wrote:
>>>>>> Am I right that the handling of page fault need a valid VM?
>>>>> No, exactly that view is incorrect.
>>>>>
>>>>> The VM is meant to be a memory management structure of page tables and
>>>>> is completely pointless fault processing because it represent the
>>>>> future state of the page tables and not the current one.
>>>>>
>>>>> What we need for fault processing is a new structure build around
>>>>> PASIDs which is feed by the with addresses when page tables are moved
>>>>> around.
>>>>>
>>>>> Alternatively I hope that we can use the SDMA to walk the page tables
>>>>> and update the required entries by just using the address.
>>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>> Am 07.09.2018 um 16:55 schrieb Zeng, Oak:
>>>>>> Hi Christian,
>>>>>>
>>>>>>
>>>>>> What is the value of delay the destroy of hash to after vm is
>>>>>> destroyed? Since vm is destroyed, you basically don’t have enough
>>>>>> information to paging in the correct page to gpuvm. Am I right that
>>>>>> the handling of page fault need a valid VM? For example, you need the
>>>>>> VM to get the corresponding physical address of the faulty page.
>>>>>>
>>>>>>
>>>>>> After vm is destroyed, the retry interrupt can be directly discard as
>>>>>> you can’t find vm through pasid. You can think this is also one kind
>>>>>> of prescreen.
>>>>>>
>>>>>>
>>>>>> So I am stand for the point that, there is no need to delay the
>>>>>> destroy of hash to after vm is destroyed. Prescreening hash can be
>>>>>> destroyed at the time of vm_fini.
>>>>>>
>>>>>>
>>>>>> Thanks,
>>>>>>
>>>>>> Oak
>>>>>>
>>>>>>
>>>>>> *From:*Christian König <ckoenig.leichtzumerken@gmail.com>
>>>>>> *Sent:* Friday, September 07, 2018 4:39 AM
>>>>>> *To:* Zeng, Oak <Oak.Zeng@amd.com>; Oak Zeng <zengshanjun@gmail.com>;
>>>>>> amd-gfx@lists.freedesktop.org
>>>>>> *Cc:* Zeng, Oak <Oak.Zeng@amd.com>
>>>>>> *Subject:* Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to
>>>>>> amdgpu vm
>>>>>>
>>>>>>
>>>>>> Hi Oak,
>>>>>>
>>>>>> yeah, but this is still a NAK. Making the hash per PASID was not a
>>>>>> suggestion but rather a must have.
>>>>>>
>>>>>> The VM structures must be destroyed while the processing is still
>>>>>> ongoing, or otherwise we would not have a clean OOM handling.
>>>>>>
>>>>>> The IDR for PASID lockup can be put into amdgpu_ids.c, you can just
>>>>>> replace the IDA already used there with an IDR.
>>>>>>
>>>>>> Since the PASID is already freed up delayed we should have the grace
>>>>>> period for interrupt processing included. If that still isn't
>>>>>> sufficient we can still add some delayed work for this.
>>>>>>
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>>> Am 07.09.2018 um 06:16 schrieb ozeng:
>>>>>>
>>>>>>       Hi Christian,
>>>>>>
>>>>>>       In this implementation, fault hash is made per vm, not per pasid
>>>>>>       as suggested, based on below considerations:
>>>>>>
>>>>>>         * Delay the destroy of hash introduces more effort like how to
>>>>>>           set the proper grace period after which no retry interrupt
>>>>>>           will be generated. We want to avoid those complication if we
>>>>>>           can.
>>>>>>         * The  problem of the per vm implementation is that it is hard
>>>>>>           to delay the destroy of fault hash, because when vm is
>>>>>>           destroyed, prescreen function won't be able to retrieve the
>>>>>>           fault hash. But in this case, the prescreen function can
>>>>>>           either let the interrupt go through (to gmc) or ignore it.
>>>>>>           From this perspective, it is not very necessary to delay the
>>>>>>           destroy of hash table.
>>>>>>         * On the other hand, since ASICs after vega10 have the HW CAM
>>>>>>           feature. So the SW IV prescreen is only useful for vega10.
>>>>>>           For all the HW implemented CAM, we can consider the delayed
>>>>>>           CAM acknowledgment.
>>>>>>         * Making it per pasid need to introduce another idr to
>>>>>>           correlate pasid and hash table - where to put the idr? You
>>>>>>           will have to make it a global variable which is not very
>>>>>>           desirable.
>>>>>>
>>>>>>       The main purpose of this patch is to solve the inter-process
>>>>>> lock
>>>>>>       issue (when hash table is full), while still keep codes simple.
>>>>>>
>>>>>>       Also with this patch, the faults kfifo is not necessary any
>>>>>> more.
>>>>>>       See patch 2.
>>>>>>
>>>>>>       Regards,
>>>>>>
>>>>>>       Oak
>>>>>>
>>>>>>
>>>>>>       On 09/06/2018 11:28 PM, Oak Zeng wrote:
>>>>>>
>>>>>>           In stead of share one fault hash table per device, make it
>>>>>>
>>>>>>           per vm. This can avoid inter-process lock issue when fault
>>>>>>
>>>>>>           hash table is full.
>>>>>>
>>>>>>
>>>>>>           Change-Id: I5d1281b7c41eddc8e26113e010516557588d3708
>>>>>>
>>>>>>           Signed-off-by: Oak Zeng <Oak.Zeng@amd.com>
>>>>>> <mailto:Oak.Zeng@amd.com>
>>>>>>
>>>>>>           Suggested-by: Christian Konig <Christian.Koenig@amd.com>
>>>>>> <mailto:Christian.Koenig@amd.com>
>>>>>>
>>>>>>           Suggested-by: Felix Kuehling <Felix.Kuehling@amd.com>
>>>>>> <mailto:Felix.Kuehling@amd.com>
>>>>>>
>>>>>>           ---
>>>>>>
>>>>>>            drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c |  75
>>>>>> ------------------------
>>>>>>
>>>>>>            drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h |  10 ----
>>>>>>
>>>>>>            drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 102
>>>>>> ++++++++++++++++++++++++++++++++-
>>>>>>
>>>>>>            drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h |  12 ++++
>>>>>>
>>>>>>            drivers/gpu/drm/amd/amdgpu/vega10_ih.c |  38 +++++-------
>>>>>>
>>>>>>            5 files changed, 127 insertions(+), 110 deletions(-)
>>>>>>
>>>>>>
>>>>>>           diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>>>>>
>>>>>>           index 06373d4..4ed8621 100644
>>>>>>
>>>>>>           --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>>>>>
>>>>>>           +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>>>>>
>>>>>>           @@ -197,78 +197,3 @@ int amdgpu_ih_process(struct
>>>>>> amdgpu_device *adev)
>>>>>>
>>>>>>              return IRQ_HANDLED;
>>>>>>
>>>>>>            }
>>>>>>
>>>>>>
>>>>>>           -/**
>>>>>>
>>>>>>           - * amdgpu_ih_add_fault - Add a page fault record
>>>>>>
>>>>>>           - *
>>>>>>
>>>>>>           - * @adev: amdgpu device pointer
>>>>>>
>>>>>>           - * @key: 64-bit encoding of PASID and address
>>>>>>
>>>>>>           - *
>>>>>>
>>>>>>           - * This should be called when a retry page fault
>>>>>> interrupt is
>>>>>>
>>>>>>           - * received. If this is a new page fault, it will be
>>>>>> added to a hash
>>>>>>
>>>>>>           - * table. The return value indicates whether this is a
>>>>>> new fault, or
>>>>>>
>>>>>>           - * a fault that was already known and is already being
>>>>>> handled.
>>>>>>
>>>>>>           - *
>>>>>>
>>>>>>           - * If there are too many pending page faults, this will
>>>>>> fail. Retry
>>>>>>
>>>>>>           - * interrupts should be ignored in this case until there
>>>>>> is enough
>>>>>>
>>>>>>           - * free space.
>>>>>>
>>>>>>           - *
>>>>>>
>>>>>>           - * Returns 0 if the fault was added, 1 if the fault was
>>>>>> already known,
>>>>>>
>>>>>>           - * -ENOSPC if there are too many pending faults.
>>>>>>
>>>>>>           - */
>>>>>>
>>>>>>           -int amdgpu_ih_add_fault(struct amdgpu_device *adev, u64
>>>>>> key)
>>>>>>
>>>>>>           -{
>>>>>>
>>>>>>           -  unsigned long flags;
>>>>>>
>>>>>>           -  int r = -ENOSPC;
>>>>>>
>>>>>>           -
>>>>>>
>>>>>>           -  if (WARN_ON_ONCE(!adev->irq.ih.faults))
>>>>>>
>>>>>>           -          /* Should be allocated in <IP>_ih_sw_init on
>>>>>> GPUs that
>>>>>>
>>>>>>           -           * support retry faults and require retry
>>>>>> filtering.
>>>>>>
>>>>>>           -           */
>>>>>>
>>>>>>           -          return r;
>>>>>>
>>>>>>           -
>>>>>>
>>>>>>           - spin_lock_irqsave(&adev->irq.ih.faults->lock, flags);
>>>>>>
>>>>>>           -
>>>>>>
>>>>>>           -  /* Only let the hash table fill up to 50% for best
>>>>>> performance */
>>>>>>
>>>>>>           -  if (adev->irq.ih.faults->count >= (1 <<
>>>>>> (AMDGPU_PAGEFAULT_HASH_BITS-1)))
>>>>>>
>>>>>>           -          goto unlock_out;
>>>>>>
>>>>>>           -
>>>>>>
>>>>>>           -  r = chash_table_copy_in(&adev->irq.ih.faults->hash,
>>>>>> key, NULL);
>>>>>>
>>>>>>           -  if (!r)
>>>>>>
>>>>>>           -          adev->irq.ih.faults->count++;
>>>>>>
>>>>>>           -
>>>>>>
>>>>>>           -  /* chash_table_copy_in should never fail unless we're
>>>>>> losing count */
>>>>>>
>>>>>>           -  WARN_ON_ONCE(r < 0);
>>>>>>
>>>>>>           -
>>>>>>
>>>>>>           -unlock_out:
>>>>>>
>>>>>>           - spin_unlock_irqrestore(&adev->irq.ih.faults->lock, flags);
>>>>>>
>>>>>>           -  return r;
>>>>>>
>>>>>>           -}
>>>>>>
>>>>>>           -
>>>>>>
>>>>>>           -/**
>>>>>>
>>>>>>           - * amdgpu_ih_clear_fault - Remove a page fault record
>>>>>>
>>>>>>           - *
>>>>>>
>>>>>>           - * @adev: amdgpu device pointer
>>>>>>
>>>>>>           - * @key: 64-bit encoding of PASID and address
>>>>>>
>>>>>>           - *
>>>>>>
>>>>>>           - * This should be called when a page fault has been
>>>>>> handled. Any
>>>>>>
>>>>>>           - * future interrupt with this key will be processed as a
>>>>>> new
>>>>>>
>>>>>>           - * page fault.
>>>>>>
>>>>>>           - */
>>>>>>
>>>>>>           -void amdgpu_ih_clear_fault(struct amdgpu_device *adev,
>>>>>> u64 key)
>>>>>>
>>>>>>           -{
>>>>>>
>>>>>>           -  unsigned long flags;
>>>>>>
>>>>>>           -  int r;
>>>>>>
>>>>>>           -
>>>>>>
>>>>>>           -  if (!adev->irq.ih.faults)
>>>>>>
>>>>>>           -          return;
>>>>>>
>>>>>>           -
>>>>>>
>>>>>>           - spin_lock_irqsave(&adev->irq.ih.faults->lock, flags);
>>>>>>
>>>>>>           -
>>>>>>
>>>>>>           -  r = chash_table_remove(&adev->irq.ih.faults->hash,
>>>>>> key, NULL);
>>>>>>
>>>>>>           -  if (!WARN_ON_ONCE(r < 0)) {
>>>>>>
>>>>>>           -          adev->irq.ih.faults->count--;
>>>>>>
>>>>>>           - WARN_ON_ONCE(adev->irq.ih.faults->count < 0);
>>>>>>
>>>>>>           -  }
>>>>>>
>>>>>>           -
>>>>>>
>>>>>>           - spin_unlock_irqrestore(&adev->irq.ih.faults->lock, flags);
>>>>>>
>>>>>>           -}
>>>>>>
>>>>>>           diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>>>>
>>>>>>           index a23e1c0..f411ffb 100644
>>>>>>
>>>>>>           --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>>>>
>>>>>>           +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>>>>
>>>>>>           @@ -32,13 +32,6 @@ struct amdgpu_device;
>>>>>>
>>>>>>            #define AMDGPU_IH_CLIENTID_LEGACY 0
>>>>>>
>>>>>>            #define AMDGPU_IH_CLIENTID_MAX SOC15_IH_CLIENTID_MAX
>>>>>>
>>>>>>
>>>>>>           -#define AMDGPU_PAGEFAULT_HASH_BITS 8
>>>>>>
>>>>>>           -struct amdgpu_retryfault_hashtable {
>>>>>>
>>>>>>           -  DECLARE_CHASH_TABLE(hash, AMDGPU_PAGEFAULT_HASH_BITS,
>>>>>> 8, 0);
>>>>>>
>>>>>>           -  spinlock_t     lock;
>>>>>>
>>>>>>           -  int            count;
>>>>>>
>>>>>>           -};
>>>>>>
>>>>>>           -
>>>>>>
>>>>>>            /*
>>>>>>
>>>>>>             * R6xx+ IH ring
>>>>>>
>>>>>>             */
>>>>>>
>>>>>>           @@ -57,7 +50,6 @@ struct amdgpu_ih_ring {
>>>>>>
>>>>>>              bool                   use_doorbell;
>>>>>>
>>>>>>              bool                   use_bus_addr;
>>>>>>
>>>>>>              dma_addr_t             rb_dma_addr; /* only used when
>>>>>> use_bus_addr = true */
>>>>>>
>>>>>>           -  struct amdgpu_retryfault_hashtable *faults;
>>>>>>
>>>>>>            };
>>>>>>
>>>>>>
>>>>>>            #define AMDGPU_IH_SRC_DATA_MAX_SIZE_DW 4
>>>>>>
>>>>>>           @@ -95,7 +87,5 @@ int amdgpu_ih_ring_init(struct
>>>>>> amdgpu_device *adev, unsigned ring_size,
>>>>>>
>>>>>>                              bool use_bus_addr);
>>>>>>
>>>>>>            void amdgpu_ih_ring_fini(struct amdgpu_device *adev);
>>>>>>
>>>>>>            int amdgpu_ih_process(struct amdgpu_device *adev);
>>>>>>
>>>>>>           -int amdgpu_ih_add_fault(struct amdgpu_device *adev, u64
>>>>>> key);
>>>>>>
>>>>>>           -void amdgpu_ih_clear_fault(struct amdgpu_device *adev,
>>>>>> u64 key);
>>>>>>
>>>>>>
>>>>>>            #endif
>>>>>>
>>>>>>           diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>
>>>>>>           index 1d7e3c1..8b220e9 100644
>>>>>>
>>>>>>           --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>
>>>>>>           +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>
>>>>>>           @@ -2692,6 +2692,22 @@ void amdgpu_vm_adjust_size(struct
>>>>>> amdgpu_device *adev, uint32_t min_vm_size,
>>>>>>
>>>>>>                       adev->vm_manager.fragment_size);
>>>>>>
>>>>>>            }
>>>>>>
>>>>>>
>>>>>>           +static struct amdgpu_retryfault_hashtable
>>>>>> *init_fault_hash(void)
>>>>>>
>>>>>>           +{
>>>>>>
>>>>>>           +  struct amdgpu_retryfault_hashtable *fault_hash;
>>>>>>
>>>>>>           +
>>>>>>
>>>>>>           +  fault_hash = kmalloc(sizeof(*fault_hash), GFP_KERNEL);
>>>>>>
>>>>>>           +  if (!fault_hash)
>>>>>>
>>>>>>           +          return fault_hash;
>>>>>>
>>>>>>           +
>>>>>>
>>>>>>           +  INIT_CHASH_TABLE(fault_hash->hash,
>>>>>>
>>>>>>           +                  AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
>>>>>>
>>>>>>           +  spin_lock_init(&fault_hash->lock);
>>>>>>
>>>>>>           +  fault_hash->count = 0;
>>>>>>
>>>>>>           +
>>>>>>
>>>>>>           +  return fault_hash;
>>>>>>
>>>>>>           +}
>>>>>>
>>>>>>           +
>>>>>>
>>>>>>            /**
>>>>>>
>>>>>>             * amdgpu_vm_init - initialize a vm instance
>>>>>>
>>>>>>             *
>>>>>>
>>>>>>           @@ -2780,6 +2796,12 @@ int amdgpu_vm_init(struct
>>>>>> amdgpu_device *adev, struct amdgpu_vm *vm,
>>>>>>
>>>>>>                      vm->pasid = pasid;
>>>>>>
>>>>>>              }
>>>>>>
>>>>>>
>>>>>>           +  vm->fault_hash = init_fault_hash();
>>>>>>
>>>>>>           +  if (!vm->fault_hash) {
>>>>>>
>>>>>>           +          r = -ENOMEM;
>>>>>>
>>>>>>           +          goto error_free_root;
>>>>>>
>>>>>>           +  }
>>>>>>
>>>>>>           +
>>>>>>
>>>>>>              INIT_KFIFO(vm->faults);
>>>>>>
>>>>>>              vm->fault_credit = 16;
>>>>>>
>>>>>>
>>>>>>           @@ -2973,7 +2995,7 @@ void amdgpu_vm_fini(struct
>>>>>> amdgpu_device *adev, struct amdgpu_vm *vm)
>>>>>>
>>>>>>
>>>>>>              /* Clear pending page faults from IH when the VM is
>>>>>> destroyed */
>>>>>>
>>>>>>              while (kfifo_get(&vm->faults, &fault))
>>>>>>
>>>>>>           -          amdgpu_ih_clear_fault(adev, fault);
>>>>>>
>>>>>>           +          amdgpu_vm_clear_fault(vm->fault_hash, fault);
>>>>>>
>>>>>>
>>>>>>              if (vm->pasid) {
>>>>>>
>>>>>>                      unsigned long flags;
>>>>>>
>>>>>>           @@ -2983,6 +3005,9 @@ void amdgpu_vm_fini(struct
>>>>>> amdgpu_device *adev, struct amdgpu_vm *vm)
>>>>>>
>>>>>> spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
>>>>>>
>>>>>>              }
>>>>>>
>>>>>>
>>>>>>           +  kfree(vm->fault_hash);
>>>>>>
>>>>>>           +  vm->fault_hash = NULL;
>>>>>>
>>>>>>           +
>>>>>>
>>>>>>              drm_sched_entity_destroy(&vm->entity);
>>>>>>
>>>>>>
>>>>>>              if (!RB_EMPTY_ROOT(&vm->va.rb_root)) {
>>>>>>
>>>>>>           @@ -3183,3 +3208,78 @@ void
>>>>>> amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
>>>>>>
>>>>>>                      }
>>>>>>
>>>>>>              }
>>>>>>
>>>>>>            }
>>>>>>
>>>>>>           +
>>>>>>
>>>>>>           +/**
>>>>>>
>>>>>>           + * amdgpu_vm_add_fault - Add a page fault record to
>>>>>> fault hash table
>>>>>>
>>>>>>           + *
>>>>>>
>>>>>>           + * @fault_hash: fault hash table
>>>>>>
>>>>>>           + * @key: 64-bit encoding of PASID and address
>>>>>>
>>>>>>           + *
>>>>>>
>>>>>>           + * This should be called when a retry page fault
>>>>>> interrupt is
>>>>>>
>>>>>>           + * received. If this is a new page fault, it will be
>>>>>> added to a hash
>>>>>>
>>>>>>           + * table. The return value indicates whether this is a
>>>>>> new fault, or
>>>>>>
>>>>>>           + * a fault that was already known and is already being
>>>>>> handled.
>>>>>>
>>>>>>           + *
>>>>>>
>>>>>>           + * If there are too many pending page faults, this will
>>>>>> fail. Retry
>>>>>>
>>>>>>           + * interrupts should be ignored in this case until there
>>>>>> is enough
>>>>>>
>>>>>>           + * free space.
>>>>>>
>>>>>>           + *
>>>>>>
>>>>>>           + * Returns 0 if the fault was added, 1 if the fault was
>>>>>> already known,
>>>>>>
>>>>>>           + * -ENOSPC if there are too many pending faults.
>>>>>>
>>>>>>           + */
>>>>>>
>>>>>>           +int amdgpu_vm_add_fault(struct
>>>>>> amdgpu_retryfault_hashtable *fault_hash, u64 key)
>>>>>>
>>>>>>           +{
>>>>>>
>>>>>>           +  unsigned long flags;
>>>>>>
>>>>>>           +  int r = -ENOSPC;
>>>>>>
>>>>>>           +
>>>>>>
>>>>>>           +  if (WARN_ON_ONCE(!fault_hash))
>>>>>>
>>>>>>           +          /* Should be allocated in amdgpu_vm_init
>>>>>>
>>>>>>           +           */
>>>>>>
>>>>>>           +          return r;
>>>>>>
>>>>>>           +
>>>>>>
>>>>>>           +  spin_lock_irqsave(&fault_hash->lock, flags);
>>>>>>
>>>>>>           +
>>>>>>
>>>>>>           +  /* Only let the hash table fill up to 50% for best
>>>>>> performance */
>>>>>>
>>>>>>           +  if (fault_hash->count >= (1 <<
>>>>>> (AMDGPU_PAGEFAULT_HASH_BITS-1)))
>>>>>>
>>>>>>           +          goto unlock_out;
>>>>>>
>>>>>>           +
>>>>>>
>>>>>>           +  r = chash_table_copy_in(&fault_hash->hash, key, NULL);
>>>>>>
>>>>>>           +  if (!r)
>>>>>>
>>>>>>           +          fault_hash->count++;
>>>>>>
>>>>>>           +
>>>>>>
>>>>>>           +  /* chash_table_copy_in should never fail unless we're
>>>>>> losing count */
>>>>>>
>>>>>>           +  WARN_ON_ONCE(r < 0);
>>>>>>
>>>>>>           +
>>>>>>
>>>>>>           +unlock_out:
>>>>>>
>>>>>>           +  spin_unlock_irqrestore(&fault_hash->lock, flags);
>>>>>>
>>>>>>           +  return r;
>>>>>>
>>>>>>           +}
>>>>>>
>>>>>>           +
>>>>>>
>>>>>>           +/**
>>>>>>
>>>>>>           + * amdgpu_vm_clear_fault - Remove a page fault record
>>>>>>
>>>>>>           + *
>>>>>>
>>>>>>           + * @fault_hash: fault hash table
>>>>>>
>>>>>>           + * @key: 64-bit encoding of PASID and address
>>>>>>
>>>>>>           + *
>>>>>>
>>>>>>           + * This should be called when a page fault has been
>>>>>> handled. Any
>>>>>>
>>>>>>           + * future interrupt with this key will be processed as a
>>>>>> new
>>>>>>
>>>>>>           + * page fault.
>>>>>>
>>>>>>           + */
>>>>>>
>>>>>>           +void amdgpu_vm_clear_fault(struct
>>>>>> amdgpu_retryfault_hashtable *fault_hash, u64 key)
>>>>>>
>>>>>>           +{
>>>>>>
>>>>>>           +  unsigned long flags;
>>>>>>
>>>>>>           +  int r;
>>>>>>
>>>>>>           +
>>>>>>
>>>>>>           +  if (!fault_hash)
>>>>>>
>>>>>>           +          return;
>>>>>>
>>>>>>           +
>>>>>>
>>>>>>           +  spin_lock_irqsave(&fault_hash->lock, flags);
>>>>>>
>>>>>>           +
>>>>>>
>>>>>>           +  r = chash_table_remove(&fault_hash->hash, key, NULL);
>>>>>>
>>>>>>           +  if (!WARN_ON_ONCE(r < 0)) {
>>>>>>
>>>>>>           +          fault_hash->count--;
>>>>>>
>>>>>>           +          WARN_ON_ONCE(fault_hash->count < 0);
>>>>>>
>>>>>>           +  }
>>>>>>
>>>>>>           +
>>>>>>
>>>>>>           +  spin_unlock_irqrestore(&fault_hash->lock, flags);
>>>>>>
>>>>>>           +}
>>>>>>
>>>>>>           diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>>>>
>>>>>>           index e275ee7..6eb1da1 100644
>>>>>>
>>>>>>           --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>>>>
>>>>>>           +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>>>>
>>>>>>           @@ -178,6 +178,13 @@ struct amdgpu_task_info {
>>>>>>
>>>>>>              pid_t   tgid;
>>>>>>
>>>>>>            };
>>>>>>
>>>>>>
>>>>>>           +#define AMDGPU_PAGEFAULT_HASH_BITS 8
>>>>>>
>>>>>>           +struct amdgpu_retryfault_hashtable {
>>>>>>
>>>>>>           +  DECLARE_CHASH_TABLE(hash, AMDGPU_PAGEFAULT_HASH_BITS,
>>>>>> 8, 0);
>>>>>>
>>>>>>           +  spinlock_t     lock;
>>>>>>
>>>>>>           +  int            count;
>>>>>>
>>>>>>           +};
>>>>>>
>>>>>>           +
>>>>>>
>>>>>>            struct amdgpu_vm {
>>>>>>
>>>>>>              /* tree of virtual addresses mapped */
>>>>>>
>>>>>>              struct rb_root_cached  va;
>>>>>>
>>>>>>           @@ -240,6 +247,7 @@ struct amdgpu_vm {
>>>>>>
>>>>>>              struct ttm_lru_bulk_move lru_bulk_move;
>>>>>>
>>>>>>              /* mark whether can do the bulk move */
>>>>>>
>>>>>>              bool                   bulk_moveable;
>>>>>>
>>>>>>           +  struct amdgpu_retryfault_hashtable *fault_hash;
>>>>>>
>>>>>>            };
>>>>>>
>>>>>>
>>>>>>            struct amdgpu_vm_manager {
>>>>>>
>>>>>>           @@ -355,4 +363,8 @@ void amdgpu_vm_set_task_info(struct
>>>>>> amdgpu_vm *vm);
>>>>>>
>>>>>>            void amdgpu_vm_move_to_lru_tail(struct amdgpu_device *adev,
>>>>>>
>>>>>>                                     struct amdgpu_vm *vm);
>>>>>>
>>>>>>
>>>>>>           +int amdgpu_vm_add_fault(struct
>>>>>> amdgpu_retryfault_hashtable *fault_hash, u64 key);
>>>>>>
>>>>>>           +
>>>>>>
>>>>>>           +void amdgpu_vm_clear_fault(struct
>>>>>> amdgpu_retryfault_hashtable *fault_hash, u64 key);
>>>>>>
>>>>>>           +
>>>>>>
>>>>>>            #endif
>>>>>>
>>>>>>           diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>> b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>
>>>>>>           index 5ae5ed2..acbe5a7 100644
>>>>>>
>>>>>>           --- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>
>>>>>>           +++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>
>>>>>>           @@ -265,35 +265,36 @@ static bool
>>>>>> vega10_ih_prescreen_iv(struct amdgpu_device *adev)
>>>>>>
>>>>>>                      return true;
>>>>>>
>>>>>>              }
>>>>>>
>>>>>>
>>>>>>           -  addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 12);
>>>>>>
>>>>>>           -  key = AMDGPU_VM_FAULT(pasid, addr);
>>>>>>
>>>>>>           -  r = amdgpu_ih_add_fault(adev, key);
>>>>>>
>>>>>>           -
>>>>>>
>>>>>>           -  /* Hash table is full or the fault is already being
>>>>>> processed,
>>>>>>
>>>>>>           -   * ignore further page faults
>>>>>>
>>>>>>           -   */
>>>>>>
>>>>>>           -  if (r != 0)
>>>>>>
>>>>>>           -          goto ignore_iv;
>>>>>>
>>>>>>           -
>>>>>>
>>>>>>              /* Track retry faults in per-VM fault FIFO. */
>>>>>>
>>>>>>              spin_lock(&adev->vm_manager.pasid_lock);
>>>>>>
>>>>>>              vm = idr_find(&adev->vm_manager.pasid_idr, pasid);
>>>>>>
>>>>>>           +  addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 12);
>>>>>>
>>>>>>           +  key = AMDGPU_VM_FAULT(pasid, addr);
>>>>>>
>>>>>>              if (!vm) {
>>>>>>
>>>>>>                      /* VM not found, process it normally */
>>>>>>
>>>>>> spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>>
>>>>>>           -          amdgpu_ih_clear_fault(adev, key);
>>>>>>
>>>>>>                      return true;
>>>>>>
>>>>>>           +  } else {
>>>>>>
>>>>>>           +          r = amdgpu_vm_add_fault(vm->fault_hash, key);
>>>>>>
>>>>>>           +
>>>>>>
>>>>>>           +          /* Hash table is full or the fault is already
>>>>>> being processed,
>>>>>>
>>>>>>           +           * ignore further page faults
>>>>>>
>>>>>>           +           */
>>>>>>
>>>>>>           +          if (r != 0) {
>>>>>>
>>>>>>           + spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>>
>>>>>>           +                  goto ignore_iv;
>>>>>>
>>>>>>           +          }
>>>>>>
>>>>>>              }
>>>>>>
>>>>>>              /* No locking required with single writer and single
>>>>>> reader */
>>>>>>
>>>>>>              r = kfifo_put(&vm->faults, key);
>>>>>>
>>>>>>              if (!r) {
>>>>>>
>>>>>>                      /* FIFO is full. Ignore it until there is
>>>>>> space */
>>>>>>
>>>>>>           +          amdgpu_vm_clear_fault(vm->fault_hash, key);
>>>>>>
>>>>>> spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>>
>>>>>>           -          amdgpu_ih_clear_fault(adev, key);
>>>>>>
>>>>>>                      goto ignore_iv;
>>>>>>
>>>>>>              }
>>>>>>
>>>>>>           - spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>>
>>>>>>
>>>>>>           + spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>>
>>>>>>              /* It's the first fault for this address, process it
>>>>>> normally */
>>>>>>
>>>>>>              return true;
>>>>>>
>>>>>>
>>>>>>           @@ -386,14 +387,6 @@ static int vega10_ih_sw_init(void
>>>>>> *handle)
>>>>>>
>>>>>>              adev->irq.ih.use_doorbell = true;
>>>>>>
>>>>>>              adev->irq.ih.doorbell_index = AMDGPU_DOORBELL64_IH << 1;
>>>>>>
>>>>>>
>>>>>>           -  adev->irq.ih.faults =
>>>>>> kmalloc(sizeof(*adev->irq.ih.faults), GFP_KERNEL);
>>>>>>
>>>>>>           -  if (!adev->irq.ih.faults)
>>>>>>
>>>>>>           -          return -ENOMEM;
>>>>>>
>>>>>>           - INIT_CHASH_TABLE(adev->irq.ih.faults->hash,
>>>>>>
>>>>>>           -                   AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
>>>>>>
>>>>>>           - spin_lock_init(&adev->irq.ih.faults->lock);
>>>>>>
>>>>>>           -  adev->irq.ih.faults->count = 0;
>>>>>>
>>>>>>           -
>>>>>>
>>>>>>              r = amdgpu_irq_init(adev);
>>>>>>
>>>>>>
>>>>>>              return r;
>>>>>>
>>>>>>           @@ -406,9 +399,6 @@ static int vega10_ih_sw_fini(void
>>>>>> *handle)
>>>>>>
>>>>>>              amdgpu_irq_fini(adev);
>>>>>>
>>>>>>              amdgpu_ih_ring_fini(adev);
>>>>>>
>>>>>>
>>>>>>           -  kfree(adev->irq.ih.faults);
>>>>>>
>>>>>>           -  adev->irq.ih.faults = NULL;
>>>>>>
>>>>>>           -
>>>>>>
>>>>>>              return 0;
>>>>>>
>>>>>>            }
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>>       _______________________________________________
>>>>>>
>>>>>>       amd-gfx mailing list
>>>>>>
>>>>>>       amd-gfx@lists.freedesktop.org
>>>>>> <mailto:amd-gfx@lists.freedesktop.org>
>>>>>>
>>>>>>       https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>
>>>>>>
>>>>>
>>>>> _______________________________________________
>>>>> amd-gfx mailing list
>>>>> amd-gfx@lists.freedesktop.org
>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* RE: [PATCH 1/2] drm/amdgpu: Moved fault hash table to amdgpu vm
       [not found]                                     ` <82fb54c4-8cf2-9033-9fb4-f76e747d7b87-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
@ 2018-09-11 14:09                                       ` Zeng, Oak
       [not found]                                         ` <DM5PR12MB1660F46176F685755F5BC48880040-2J9CzHegvk9mP71xM+BsFwdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  2018-09-11 22:00                                       ` Felix Kuehling
  1 sibling, 1 reply; 27+ messages in thread
From: Zeng, Oak @ 2018-09-11 14:09 UTC (permalink / raw)
  To: Koenig, Christian, Kuehling, Felix, Oak Zeng,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Yang, Philip

Hi Christian,

I am not sure I understand exactly your thought but see 2 comments inline

Thanks,
Oak

-----Original Message-----
From: Christian König <ckoenig.leichtzumerken@gmail.com> 
Sent: Tuesday, September 11, 2018 3:19 AM
To: Kuehling, Felix <Felix.Kuehling@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Zeng, Oak <Oak.Zeng@amd.com>; Oak Zeng <zengshanjun@gmail.com>; amd-gfx@lists.freedesktop.org; Yang, Philip <Philip.Yang@amd.com>
Subject: Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to amdgpu vm

Hi Felix,

let me try to explain the problem on an example:

1. We have a running job which needs recoverable page faults for accessing the process address space.
2. We run into memory pressure on VRAM and start to evict things.
3. A page tables of the running job is picked up for eviction.
4. We schedule a copy command to move the content of the page table to system memory.
5. The new system memory location of the page table is noted in its BO.
6. We get a fault from the job and swap in the page from the process address space.
7. Now we need to enter the new page address into the page table -> *BAM*

The problem is now that we don't know the address of the page table because the current location was replaced with the future location in step #5.
[Oak]: Step 5, We also mark the corresponding entry of parent page table of the evicted page table to invalid. This invalid bit will cause the utcl2 walker to report a page fault. Depending on implementation, for the evicted page table, its' amdgpu_vm_pt.amdgpu_vm_bo_base will point to a new location in system memory (amdgpu_vm_pt itself is in system memory). In the page fault handler, we allocate a *new* physical page in vram (through ttm), swap in the page content from system memory (using sdma) to this newly allocated page, modify parent page table's corresponding entry to point to this *new allocated* page and set valid bit to 1.

If all the swap-in, swap-out work is done by sdma, we don't need to worry about the case that swap-in happens earlier than swap-out - gpu scheduler will take care the sequence.

If ttm can't turn around for the new page allocation, it can optionally trigger things like OOM. This should be something that need to be considered at ttm physical memory management layer.



We could now argue that we could update the page tables on the fly for the evicted page and never wait for the current job to finish, but this won't work because not all engines can handle that.

I will circumvent this problem for now by blocking the eviction before step #5. 
[Oak] How do you block it? Pin the memory? I agree that swapping out page table should be the last resort during a memory pressure handling. 

The issue with that is that this pipelining is responsible for nearly 20% of the fps in some testcases.

I hope that it won't be that bad when I limit disabling the pipelining to only page tables, but it is certainly possible that we will get pushback on this.

If that happens we probably need to add some flags to limit this workaround even more to only the root PD and all the PDs/PTs which are involved in recoverable faults.

Regards,
Christian.

Am 10.09.2018 um 21:10 schrieb Felix Kuehling:
> I'm not sure why you need to distinguish current and future state when 
> dealing with page faults. When you get a page fault, you know that the 
> GPU is trying to access memory right now, in the present. So you're 
> always working with the current state. When the CPU page table 
> changes, you get an MMU notifier that allows you to invalidate the 
> corresponding GPU PTEs. If you have a valid GPU PTE, it always 
> represents the current states and is in sync with the CPU page table. 
> If the GPU page table is ever outdated, it should have an invalid entry in that place.
>
> If SDMA is used to update the GPU PTEs, there is a delay. The MMU 
> notifier is synchronous, so it shouldn't be a problem. You just wait 
> for the SDMA job to complete before returning. When updating PTEs with 
> new valid addresses, it's asynchronous. But the GPU will continue 
> retrying on the invalid entry until SDMA finishes. So it's also 
> implicitly synchronized on the GPU side.
>
> Regards,
>    Felix
>
>
> On 2018-09-10 05:42 AM, Christian König wrote:
>> Hi Felix & Oak,
>>
>> over the weekend I had the idea that we could just use the shadow BOs 
>> to have the current state in a page fault. They are GTT BOs and CPU 
>> accessible anyway.
>>
>> Regards,
>> Christian.
>>
>> Am 08.09.2018 um 09:34 schrieb Christian König:
>>> Hi Felix,
>>>
>>>> But why do you want to update page tables when there is no more 
>>>> user mode context that cares about them? Is this just to allow 
>>>> pending work to complete after the user mode context is gone? To prevent hangs?
>>> The problem I'm see is that we still don't have a reliable way of 
>>> killing a command submission while making sure that no more faults 
>>> of it are in the flight.
>>>
>>> E.g. the HWS has a function to kill a running job on the hardware, 
>>> but that only works for compute and there is no way of telling when 
>>> that operation has finished.
>>>
>>> But Oak has already convinced me that we should probably work on 
>>> that instead of trying to keep the VM alive as long as possible.
>>>
>>> Regards,
>>> Christian.
>>>
>>> Am 08.09.2018 um 02:27 schrieb Felix Kuehling:
>>>> Hi Christian,
>>>>
>>>> I'm not sure I get your point here.
>>>>
>>>> As I understand it, the amdgpu_vm structure represents the user 
>>>> mode VM context. It has the pointers to the root page directory and 
>>>> the rest of the page table hierarchy. If there is no amdgpu_vm, 
>>>> there is no user mode context that cares about the page tables.
>>>>
>>>> If we leave the page table pointers in the amdgpu_vm, then handling 
>>>> faults after the VM is destroyed is pointless. We can't find the 
>>>> page tables and we can't update them, so there is nothing we can do 
>>>> in response to VM faults.
>>>>
>>>> So I'm guessing that you want to move the page table pointers into 
>>>> a different structure that exists half-independently of the VM. It 
>>>> would be created when the VM is created (that's where we currently 
>>>> allocate the PASID) but can be destroyed slightly later.
>>>>
>>>> But why do you want to update page tables when there is no more 
>>>> user mode context that cares about them? Is this just to allow 
>>>> pending work to complete after the user mode context is gone? To prevent hangs?
>>>>
>>>> Regards,
>>>>     Felix
>>>>
>>>> On 2018-09-10 07:20 AM, Christian König wrote:
>>>>>> Am I right that the handling of page fault need a valid VM?
>>>>> No, exactly that view is incorrect.
>>>>>
>>>>> The VM is meant to be a memory management structure of page tables 
>>>>> and is completely pointless fault processing because it represent 
>>>>> the future state of the page tables and not the current one.
>>>>>
>>>>> What we need for fault processing is a new structure build around 
>>>>> PASIDs which is feed by the with addresses when page tables are 
>>>>> moved around.
>>>>>
>>>>> Alternatively I hope that we can use the SDMA to walk the page 
>>>>> tables and update the required entries by just using the address.
>>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>> Am 07.09.2018 um 16:55 schrieb Zeng, Oak:
>>>>>> Hi Christian,
>>>>>>
>>>>>>
>>>>>> What is the value of delay the destroy of hash to after vm is 
>>>>>> destroyed? Since vm is destroyed, you basically don’t have enough 
>>>>>> information to paging in the correct page to gpuvm. Am I right 
>>>>>> that the handling of page fault need a valid VM? For example, you 
>>>>>> need the VM to get the corresponding physical address of the faulty page.
>>>>>>
>>>>>>
>>>>>> After vm is destroyed, the retry interrupt can be directly 
>>>>>> discard as you can’t find vm through pasid. You can think this is 
>>>>>> also one kind of prescreen.
>>>>>>
>>>>>>
>>>>>> So I am stand for the point that, there is no need to delay the 
>>>>>> destroy of hash to after vm is destroyed. Prescreening hash can 
>>>>>> be destroyed at the time of vm_fini.
>>>>>>
>>>>>>
>>>>>> Thanks,
>>>>>>
>>>>>> Oak
>>>>>>
>>>>>>
>>>>>> *From:*Christian König <ckoenig.leichtzumerken@gmail.com>
>>>>>> *Sent:* Friday, September 07, 2018 4:39 AM
>>>>>> *To:* Zeng, Oak <Oak.Zeng@amd.com>; Oak Zeng 
>>>>>> <zengshanjun@gmail.com>; amd-gfx@lists.freedesktop.org
>>>>>> *Cc:* Zeng, Oak <Oak.Zeng@amd.com>
>>>>>> *Subject:* Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to 
>>>>>> amdgpu vm
>>>>>>
>>>>>>
>>>>>> Hi Oak,
>>>>>>
>>>>>> yeah, but this is still a NAK. Making the hash per PASID was not 
>>>>>> a suggestion but rather a must have.
>>>>>>
>>>>>> The VM structures must be destroyed while the processing is still 
>>>>>> ongoing, or otherwise we would not have a clean OOM handling.
>>>>>>
>>>>>> The IDR for PASID lockup can be put into amdgpu_ids.c, you can 
>>>>>> just replace the IDA already used there with an IDR.
>>>>>>
>>>>>> Since the PASID is already freed up delayed we should have the 
>>>>>> grace period for interrupt processing included. If that still 
>>>>>> isn't sufficient we can still add some delayed work for this.
>>>>>>
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>>> Am 07.09.2018 um 06:16 schrieb ozeng:
>>>>>>
>>>>>>       Hi Christian,
>>>>>>
>>>>>>       In this implementation, fault hash is made per vm, not per 
>>>>>> pasid
>>>>>>       as suggested, based on below considerations:
>>>>>>
>>>>>>         * Delay the destroy of hash introduces more effort like 
>>>>>> how to
>>>>>>           set the proper grace period after which no retry 
>>>>>> interrupt
>>>>>>           will be generated. We want to avoid those complication 
>>>>>> if we
>>>>>>           can.
>>>>>>         * The  problem of the per vm implementation is that it is 
>>>>>> hard
>>>>>>           to delay the destroy of fault hash, because when vm is
>>>>>>           destroyed, prescreen function won't be able to retrieve 
>>>>>> the
>>>>>>           fault hash. But in this case, the prescreen function 
>>>>>> can
>>>>>>           either let the interrupt go through (to gmc) or ignore it.
>>>>>>           From this perspective, it is not very necessary to 
>>>>>> delay the
>>>>>>           destroy of hash table.
>>>>>>         * On the other hand, since ASICs after vega10 have the HW 
>>>>>> CAM
>>>>>>           feature. So the SW IV prescreen is only useful for vega10.
>>>>>>           For all the HW implemented CAM, we can consider the 
>>>>>> delayed
>>>>>>           CAM acknowledgment.
>>>>>>         * Making it per pasid need to introduce another idr to
>>>>>>           correlate pasid and hash table - where to put the idr? 
>>>>>> You
>>>>>>           will have to make it a global variable which is not 
>>>>>> very
>>>>>>           desirable.
>>>>>>
>>>>>>       The main purpose of this patch is to solve the 
>>>>>> inter-process lock
>>>>>>       issue (when hash table is full), while still keep codes simple.
>>>>>>
>>>>>>       Also with this patch, the faults kfifo is not necessary any 
>>>>>> more.
>>>>>>       See patch 2.
>>>>>>
>>>>>>       Regards,
>>>>>>
>>>>>>       Oak
>>>>>>
>>>>>>
>>>>>>       On 09/06/2018 11:28 PM, Oak Zeng wrote:
>>>>>>
>>>>>>           In stead of share one fault hash table per device, make 
>>>>>> it
>>>>>>
>>>>>>           per vm. This can avoid inter-process lock issue when 
>>>>>> fault
>>>>>>
>>>>>>           hash table is full.
>>>>>>
>>>>>>
>>>>>>           Change-Id: I5d1281b7c41eddc8e26113e010516557588d3708
>>>>>>
>>>>>>           Signed-off-by: Oak Zeng <Oak.Zeng@amd.com> 
>>>>>> <mailto:Oak.Zeng@amd.com>
>>>>>>
>>>>>>           Suggested-by: Christian Konig 
>>>>>> <Christian.Koenig@amd.com> <mailto:Christian.Koenig@amd.com>
>>>>>>
>>>>>>           Suggested-by: Felix Kuehling <Felix.Kuehling@amd.com> 
>>>>>> <mailto:Felix.Kuehling@amd.com>
>>>>>>
>>>>>>           ---
>>>>>>
>>>>>>            drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c |  75
>>>>>> ------------------------
>>>>>>
>>>>>>            drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h |  10 ----
>>>>>>
>>>>>>            drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 102
>>>>>> ++++++++++++++++++++++++++++++++-
>>>>>>
>>>>>>            drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h |  12 ++++
>>>>>>
>>>>>>            drivers/gpu/drm/amd/amdgpu/vega10_ih.c |  38 
>>>>>> +++++-------
>>>>>>
>>>>>>            5 files changed, 127 insertions(+), 110 deletions(-)
>>>>>>
>>>>>>
>>>>>>           diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>>>>>
>>>>>>           index 06373d4..4ed8621 100644
>>>>>>
>>>>>>           --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>>>>>
>>>>>>           +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>>>>>
>>>>>>           @@ -197,78 +197,3 @@ int amdgpu_ih_process(struct 
>>>>>> amdgpu_device *adev)
>>>>>>
>>>>>>              return IRQ_HANDLED;
>>>>>>
>>>>>>            }
>>>>>>
>>>>>>
>>>>>>           -/**
>>>>>>
>>>>>>           - * amdgpu_ih_add_fault - Add a page fault record
>>>>>>
>>>>>>           - *
>>>>>>
>>>>>>           - * @adev: amdgpu device pointer
>>>>>>
>>>>>>           - * @key: 64-bit encoding of PASID and address
>>>>>>
>>>>>>           - *
>>>>>>
>>>>>>           - * This should be called when a retry page fault 
>>>>>> interrupt is
>>>>>>
>>>>>>           - * received. If this is a new page fault, it will be 
>>>>>> added to a hash
>>>>>>
>>>>>>           - * table. The return value indicates whether this is a 
>>>>>> new fault, or
>>>>>>
>>>>>>           - * a fault that was already known and is already being 
>>>>>> handled.
>>>>>>
>>>>>>           - *
>>>>>>
>>>>>>           - * If there are too many pending page faults, this 
>>>>>> will fail. Retry
>>>>>>
>>>>>>           - * interrupts should be ignored in this case until 
>>>>>> there is enough
>>>>>>
>>>>>>           - * free space.
>>>>>>
>>>>>>           - *
>>>>>>
>>>>>>           - * Returns 0 if the fault was added, 1 if the fault 
>>>>>> was already known,
>>>>>>
>>>>>>           - * -ENOSPC if there are too many pending faults.
>>>>>>
>>>>>>           - */
>>>>>>
>>>>>>           -int amdgpu_ih_add_fault(struct amdgpu_device *adev, 
>>>>>> u64
>>>>>> key)
>>>>>>
>>>>>>           -{
>>>>>>
>>>>>>           -  unsigned long flags;
>>>>>>
>>>>>>           -  int r = -ENOSPC;
>>>>>>
>>>>>>           -
>>>>>>
>>>>>>           -  if (WARN_ON_ONCE(!adev->irq.ih.faults))
>>>>>>
>>>>>>           -          /* Should be allocated in <IP>_ih_sw_init on 
>>>>>> GPUs that
>>>>>>
>>>>>>           -           * support retry faults and require retry 
>>>>>> filtering.
>>>>>>
>>>>>>           -           */
>>>>>>
>>>>>>           -          return r;
>>>>>>
>>>>>>           -
>>>>>>
>>>>>>           - spin_lock_irqsave(&adev->irq.ih.faults->lock, flags);
>>>>>>
>>>>>>           -
>>>>>>
>>>>>>           -  /* Only let the hash table fill up to 50% for best 
>>>>>> performance */
>>>>>>
>>>>>>           -  if (adev->irq.ih.faults->count >= (1 <<
>>>>>> (AMDGPU_PAGEFAULT_HASH_BITS-1)))
>>>>>>
>>>>>>           -          goto unlock_out;
>>>>>>
>>>>>>           -
>>>>>>
>>>>>>           -  r = chash_table_copy_in(&adev->irq.ih.faults->hash,
>>>>>> key, NULL);
>>>>>>
>>>>>>           -  if (!r)
>>>>>>
>>>>>>           -          adev->irq.ih.faults->count++;
>>>>>>
>>>>>>           -
>>>>>>
>>>>>>           -  /* chash_table_copy_in should never fail unless 
>>>>>> we're losing count */
>>>>>>
>>>>>>           -  WARN_ON_ONCE(r < 0);
>>>>>>
>>>>>>           -
>>>>>>
>>>>>>           -unlock_out:
>>>>>>
>>>>>>           - spin_unlock_irqrestore(&adev->irq.ih.faults->lock, 
>>>>>> flags);
>>>>>>
>>>>>>           -  return r;
>>>>>>
>>>>>>           -}
>>>>>>
>>>>>>           -
>>>>>>
>>>>>>           -/**
>>>>>>
>>>>>>           - * amdgpu_ih_clear_fault - Remove a page fault record
>>>>>>
>>>>>>           - *
>>>>>>
>>>>>>           - * @adev: amdgpu device pointer
>>>>>>
>>>>>>           - * @key: 64-bit encoding of PASID and address
>>>>>>
>>>>>>           - *
>>>>>>
>>>>>>           - * This should be called when a page fault has been 
>>>>>> handled. Any
>>>>>>
>>>>>>           - * future interrupt with this key will be processed as 
>>>>>> a new
>>>>>>
>>>>>>           - * page fault.
>>>>>>
>>>>>>           - */
>>>>>>
>>>>>>           -void amdgpu_ih_clear_fault(struct amdgpu_device *adev,
>>>>>> u64 key)
>>>>>>
>>>>>>           -{
>>>>>>
>>>>>>           -  unsigned long flags;
>>>>>>
>>>>>>           -  int r;
>>>>>>
>>>>>>           -
>>>>>>
>>>>>>           -  if (!adev->irq.ih.faults)
>>>>>>
>>>>>>           -          return;
>>>>>>
>>>>>>           -
>>>>>>
>>>>>>           - spin_lock_irqsave(&adev->irq.ih.faults->lock, flags);
>>>>>>
>>>>>>           -
>>>>>>
>>>>>>           -  r = chash_table_remove(&adev->irq.ih.faults->hash,
>>>>>> key, NULL);
>>>>>>
>>>>>>           -  if (!WARN_ON_ONCE(r < 0)) {
>>>>>>
>>>>>>           -          adev->irq.ih.faults->count--;
>>>>>>
>>>>>>           - WARN_ON_ONCE(adev->irq.ih.faults->count < 0);
>>>>>>
>>>>>>           -  }
>>>>>>
>>>>>>           -
>>>>>>
>>>>>>           - spin_unlock_irqrestore(&adev->irq.ih.faults->lock, 
>>>>>> flags);
>>>>>>
>>>>>>           -}
>>>>>>
>>>>>>           diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>>>>
>>>>>>           index a23e1c0..f411ffb 100644
>>>>>>
>>>>>>           --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>>>>
>>>>>>           +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>>>>
>>>>>>           @@ -32,13 +32,6 @@ struct amdgpu_device;
>>>>>>
>>>>>>            #define AMDGPU_IH_CLIENTID_LEGACY 0
>>>>>>
>>>>>>            #define AMDGPU_IH_CLIENTID_MAX SOC15_IH_CLIENTID_MAX
>>>>>>
>>>>>>
>>>>>>           -#define AMDGPU_PAGEFAULT_HASH_BITS 8
>>>>>>
>>>>>>           -struct amdgpu_retryfault_hashtable {
>>>>>>
>>>>>>           -  DECLARE_CHASH_TABLE(hash, 
>>>>>> AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
>>>>>>
>>>>>>           -  spinlock_t     lock;
>>>>>>
>>>>>>           -  int            count;
>>>>>>
>>>>>>           -};
>>>>>>
>>>>>>           -
>>>>>>
>>>>>>            /*
>>>>>>
>>>>>>             * R6xx+ IH ring
>>>>>>
>>>>>>             */
>>>>>>
>>>>>>           @@ -57,7 +50,6 @@ struct amdgpu_ih_ring {
>>>>>>
>>>>>>              bool                   use_doorbell;
>>>>>>
>>>>>>              bool                   use_bus_addr;
>>>>>>
>>>>>>              dma_addr_t             rb_dma_addr; /* only used 
>>>>>> when use_bus_addr = true */
>>>>>>
>>>>>>           -  struct amdgpu_retryfault_hashtable *faults;
>>>>>>
>>>>>>            };
>>>>>>
>>>>>>
>>>>>>            #define AMDGPU_IH_SRC_DATA_MAX_SIZE_DW 4
>>>>>>
>>>>>>           @@ -95,7 +87,5 @@ int amdgpu_ih_ring_init(struct 
>>>>>> amdgpu_device *adev, unsigned ring_size,
>>>>>>
>>>>>>                              bool use_bus_addr);
>>>>>>
>>>>>>            void amdgpu_ih_ring_fini(struct amdgpu_device *adev);
>>>>>>
>>>>>>            int amdgpu_ih_process(struct amdgpu_device *adev);
>>>>>>
>>>>>>           -int amdgpu_ih_add_fault(struct amdgpu_device *adev, 
>>>>>> u64 key);
>>>>>>
>>>>>>           -void amdgpu_ih_clear_fault(struct amdgpu_device *adev,
>>>>>> u64 key);
>>>>>>
>>>>>>
>>>>>>            #endif
>>>>>>
>>>>>>           diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>
>>>>>>           index 1d7e3c1..8b220e9 100644
>>>>>>
>>>>>>           --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>
>>>>>>           +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>
>>>>>>           @@ -2692,6 +2692,22 @@ void 
>>>>>> amdgpu_vm_adjust_size(struct amdgpu_device *adev, uint32_t 
>>>>>> min_vm_size,
>>>>>>
>>>>>>                       adev->vm_manager.fragment_size);
>>>>>>
>>>>>>            }
>>>>>>
>>>>>>
>>>>>>           +static struct amdgpu_retryfault_hashtable
>>>>>> *init_fault_hash(void)
>>>>>>
>>>>>>           +{
>>>>>>
>>>>>>           +  struct amdgpu_retryfault_hashtable *fault_hash;
>>>>>>
>>>>>>           +
>>>>>>
>>>>>>           +  fault_hash = kmalloc(sizeof(*fault_hash), 
>>>>>> GFP_KERNEL);
>>>>>>
>>>>>>           +  if (!fault_hash)
>>>>>>
>>>>>>           +          return fault_hash;
>>>>>>
>>>>>>           +
>>>>>>
>>>>>>           +  INIT_CHASH_TABLE(fault_hash->hash,
>>>>>>
>>>>>>           +                  AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
>>>>>>
>>>>>>           +  spin_lock_init(&fault_hash->lock);
>>>>>>
>>>>>>           +  fault_hash->count = 0;
>>>>>>
>>>>>>           +
>>>>>>
>>>>>>           +  return fault_hash;
>>>>>>
>>>>>>           +}
>>>>>>
>>>>>>           +
>>>>>>
>>>>>>            /**
>>>>>>
>>>>>>             * amdgpu_vm_init - initialize a vm instance
>>>>>>
>>>>>>             *
>>>>>>
>>>>>>           @@ -2780,6 +2796,12 @@ int amdgpu_vm_init(struct 
>>>>>> amdgpu_device *adev, struct amdgpu_vm *vm,
>>>>>>
>>>>>>                      vm->pasid = pasid;
>>>>>>
>>>>>>              }
>>>>>>
>>>>>>
>>>>>>           +  vm->fault_hash = init_fault_hash();
>>>>>>
>>>>>>           +  if (!vm->fault_hash) {
>>>>>>
>>>>>>           +          r = -ENOMEM;
>>>>>>
>>>>>>           +          goto error_free_root;
>>>>>>
>>>>>>           +  }
>>>>>>
>>>>>>           +
>>>>>>
>>>>>>              INIT_KFIFO(vm->faults);
>>>>>>
>>>>>>              vm->fault_credit = 16;
>>>>>>
>>>>>>
>>>>>>           @@ -2973,7 +2995,7 @@ void amdgpu_vm_fini(struct 
>>>>>> amdgpu_device *adev, struct amdgpu_vm *vm)
>>>>>>
>>>>>>
>>>>>>              /* Clear pending page faults from IH when the VM is 
>>>>>> destroyed */
>>>>>>
>>>>>>              while (kfifo_get(&vm->faults, &fault))
>>>>>>
>>>>>>           -          amdgpu_ih_clear_fault(adev, fault);
>>>>>>
>>>>>>           +          amdgpu_vm_clear_fault(vm->fault_hash, 
>>>>>> fault);
>>>>>>
>>>>>>
>>>>>>              if (vm->pasid) {
>>>>>>
>>>>>>                      unsigned long flags;
>>>>>>
>>>>>>           @@ -2983,6 +3005,9 @@ void amdgpu_vm_fini(struct 
>>>>>> amdgpu_device *adev, struct amdgpu_vm *vm)
>>>>>>
>>>>>> spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
>>>>>>
>>>>>>              }
>>>>>>
>>>>>>
>>>>>>           +  kfree(vm->fault_hash);
>>>>>>
>>>>>>           +  vm->fault_hash = NULL;
>>>>>>
>>>>>>           +
>>>>>>
>>>>>>              drm_sched_entity_destroy(&vm->entity);
>>>>>>
>>>>>>
>>>>>>              if (!RB_EMPTY_ROOT(&vm->va.rb_root)) {
>>>>>>
>>>>>>           @@ -3183,3 +3208,78 @@ void 
>>>>>> amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
>>>>>>
>>>>>>                      }
>>>>>>
>>>>>>              }
>>>>>>
>>>>>>            }
>>>>>>
>>>>>>           +
>>>>>>
>>>>>>           +/**
>>>>>>
>>>>>>           + * amdgpu_vm_add_fault - Add a page fault record to 
>>>>>> fault hash table
>>>>>>
>>>>>>           + *
>>>>>>
>>>>>>           + * @fault_hash: fault hash table
>>>>>>
>>>>>>           + * @key: 64-bit encoding of PASID and address
>>>>>>
>>>>>>           + *
>>>>>>
>>>>>>           + * This should be called when a retry page fault 
>>>>>> interrupt is
>>>>>>
>>>>>>           + * received. If this is a new page fault, it will be 
>>>>>> added to a hash
>>>>>>
>>>>>>           + * table. The return value indicates whether this is a 
>>>>>> new fault, or
>>>>>>
>>>>>>           + * a fault that was already known and is already being 
>>>>>> handled.
>>>>>>
>>>>>>           + *
>>>>>>
>>>>>>           + * If there are too many pending page faults, this 
>>>>>> will fail. Retry
>>>>>>
>>>>>>           + * interrupts should be ignored in this case until 
>>>>>> there is enough
>>>>>>
>>>>>>           + * free space.
>>>>>>
>>>>>>           + *
>>>>>>
>>>>>>           + * Returns 0 if the fault was added, 1 if the fault 
>>>>>> was already known,
>>>>>>
>>>>>>           + * -ENOSPC if there are too many pending faults.
>>>>>>
>>>>>>           + */
>>>>>>
>>>>>>           +int amdgpu_vm_add_fault(struct 
>>>>>> amdgpu_retryfault_hashtable *fault_hash, u64 key)
>>>>>>
>>>>>>           +{
>>>>>>
>>>>>>           +  unsigned long flags;
>>>>>>
>>>>>>           +  int r = -ENOSPC;
>>>>>>
>>>>>>           +
>>>>>>
>>>>>>           +  if (WARN_ON_ONCE(!fault_hash))
>>>>>>
>>>>>>           +          /* Should be allocated in amdgpu_vm_init
>>>>>>
>>>>>>           +           */
>>>>>>
>>>>>>           +          return r;
>>>>>>
>>>>>>           +
>>>>>>
>>>>>>           +  spin_lock_irqsave(&fault_hash->lock, flags);
>>>>>>
>>>>>>           +
>>>>>>
>>>>>>           +  /* Only let the hash table fill up to 50% for best 
>>>>>> performance */
>>>>>>
>>>>>>           +  if (fault_hash->count >= (1 <<
>>>>>> (AMDGPU_PAGEFAULT_HASH_BITS-1)))
>>>>>>
>>>>>>           +          goto unlock_out;
>>>>>>
>>>>>>           +
>>>>>>
>>>>>>           +  r = chash_table_copy_in(&fault_hash->hash, key, 
>>>>>> NULL);
>>>>>>
>>>>>>           +  if (!r)
>>>>>>
>>>>>>           +          fault_hash->count++;
>>>>>>
>>>>>>           +
>>>>>>
>>>>>>           +  /* chash_table_copy_in should never fail unless 
>>>>>> we're losing count */
>>>>>>
>>>>>>           +  WARN_ON_ONCE(r < 0);
>>>>>>
>>>>>>           +
>>>>>>
>>>>>>           +unlock_out:
>>>>>>
>>>>>>           +  spin_unlock_irqrestore(&fault_hash->lock, flags);
>>>>>>
>>>>>>           +  return r;
>>>>>>
>>>>>>           +}
>>>>>>
>>>>>>           +
>>>>>>
>>>>>>           +/**
>>>>>>
>>>>>>           + * amdgpu_vm_clear_fault - Remove a page fault record
>>>>>>
>>>>>>           + *
>>>>>>
>>>>>>           + * @fault_hash: fault hash table
>>>>>>
>>>>>>           + * @key: 64-bit encoding of PASID and address
>>>>>>
>>>>>>           + *
>>>>>>
>>>>>>           + * This should be called when a page fault has been 
>>>>>> handled. Any
>>>>>>
>>>>>>           + * future interrupt with this key will be processed as 
>>>>>> a new
>>>>>>
>>>>>>           + * page fault.
>>>>>>
>>>>>>           + */
>>>>>>
>>>>>>           +void amdgpu_vm_clear_fault(struct 
>>>>>> amdgpu_retryfault_hashtable *fault_hash, u64 key)
>>>>>>
>>>>>>           +{
>>>>>>
>>>>>>           +  unsigned long flags;
>>>>>>
>>>>>>           +  int r;
>>>>>>
>>>>>>           +
>>>>>>
>>>>>>           +  if (!fault_hash)
>>>>>>
>>>>>>           +          return;
>>>>>>
>>>>>>           +
>>>>>>
>>>>>>           +  spin_lock_irqsave(&fault_hash->lock, flags);
>>>>>>
>>>>>>           +
>>>>>>
>>>>>>           +  r = chash_table_remove(&fault_hash->hash, key, 
>>>>>> NULL);
>>>>>>
>>>>>>           +  if (!WARN_ON_ONCE(r < 0)) {
>>>>>>
>>>>>>           +          fault_hash->count--;
>>>>>>
>>>>>>           +          WARN_ON_ONCE(fault_hash->count < 0);
>>>>>>
>>>>>>           +  }
>>>>>>
>>>>>>           +
>>>>>>
>>>>>>           +  spin_unlock_irqrestore(&fault_hash->lock, flags);
>>>>>>
>>>>>>           +}
>>>>>>
>>>>>>           diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>>>>
>>>>>>           index e275ee7..6eb1da1 100644
>>>>>>
>>>>>>           --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>>>>
>>>>>>           +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>>>>
>>>>>>           @@ -178,6 +178,13 @@ struct amdgpu_task_info {
>>>>>>
>>>>>>              pid_t   tgid;
>>>>>>
>>>>>>            };
>>>>>>
>>>>>>
>>>>>>           +#define AMDGPU_PAGEFAULT_HASH_BITS 8
>>>>>>
>>>>>>           +struct amdgpu_retryfault_hashtable {
>>>>>>
>>>>>>           +  DECLARE_CHASH_TABLE(hash, 
>>>>>> AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
>>>>>>
>>>>>>           +  spinlock_t     lock;
>>>>>>
>>>>>>           +  int            count;
>>>>>>
>>>>>>           +};
>>>>>>
>>>>>>           +
>>>>>>
>>>>>>            struct amdgpu_vm {
>>>>>>
>>>>>>              /* tree of virtual addresses mapped */
>>>>>>
>>>>>>              struct rb_root_cached  va;
>>>>>>
>>>>>>           @@ -240,6 +247,7 @@ struct amdgpu_vm {
>>>>>>
>>>>>>              struct ttm_lru_bulk_move lru_bulk_move;
>>>>>>
>>>>>>              /* mark whether can do the bulk move */
>>>>>>
>>>>>>              bool                   bulk_moveable;
>>>>>>
>>>>>>           +  struct amdgpu_retryfault_hashtable *fault_hash;
>>>>>>
>>>>>>            };
>>>>>>
>>>>>>
>>>>>>            struct amdgpu_vm_manager {
>>>>>>
>>>>>>           @@ -355,4 +363,8 @@ void amdgpu_vm_set_task_info(struct 
>>>>>> amdgpu_vm *vm);
>>>>>>
>>>>>>            void amdgpu_vm_move_to_lru_tail(struct amdgpu_device 
>>>>>> *adev,
>>>>>>
>>>>>>                                     struct amdgpu_vm *vm);
>>>>>>
>>>>>>
>>>>>>           +int amdgpu_vm_add_fault(struct 
>>>>>> amdgpu_retryfault_hashtable *fault_hash, u64 key);
>>>>>>
>>>>>>           +
>>>>>>
>>>>>>           +void amdgpu_vm_clear_fault(struct 
>>>>>> amdgpu_retryfault_hashtable *fault_hash, u64 key);
>>>>>>
>>>>>>           +
>>>>>>
>>>>>>            #endif
>>>>>>
>>>>>>           diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>> b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>
>>>>>>           index 5ae5ed2..acbe5a7 100644
>>>>>>
>>>>>>           --- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>
>>>>>>           +++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>
>>>>>>           @@ -265,35 +265,36 @@ static bool 
>>>>>> vega10_ih_prescreen_iv(struct amdgpu_device *adev)
>>>>>>
>>>>>>                      return true;
>>>>>>
>>>>>>              }
>>>>>>
>>>>>>
>>>>>>           -  addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 12);
>>>>>>
>>>>>>           -  key = AMDGPU_VM_FAULT(pasid, addr);
>>>>>>
>>>>>>           -  r = amdgpu_ih_add_fault(adev, key);
>>>>>>
>>>>>>           -
>>>>>>
>>>>>>           -  /* Hash table is full or the fault is already being 
>>>>>> processed,
>>>>>>
>>>>>>           -   * ignore further page faults
>>>>>>
>>>>>>           -   */
>>>>>>
>>>>>>           -  if (r != 0)
>>>>>>
>>>>>>           -          goto ignore_iv;
>>>>>>
>>>>>>           -
>>>>>>
>>>>>>              /* Track retry faults in per-VM fault FIFO. */
>>>>>>
>>>>>>              spin_lock(&adev->vm_manager.pasid_lock);
>>>>>>
>>>>>>              vm = idr_find(&adev->vm_manager.pasid_idr, pasid);
>>>>>>
>>>>>>           +  addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 12);
>>>>>>
>>>>>>           +  key = AMDGPU_VM_FAULT(pasid, addr);
>>>>>>
>>>>>>              if (!vm) {
>>>>>>
>>>>>>                      /* VM not found, process it normally */
>>>>>>
>>>>>> spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>>
>>>>>>           -          amdgpu_ih_clear_fault(adev, key);
>>>>>>
>>>>>>                      return true;
>>>>>>
>>>>>>           +  } else {
>>>>>>
>>>>>>           +          r = amdgpu_vm_add_fault(vm->fault_hash, 
>>>>>> key);
>>>>>>
>>>>>>           +
>>>>>>
>>>>>>           +          /* Hash table is full or the fault is 
>>>>>> already being processed,
>>>>>>
>>>>>>           +           * ignore further page faults
>>>>>>
>>>>>>           +           */
>>>>>>
>>>>>>           +          if (r != 0) {
>>>>>>
>>>>>>           + spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>>
>>>>>>           +                  goto ignore_iv;
>>>>>>
>>>>>>           +          }
>>>>>>
>>>>>>              }
>>>>>>
>>>>>>              /* No locking required with single writer and single 
>>>>>> reader */
>>>>>>
>>>>>>              r = kfifo_put(&vm->faults, key);
>>>>>>
>>>>>>              if (!r) {
>>>>>>
>>>>>>                      /* FIFO is full. Ignore it until there is 
>>>>>> space */
>>>>>>
>>>>>>           +          amdgpu_vm_clear_fault(vm->fault_hash, key);
>>>>>>
>>>>>> spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>>
>>>>>>           -          amdgpu_ih_clear_fault(adev, key);
>>>>>>
>>>>>>                      goto ignore_iv;
>>>>>>
>>>>>>              }
>>>>>>
>>>>>>           - spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>>
>>>>>>
>>>>>>           + spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>>
>>>>>>              /* It's the first fault for this address, process it 
>>>>>> normally */
>>>>>>
>>>>>>              return true;
>>>>>>
>>>>>>
>>>>>>           @@ -386,14 +387,6 @@ static int vega10_ih_sw_init(void
>>>>>> *handle)
>>>>>>
>>>>>>              adev->irq.ih.use_doorbell = true;
>>>>>>
>>>>>>              adev->irq.ih.doorbell_index = AMDGPU_DOORBELL64_IH 
>>>>>> << 1;
>>>>>>
>>>>>>
>>>>>>           -  adev->irq.ih.faults = 
>>>>>> kmalloc(sizeof(*adev->irq.ih.faults), GFP_KERNEL);
>>>>>>
>>>>>>           -  if (!adev->irq.ih.faults)
>>>>>>
>>>>>>           -          return -ENOMEM;
>>>>>>
>>>>>>           - INIT_CHASH_TABLE(adev->irq.ih.faults->hash,
>>>>>>
>>>>>>           -                   AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
>>>>>>
>>>>>>           - spin_lock_init(&adev->irq.ih.faults->lock);
>>>>>>
>>>>>>           -  adev->irq.ih.faults->count = 0;
>>>>>>
>>>>>>           -
>>>>>>
>>>>>>              r = amdgpu_irq_init(adev);
>>>>>>
>>>>>>
>>>>>>              return r;
>>>>>>
>>>>>>           @@ -406,9 +399,6 @@ static int vega10_ih_sw_fini(void
>>>>>> *handle)
>>>>>>
>>>>>>              amdgpu_irq_fini(adev);
>>>>>>
>>>>>>              amdgpu_ih_ring_fini(adev);
>>>>>>
>>>>>>
>>>>>>           -  kfree(adev->irq.ih.faults);
>>>>>>
>>>>>>           -  adev->irq.ih.faults = NULL;
>>>>>>
>>>>>>           -
>>>>>>
>>>>>>              return 0;
>>>>>>
>>>>>>            }
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>>       _______________________________________________
>>>>>>
>>>>>>       amd-gfx mailing list
>>>>>>
>>>>>>       amd-gfx@lists.freedesktop.org 
>>>>>> <mailto:amd-gfx@lists.freedesktop.org>
>>>>>>
>>>>>>       https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>
>>>>>>
>>>>>
>>>>> _______________________________________________
>>>>> amd-gfx mailing list
>>>>> amd-gfx@lists.freedesktop.org
>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to amdgpu vm
       [not found]                                         ` <DM5PR12MB1660F46176F685755F5BC48880040-2J9CzHegvk9mP71xM+BsFwdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2018-09-11 14:23                                           ` Christian König
       [not found]                                             ` <375d7ba0-1da4-b5b3-6eb9-0d947290ee74-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
  0 siblings, 1 reply; 27+ messages in thread
From: Christian König @ 2018-09-11 14:23 UTC (permalink / raw)
  To: Zeng, Oak, Koenig, Christian, Kuehling, Felix, Oak Zeng,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Yang, Philip

> [Oak]: Step 5, We also mark the corresponding entry of parent page table of the evicted page table to invalid.
Exactly that is what won't work. See we can't mark the PDE invalid 
because we still have some engines which can't deal with retry faults.

> [Oak] How do you block it?
Just waiting for all execution to finish.

Regards,
Christian.

Am 11.09.2018 um 16:09 schrieb Zeng, Oak:
> Hi Christian,
>
> I am not sure I understand exactly your thought but see 2 comments inline
>
> Thanks,
> Oak
>
> -----Original Message-----
> From: Christian König <ckoenig.leichtzumerken@gmail.com>
> Sent: Tuesday, September 11, 2018 3:19 AM
> To: Kuehling, Felix <Felix.Kuehling@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Zeng, Oak <Oak.Zeng@amd.com>; Oak Zeng <zengshanjun@gmail.com>; amd-gfx@lists.freedesktop.org; Yang, Philip <Philip.Yang@amd.com>
> Subject: Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to amdgpu vm
>
> Hi Felix,
>
> let me try to explain the problem on an example:
>
> 1. We have a running job which needs recoverable page faults for accessing the process address space.
> 2. We run into memory pressure on VRAM and start to evict things.
> 3. A page tables of the running job is picked up for eviction.
> 4. We schedule a copy command to move the content of the page table to system memory.
> 5. The new system memory location of the page table is noted in its BO.
> 6. We get a fault from the job and swap in the page from the process address space.
> 7. Now we need to enter the new page address into the page table -> *BAM*
>
> The problem is now that we don't know the address of the page table because the current location was replaced with the future location in step #5.
> [Oak]: Step 5, We also mark the corresponding entry of parent page table of the evicted page table to invalid. This invalid bit will cause the utcl2 walker to report a page fault. Depending on implementation, for the evicted page table, its' amdgpu_vm_pt.amdgpu_vm_bo_base will point to a new location in system memory (amdgpu_vm_pt itself is in system memory). In the page fault handler, we allocate a *new* physical page in vram (through ttm), swap in the page content from system memory (using sdma) to this newly allocated page, modify parent page table's corresponding entry to point to this *new allocated* page and set valid bit to 1.
>
> If all the swap-in, swap-out work is done by sdma, we don't need to worry about the case that swap-in happens earlier than swap-out - gpu scheduler will take care the sequence.
>
> If ttm can't turn around for the new page allocation, it can optionally trigger things like OOM. This should be something that need to be considered at ttm physical memory management layer.
>
>
>
> We could now argue that we could update the page tables on the fly for the evicted page and never wait for the current job to finish, but this won't work because not all engines can handle that.
>
> I will circumvent this problem for now by blocking the eviction before step #5.
> [Oak] How do you block it? Pin the memory? I agree that swapping out page table should be the last resort during a memory pressure handling.
>
> The issue with that is that this pipelining is responsible for nearly 20% of the fps in some testcases.
>
> I hope that it won't be that bad when I limit disabling the pipelining to only page tables, but it is certainly possible that we will get pushback on this.
>
> If that happens we probably need to add some flags to limit this workaround even more to only the root PD and all the PDs/PTs which are involved in recoverable faults.
>
> Regards,
> Christian.
>
> Am 10.09.2018 um 21:10 schrieb Felix Kuehling:
>> I'm not sure why you need to distinguish current and future state when
>> dealing with page faults. When you get a page fault, you know that the
>> GPU is trying to access memory right now, in the present. So you're
>> always working with the current state. When the CPU page table
>> changes, you get an MMU notifier that allows you to invalidate the
>> corresponding GPU PTEs. If you have a valid GPU PTE, it always
>> represents the current states and is in sync with the CPU page table.
>> If the GPU page table is ever outdated, it should have an invalid entry in that place.
>>
>> If SDMA is used to update the GPU PTEs, there is a delay. The MMU
>> notifier is synchronous, so it shouldn't be a problem. You just wait
>> for the SDMA job to complete before returning. When updating PTEs with
>> new valid addresses, it's asynchronous. But the GPU will continue
>> retrying on the invalid entry until SDMA finishes. So it's also
>> implicitly synchronized on the GPU side.
>>
>> Regards,
>>     Felix
>>
>>
>> On 2018-09-10 05:42 AM, Christian König wrote:
>>> Hi Felix & Oak,
>>>
>>> over the weekend I had the idea that we could just use the shadow BOs
>>> to have the current state in a page fault. They are GTT BOs and CPU
>>> accessible anyway.
>>>
>>> Regards,
>>> Christian.
>>>
>>> Am 08.09.2018 um 09:34 schrieb Christian König:
>>>> Hi Felix,
>>>>
>>>>> But why do you want to update page tables when there is no more
>>>>> user mode context that cares about them? Is this just to allow
>>>>> pending work to complete after the user mode context is gone? To prevent hangs?
>>>> The problem I'm see is that we still don't have a reliable way of
>>>> killing a command submission while making sure that no more faults
>>>> of it are in the flight.
>>>>
>>>> E.g. the HWS has a function to kill a running job on the hardware,
>>>> but that only works for compute and there is no way of telling when
>>>> that operation has finished.
>>>>
>>>> But Oak has already convinced me that we should probably work on
>>>> that instead of trying to keep the VM alive as long as possible.
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>> Am 08.09.2018 um 02:27 schrieb Felix Kuehling:
>>>>> Hi Christian,
>>>>>
>>>>> I'm not sure I get your point here.
>>>>>
>>>>> As I understand it, the amdgpu_vm structure represents the user
>>>>> mode VM context. It has the pointers to the root page directory and
>>>>> the rest of the page table hierarchy. If there is no amdgpu_vm,
>>>>> there is no user mode context that cares about the page tables.
>>>>>
>>>>> If we leave the page table pointers in the amdgpu_vm, then handling
>>>>> faults after the VM is destroyed is pointless. We can't find the
>>>>> page tables and we can't update them, so there is nothing we can do
>>>>> in response to VM faults.
>>>>>
>>>>> So I'm guessing that you want to move the page table pointers into
>>>>> a different structure that exists half-independently of the VM. It
>>>>> would be created when the VM is created (that's where we currently
>>>>> allocate the PASID) but can be destroyed slightly later.
>>>>>
>>>>> But why do you want to update page tables when there is no more
>>>>> user mode context that cares about them? Is this just to allow
>>>>> pending work to complete after the user mode context is gone? To prevent hangs?
>>>>>
>>>>> Regards,
>>>>>      Felix
>>>>>
>>>>> On 2018-09-10 07:20 AM, Christian König wrote:
>>>>>>> Am I right that the handling of page fault need a valid VM?
>>>>>> No, exactly that view is incorrect.
>>>>>>
>>>>>> The VM is meant to be a memory management structure of page tables
>>>>>> and is completely pointless fault processing because it represent
>>>>>> the future state of the page tables and not the current one.
>>>>>>
>>>>>> What we need for fault processing is a new structure build around
>>>>>> PASIDs which is feed by the with addresses when page tables are
>>>>>> moved around.
>>>>>>
>>>>>> Alternatively I hope that we can use the SDMA to walk the page
>>>>>> tables and update the required entries by just using the address.
>>>>>>
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>>> Am 07.09.2018 um 16:55 schrieb Zeng, Oak:
>>>>>>> Hi Christian,
>>>>>>>
>>>>>>>
>>>>>>> What is the value of delay the destroy of hash to after vm is
>>>>>>> destroyed? Since vm is destroyed, you basically don’t have enough
>>>>>>> information to paging in the correct page to gpuvm. Am I right
>>>>>>> that the handling of page fault need a valid VM? For example, you
>>>>>>> need the VM to get the corresponding physical address of the faulty page.
>>>>>>>
>>>>>>>
>>>>>>> After vm is destroyed, the retry interrupt can be directly
>>>>>>> discard as you can’t find vm through pasid. You can think this is
>>>>>>> also one kind of prescreen.
>>>>>>>
>>>>>>>
>>>>>>> So I am stand for the point that, there is no need to delay the
>>>>>>> destroy of hash to after vm is destroyed. Prescreening hash can
>>>>>>> be destroyed at the time of vm_fini.
>>>>>>>
>>>>>>>
>>>>>>> Thanks,
>>>>>>>
>>>>>>> Oak
>>>>>>>
>>>>>>>
>>>>>>> *From:*Christian König <ckoenig.leichtzumerken@gmail.com>
>>>>>>> *Sent:* Friday, September 07, 2018 4:39 AM
>>>>>>> *To:* Zeng, Oak <Oak.Zeng@amd.com>; Oak Zeng
>>>>>>> <zengshanjun@gmail.com>; amd-gfx@lists.freedesktop.org
>>>>>>> *Cc:* Zeng, Oak <Oak.Zeng@amd.com>
>>>>>>> *Subject:* Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to
>>>>>>> amdgpu vm
>>>>>>>
>>>>>>>
>>>>>>> Hi Oak,
>>>>>>>
>>>>>>> yeah, but this is still a NAK. Making the hash per PASID was not
>>>>>>> a suggestion but rather a must have.
>>>>>>>
>>>>>>> The VM structures must be destroyed while the processing is still
>>>>>>> ongoing, or otherwise we would not have a clean OOM handling.
>>>>>>>
>>>>>>> The IDR for PASID lockup can be put into amdgpu_ids.c, you can
>>>>>>> just replace the IDA already used there with an IDR.
>>>>>>>
>>>>>>> Since the PASID is already freed up delayed we should have the
>>>>>>> grace period for interrupt processing included. If that still
>>>>>>> isn't sufficient we can still add some delayed work for this.
>>>>>>>
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>>> Am 07.09.2018 um 06:16 schrieb ozeng:
>>>>>>>
>>>>>>>        Hi Christian,
>>>>>>>
>>>>>>>        In this implementation, fault hash is made per vm, not per
>>>>>>> pasid
>>>>>>>        as suggested, based on below considerations:
>>>>>>>
>>>>>>>          * Delay the destroy of hash introduces more effort like
>>>>>>> how to
>>>>>>>            set the proper grace period after which no retry
>>>>>>> interrupt
>>>>>>>            will be generated. We want to avoid those complication
>>>>>>> if we
>>>>>>>            can.
>>>>>>>          * The  problem of the per vm implementation is that it is
>>>>>>> hard
>>>>>>>            to delay the destroy of fault hash, because when vm is
>>>>>>>            destroyed, prescreen function won't be able to retrieve
>>>>>>> the
>>>>>>>            fault hash. But in this case, the prescreen function
>>>>>>> can
>>>>>>>            either let the interrupt go through (to gmc) or ignore it.
>>>>>>>            From this perspective, it is not very necessary to
>>>>>>> delay the
>>>>>>>            destroy of hash table.
>>>>>>>          * On the other hand, since ASICs after vega10 have the HW
>>>>>>> CAM
>>>>>>>            feature. So the SW IV prescreen is only useful for vega10.
>>>>>>>            For all the HW implemented CAM, we can consider the
>>>>>>> delayed
>>>>>>>            CAM acknowledgment.
>>>>>>>          * Making it per pasid need to introduce another idr to
>>>>>>>            correlate pasid and hash table - where to put the idr?
>>>>>>> You
>>>>>>>            will have to make it a global variable which is not
>>>>>>> very
>>>>>>>            desirable.
>>>>>>>
>>>>>>>        The main purpose of this patch is to solve the
>>>>>>> inter-process lock
>>>>>>>        issue (when hash table is full), while still keep codes simple.
>>>>>>>
>>>>>>>        Also with this patch, the faults kfifo is not necessary any
>>>>>>> more.
>>>>>>>        See patch 2.
>>>>>>>
>>>>>>>        Regards,
>>>>>>>
>>>>>>>        Oak
>>>>>>>
>>>>>>>
>>>>>>>        On 09/06/2018 11:28 PM, Oak Zeng wrote:
>>>>>>>
>>>>>>>            In stead of share one fault hash table per device, make
>>>>>>> it
>>>>>>>
>>>>>>>            per vm. This can avoid inter-process lock issue when
>>>>>>> fault
>>>>>>>
>>>>>>>            hash table is full.
>>>>>>>
>>>>>>>
>>>>>>>            Change-Id: I5d1281b7c41eddc8e26113e010516557588d3708
>>>>>>>
>>>>>>>            Signed-off-by: Oak Zeng <Oak.Zeng@amd.com>
>>>>>>> <mailto:Oak.Zeng@amd.com>
>>>>>>>
>>>>>>>            Suggested-by: Christian Konig
>>>>>>> <Christian.Koenig@amd.com> <mailto:Christian.Koenig@amd.com>
>>>>>>>
>>>>>>>            Suggested-by: Felix Kuehling <Felix.Kuehling@amd.com>
>>>>>>> <mailto:Felix.Kuehling@amd.com>
>>>>>>>
>>>>>>>            ---
>>>>>>>
>>>>>>>             drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c |  75
>>>>>>> ------------------------
>>>>>>>
>>>>>>>             drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h |  10 ----
>>>>>>>
>>>>>>>             drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 102
>>>>>>> ++++++++++++++++++++++++++++++++-
>>>>>>>
>>>>>>>             drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h |  12 ++++
>>>>>>>
>>>>>>>             drivers/gpu/drm/amd/amdgpu/vega10_ih.c |  38
>>>>>>> +++++-------
>>>>>>>
>>>>>>>             5 files changed, 127 insertions(+), 110 deletions(-)
>>>>>>>
>>>>>>>
>>>>>>>            diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>>>>>>
>>>>>>>            index 06373d4..4ed8621 100644
>>>>>>>
>>>>>>>            --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>>>>>>
>>>>>>>            +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>>>>>>
>>>>>>>            @@ -197,78 +197,3 @@ int amdgpu_ih_process(struct
>>>>>>> amdgpu_device *adev)
>>>>>>>
>>>>>>>               return IRQ_HANDLED;
>>>>>>>
>>>>>>>             }
>>>>>>>
>>>>>>>
>>>>>>>            -/**
>>>>>>>
>>>>>>>            - * amdgpu_ih_add_fault - Add a page fault record
>>>>>>>
>>>>>>>            - *
>>>>>>>
>>>>>>>            - * @adev: amdgpu device pointer
>>>>>>>
>>>>>>>            - * @key: 64-bit encoding of PASID and address
>>>>>>>
>>>>>>>            - *
>>>>>>>
>>>>>>>            - * This should be called when a retry page fault
>>>>>>> interrupt is
>>>>>>>
>>>>>>>            - * received. If this is a new page fault, it will be
>>>>>>> added to a hash
>>>>>>>
>>>>>>>            - * table. The return value indicates whether this is a
>>>>>>> new fault, or
>>>>>>>
>>>>>>>            - * a fault that was already known and is already being
>>>>>>> handled.
>>>>>>>
>>>>>>>            - *
>>>>>>>
>>>>>>>            - * If there are too many pending page faults, this
>>>>>>> will fail. Retry
>>>>>>>
>>>>>>>            - * interrupts should be ignored in this case until
>>>>>>> there is enough
>>>>>>>
>>>>>>>            - * free space.
>>>>>>>
>>>>>>>            - *
>>>>>>>
>>>>>>>            - * Returns 0 if the fault was added, 1 if the fault
>>>>>>> was already known,
>>>>>>>
>>>>>>>            - * -ENOSPC if there are too many pending faults.
>>>>>>>
>>>>>>>            - */
>>>>>>>
>>>>>>>            -int amdgpu_ih_add_fault(struct amdgpu_device *adev,
>>>>>>> u64
>>>>>>> key)
>>>>>>>
>>>>>>>            -{
>>>>>>>
>>>>>>>            -  unsigned long flags;
>>>>>>>
>>>>>>>            -  int r = -ENOSPC;
>>>>>>>
>>>>>>>            -
>>>>>>>
>>>>>>>            -  if (WARN_ON_ONCE(!adev->irq.ih.faults))
>>>>>>>
>>>>>>>            -          /* Should be allocated in <IP>_ih_sw_init on
>>>>>>> GPUs that
>>>>>>>
>>>>>>>            -           * support retry faults and require retry
>>>>>>> filtering.
>>>>>>>
>>>>>>>            -           */
>>>>>>>
>>>>>>>            -          return r;
>>>>>>>
>>>>>>>            -
>>>>>>>
>>>>>>>            - spin_lock_irqsave(&adev->irq.ih.faults->lock, flags);
>>>>>>>
>>>>>>>            -
>>>>>>>
>>>>>>>            -  /* Only let the hash table fill up to 50% for best
>>>>>>> performance */
>>>>>>>
>>>>>>>            -  if (adev->irq.ih.faults->count >= (1 <<
>>>>>>> (AMDGPU_PAGEFAULT_HASH_BITS-1)))
>>>>>>>
>>>>>>>            -          goto unlock_out;
>>>>>>>
>>>>>>>            -
>>>>>>>
>>>>>>>            -  r = chash_table_copy_in(&adev->irq.ih.faults->hash,
>>>>>>> key, NULL);
>>>>>>>
>>>>>>>            -  if (!r)
>>>>>>>
>>>>>>>            -          adev->irq.ih.faults->count++;
>>>>>>>
>>>>>>>            -
>>>>>>>
>>>>>>>            -  /* chash_table_copy_in should never fail unless
>>>>>>> we're losing count */
>>>>>>>
>>>>>>>            -  WARN_ON_ONCE(r < 0);
>>>>>>>
>>>>>>>            -
>>>>>>>
>>>>>>>            -unlock_out:
>>>>>>>
>>>>>>>            - spin_unlock_irqrestore(&adev->irq.ih.faults->lock,
>>>>>>> flags);
>>>>>>>
>>>>>>>            -  return r;
>>>>>>>
>>>>>>>            -}
>>>>>>>
>>>>>>>            -
>>>>>>>
>>>>>>>            -/**
>>>>>>>
>>>>>>>            - * amdgpu_ih_clear_fault - Remove a page fault record
>>>>>>>
>>>>>>>            - *
>>>>>>>
>>>>>>>            - * @adev: amdgpu device pointer
>>>>>>>
>>>>>>>            - * @key: 64-bit encoding of PASID and address
>>>>>>>
>>>>>>>            - *
>>>>>>>
>>>>>>>            - * This should be called when a page fault has been
>>>>>>> handled. Any
>>>>>>>
>>>>>>>            - * future interrupt with this key will be processed as
>>>>>>> a new
>>>>>>>
>>>>>>>            - * page fault.
>>>>>>>
>>>>>>>            - */
>>>>>>>
>>>>>>>            -void amdgpu_ih_clear_fault(struct amdgpu_device *adev,
>>>>>>> u64 key)
>>>>>>>
>>>>>>>            -{
>>>>>>>
>>>>>>>            -  unsigned long flags;
>>>>>>>
>>>>>>>            -  int r;
>>>>>>>
>>>>>>>            -
>>>>>>>
>>>>>>>            -  if (!adev->irq.ih.faults)
>>>>>>>
>>>>>>>            -          return;
>>>>>>>
>>>>>>>            -
>>>>>>>
>>>>>>>            - spin_lock_irqsave(&adev->irq.ih.faults->lock, flags);
>>>>>>>
>>>>>>>            -
>>>>>>>
>>>>>>>            -  r = chash_table_remove(&adev->irq.ih.faults->hash,
>>>>>>> key, NULL);
>>>>>>>
>>>>>>>            -  if (!WARN_ON_ONCE(r < 0)) {
>>>>>>>
>>>>>>>            -          adev->irq.ih.faults->count--;
>>>>>>>
>>>>>>>            - WARN_ON_ONCE(adev->irq.ih.faults->count < 0);
>>>>>>>
>>>>>>>            -  }
>>>>>>>
>>>>>>>            -
>>>>>>>
>>>>>>>            - spin_unlock_irqrestore(&adev->irq.ih.faults->lock,
>>>>>>> flags);
>>>>>>>
>>>>>>>            -}
>>>>>>>
>>>>>>>            diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>>>>>
>>>>>>>            index a23e1c0..f411ffb 100644
>>>>>>>
>>>>>>>            --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>>>>>
>>>>>>>            +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>>>>>
>>>>>>>            @@ -32,13 +32,6 @@ struct amdgpu_device;
>>>>>>>
>>>>>>>             #define AMDGPU_IH_CLIENTID_LEGACY 0
>>>>>>>
>>>>>>>             #define AMDGPU_IH_CLIENTID_MAX SOC15_IH_CLIENTID_MAX
>>>>>>>
>>>>>>>
>>>>>>>            -#define AMDGPU_PAGEFAULT_HASH_BITS 8
>>>>>>>
>>>>>>>            -struct amdgpu_retryfault_hashtable {
>>>>>>>
>>>>>>>            -  DECLARE_CHASH_TABLE(hash,
>>>>>>> AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
>>>>>>>
>>>>>>>            -  spinlock_t     lock;
>>>>>>>
>>>>>>>            -  int            count;
>>>>>>>
>>>>>>>            -};
>>>>>>>
>>>>>>>            -
>>>>>>>
>>>>>>>             /*
>>>>>>>
>>>>>>>              * R6xx+ IH ring
>>>>>>>
>>>>>>>              */
>>>>>>>
>>>>>>>            @@ -57,7 +50,6 @@ struct amdgpu_ih_ring {
>>>>>>>
>>>>>>>               bool                   use_doorbell;
>>>>>>>
>>>>>>>               bool                   use_bus_addr;
>>>>>>>
>>>>>>>               dma_addr_t             rb_dma_addr; /* only used
>>>>>>> when use_bus_addr = true */
>>>>>>>
>>>>>>>            -  struct amdgpu_retryfault_hashtable *faults;
>>>>>>>
>>>>>>>             };
>>>>>>>
>>>>>>>
>>>>>>>             #define AMDGPU_IH_SRC_DATA_MAX_SIZE_DW 4
>>>>>>>
>>>>>>>            @@ -95,7 +87,5 @@ int amdgpu_ih_ring_init(struct
>>>>>>> amdgpu_device *adev, unsigned ring_size,
>>>>>>>
>>>>>>>                               bool use_bus_addr);
>>>>>>>
>>>>>>>             void amdgpu_ih_ring_fini(struct amdgpu_device *adev);
>>>>>>>
>>>>>>>             int amdgpu_ih_process(struct amdgpu_device *adev);
>>>>>>>
>>>>>>>            -int amdgpu_ih_add_fault(struct amdgpu_device *adev,
>>>>>>> u64 key);
>>>>>>>
>>>>>>>            -void amdgpu_ih_clear_fault(struct amdgpu_device *adev,
>>>>>>> u64 key);
>>>>>>>
>>>>>>>
>>>>>>>             #endif
>>>>>>>
>>>>>>>            diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>>
>>>>>>>            index 1d7e3c1..8b220e9 100644
>>>>>>>
>>>>>>>            --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>>
>>>>>>>            +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>>
>>>>>>>            @@ -2692,6 +2692,22 @@ void
>>>>>>> amdgpu_vm_adjust_size(struct amdgpu_device *adev, uint32_t
>>>>>>> min_vm_size,
>>>>>>>
>>>>>>>                        adev->vm_manager.fragment_size);
>>>>>>>
>>>>>>>             }
>>>>>>>
>>>>>>>
>>>>>>>            +static struct amdgpu_retryfault_hashtable
>>>>>>> *init_fault_hash(void)
>>>>>>>
>>>>>>>            +{
>>>>>>>
>>>>>>>            +  struct amdgpu_retryfault_hashtable *fault_hash;
>>>>>>>
>>>>>>>            +
>>>>>>>
>>>>>>>            +  fault_hash = kmalloc(sizeof(*fault_hash),
>>>>>>> GFP_KERNEL);
>>>>>>>
>>>>>>>            +  if (!fault_hash)
>>>>>>>
>>>>>>>            +          return fault_hash;
>>>>>>>
>>>>>>>            +
>>>>>>>
>>>>>>>            +  INIT_CHASH_TABLE(fault_hash->hash,
>>>>>>>
>>>>>>>            +                  AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
>>>>>>>
>>>>>>>            +  spin_lock_init(&fault_hash->lock);
>>>>>>>
>>>>>>>            +  fault_hash->count = 0;
>>>>>>>
>>>>>>>            +
>>>>>>>
>>>>>>>            +  return fault_hash;
>>>>>>>
>>>>>>>            +}
>>>>>>>
>>>>>>>            +
>>>>>>>
>>>>>>>             /**
>>>>>>>
>>>>>>>              * amdgpu_vm_init - initialize a vm instance
>>>>>>>
>>>>>>>              *
>>>>>>>
>>>>>>>            @@ -2780,6 +2796,12 @@ int amdgpu_vm_init(struct
>>>>>>> amdgpu_device *adev, struct amdgpu_vm *vm,
>>>>>>>
>>>>>>>                       vm->pasid = pasid;
>>>>>>>
>>>>>>>               }
>>>>>>>
>>>>>>>
>>>>>>>            +  vm->fault_hash = init_fault_hash();
>>>>>>>
>>>>>>>            +  if (!vm->fault_hash) {
>>>>>>>
>>>>>>>            +          r = -ENOMEM;
>>>>>>>
>>>>>>>            +          goto error_free_root;
>>>>>>>
>>>>>>>            +  }
>>>>>>>
>>>>>>>            +
>>>>>>>
>>>>>>>               INIT_KFIFO(vm->faults);
>>>>>>>
>>>>>>>               vm->fault_credit = 16;
>>>>>>>
>>>>>>>
>>>>>>>            @@ -2973,7 +2995,7 @@ void amdgpu_vm_fini(struct
>>>>>>> amdgpu_device *adev, struct amdgpu_vm *vm)
>>>>>>>
>>>>>>>
>>>>>>>               /* Clear pending page faults from IH when the VM is
>>>>>>> destroyed */
>>>>>>>
>>>>>>>               while (kfifo_get(&vm->faults, &fault))
>>>>>>>
>>>>>>>            -          amdgpu_ih_clear_fault(adev, fault);
>>>>>>>
>>>>>>>            +          amdgpu_vm_clear_fault(vm->fault_hash,
>>>>>>> fault);
>>>>>>>
>>>>>>>
>>>>>>>               if (vm->pasid) {
>>>>>>>
>>>>>>>                       unsigned long flags;
>>>>>>>
>>>>>>>            @@ -2983,6 +3005,9 @@ void amdgpu_vm_fini(struct
>>>>>>> amdgpu_device *adev, struct amdgpu_vm *vm)
>>>>>>>
>>>>>>> spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
>>>>>>>
>>>>>>>               }
>>>>>>>
>>>>>>>
>>>>>>>            +  kfree(vm->fault_hash);
>>>>>>>
>>>>>>>            +  vm->fault_hash = NULL;
>>>>>>>
>>>>>>>            +
>>>>>>>
>>>>>>>               drm_sched_entity_destroy(&vm->entity);
>>>>>>>
>>>>>>>
>>>>>>>               if (!RB_EMPTY_ROOT(&vm->va.rb_root)) {
>>>>>>>
>>>>>>>            @@ -3183,3 +3208,78 @@ void
>>>>>>> amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
>>>>>>>
>>>>>>>                       }
>>>>>>>
>>>>>>>               }
>>>>>>>
>>>>>>>             }
>>>>>>>
>>>>>>>            +
>>>>>>>
>>>>>>>            +/**
>>>>>>>
>>>>>>>            + * amdgpu_vm_add_fault - Add a page fault record to
>>>>>>> fault hash table
>>>>>>>
>>>>>>>            + *
>>>>>>>
>>>>>>>            + * @fault_hash: fault hash table
>>>>>>>
>>>>>>>            + * @key: 64-bit encoding of PASID and address
>>>>>>>
>>>>>>>            + *
>>>>>>>
>>>>>>>            + * This should be called when a retry page fault
>>>>>>> interrupt is
>>>>>>>
>>>>>>>            + * received. If this is a new page fault, it will be
>>>>>>> added to a hash
>>>>>>>
>>>>>>>            + * table. The return value indicates whether this is a
>>>>>>> new fault, or
>>>>>>>
>>>>>>>            + * a fault that was already known and is already being
>>>>>>> handled.
>>>>>>>
>>>>>>>            + *
>>>>>>>
>>>>>>>            + * If there are too many pending page faults, this
>>>>>>> will fail. Retry
>>>>>>>
>>>>>>>            + * interrupts should be ignored in this case until
>>>>>>> there is enough
>>>>>>>
>>>>>>>            + * free space.
>>>>>>>
>>>>>>>            + *
>>>>>>>
>>>>>>>            + * Returns 0 if the fault was added, 1 if the fault
>>>>>>> was already known,
>>>>>>>
>>>>>>>            + * -ENOSPC if there are too many pending faults.
>>>>>>>
>>>>>>>            + */
>>>>>>>
>>>>>>>            +int amdgpu_vm_add_fault(struct
>>>>>>> amdgpu_retryfault_hashtable *fault_hash, u64 key)
>>>>>>>
>>>>>>>            +{
>>>>>>>
>>>>>>>            +  unsigned long flags;
>>>>>>>
>>>>>>>            +  int r = -ENOSPC;
>>>>>>>
>>>>>>>            +
>>>>>>>
>>>>>>>            +  if (WARN_ON_ONCE(!fault_hash))
>>>>>>>
>>>>>>>            +          /* Should be allocated in amdgpu_vm_init
>>>>>>>
>>>>>>>            +           */
>>>>>>>
>>>>>>>            +          return r;
>>>>>>>
>>>>>>>            +
>>>>>>>
>>>>>>>            +  spin_lock_irqsave(&fault_hash->lock, flags);
>>>>>>>
>>>>>>>            +
>>>>>>>
>>>>>>>            +  /* Only let the hash table fill up to 50% for best
>>>>>>> performance */
>>>>>>>
>>>>>>>            +  if (fault_hash->count >= (1 <<
>>>>>>> (AMDGPU_PAGEFAULT_HASH_BITS-1)))
>>>>>>>
>>>>>>>            +          goto unlock_out;
>>>>>>>
>>>>>>>            +
>>>>>>>
>>>>>>>            +  r = chash_table_copy_in(&fault_hash->hash, key,
>>>>>>> NULL);
>>>>>>>
>>>>>>>            +  if (!r)
>>>>>>>
>>>>>>>            +          fault_hash->count++;
>>>>>>>
>>>>>>>            +
>>>>>>>
>>>>>>>            +  /* chash_table_copy_in should never fail unless
>>>>>>> we're losing count */
>>>>>>>
>>>>>>>            +  WARN_ON_ONCE(r < 0);
>>>>>>>
>>>>>>>            +
>>>>>>>
>>>>>>>            +unlock_out:
>>>>>>>
>>>>>>>            +  spin_unlock_irqrestore(&fault_hash->lock, flags);
>>>>>>>
>>>>>>>            +  return r;
>>>>>>>
>>>>>>>            +}
>>>>>>>
>>>>>>>            +
>>>>>>>
>>>>>>>            +/**
>>>>>>>
>>>>>>>            + * amdgpu_vm_clear_fault - Remove a page fault record
>>>>>>>
>>>>>>>            + *
>>>>>>>
>>>>>>>            + * @fault_hash: fault hash table
>>>>>>>
>>>>>>>            + * @key: 64-bit encoding of PASID and address
>>>>>>>
>>>>>>>            + *
>>>>>>>
>>>>>>>            + * This should be called when a page fault has been
>>>>>>> handled. Any
>>>>>>>
>>>>>>>            + * future interrupt with this key will be processed as
>>>>>>> a new
>>>>>>>
>>>>>>>            + * page fault.
>>>>>>>
>>>>>>>            + */
>>>>>>>
>>>>>>>            +void amdgpu_vm_clear_fault(struct
>>>>>>> amdgpu_retryfault_hashtable *fault_hash, u64 key)
>>>>>>>
>>>>>>>            +{
>>>>>>>
>>>>>>>            +  unsigned long flags;
>>>>>>>
>>>>>>>            +  int r;
>>>>>>>
>>>>>>>            +
>>>>>>>
>>>>>>>            +  if (!fault_hash)
>>>>>>>
>>>>>>>            +          return;
>>>>>>>
>>>>>>>            +
>>>>>>>
>>>>>>>            +  spin_lock_irqsave(&fault_hash->lock, flags);
>>>>>>>
>>>>>>>            +
>>>>>>>
>>>>>>>            +  r = chash_table_remove(&fault_hash->hash, key,
>>>>>>> NULL);
>>>>>>>
>>>>>>>            +  if (!WARN_ON_ONCE(r < 0)) {
>>>>>>>
>>>>>>>            +          fault_hash->count--;
>>>>>>>
>>>>>>>            +          WARN_ON_ONCE(fault_hash->count < 0);
>>>>>>>
>>>>>>>            +  }
>>>>>>>
>>>>>>>            +
>>>>>>>
>>>>>>>            +  spin_unlock_irqrestore(&fault_hash->lock, flags);
>>>>>>>
>>>>>>>            +}
>>>>>>>
>>>>>>>            diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>>>>>
>>>>>>>            index e275ee7..6eb1da1 100644
>>>>>>>
>>>>>>>            --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>>>>>
>>>>>>>            +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>>>>>
>>>>>>>            @@ -178,6 +178,13 @@ struct amdgpu_task_info {
>>>>>>>
>>>>>>>               pid_t   tgid;
>>>>>>>
>>>>>>>             };
>>>>>>>
>>>>>>>
>>>>>>>            +#define AMDGPU_PAGEFAULT_HASH_BITS 8
>>>>>>>
>>>>>>>            +struct amdgpu_retryfault_hashtable {
>>>>>>>
>>>>>>>            +  DECLARE_CHASH_TABLE(hash,
>>>>>>> AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
>>>>>>>
>>>>>>>            +  spinlock_t     lock;
>>>>>>>
>>>>>>>            +  int            count;
>>>>>>>
>>>>>>>            +};
>>>>>>>
>>>>>>>            +
>>>>>>>
>>>>>>>             struct amdgpu_vm {
>>>>>>>
>>>>>>>               /* tree of virtual addresses mapped */
>>>>>>>
>>>>>>>               struct rb_root_cached  va;
>>>>>>>
>>>>>>>            @@ -240,6 +247,7 @@ struct amdgpu_vm {
>>>>>>>
>>>>>>>               struct ttm_lru_bulk_move lru_bulk_move;
>>>>>>>
>>>>>>>               /* mark whether can do the bulk move */
>>>>>>>
>>>>>>>               bool                   bulk_moveable;
>>>>>>>
>>>>>>>            +  struct amdgpu_retryfault_hashtable *fault_hash;
>>>>>>>
>>>>>>>             };
>>>>>>>
>>>>>>>
>>>>>>>             struct amdgpu_vm_manager {
>>>>>>>
>>>>>>>            @@ -355,4 +363,8 @@ void amdgpu_vm_set_task_info(struct
>>>>>>> amdgpu_vm *vm);
>>>>>>>
>>>>>>>             void amdgpu_vm_move_to_lru_tail(struct amdgpu_device
>>>>>>> *adev,
>>>>>>>
>>>>>>>                                      struct amdgpu_vm *vm);
>>>>>>>
>>>>>>>
>>>>>>>            +int amdgpu_vm_add_fault(struct
>>>>>>> amdgpu_retryfault_hashtable *fault_hash, u64 key);
>>>>>>>
>>>>>>>            +
>>>>>>>
>>>>>>>            +void amdgpu_vm_clear_fault(struct
>>>>>>> amdgpu_retryfault_hashtable *fault_hash, u64 key);
>>>>>>>
>>>>>>>            +
>>>>>>>
>>>>>>>             #endif
>>>>>>>
>>>>>>>            diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>>
>>>>>>>            index 5ae5ed2..acbe5a7 100644
>>>>>>>
>>>>>>>            --- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>>
>>>>>>>            +++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>>
>>>>>>>            @@ -265,35 +265,36 @@ static bool
>>>>>>> vega10_ih_prescreen_iv(struct amdgpu_device *adev)
>>>>>>>
>>>>>>>                       return true;
>>>>>>>
>>>>>>>               }
>>>>>>>
>>>>>>>
>>>>>>>            -  addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 12);
>>>>>>>
>>>>>>>            -  key = AMDGPU_VM_FAULT(pasid, addr);
>>>>>>>
>>>>>>>            -  r = amdgpu_ih_add_fault(adev, key);
>>>>>>>
>>>>>>>            -
>>>>>>>
>>>>>>>            -  /* Hash table is full or the fault is already being
>>>>>>> processed,
>>>>>>>
>>>>>>>            -   * ignore further page faults
>>>>>>>
>>>>>>>            -   */
>>>>>>>
>>>>>>>            -  if (r != 0)
>>>>>>>
>>>>>>>            -          goto ignore_iv;
>>>>>>>
>>>>>>>            -
>>>>>>>
>>>>>>>               /* Track retry faults in per-VM fault FIFO. */
>>>>>>>
>>>>>>>               spin_lock(&adev->vm_manager.pasid_lock);
>>>>>>>
>>>>>>>               vm = idr_find(&adev->vm_manager.pasid_idr, pasid);
>>>>>>>
>>>>>>>            +  addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 12);
>>>>>>>
>>>>>>>            +  key = AMDGPU_VM_FAULT(pasid, addr);
>>>>>>>
>>>>>>>               if (!vm) {
>>>>>>>
>>>>>>>                       /* VM not found, process it normally */
>>>>>>>
>>>>>>> spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>>>
>>>>>>>            -          amdgpu_ih_clear_fault(adev, key);
>>>>>>>
>>>>>>>                       return true;
>>>>>>>
>>>>>>>            +  } else {
>>>>>>>
>>>>>>>            +          r = amdgpu_vm_add_fault(vm->fault_hash,
>>>>>>> key);
>>>>>>>
>>>>>>>            +
>>>>>>>
>>>>>>>            +          /* Hash table is full or the fault is
>>>>>>> already being processed,
>>>>>>>
>>>>>>>            +           * ignore further page faults
>>>>>>>
>>>>>>>            +           */
>>>>>>>
>>>>>>>            +          if (r != 0) {
>>>>>>>
>>>>>>>            + spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>>>
>>>>>>>            +                  goto ignore_iv;
>>>>>>>
>>>>>>>            +          }
>>>>>>>
>>>>>>>               }
>>>>>>>
>>>>>>>               /* No locking required with single writer and single
>>>>>>> reader */
>>>>>>>
>>>>>>>               r = kfifo_put(&vm->faults, key);
>>>>>>>
>>>>>>>               if (!r) {
>>>>>>>
>>>>>>>                       /* FIFO is full. Ignore it until there is
>>>>>>> space */
>>>>>>>
>>>>>>>            +          amdgpu_vm_clear_fault(vm->fault_hash, key);
>>>>>>>
>>>>>>> spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>>>
>>>>>>>            -          amdgpu_ih_clear_fault(adev, key);
>>>>>>>
>>>>>>>                       goto ignore_iv;
>>>>>>>
>>>>>>>               }
>>>>>>>
>>>>>>>            - spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>>>
>>>>>>>
>>>>>>>            + spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>>>
>>>>>>>               /* It's the first fault for this address, process it
>>>>>>> normally */
>>>>>>>
>>>>>>>               return true;
>>>>>>>
>>>>>>>
>>>>>>>            @@ -386,14 +387,6 @@ static int vega10_ih_sw_init(void
>>>>>>> *handle)
>>>>>>>
>>>>>>>               adev->irq.ih.use_doorbell = true;
>>>>>>>
>>>>>>>               adev->irq.ih.doorbell_index = AMDGPU_DOORBELL64_IH
>>>>>>> << 1;
>>>>>>>
>>>>>>>
>>>>>>>            -  adev->irq.ih.faults =
>>>>>>> kmalloc(sizeof(*adev->irq.ih.faults), GFP_KERNEL);
>>>>>>>
>>>>>>>            -  if (!adev->irq.ih.faults)
>>>>>>>
>>>>>>>            -          return -ENOMEM;
>>>>>>>
>>>>>>>            - INIT_CHASH_TABLE(adev->irq.ih.faults->hash,
>>>>>>>
>>>>>>>            -                   AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
>>>>>>>
>>>>>>>            - spin_lock_init(&adev->irq.ih.faults->lock);
>>>>>>>
>>>>>>>            -  adev->irq.ih.faults->count = 0;
>>>>>>>
>>>>>>>            -
>>>>>>>
>>>>>>>               r = amdgpu_irq_init(adev);
>>>>>>>
>>>>>>>
>>>>>>>               return r;
>>>>>>>
>>>>>>>            @@ -406,9 +399,6 @@ static int vega10_ih_sw_fini(void
>>>>>>> *handle)
>>>>>>>
>>>>>>>               amdgpu_irq_fini(adev);
>>>>>>>
>>>>>>>               amdgpu_ih_ring_fini(adev);
>>>>>>>
>>>>>>>
>>>>>>>            -  kfree(adev->irq.ih.faults);
>>>>>>>
>>>>>>>            -  adev->irq.ih.faults = NULL;
>>>>>>>
>>>>>>>            -
>>>>>>>
>>>>>>>               return 0;
>>>>>>>
>>>>>>>             }
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>        _______________________________________________
>>>>>>>
>>>>>>>        amd-gfx mailing list
>>>>>>>
>>>>>>>        amd-gfx@lists.freedesktop.org
>>>>>>> <mailto:amd-gfx@lists.freedesktop.org>
>>>>>>>
>>>>>>>        https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>>
>>>>>>>
>>>>>> _______________________________________________
>>>>>> amd-gfx mailing list
>>>>>> amd-gfx@lists.freedesktop.org
>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>> _______________________________________________
>>>> amd-gfx mailing list
>>>> amd-gfx@lists.freedesktop.org
>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* RE: [PATCH 1/2] drm/amdgpu: Moved fault hash table to amdgpu vm
       [not found]                                             ` <375d7ba0-1da4-b5b3-6eb9-0d947290ee74-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
@ 2018-09-11 15:02                                               ` Zeng, Oak
       [not found]                                                 ` <DM5PR12MB16608D6DAB8D9BFAAE348C4780040-2J9CzHegvk9mP71xM+BsFwdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  0 siblings, 1 reply; 27+ messages in thread
From: Zeng, Oak @ 2018-09-11 15:02 UTC (permalink / raw)
  To: Koenig, Christian, Kuehling, Felix, Oak Zeng,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Yang, Philip

>Exactly that is what won't work. See we can't mark the PDE invalid because we still have some engines which can't deal with retry faults.

By engines do you mean UTCL2 client? If yes, I am not aware of that. If any UTCL2 client doesn't support translation retry, driver will need to make sure all the current PDE/PTE entries are valid. In other word, it should be setup before it is used by any job and can't be swapped out during memory pressure. How can it work otherwise?

Thanks,
Oak

-----Original Message-----
From: Christian König <ckoenig.leichtzumerken@gmail.com> 
Sent: Tuesday, September 11, 2018 10:23 AM
To: Zeng, Oak <Oak.Zeng@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; Oak Zeng <zengshanjun@gmail.com>; amd-gfx@lists.freedesktop.org; Yang, Philip <Philip.Yang@amd.com>
Subject: Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to amdgpu vm

> [Oak]: Step 5, We also mark the corresponding entry of parent page table of the evicted page table to invalid.
Exactly that is what won't work. See we can't mark the PDE invalid because we still have some engines which can't deal with retry faults.

> [Oak] How do you block it?
Just waiting for all execution to finish.

Regards,
Christian.

Am 11.09.2018 um 16:09 schrieb Zeng, Oak:
> Hi Christian,
>
> I am not sure I understand exactly your thought but see 2 comments 
> inline
>
> Thanks,
> Oak
>
> -----Original Message-----
> From: Christian König <ckoenig.leichtzumerken@gmail.com>
> Sent: Tuesday, September 11, 2018 3:19 AM
> To: Kuehling, Felix <Felix.Kuehling@amd.com>; Koenig, Christian 
> <Christian.Koenig@amd.com>; Zeng, Oak <Oak.Zeng@amd.com>; Oak Zeng 
> <zengshanjun@gmail.com>; amd-gfx@lists.freedesktop.org; Yang, Philip 
> <Philip.Yang@amd.com>
> Subject: Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to amdgpu 
> vm
>
> Hi Felix,
>
> let me try to explain the problem on an example:
>
> 1. We have a running job which needs recoverable page faults for accessing the process address space.
> 2. We run into memory pressure on VRAM and start to evict things.
> 3. A page tables of the running job is picked up for eviction.
> 4. We schedule a copy command to move the content of the page table to system memory.
> 5. The new system memory location of the page table is noted in its BO.
> 6. We get a fault from the job and swap in the page from the process address space.
> 7. Now we need to enter the new page address into the page table -> 
> *BAM*
>
> The problem is now that we don't know the address of the page table because the current location was replaced with the future location in step #5.
> [Oak]: Step 5, We also mark the corresponding entry of parent page table of the evicted page table to invalid. This invalid bit will cause the utcl2 walker to report a page fault. Depending on implementation, for the evicted page table, its' amdgpu_vm_pt.amdgpu_vm_bo_base will point to a new location in system memory (amdgpu_vm_pt itself is in system memory). In the page fault handler, we allocate a *new* physical page in vram (through ttm), swap in the page content from system memory (using sdma) to this newly allocated page, modify parent page table's corresponding entry to point to this *new allocated* page and set valid bit to 1.
>
> If all the swap-in, swap-out work is done by sdma, we don't need to worry about the case that swap-in happens earlier than swap-out - gpu scheduler will take care the sequence.
>
> If ttm can't turn around for the new page allocation, it can optionally trigger things like OOM. This should be something that need to be considered at ttm physical memory management layer.
>
>
>
> We could now argue that we could update the page tables on the fly for the evicted page and never wait for the current job to finish, but this won't work because not all engines can handle that.
>
> I will circumvent this problem for now by blocking the eviction before step #5.
> [Oak] How do you block it? Pin the memory? I agree that swapping out page table should be the last resort during a memory pressure handling.
>
> The issue with that is that this pipelining is responsible for nearly 20% of the fps in some testcases.
>
> I hope that it won't be that bad when I limit disabling the pipelining to only page tables, but it is certainly possible that we will get pushback on this.
>
> If that happens we probably need to add some flags to limit this workaround even more to only the root PD and all the PDs/PTs which are involved in recoverable faults.
>
> Regards,
> Christian.
>
> Am 10.09.2018 um 21:10 schrieb Felix Kuehling:
>> I'm not sure why you need to distinguish current and future state 
>> when dealing with page faults. When you get a page fault, you know 
>> that the GPU is trying to access memory right now, in the present. So 
>> you're always working with the current state. When the CPU page table 
>> changes, you get an MMU notifier that allows you to invalidate the 
>> corresponding GPU PTEs. If you have a valid GPU PTE, it always 
>> represents the current states and is in sync with the CPU page table.
>> If the GPU page table is ever outdated, it should have an invalid entry in that place.
>>
>> If SDMA is used to update the GPU PTEs, there is a delay. The MMU 
>> notifier is synchronous, so it shouldn't be a problem. You just wait 
>> for the SDMA job to complete before returning. When updating PTEs 
>> with new valid addresses, it's asynchronous. But the GPU will 
>> continue retrying on the invalid entry until SDMA finishes. So it's 
>> also implicitly synchronized on the GPU side.
>>
>> Regards,
>>     Felix
>>
>>
>> On 2018-09-10 05:42 AM, Christian König wrote:
>>> Hi Felix & Oak,
>>>
>>> over the weekend I had the idea that we could just use the shadow 
>>> BOs to have the current state in a page fault. They are GTT BOs and 
>>> CPU accessible anyway.
>>>
>>> Regards,
>>> Christian.
>>>
>>> Am 08.09.2018 um 09:34 schrieb Christian König:
>>>> Hi Felix,
>>>>
>>>>> But why do you want to update page tables when there is no more 
>>>>> user mode context that cares about them? Is this just to allow 
>>>>> pending work to complete after the user mode context is gone? To prevent hangs?
>>>> The problem I'm see is that we still don't have a reliable way of 
>>>> killing a command submission while making sure that no more faults 
>>>> of it are in the flight.
>>>>
>>>> E.g. the HWS has a function to kill a running job on the hardware, 
>>>> but that only works for compute and there is no way of telling when 
>>>> that operation has finished.
>>>>
>>>> But Oak has already convinced me that we should probably work on 
>>>> that instead of trying to keep the VM alive as long as possible.
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>> Am 08.09.2018 um 02:27 schrieb Felix Kuehling:
>>>>> Hi Christian,
>>>>>
>>>>> I'm not sure I get your point here.
>>>>>
>>>>> As I understand it, the amdgpu_vm structure represents the user 
>>>>> mode VM context. It has the pointers to the root page directory 
>>>>> and the rest of the page table hierarchy. If there is no 
>>>>> amdgpu_vm, there is no user mode context that cares about the page tables.
>>>>>
>>>>> If we leave the page table pointers in the amdgpu_vm, then 
>>>>> handling faults after the VM is destroyed is pointless. We can't 
>>>>> find the page tables and we can't update them, so there is nothing 
>>>>> we can do in response to VM faults.
>>>>>
>>>>> So I'm guessing that you want to move the page table pointers into 
>>>>> a different structure that exists half-independently of the VM. It 
>>>>> would be created when the VM is created (that's where we currently 
>>>>> allocate the PASID) but can be destroyed slightly later.
>>>>>
>>>>> But why do you want to update page tables when there is no more 
>>>>> user mode context that cares about them? Is this just to allow 
>>>>> pending work to complete after the user mode context is gone? To prevent hangs?
>>>>>
>>>>> Regards,
>>>>>      Felix
>>>>>
>>>>> On 2018-09-10 07:20 AM, Christian König wrote:
>>>>>>> Am I right that the handling of page fault need a valid VM?
>>>>>> No, exactly that view is incorrect.
>>>>>>
>>>>>> The VM is meant to be a memory management structure of page 
>>>>>> tables and is completely pointless fault processing because it 
>>>>>> represent the future state of the page tables and not the current one.
>>>>>>
>>>>>> What we need for fault processing is a new structure build around 
>>>>>> PASIDs which is feed by the with addresses when page tables are 
>>>>>> moved around.
>>>>>>
>>>>>> Alternatively I hope that we can use the SDMA to walk the page 
>>>>>> tables and update the required entries by just using the address.
>>>>>>
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>>> Am 07.09.2018 um 16:55 schrieb Zeng, Oak:
>>>>>>> Hi Christian,
>>>>>>>
>>>>>>>
>>>>>>> What is the value of delay the destroy of hash to after vm is 
>>>>>>> destroyed? Since vm is destroyed, you basically don’t have 
>>>>>>> enough information to paging in the correct page to gpuvm. Am I 
>>>>>>> right that the handling of page fault need a valid VM? For 
>>>>>>> example, you need the VM to get the corresponding physical address of the faulty page.
>>>>>>>
>>>>>>>
>>>>>>> After vm is destroyed, the retry interrupt can be directly 
>>>>>>> discard as you can’t find vm through pasid. You can think this 
>>>>>>> is also one kind of prescreen.
>>>>>>>
>>>>>>>
>>>>>>> So I am stand for the point that, there is no need to delay the 
>>>>>>> destroy of hash to after vm is destroyed. Prescreening hash can 
>>>>>>> be destroyed at the time of vm_fini.
>>>>>>>
>>>>>>>
>>>>>>> Thanks,
>>>>>>>
>>>>>>> Oak
>>>>>>>
>>>>>>>
>>>>>>> *From:*Christian König <ckoenig.leichtzumerken@gmail.com>
>>>>>>> *Sent:* Friday, September 07, 2018 4:39 AM
>>>>>>> *To:* Zeng, Oak <Oak.Zeng@amd.com>; Oak Zeng 
>>>>>>> <zengshanjun@gmail.com>; amd-gfx@lists.freedesktop.org
>>>>>>> *Cc:* Zeng, Oak <Oak.Zeng@amd.com>
>>>>>>> *Subject:* Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to 
>>>>>>> amdgpu vm
>>>>>>>
>>>>>>>
>>>>>>> Hi Oak,
>>>>>>>
>>>>>>> yeah, but this is still a NAK. Making the hash per PASID was not 
>>>>>>> a suggestion but rather a must have.
>>>>>>>
>>>>>>> The VM structures must be destroyed while the processing is 
>>>>>>> still ongoing, or otherwise we would not have a clean OOM handling.
>>>>>>>
>>>>>>> The IDR for PASID lockup can be put into amdgpu_ids.c, you can 
>>>>>>> just replace the IDA already used there with an IDR.
>>>>>>>
>>>>>>> Since the PASID is already freed up delayed we should have the 
>>>>>>> grace period for interrupt processing included. If that still 
>>>>>>> isn't sufficient we can still add some delayed work for this.
>>>>>>>
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>>> Am 07.09.2018 um 06:16 schrieb ozeng:
>>>>>>>
>>>>>>>        Hi Christian,
>>>>>>>
>>>>>>>        In this implementation, fault hash is made per vm, not 
>>>>>>> per pasid
>>>>>>>        as suggested, based on below considerations:
>>>>>>>
>>>>>>>          * Delay the destroy of hash introduces more effort like 
>>>>>>> how to
>>>>>>>            set the proper grace period after which no retry 
>>>>>>> interrupt
>>>>>>>            will be generated. We want to avoid those 
>>>>>>> complication if we
>>>>>>>            can.
>>>>>>>          * The  problem of the per vm implementation is that it 
>>>>>>> is hard
>>>>>>>            to delay the destroy of fault hash, because when vm is
>>>>>>>            destroyed, prescreen function won't be able to 
>>>>>>> retrieve the
>>>>>>>            fault hash. But in this case, the prescreen function 
>>>>>>> can
>>>>>>>            either let the interrupt go through (to gmc) or ignore it.
>>>>>>>            From this perspective, it is not very necessary to 
>>>>>>> delay the
>>>>>>>            destroy of hash table.
>>>>>>>          * On the other hand, since ASICs after vega10 have the 
>>>>>>> HW CAM
>>>>>>>            feature. So the SW IV prescreen is only useful for vega10.
>>>>>>>            For all the HW implemented CAM, we can consider the 
>>>>>>> delayed
>>>>>>>            CAM acknowledgment.
>>>>>>>          * Making it per pasid need to introduce another idr to
>>>>>>>            correlate pasid and hash table - where to put the idr?
>>>>>>> You
>>>>>>>            will have to make it a global variable which is not 
>>>>>>> very
>>>>>>>            desirable.
>>>>>>>
>>>>>>>        The main purpose of this patch is to solve the 
>>>>>>> inter-process lock
>>>>>>>        issue (when hash table is full), while still keep codes simple.
>>>>>>>
>>>>>>>        Also with this patch, the faults kfifo is not necessary 
>>>>>>> any more.
>>>>>>>        See patch 2.
>>>>>>>
>>>>>>>        Regards,
>>>>>>>
>>>>>>>        Oak
>>>>>>>
>>>>>>>
>>>>>>>        On 09/06/2018 11:28 PM, Oak Zeng wrote:
>>>>>>>
>>>>>>>            In stead of share one fault hash table per device, 
>>>>>>> make it
>>>>>>>
>>>>>>>            per vm. This can avoid inter-process lock issue when 
>>>>>>> fault
>>>>>>>
>>>>>>>            hash table is full.
>>>>>>>
>>>>>>>
>>>>>>>            Change-Id: I5d1281b7c41eddc8e26113e010516557588d3708
>>>>>>>
>>>>>>>            Signed-off-by: Oak Zeng <Oak.Zeng@amd.com> 
>>>>>>> <mailto:Oak.Zeng@amd.com>
>>>>>>>
>>>>>>>            Suggested-by: Christian Konig 
>>>>>>> <Christian.Koenig@amd.com> <mailto:Christian.Koenig@amd.com>
>>>>>>>
>>>>>>>            Suggested-by: Felix Kuehling <Felix.Kuehling@amd.com> 
>>>>>>> <mailto:Felix.Kuehling@amd.com>
>>>>>>>
>>>>>>>            ---
>>>>>>>
>>>>>>>             drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c |  75
>>>>>>> ------------------------
>>>>>>>
>>>>>>>             drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h |  10 ----
>>>>>>>
>>>>>>>             drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 102
>>>>>>> ++++++++++++++++++++++++++++++++-
>>>>>>>
>>>>>>>             drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h |  12 ++++
>>>>>>>
>>>>>>>             drivers/gpu/drm/amd/amdgpu/vega10_ih.c |  38
>>>>>>> +++++-------
>>>>>>>
>>>>>>>             5 files changed, 127 insertions(+), 110 deletions(-)
>>>>>>>
>>>>>>>
>>>>>>>            diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>>>>>>
>>>>>>>            index 06373d4..4ed8621 100644
>>>>>>>
>>>>>>>            --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>>>>>>
>>>>>>>            +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>>>>>>
>>>>>>>            @@ -197,78 +197,3 @@ int amdgpu_ih_process(struct 
>>>>>>> amdgpu_device *adev)
>>>>>>>
>>>>>>>               return IRQ_HANDLED;
>>>>>>>
>>>>>>>             }
>>>>>>>
>>>>>>>
>>>>>>>            -/**
>>>>>>>
>>>>>>>            - * amdgpu_ih_add_fault - Add a page fault record
>>>>>>>
>>>>>>>            - *
>>>>>>>
>>>>>>>            - * @adev: amdgpu device pointer
>>>>>>>
>>>>>>>            - * @key: 64-bit encoding of PASID and address
>>>>>>>
>>>>>>>            - *
>>>>>>>
>>>>>>>            - * This should be called when a retry page fault 
>>>>>>> interrupt is
>>>>>>>
>>>>>>>            - * received. If this is a new page fault, it will be 
>>>>>>> added to a hash
>>>>>>>
>>>>>>>            - * table. The return value indicates whether this is 
>>>>>>> a new fault, or
>>>>>>>
>>>>>>>            - * a fault that was already known and is already 
>>>>>>> being handled.
>>>>>>>
>>>>>>>            - *
>>>>>>>
>>>>>>>            - * If there are too many pending page faults, this 
>>>>>>> will fail. Retry
>>>>>>>
>>>>>>>            - * interrupts should be ignored in this case until 
>>>>>>> there is enough
>>>>>>>
>>>>>>>            - * free space.
>>>>>>>
>>>>>>>            - *
>>>>>>>
>>>>>>>            - * Returns 0 if the fault was added, 1 if the fault 
>>>>>>> was already known,
>>>>>>>
>>>>>>>            - * -ENOSPC if there are too many pending faults.
>>>>>>>
>>>>>>>            - */
>>>>>>>
>>>>>>>            -int amdgpu_ih_add_fault(struct amdgpu_device *adev,
>>>>>>> u64
>>>>>>> key)
>>>>>>>
>>>>>>>            -{
>>>>>>>
>>>>>>>            -  unsigned long flags;
>>>>>>>
>>>>>>>            -  int r = -ENOSPC;
>>>>>>>
>>>>>>>            -
>>>>>>>
>>>>>>>            -  if (WARN_ON_ONCE(!adev->irq.ih.faults))
>>>>>>>
>>>>>>>            -          /* Should be allocated in <IP>_ih_sw_init 
>>>>>>> on GPUs that
>>>>>>>
>>>>>>>            -           * support retry faults and require retry 
>>>>>>> filtering.
>>>>>>>
>>>>>>>            -           */
>>>>>>>
>>>>>>>            -          return r;
>>>>>>>
>>>>>>>            -
>>>>>>>
>>>>>>>            - spin_lock_irqsave(&adev->irq.ih.faults->lock, 
>>>>>>> flags);
>>>>>>>
>>>>>>>            -
>>>>>>>
>>>>>>>            -  /* Only let the hash table fill up to 50% for best 
>>>>>>> performance */
>>>>>>>
>>>>>>>            -  if (adev->irq.ih.faults->count >= (1 <<
>>>>>>> (AMDGPU_PAGEFAULT_HASH_BITS-1)))
>>>>>>>
>>>>>>>            -          goto unlock_out;
>>>>>>>
>>>>>>>            -
>>>>>>>
>>>>>>>            -  r = 
>>>>>>> chash_table_copy_in(&adev->irq.ih.faults->hash,
>>>>>>> key, NULL);
>>>>>>>
>>>>>>>            -  if (!r)
>>>>>>>
>>>>>>>            -          adev->irq.ih.faults->count++;
>>>>>>>
>>>>>>>            -
>>>>>>>
>>>>>>>            -  /* chash_table_copy_in should never fail unless 
>>>>>>> we're losing count */
>>>>>>>
>>>>>>>            -  WARN_ON_ONCE(r < 0);
>>>>>>>
>>>>>>>            -
>>>>>>>
>>>>>>>            -unlock_out:
>>>>>>>
>>>>>>>            - spin_unlock_irqrestore(&adev->irq.ih.faults->lock,
>>>>>>> flags);
>>>>>>>
>>>>>>>            -  return r;
>>>>>>>
>>>>>>>            -}
>>>>>>>
>>>>>>>            -
>>>>>>>
>>>>>>>            -/**
>>>>>>>
>>>>>>>            - * amdgpu_ih_clear_fault - Remove a page fault 
>>>>>>> record
>>>>>>>
>>>>>>>            - *
>>>>>>>
>>>>>>>            - * @adev: amdgpu device pointer
>>>>>>>
>>>>>>>            - * @key: 64-bit encoding of PASID and address
>>>>>>>
>>>>>>>            - *
>>>>>>>
>>>>>>>            - * This should be called when a page fault has been 
>>>>>>> handled. Any
>>>>>>>
>>>>>>>            - * future interrupt with this key will be processed 
>>>>>>> as a new
>>>>>>>
>>>>>>>            - * page fault.
>>>>>>>
>>>>>>>            - */
>>>>>>>
>>>>>>>            -void amdgpu_ih_clear_fault(struct amdgpu_device 
>>>>>>> *adev,
>>>>>>> u64 key)
>>>>>>>
>>>>>>>            -{
>>>>>>>
>>>>>>>            -  unsigned long flags;
>>>>>>>
>>>>>>>            -  int r;
>>>>>>>
>>>>>>>            -
>>>>>>>
>>>>>>>            -  if (!adev->irq.ih.faults)
>>>>>>>
>>>>>>>            -          return;
>>>>>>>
>>>>>>>            -
>>>>>>>
>>>>>>>            - spin_lock_irqsave(&adev->irq.ih.faults->lock, 
>>>>>>> flags);
>>>>>>>
>>>>>>>            -
>>>>>>>
>>>>>>>            -  r = chash_table_remove(&adev->irq.ih.faults->hash,
>>>>>>> key, NULL);
>>>>>>>
>>>>>>>            -  if (!WARN_ON_ONCE(r < 0)) {
>>>>>>>
>>>>>>>            -          adev->irq.ih.faults->count--;
>>>>>>>
>>>>>>>            - WARN_ON_ONCE(adev->irq.ih.faults->count < 0);
>>>>>>>
>>>>>>>            -  }
>>>>>>>
>>>>>>>            -
>>>>>>>
>>>>>>>            - spin_unlock_irqrestore(&adev->irq.ih.faults->lock,
>>>>>>> flags);
>>>>>>>
>>>>>>>            -}
>>>>>>>
>>>>>>>            diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>>>>>
>>>>>>>            index a23e1c0..f411ffb 100644
>>>>>>>
>>>>>>>            --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>>>>>
>>>>>>>            +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>>>>>
>>>>>>>            @@ -32,13 +32,6 @@ struct amdgpu_device;
>>>>>>>
>>>>>>>             #define AMDGPU_IH_CLIENTID_LEGACY 0
>>>>>>>
>>>>>>>             #define AMDGPU_IH_CLIENTID_MAX SOC15_IH_CLIENTID_MAX
>>>>>>>
>>>>>>>
>>>>>>>            -#define AMDGPU_PAGEFAULT_HASH_BITS 8
>>>>>>>
>>>>>>>            -struct amdgpu_retryfault_hashtable {
>>>>>>>
>>>>>>>            -  DECLARE_CHASH_TABLE(hash, 
>>>>>>> AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
>>>>>>>
>>>>>>>            -  spinlock_t     lock;
>>>>>>>
>>>>>>>            -  int            count;
>>>>>>>
>>>>>>>            -};
>>>>>>>
>>>>>>>            -
>>>>>>>
>>>>>>>             /*
>>>>>>>
>>>>>>>              * R6xx+ IH ring
>>>>>>>
>>>>>>>              */
>>>>>>>
>>>>>>>            @@ -57,7 +50,6 @@ struct amdgpu_ih_ring {
>>>>>>>
>>>>>>>               bool                   use_doorbell;
>>>>>>>
>>>>>>>               bool                   use_bus_addr;
>>>>>>>
>>>>>>>               dma_addr_t             rb_dma_addr; /* only used 
>>>>>>> when use_bus_addr = true */
>>>>>>>
>>>>>>>            -  struct amdgpu_retryfault_hashtable *faults;
>>>>>>>
>>>>>>>             };
>>>>>>>
>>>>>>>
>>>>>>>             #define AMDGPU_IH_SRC_DATA_MAX_SIZE_DW 4
>>>>>>>
>>>>>>>            @@ -95,7 +87,5 @@ int amdgpu_ih_ring_init(struct 
>>>>>>> amdgpu_device *adev, unsigned ring_size,
>>>>>>>
>>>>>>>                               bool use_bus_addr);
>>>>>>>
>>>>>>>             void amdgpu_ih_ring_fini(struct amdgpu_device 
>>>>>>> *adev);
>>>>>>>
>>>>>>>             int amdgpu_ih_process(struct amdgpu_device *adev);
>>>>>>>
>>>>>>>            -int amdgpu_ih_add_fault(struct amdgpu_device *adev,
>>>>>>> u64 key);
>>>>>>>
>>>>>>>            -void amdgpu_ih_clear_fault(struct amdgpu_device 
>>>>>>> *adev,
>>>>>>> u64 key);
>>>>>>>
>>>>>>>
>>>>>>>             #endif
>>>>>>>
>>>>>>>            diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>>
>>>>>>>            index 1d7e3c1..8b220e9 100644
>>>>>>>
>>>>>>>            --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>>
>>>>>>>            +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>>
>>>>>>>            @@ -2692,6 +2692,22 @@ void 
>>>>>>> amdgpu_vm_adjust_size(struct amdgpu_device *adev, uint32_t 
>>>>>>> min_vm_size,
>>>>>>>
>>>>>>>                        adev->vm_manager.fragment_size);
>>>>>>>
>>>>>>>             }
>>>>>>>
>>>>>>>
>>>>>>>            +static struct amdgpu_retryfault_hashtable
>>>>>>> *init_fault_hash(void)
>>>>>>>
>>>>>>>            +{
>>>>>>>
>>>>>>>            +  struct amdgpu_retryfault_hashtable *fault_hash;
>>>>>>>
>>>>>>>            +
>>>>>>>
>>>>>>>            +  fault_hash = kmalloc(sizeof(*fault_hash), 
>>>>>>> GFP_KERNEL);
>>>>>>>
>>>>>>>            +  if (!fault_hash)
>>>>>>>
>>>>>>>            +          return fault_hash;
>>>>>>>
>>>>>>>            +
>>>>>>>
>>>>>>>            +  INIT_CHASH_TABLE(fault_hash->hash,
>>>>>>>
>>>>>>>            +                  AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
>>>>>>>
>>>>>>>            +  spin_lock_init(&fault_hash->lock);
>>>>>>>
>>>>>>>            +  fault_hash->count = 0;
>>>>>>>
>>>>>>>            +
>>>>>>>
>>>>>>>            +  return fault_hash;
>>>>>>>
>>>>>>>            +}
>>>>>>>
>>>>>>>            +
>>>>>>>
>>>>>>>             /**
>>>>>>>
>>>>>>>              * amdgpu_vm_init - initialize a vm instance
>>>>>>>
>>>>>>>              *
>>>>>>>
>>>>>>>            @@ -2780,6 +2796,12 @@ int amdgpu_vm_init(struct 
>>>>>>> amdgpu_device *adev, struct amdgpu_vm *vm,
>>>>>>>
>>>>>>>                       vm->pasid = pasid;
>>>>>>>
>>>>>>>               }
>>>>>>>
>>>>>>>
>>>>>>>            +  vm->fault_hash = init_fault_hash();
>>>>>>>
>>>>>>>            +  if (!vm->fault_hash) {
>>>>>>>
>>>>>>>            +          r = -ENOMEM;
>>>>>>>
>>>>>>>            +          goto error_free_root;
>>>>>>>
>>>>>>>            +  }
>>>>>>>
>>>>>>>            +
>>>>>>>
>>>>>>>               INIT_KFIFO(vm->faults);
>>>>>>>
>>>>>>>               vm->fault_credit = 16;
>>>>>>>
>>>>>>>
>>>>>>>            @@ -2973,7 +2995,7 @@ void amdgpu_vm_fini(struct 
>>>>>>> amdgpu_device *adev, struct amdgpu_vm *vm)
>>>>>>>
>>>>>>>
>>>>>>>               /* Clear pending page faults from IH when the VM 
>>>>>>> is destroyed */
>>>>>>>
>>>>>>>               while (kfifo_get(&vm->faults, &fault))
>>>>>>>
>>>>>>>            -          amdgpu_ih_clear_fault(adev, fault);
>>>>>>>
>>>>>>>            +          amdgpu_vm_clear_fault(vm->fault_hash,
>>>>>>> fault);
>>>>>>>
>>>>>>>
>>>>>>>               if (vm->pasid) {
>>>>>>>
>>>>>>>                       unsigned long flags;
>>>>>>>
>>>>>>>            @@ -2983,6 +3005,9 @@ void amdgpu_vm_fini(struct 
>>>>>>> amdgpu_device *adev, struct amdgpu_vm *vm)
>>>>>>>
>>>>>>> spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
>>>>>>>
>>>>>>>               }
>>>>>>>
>>>>>>>
>>>>>>>            +  kfree(vm->fault_hash);
>>>>>>>
>>>>>>>            +  vm->fault_hash = NULL;
>>>>>>>
>>>>>>>            +
>>>>>>>
>>>>>>>               drm_sched_entity_destroy(&vm->entity);
>>>>>>>
>>>>>>>
>>>>>>>               if (!RB_EMPTY_ROOT(&vm->va.rb_root)) {
>>>>>>>
>>>>>>>            @@ -3183,3 +3208,78 @@ void 
>>>>>>> amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
>>>>>>>
>>>>>>>                       }
>>>>>>>
>>>>>>>               }
>>>>>>>
>>>>>>>             }
>>>>>>>
>>>>>>>            +
>>>>>>>
>>>>>>>            +/**
>>>>>>>
>>>>>>>            + * amdgpu_vm_add_fault - Add a page fault record to 
>>>>>>> fault hash table
>>>>>>>
>>>>>>>            + *
>>>>>>>
>>>>>>>            + * @fault_hash: fault hash table
>>>>>>>
>>>>>>>            + * @key: 64-bit encoding of PASID and address
>>>>>>>
>>>>>>>            + *
>>>>>>>
>>>>>>>            + * This should be called when a retry page fault 
>>>>>>> interrupt is
>>>>>>>
>>>>>>>            + * received. If this is a new page fault, it will be 
>>>>>>> added to a hash
>>>>>>>
>>>>>>>            + * table. The return value indicates whether this is 
>>>>>>> a new fault, or
>>>>>>>
>>>>>>>            + * a fault that was already known and is already 
>>>>>>> being handled.
>>>>>>>
>>>>>>>            + *
>>>>>>>
>>>>>>>            + * If there are too many pending page faults, this 
>>>>>>> will fail. Retry
>>>>>>>
>>>>>>>            + * interrupts should be ignored in this case until 
>>>>>>> there is enough
>>>>>>>
>>>>>>>            + * free space.
>>>>>>>
>>>>>>>            + *
>>>>>>>
>>>>>>>            + * Returns 0 if the fault was added, 1 if the fault 
>>>>>>> was already known,
>>>>>>>
>>>>>>>            + * -ENOSPC if there are too many pending faults.
>>>>>>>
>>>>>>>            + */
>>>>>>>
>>>>>>>            +int amdgpu_vm_add_fault(struct 
>>>>>>> amdgpu_retryfault_hashtable *fault_hash, u64 key)
>>>>>>>
>>>>>>>            +{
>>>>>>>
>>>>>>>            +  unsigned long flags;
>>>>>>>
>>>>>>>            +  int r = -ENOSPC;
>>>>>>>
>>>>>>>            +
>>>>>>>
>>>>>>>            +  if (WARN_ON_ONCE(!fault_hash))
>>>>>>>
>>>>>>>            +          /* Should be allocated in amdgpu_vm_init
>>>>>>>
>>>>>>>            +           */
>>>>>>>
>>>>>>>            +          return r;
>>>>>>>
>>>>>>>            +
>>>>>>>
>>>>>>>            +  spin_lock_irqsave(&fault_hash->lock, flags);
>>>>>>>
>>>>>>>            +
>>>>>>>
>>>>>>>            +  /* Only let the hash table fill up to 50% for best 
>>>>>>> performance */
>>>>>>>
>>>>>>>            +  if (fault_hash->count >= (1 <<
>>>>>>> (AMDGPU_PAGEFAULT_HASH_BITS-1)))
>>>>>>>
>>>>>>>            +          goto unlock_out;
>>>>>>>
>>>>>>>            +
>>>>>>>
>>>>>>>            +  r = chash_table_copy_in(&fault_hash->hash, key, 
>>>>>>> NULL);
>>>>>>>
>>>>>>>            +  if (!r)
>>>>>>>
>>>>>>>            +          fault_hash->count++;
>>>>>>>
>>>>>>>            +
>>>>>>>
>>>>>>>            +  /* chash_table_copy_in should never fail unless 
>>>>>>> we're losing count */
>>>>>>>
>>>>>>>            +  WARN_ON_ONCE(r < 0);
>>>>>>>
>>>>>>>            +
>>>>>>>
>>>>>>>            +unlock_out:
>>>>>>>
>>>>>>>            +  spin_unlock_irqrestore(&fault_hash->lock, flags);
>>>>>>>
>>>>>>>            +  return r;
>>>>>>>
>>>>>>>            +}
>>>>>>>
>>>>>>>            +
>>>>>>>
>>>>>>>            +/**
>>>>>>>
>>>>>>>            + * amdgpu_vm_clear_fault - Remove a page fault 
>>>>>>> record
>>>>>>>
>>>>>>>            + *
>>>>>>>
>>>>>>>            + * @fault_hash: fault hash table
>>>>>>>
>>>>>>>            + * @key: 64-bit encoding of PASID and address
>>>>>>>
>>>>>>>            + *
>>>>>>>
>>>>>>>            + * This should be called when a page fault has been 
>>>>>>> handled. Any
>>>>>>>
>>>>>>>            + * future interrupt with this key will be processed 
>>>>>>> as a new
>>>>>>>
>>>>>>>            + * page fault.
>>>>>>>
>>>>>>>            + */
>>>>>>>
>>>>>>>            +void amdgpu_vm_clear_fault(struct 
>>>>>>> amdgpu_retryfault_hashtable *fault_hash, u64 key)
>>>>>>>
>>>>>>>            +{
>>>>>>>
>>>>>>>            +  unsigned long flags;
>>>>>>>
>>>>>>>            +  int r;
>>>>>>>
>>>>>>>            +
>>>>>>>
>>>>>>>            +  if (!fault_hash)
>>>>>>>
>>>>>>>            +          return;
>>>>>>>
>>>>>>>            +
>>>>>>>
>>>>>>>            +  spin_lock_irqsave(&fault_hash->lock, flags);
>>>>>>>
>>>>>>>            +
>>>>>>>
>>>>>>>            +  r = chash_table_remove(&fault_hash->hash, key, 
>>>>>>> NULL);
>>>>>>>
>>>>>>>            +  if (!WARN_ON_ONCE(r < 0)) {
>>>>>>>
>>>>>>>            +          fault_hash->count--;
>>>>>>>
>>>>>>>            +          WARN_ON_ONCE(fault_hash->count < 0);
>>>>>>>
>>>>>>>            +  }
>>>>>>>
>>>>>>>            +
>>>>>>>
>>>>>>>            +  spin_unlock_irqrestore(&fault_hash->lock, flags);
>>>>>>>
>>>>>>>            +}
>>>>>>>
>>>>>>>            diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>>>>>
>>>>>>>            index e275ee7..6eb1da1 100644
>>>>>>>
>>>>>>>            --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>>>>>
>>>>>>>            +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>>>>>
>>>>>>>            @@ -178,6 +178,13 @@ struct amdgpu_task_info {
>>>>>>>
>>>>>>>               pid_t   tgid;
>>>>>>>
>>>>>>>             };
>>>>>>>
>>>>>>>
>>>>>>>            +#define AMDGPU_PAGEFAULT_HASH_BITS 8
>>>>>>>
>>>>>>>            +struct amdgpu_retryfault_hashtable {
>>>>>>>
>>>>>>>            +  DECLARE_CHASH_TABLE(hash, 
>>>>>>> AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
>>>>>>>
>>>>>>>            +  spinlock_t     lock;
>>>>>>>
>>>>>>>            +  int            count;
>>>>>>>
>>>>>>>            +};
>>>>>>>
>>>>>>>            +
>>>>>>>
>>>>>>>             struct amdgpu_vm {
>>>>>>>
>>>>>>>               /* tree of virtual addresses mapped */
>>>>>>>
>>>>>>>               struct rb_root_cached  va;
>>>>>>>
>>>>>>>            @@ -240,6 +247,7 @@ struct amdgpu_vm {
>>>>>>>
>>>>>>>               struct ttm_lru_bulk_move lru_bulk_move;
>>>>>>>
>>>>>>>               /* mark whether can do the bulk move */
>>>>>>>
>>>>>>>               bool                   bulk_moveable;
>>>>>>>
>>>>>>>            +  struct amdgpu_retryfault_hashtable *fault_hash;
>>>>>>>
>>>>>>>             };
>>>>>>>
>>>>>>>
>>>>>>>             struct amdgpu_vm_manager {
>>>>>>>
>>>>>>>            @@ -355,4 +363,8 @@ void 
>>>>>>> amdgpu_vm_set_task_info(struct amdgpu_vm *vm);
>>>>>>>
>>>>>>>             void amdgpu_vm_move_to_lru_tail(struct amdgpu_device 
>>>>>>> *adev,
>>>>>>>
>>>>>>>                                      struct amdgpu_vm *vm);
>>>>>>>
>>>>>>>
>>>>>>>            +int amdgpu_vm_add_fault(struct 
>>>>>>> amdgpu_retryfault_hashtable *fault_hash, u64 key);
>>>>>>>
>>>>>>>            +
>>>>>>>
>>>>>>>            +void amdgpu_vm_clear_fault(struct 
>>>>>>> amdgpu_retryfault_hashtable *fault_hash, u64 key);
>>>>>>>
>>>>>>>            +
>>>>>>>
>>>>>>>             #endif
>>>>>>>
>>>>>>>            diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>>
>>>>>>>            index 5ae5ed2..acbe5a7 100644
>>>>>>>
>>>>>>>            --- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>>
>>>>>>>            +++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>>
>>>>>>>            @@ -265,35 +265,36 @@ static bool 
>>>>>>> vega10_ih_prescreen_iv(struct amdgpu_device *adev)
>>>>>>>
>>>>>>>                       return true;
>>>>>>>
>>>>>>>               }
>>>>>>>
>>>>>>>
>>>>>>>            -  addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 
>>>>>>> 12);
>>>>>>>
>>>>>>>            -  key = AMDGPU_VM_FAULT(pasid, addr);
>>>>>>>
>>>>>>>            -  r = amdgpu_ih_add_fault(adev, key);
>>>>>>>
>>>>>>>            -
>>>>>>>
>>>>>>>            -  /* Hash table is full or the fault is already 
>>>>>>> being processed,
>>>>>>>
>>>>>>>            -   * ignore further page faults
>>>>>>>
>>>>>>>            -   */
>>>>>>>
>>>>>>>            -  if (r != 0)
>>>>>>>
>>>>>>>            -          goto ignore_iv;
>>>>>>>
>>>>>>>            -
>>>>>>>
>>>>>>>               /* Track retry faults in per-VM fault FIFO. */
>>>>>>>
>>>>>>>               spin_lock(&adev->vm_manager.pasid_lock);
>>>>>>>
>>>>>>>               vm = idr_find(&adev->vm_manager.pasid_idr, pasid);
>>>>>>>
>>>>>>>            +  addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 
>>>>>>> 12);
>>>>>>>
>>>>>>>            +  key = AMDGPU_VM_FAULT(pasid, addr);
>>>>>>>
>>>>>>>               if (!vm) {
>>>>>>>
>>>>>>>                       /* VM not found, process it normally */
>>>>>>>
>>>>>>> spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>>>
>>>>>>>            -          amdgpu_ih_clear_fault(adev, key);
>>>>>>>
>>>>>>>                       return true;
>>>>>>>
>>>>>>>            +  } else {
>>>>>>>
>>>>>>>            +          r = amdgpu_vm_add_fault(vm->fault_hash,
>>>>>>> key);
>>>>>>>
>>>>>>>            +
>>>>>>>
>>>>>>>            +          /* Hash table is full or the fault is 
>>>>>>> already being processed,
>>>>>>>
>>>>>>>            +           * ignore further page faults
>>>>>>>
>>>>>>>            +           */
>>>>>>>
>>>>>>>            +          if (r != 0) {
>>>>>>>
>>>>>>>            + spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>>>
>>>>>>>            +                  goto ignore_iv;
>>>>>>>
>>>>>>>            +          }
>>>>>>>
>>>>>>>               }
>>>>>>>
>>>>>>>               /* No locking required with single writer and 
>>>>>>> single reader */
>>>>>>>
>>>>>>>               r = kfifo_put(&vm->faults, key);
>>>>>>>
>>>>>>>               if (!r) {
>>>>>>>
>>>>>>>                       /* FIFO is full. Ignore it until there is 
>>>>>>> space */
>>>>>>>
>>>>>>>            +          amdgpu_vm_clear_fault(vm->fault_hash, 
>>>>>>> key);
>>>>>>>
>>>>>>> spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>>>
>>>>>>>            -          amdgpu_ih_clear_fault(adev, key);
>>>>>>>
>>>>>>>                       goto ignore_iv;
>>>>>>>
>>>>>>>               }
>>>>>>>
>>>>>>>            - spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>>>
>>>>>>>
>>>>>>>            + spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>>>
>>>>>>>               /* It's the first fault for this address, process 
>>>>>>> it normally */
>>>>>>>
>>>>>>>               return true;
>>>>>>>
>>>>>>>
>>>>>>>            @@ -386,14 +387,6 @@ static int 
>>>>>>> vega10_ih_sw_init(void
>>>>>>> *handle)
>>>>>>>
>>>>>>>               adev->irq.ih.use_doorbell = true;
>>>>>>>
>>>>>>>               adev->irq.ih.doorbell_index = AMDGPU_DOORBELL64_IH 
>>>>>>> << 1;
>>>>>>>
>>>>>>>
>>>>>>>            -  adev->irq.ih.faults = 
>>>>>>> kmalloc(sizeof(*adev->irq.ih.faults), GFP_KERNEL);
>>>>>>>
>>>>>>>            -  if (!adev->irq.ih.faults)
>>>>>>>
>>>>>>>            -          return -ENOMEM;
>>>>>>>
>>>>>>>            - INIT_CHASH_TABLE(adev->irq.ih.faults->hash,
>>>>>>>
>>>>>>>            -                   AMDGPU_PAGEFAULT_HASH_BITS, 8, 
>>>>>>> 0);
>>>>>>>
>>>>>>>            - spin_lock_init(&adev->irq.ih.faults->lock);
>>>>>>>
>>>>>>>            -  adev->irq.ih.faults->count = 0;
>>>>>>>
>>>>>>>            -
>>>>>>>
>>>>>>>               r = amdgpu_irq_init(adev);
>>>>>>>
>>>>>>>
>>>>>>>               return r;
>>>>>>>
>>>>>>>            @@ -406,9 +399,6 @@ static int vega10_ih_sw_fini(void
>>>>>>> *handle)
>>>>>>>
>>>>>>>               amdgpu_irq_fini(adev);
>>>>>>>
>>>>>>>               amdgpu_ih_ring_fini(adev);
>>>>>>>
>>>>>>>
>>>>>>>            -  kfree(adev->irq.ih.faults);
>>>>>>>
>>>>>>>            -  adev->irq.ih.faults = NULL;
>>>>>>>
>>>>>>>            -
>>>>>>>
>>>>>>>               return 0;
>>>>>>>
>>>>>>>             }
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>        _______________________________________________
>>>>>>>
>>>>>>>        amd-gfx mailing list
>>>>>>>
>>>>>>>        amd-gfx@lists.freedesktop.org 
>>>>>>> <mailto:amd-gfx@lists.freedesktop.org>
>>>>>>>
>>>>>>>        https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>>
>>>>>>>
>>>>>> _______________________________________________
>>>>>> amd-gfx mailing list
>>>>>> amd-gfx@lists.freedesktop.org
>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>> _______________________________________________
>>>> amd-gfx mailing list
>>>> amd-gfx@lists.freedesktop.org
>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to amdgpu vm
       [not found]                                                 ` <DM5PR12MB16608D6DAB8D9BFAAE348C4780040-2J9CzHegvk9mP71xM+BsFwdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2018-09-11 15:15                                                   ` Christian König
  0 siblings, 0 replies; 27+ messages in thread
From: Christian König @ 2018-09-11 15:15 UTC (permalink / raw)
  To: Zeng, Oak, Kuehling, Felix, Oak Zeng,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Yang, Philip

Am 11.09.2018 um 17:02 schrieb Zeng, Oak:
>> Exactly that is what won't work. See we can't mark the PDE invalid because we still have some engines which can't deal with retry faults.
> By engines do you mean UTCL2 client? If yes, I am not aware of that. If any UTCL2 client doesn't support translation retry, driver will need to make sure all the current PDE/PTE entries are valid. In other word, it should be setup before it is used by any job and can't be swapped out during memory pressure. How can it work otherwise?

Yes, exactly. What could maybe work is to use two kinds of page tables.

E.g. we use classic pipelined operation for the high area of the address 
space and page faulting operation for the low area.

I've prototyped that once, but that would still require quite some work 
on the MM side to actually support this kind of operation.

Otherwise as far as I can see the best we can do on GFX9 is to set leave 
node PTEs as valid/invalid as appropriate and fill them on demand when 
the time comes.

Regards,
Christian.

>
> Thanks,
> Oak
>
> -----Original Message-----
> From: Christian König <ckoenig.leichtzumerken@gmail.com>
> Sent: Tuesday, September 11, 2018 10:23 AM
> To: Zeng, Oak <Oak.Zeng@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; Oak Zeng <zengshanjun@gmail.com>; amd-gfx@lists.freedesktop.org; Yang, Philip <Philip.Yang@amd.com>
> Subject: Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to amdgpu vm
>
>> [Oak]: Step 5, We also mark the corresponding entry of parent page table of the evicted page table to invalid.
> Exactly that is what won't work. See we can't mark the PDE invalid because we still have some engines which can't deal with retry faults.
>
>> [Oak] How do you block it?
> Just waiting for all execution to finish.
>
> Regards,
> Christian.
>
> Am 11.09.2018 um 16:09 schrieb Zeng, Oak:
>> Hi Christian,
>>
>> I am not sure I understand exactly your thought but see 2 comments
>> inline
>>
>> Thanks,
>> Oak
>>
>> -----Original Message-----
>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>> Sent: Tuesday, September 11, 2018 3:19 AM
>> To: Kuehling, Felix <Felix.Kuehling@amd.com>; Koenig, Christian
>> <Christian.Koenig@amd.com>; Zeng, Oak <Oak.Zeng@amd.com>; Oak Zeng
>> <zengshanjun@gmail.com>; amd-gfx@lists.freedesktop.org; Yang, Philip
>> <Philip.Yang@amd.com>
>> Subject: Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to amdgpu
>> vm
>>
>> Hi Felix,
>>
>> let me try to explain the problem on an example:
>>
>> 1. We have a running job which needs recoverable page faults for accessing the process address space.
>> 2. We run into memory pressure on VRAM and start to evict things.
>> 3. A page tables of the running job is picked up for eviction.
>> 4. We schedule a copy command to move the content of the page table to system memory.
>> 5. The new system memory location of the page table is noted in its BO.
>> 6. We get a fault from the job and swap in the page from the process address space.
>> 7. Now we need to enter the new page address into the page table ->
>> *BAM*
>>
>> The problem is now that we don't know the address of the page table because the current location was replaced with the future location in step #5.
>> [Oak]: Step 5, We also mark the corresponding entry of parent page table of the evicted page table to invalid. This invalid bit will cause the utcl2 walker to report a page fault. Depending on implementation, for the evicted page table, its' amdgpu_vm_pt.amdgpu_vm_bo_base will point to a new location in system memory (amdgpu_vm_pt itself is in system memory). In the page fault handler, we allocate a *new* physical page in vram (through ttm), swap in the page content from system memory (using sdma) to this newly allocated page, modify parent page table's corresponding entry to point to this *new allocated* page and set valid bit to 1.
>>
>> If all the swap-in, swap-out work is done by sdma, we don't need to worry about the case that swap-in happens earlier than swap-out - gpu scheduler will take care the sequence.
>>
>> If ttm can't turn around for the new page allocation, it can optionally trigger things like OOM. This should be something that need to be considered at ttm physical memory management layer.
>>
>>
>>
>> We could now argue that we could update the page tables on the fly for the evicted page and never wait for the current job to finish, but this won't work because not all engines can handle that.
>>
>> I will circumvent this problem for now by blocking the eviction before step #5.
>> [Oak] How do you block it? Pin the memory? I agree that swapping out page table should be the last resort during a memory pressure handling.
>>
>> The issue with that is that this pipelining is responsible for nearly 20% of the fps in some testcases.
>>
>> I hope that it won't be that bad when I limit disabling the pipelining to only page tables, but it is certainly possible that we will get pushback on this.
>>
>> If that happens we probably need to add some flags to limit this workaround even more to only the root PD and all the PDs/PTs which are involved in recoverable faults.
>>
>> Regards,
>> Christian.
>>
>> Am 10.09.2018 um 21:10 schrieb Felix Kuehling:
>>> I'm not sure why you need to distinguish current and future state
>>> when dealing with page faults. When you get a page fault, you know
>>> that the GPU is trying to access memory right now, in the present. So
>>> you're always working with the current state. When the CPU page table
>>> changes, you get an MMU notifier that allows you to invalidate the
>>> corresponding GPU PTEs. If you have a valid GPU PTE, it always
>>> represents the current states and is in sync with the CPU page table.
>>> If the GPU page table is ever outdated, it should have an invalid entry in that place.
>>>
>>> If SDMA is used to update the GPU PTEs, there is a delay. The MMU
>>> notifier is synchronous, so it shouldn't be a problem. You just wait
>>> for the SDMA job to complete before returning. When updating PTEs
>>> with new valid addresses, it's asynchronous. But the GPU will
>>> continue retrying on the invalid entry until SDMA finishes. So it's
>>> also implicitly synchronized on the GPU side.
>>>
>>> Regards,
>>>      Felix
>>>
>>>
>>> On 2018-09-10 05:42 AM, Christian König wrote:
>>>> Hi Felix & Oak,
>>>>
>>>> over the weekend I had the idea that we could just use the shadow
>>>> BOs to have the current state in a page fault. They are GTT BOs and
>>>> CPU accessible anyway.
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>> Am 08.09.2018 um 09:34 schrieb Christian König:
>>>>> Hi Felix,
>>>>>
>>>>>> But why do you want to update page tables when there is no more
>>>>>> user mode context that cares about them? Is this just to allow
>>>>>> pending work to complete after the user mode context is gone? To prevent hangs?
>>>>> The problem I'm see is that we still don't have a reliable way of
>>>>> killing a command submission while making sure that no more faults
>>>>> of it are in the flight.
>>>>>
>>>>> E.g. the HWS has a function to kill a running job on the hardware,
>>>>> but that only works for compute and there is no way of telling when
>>>>> that operation has finished.
>>>>>
>>>>> But Oak has already convinced me that we should probably work on
>>>>> that instead of trying to keep the VM alive as long as possible.
>>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>> Am 08.09.2018 um 02:27 schrieb Felix Kuehling:
>>>>>> Hi Christian,
>>>>>>
>>>>>> I'm not sure I get your point here.
>>>>>>
>>>>>> As I understand it, the amdgpu_vm structure represents the user
>>>>>> mode VM context. It has the pointers to the root page directory
>>>>>> and the rest of the page table hierarchy. If there is no
>>>>>> amdgpu_vm, there is no user mode context that cares about the page tables.
>>>>>>
>>>>>> If we leave the page table pointers in the amdgpu_vm, then
>>>>>> handling faults after the VM is destroyed is pointless. We can't
>>>>>> find the page tables and we can't update them, so there is nothing
>>>>>> we can do in response to VM faults.
>>>>>>
>>>>>> So I'm guessing that you want to move the page table pointers into
>>>>>> a different structure that exists half-independently of the VM. It
>>>>>> would be created when the VM is created (that's where we currently
>>>>>> allocate the PASID) but can be destroyed slightly later.
>>>>>>
>>>>>> But why do you want to update page tables when there is no more
>>>>>> user mode context that cares about them? Is this just to allow
>>>>>> pending work to complete after the user mode context is gone? To prevent hangs?
>>>>>>
>>>>>> Regards,
>>>>>>       Felix
>>>>>>
>>>>>> On 2018-09-10 07:20 AM, Christian König wrote:
>>>>>>>> Am I right that the handling of page fault need a valid VM?
>>>>>>> No, exactly that view is incorrect.
>>>>>>>
>>>>>>> The VM is meant to be a memory management structure of page
>>>>>>> tables and is completely pointless fault processing because it
>>>>>>> represent the future state of the page tables and not the current one.
>>>>>>>
>>>>>>> What we need for fault processing is a new structure build around
>>>>>>> PASIDs which is feed by the with addresses when page tables are
>>>>>>> moved around.
>>>>>>>
>>>>>>> Alternatively I hope that we can use the SDMA to walk the page
>>>>>>> tables and update the required entries by just using the address.
>>>>>>>
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>>> Am 07.09.2018 um 16:55 schrieb Zeng, Oak:
>>>>>>>> Hi Christian,
>>>>>>>>
>>>>>>>>
>>>>>>>> What is the value of delay the destroy of hash to after vm is
>>>>>>>> destroyed? Since vm is destroyed, you basically don’t have
>>>>>>>> enough information to paging in the correct page to gpuvm. Am I
>>>>>>>> right that the handling of page fault need a valid VM? For
>>>>>>>> example, you need the VM to get the corresponding physical address of the faulty page.
>>>>>>>>
>>>>>>>>
>>>>>>>> After vm is destroyed, the retry interrupt can be directly
>>>>>>>> discard as you can’t find vm through pasid. You can think this
>>>>>>>> is also one kind of prescreen.
>>>>>>>>
>>>>>>>>
>>>>>>>> So I am stand for the point that, there is no need to delay the
>>>>>>>> destroy of hash to after vm is destroyed. Prescreening hash can
>>>>>>>> be destroyed at the time of vm_fini.
>>>>>>>>
>>>>>>>>
>>>>>>>> Thanks,
>>>>>>>>
>>>>>>>> Oak
>>>>>>>>
>>>>>>>>
>>>>>>>> *From:*Christian König <ckoenig.leichtzumerken@gmail.com>
>>>>>>>> *Sent:* Friday, September 07, 2018 4:39 AM
>>>>>>>> *To:* Zeng, Oak <Oak.Zeng@amd.com>; Oak Zeng
>>>>>>>> <zengshanjun@gmail.com>; amd-gfx@lists.freedesktop.org
>>>>>>>> *Cc:* Zeng, Oak <Oak.Zeng@amd.com>
>>>>>>>> *Subject:* Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to
>>>>>>>> amdgpu vm
>>>>>>>>
>>>>>>>>
>>>>>>>> Hi Oak,
>>>>>>>>
>>>>>>>> yeah, but this is still a NAK. Making the hash per PASID was not
>>>>>>>> a suggestion but rather a must have.
>>>>>>>>
>>>>>>>> The VM structures must be destroyed while the processing is
>>>>>>>> still ongoing, or otherwise we would not have a clean OOM handling.
>>>>>>>>
>>>>>>>> The IDR for PASID lockup can be put into amdgpu_ids.c, you can
>>>>>>>> just replace the IDA already used there with an IDR.
>>>>>>>>
>>>>>>>> Since the PASID is already freed up delayed we should have the
>>>>>>>> grace period for interrupt processing included. If that still
>>>>>>>> isn't sufficient we can still add some delayed work for this.
>>>>>>>>
>>>>>>>> Regards,
>>>>>>>> Christian.
>>>>>>>>
>>>>>>>> Am 07.09.2018 um 06:16 schrieb ozeng:
>>>>>>>>
>>>>>>>>         Hi Christian,
>>>>>>>>
>>>>>>>>         In this implementation, fault hash is made per vm, not
>>>>>>>> per pasid
>>>>>>>>         as suggested, based on below considerations:
>>>>>>>>
>>>>>>>>           * Delay the destroy of hash introduces more effort like
>>>>>>>> how to
>>>>>>>>             set the proper grace period after which no retry
>>>>>>>> interrupt
>>>>>>>>             will be generated. We want to avoid those
>>>>>>>> complication if we
>>>>>>>>             can.
>>>>>>>>           * The  problem of the per vm implementation is that it
>>>>>>>> is hard
>>>>>>>>             to delay the destroy of fault hash, because when vm is
>>>>>>>>             destroyed, prescreen function won't be able to
>>>>>>>> retrieve the
>>>>>>>>             fault hash. But in this case, the prescreen function
>>>>>>>> can
>>>>>>>>             either let the interrupt go through (to gmc) or ignore it.
>>>>>>>>             From this perspective, it is not very necessary to
>>>>>>>> delay the
>>>>>>>>             destroy of hash table.
>>>>>>>>           * On the other hand, since ASICs after vega10 have the
>>>>>>>> HW CAM
>>>>>>>>             feature. So the SW IV prescreen is only useful for vega10.
>>>>>>>>             For all the HW implemented CAM, we can consider the
>>>>>>>> delayed
>>>>>>>>             CAM acknowledgment.
>>>>>>>>           * Making it per pasid need to introduce another idr to
>>>>>>>>             correlate pasid and hash table - where to put the idr?
>>>>>>>> You
>>>>>>>>             will have to make it a global variable which is not
>>>>>>>> very
>>>>>>>>             desirable.
>>>>>>>>
>>>>>>>>         The main purpose of this patch is to solve the
>>>>>>>> inter-process lock
>>>>>>>>         issue (when hash table is full), while still keep codes simple.
>>>>>>>>
>>>>>>>>         Also with this patch, the faults kfifo is not necessary
>>>>>>>> any more.
>>>>>>>>         See patch 2.
>>>>>>>>
>>>>>>>>         Regards,
>>>>>>>>
>>>>>>>>         Oak
>>>>>>>>
>>>>>>>>
>>>>>>>>         On 09/06/2018 11:28 PM, Oak Zeng wrote:
>>>>>>>>
>>>>>>>>             In stead of share one fault hash table per device,
>>>>>>>> make it
>>>>>>>>
>>>>>>>>             per vm. This can avoid inter-process lock issue when
>>>>>>>> fault
>>>>>>>>
>>>>>>>>             hash table is full.
>>>>>>>>
>>>>>>>>
>>>>>>>>             Change-Id: I5d1281b7c41eddc8e26113e010516557588d3708
>>>>>>>>
>>>>>>>>             Signed-off-by: Oak Zeng <Oak.Zeng@amd.com>
>>>>>>>> <mailto:Oak.Zeng@amd.com>
>>>>>>>>
>>>>>>>>             Suggested-by: Christian Konig
>>>>>>>> <Christian.Koenig@amd.com> <mailto:Christian.Koenig@amd.com>
>>>>>>>>
>>>>>>>>             Suggested-by: Felix Kuehling <Felix.Kuehling@amd.com>
>>>>>>>> <mailto:Felix.Kuehling@amd.com>
>>>>>>>>
>>>>>>>>             ---
>>>>>>>>
>>>>>>>>              drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c |  75
>>>>>>>> ------------------------
>>>>>>>>
>>>>>>>>              drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h |  10 ----
>>>>>>>>
>>>>>>>>              drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 102
>>>>>>>> ++++++++++++++++++++++++++++++++-
>>>>>>>>
>>>>>>>>              drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h |  12 ++++
>>>>>>>>
>>>>>>>>              drivers/gpu/drm/amd/amdgpu/vega10_ih.c |  38
>>>>>>>> +++++-------
>>>>>>>>
>>>>>>>>              5 files changed, 127 insertions(+), 110 deletions(-)
>>>>>>>>
>>>>>>>>
>>>>>>>>             diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>>>>>>>
>>>>>>>>             index 06373d4..4ed8621 100644
>>>>>>>>
>>>>>>>>             --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>>>>>>>
>>>>>>>>             +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>>>>>>>
>>>>>>>>             @@ -197,78 +197,3 @@ int amdgpu_ih_process(struct
>>>>>>>> amdgpu_device *adev)
>>>>>>>>
>>>>>>>>                return IRQ_HANDLED;
>>>>>>>>
>>>>>>>>              }
>>>>>>>>
>>>>>>>>
>>>>>>>>             -/**
>>>>>>>>
>>>>>>>>             - * amdgpu_ih_add_fault - Add a page fault record
>>>>>>>>
>>>>>>>>             - *
>>>>>>>>
>>>>>>>>             - * @adev: amdgpu device pointer
>>>>>>>>
>>>>>>>>             - * @key: 64-bit encoding of PASID and address
>>>>>>>>
>>>>>>>>             - *
>>>>>>>>
>>>>>>>>             - * This should be called when a retry page fault
>>>>>>>> interrupt is
>>>>>>>>
>>>>>>>>             - * received. If this is a new page fault, it will be
>>>>>>>> added to a hash
>>>>>>>>
>>>>>>>>             - * table. The return value indicates whether this is
>>>>>>>> a new fault, or
>>>>>>>>
>>>>>>>>             - * a fault that was already known and is already
>>>>>>>> being handled.
>>>>>>>>
>>>>>>>>             - *
>>>>>>>>
>>>>>>>>             - * If there are too many pending page faults, this
>>>>>>>> will fail. Retry
>>>>>>>>
>>>>>>>>             - * interrupts should be ignored in this case until
>>>>>>>> there is enough
>>>>>>>>
>>>>>>>>             - * free space.
>>>>>>>>
>>>>>>>>             - *
>>>>>>>>
>>>>>>>>             - * Returns 0 if the fault was added, 1 if the fault
>>>>>>>> was already known,
>>>>>>>>
>>>>>>>>             - * -ENOSPC if there are too many pending faults.
>>>>>>>>
>>>>>>>>             - */
>>>>>>>>
>>>>>>>>             -int amdgpu_ih_add_fault(struct amdgpu_device *adev,
>>>>>>>> u64
>>>>>>>> key)
>>>>>>>>
>>>>>>>>             -{
>>>>>>>>
>>>>>>>>             -  unsigned long flags;
>>>>>>>>
>>>>>>>>             -  int r = -ENOSPC;
>>>>>>>>
>>>>>>>>             -
>>>>>>>>
>>>>>>>>             -  if (WARN_ON_ONCE(!adev->irq.ih.faults))
>>>>>>>>
>>>>>>>>             -          /* Should be allocated in <IP>_ih_sw_init
>>>>>>>> on GPUs that
>>>>>>>>
>>>>>>>>             -           * support retry faults and require retry
>>>>>>>> filtering.
>>>>>>>>
>>>>>>>>             -           */
>>>>>>>>
>>>>>>>>             -          return r;
>>>>>>>>
>>>>>>>>             -
>>>>>>>>
>>>>>>>>             - spin_lock_irqsave(&adev->irq.ih.faults->lock,
>>>>>>>> flags);
>>>>>>>>
>>>>>>>>             -
>>>>>>>>
>>>>>>>>             -  /* Only let the hash table fill up to 50% for best
>>>>>>>> performance */
>>>>>>>>
>>>>>>>>             -  if (adev->irq.ih.faults->count >= (1 <<
>>>>>>>> (AMDGPU_PAGEFAULT_HASH_BITS-1)))
>>>>>>>>
>>>>>>>>             -          goto unlock_out;
>>>>>>>>
>>>>>>>>             -
>>>>>>>>
>>>>>>>>             -  r =
>>>>>>>> chash_table_copy_in(&adev->irq.ih.faults->hash,
>>>>>>>> key, NULL);
>>>>>>>>
>>>>>>>>             -  if (!r)
>>>>>>>>
>>>>>>>>             -          adev->irq.ih.faults->count++;
>>>>>>>>
>>>>>>>>             -
>>>>>>>>
>>>>>>>>             -  /* chash_table_copy_in should never fail unless
>>>>>>>> we're losing count */
>>>>>>>>
>>>>>>>>             -  WARN_ON_ONCE(r < 0);
>>>>>>>>
>>>>>>>>             -
>>>>>>>>
>>>>>>>>             -unlock_out:
>>>>>>>>
>>>>>>>>             - spin_unlock_irqrestore(&adev->irq.ih.faults->lock,
>>>>>>>> flags);
>>>>>>>>
>>>>>>>>             -  return r;
>>>>>>>>
>>>>>>>>             -}
>>>>>>>>
>>>>>>>>             -
>>>>>>>>
>>>>>>>>             -/**
>>>>>>>>
>>>>>>>>             - * amdgpu_ih_clear_fault - Remove a page fault
>>>>>>>> record
>>>>>>>>
>>>>>>>>             - *
>>>>>>>>
>>>>>>>>             - * @adev: amdgpu device pointer
>>>>>>>>
>>>>>>>>             - * @key: 64-bit encoding of PASID and address
>>>>>>>>
>>>>>>>>             - *
>>>>>>>>
>>>>>>>>             - * This should be called when a page fault has been
>>>>>>>> handled. Any
>>>>>>>>
>>>>>>>>             - * future interrupt with this key will be processed
>>>>>>>> as a new
>>>>>>>>
>>>>>>>>             - * page fault.
>>>>>>>>
>>>>>>>>             - */
>>>>>>>>
>>>>>>>>             -void amdgpu_ih_clear_fault(struct amdgpu_device
>>>>>>>> *adev,
>>>>>>>> u64 key)
>>>>>>>>
>>>>>>>>             -{
>>>>>>>>
>>>>>>>>             -  unsigned long flags;
>>>>>>>>
>>>>>>>>             -  int r;
>>>>>>>>
>>>>>>>>             -
>>>>>>>>
>>>>>>>>             -  if (!adev->irq.ih.faults)
>>>>>>>>
>>>>>>>>             -          return;
>>>>>>>>
>>>>>>>>             -
>>>>>>>>
>>>>>>>>             - spin_lock_irqsave(&adev->irq.ih.faults->lock,
>>>>>>>> flags);
>>>>>>>>
>>>>>>>>             -
>>>>>>>>
>>>>>>>>             -  r = chash_table_remove(&adev->irq.ih.faults->hash,
>>>>>>>> key, NULL);
>>>>>>>>
>>>>>>>>             -  if (!WARN_ON_ONCE(r < 0)) {
>>>>>>>>
>>>>>>>>             -          adev->irq.ih.faults->count--;
>>>>>>>>
>>>>>>>>             - WARN_ON_ONCE(adev->irq.ih.faults->count < 0);
>>>>>>>>
>>>>>>>>             -  }
>>>>>>>>
>>>>>>>>             -
>>>>>>>>
>>>>>>>>             - spin_unlock_irqrestore(&adev->irq.ih.faults->lock,
>>>>>>>> flags);
>>>>>>>>
>>>>>>>>             -}
>>>>>>>>
>>>>>>>>             diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>>>>>>
>>>>>>>>             index a23e1c0..f411ffb 100644
>>>>>>>>
>>>>>>>>             --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>>>>>>
>>>>>>>>             +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>>>>>>
>>>>>>>>             @@ -32,13 +32,6 @@ struct amdgpu_device;
>>>>>>>>
>>>>>>>>              #define AMDGPU_IH_CLIENTID_LEGACY 0
>>>>>>>>
>>>>>>>>              #define AMDGPU_IH_CLIENTID_MAX SOC15_IH_CLIENTID_MAX
>>>>>>>>
>>>>>>>>
>>>>>>>>             -#define AMDGPU_PAGEFAULT_HASH_BITS 8
>>>>>>>>
>>>>>>>>             -struct amdgpu_retryfault_hashtable {
>>>>>>>>
>>>>>>>>             -  DECLARE_CHASH_TABLE(hash,
>>>>>>>> AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
>>>>>>>>
>>>>>>>>             -  spinlock_t     lock;
>>>>>>>>
>>>>>>>>             -  int            count;
>>>>>>>>
>>>>>>>>             -};
>>>>>>>>
>>>>>>>>             -
>>>>>>>>
>>>>>>>>              /*
>>>>>>>>
>>>>>>>>               * R6xx+ IH ring
>>>>>>>>
>>>>>>>>               */
>>>>>>>>
>>>>>>>>             @@ -57,7 +50,6 @@ struct amdgpu_ih_ring {
>>>>>>>>
>>>>>>>>                bool                   use_doorbell;
>>>>>>>>
>>>>>>>>                bool                   use_bus_addr;
>>>>>>>>
>>>>>>>>                dma_addr_t             rb_dma_addr; /* only used
>>>>>>>> when use_bus_addr = true */
>>>>>>>>
>>>>>>>>             -  struct amdgpu_retryfault_hashtable *faults;
>>>>>>>>
>>>>>>>>              };
>>>>>>>>
>>>>>>>>
>>>>>>>>              #define AMDGPU_IH_SRC_DATA_MAX_SIZE_DW 4
>>>>>>>>
>>>>>>>>             @@ -95,7 +87,5 @@ int amdgpu_ih_ring_init(struct
>>>>>>>> amdgpu_device *adev, unsigned ring_size,
>>>>>>>>
>>>>>>>>                                bool use_bus_addr);
>>>>>>>>
>>>>>>>>              void amdgpu_ih_ring_fini(struct amdgpu_device
>>>>>>>> *adev);
>>>>>>>>
>>>>>>>>              int amdgpu_ih_process(struct amdgpu_device *adev);
>>>>>>>>
>>>>>>>>             -int amdgpu_ih_add_fault(struct amdgpu_device *adev,
>>>>>>>> u64 key);
>>>>>>>>
>>>>>>>>             -void amdgpu_ih_clear_fault(struct amdgpu_device
>>>>>>>> *adev,
>>>>>>>> u64 key);
>>>>>>>>
>>>>>>>>
>>>>>>>>              #endif
>>>>>>>>
>>>>>>>>             diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>>>
>>>>>>>>             index 1d7e3c1..8b220e9 100644
>>>>>>>>
>>>>>>>>             --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>>>
>>>>>>>>             +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>>>
>>>>>>>>             @@ -2692,6 +2692,22 @@ void
>>>>>>>> amdgpu_vm_adjust_size(struct amdgpu_device *adev, uint32_t
>>>>>>>> min_vm_size,
>>>>>>>>
>>>>>>>>                         adev->vm_manager.fragment_size);
>>>>>>>>
>>>>>>>>              }
>>>>>>>>
>>>>>>>>
>>>>>>>>             +static struct amdgpu_retryfault_hashtable
>>>>>>>> *init_fault_hash(void)
>>>>>>>>
>>>>>>>>             +{
>>>>>>>>
>>>>>>>>             +  struct amdgpu_retryfault_hashtable *fault_hash;
>>>>>>>>
>>>>>>>>             +
>>>>>>>>
>>>>>>>>             +  fault_hash = kmalloc(sizeof(*fault_hash),
>>>>>>>> GFP_KERNEL);
>>>>>>>>
>>>>>>>>             +  if (!fault_hash)
>>>>>>>>
>>>>>>>>             +          return fault_hash;
>>>>>>>>
>>>>>>>>             +
>>>>>>>>
>>>>>>>>             +  INIT_CHASH_TABLE(fault_hash->hash,
>>>>>>>>
>>>>>>>>             +                  AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
>>>>>>>>
>>>>>>>>             +  spin_lock_init(&fault_hash->lock);
>>>>>>>>
>>>>>>>>             +  fault_hash->count = 0;
>>>>>>>>
>>>>>>>>             +
>>>>>>>>
>>>>>>>>             +  return fault_hash;
>>>>>>>>
>>>>>>>>             +}
>>>>>>>>
>>>>>>>>             +
>>>>>>>>
>>>>>>>>              /**
>>>>>>>>
>>>>>>>>               * amdgpu_vm_init - initialize a vm instance
>>>>>>>>
>>>>>>>>               *
>>>>>>>>
>>>>>>>>             @@ -2780,6 +2796,12 @@ int amdgpu_vm_init(struct
>>>>>>>> amdgpu_device *adev, struct amdgpu_vm *vm,
>>>>>>>>
>>>>>>>>                        vm->pasid = pasid;
>>>>>>>>
>>>>>>>>                }
>>>>>>>>
>>>>>>>>
>>>>>>>>             +  vm->fault_hash = init_fault_hash();
>>>>>>>>
>>>>>>>>             +  if (!vm->fault_hash) {
>>>>>>>>
>>>>>>>>             +          r = -ENOMEM;
>>>>>>>>
>>>>>>>>             +          goto error_free_root;
>>>>>>>>
>>>>>>>>             +  }
>>>>>>>>
>>>>>>>>             +
>>>>>>>>
>>>>>>>>                INIT_KFIFO(vm->faults);
>>>>>>>>
>>>>>>>>                vm->fault_credit = 16;
>>>>>>>>
>>>>>>>>
>>>>>>>>             @@ -2973,7 +2995,7 @@ void amdgpu_vm_fini(struct
>>>>>>>> amdgpu_device *adev, struct amdgpu_vm *vm)
>>>>>>>>
>>>>>>>>
>>>>>>>>                /* Clear pending page faults from IH when the VM
>>>>>>>> is destroyed */
>>>>>>>>
>>>>>>>>                while (kfifo_get(&vm->faults, &fault))
>>>>>>>>
>>>>>>>>             -          amdgpu_ih_clear_fault(adev, fault);
>>>>>>>>
>>>>>>>>             +          amdgpu_vm_clear_fault(vm->fault_hash,
>>>>>>>> fault);
>>>>>>>>
>>>>>>>>
>>>>>>>>                if (vm->pasid) {
>>>>>>>>
>>>>>>>>                        unsigned long flags;
>>>>>>>>
>>>>>>>>             @@ -2983,6 +3005,9 @@ void amdgpu_vm_fini(struct
>>>>>>>> amdgpu_device *adev, struct amdgpu_vm *vm)
>>>>>>>>
>>>>>>>> spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
>>>>>>>>
>>>>>>>>                }
>>>>>>>>
>>>>>>>>
>>>>>>>>             +  kfree(vm->fault_hash);
>>>>>>>>
>>>>>>>>             +  vm->fault_hash = NULL;
>>>>>>>>
>>>>>>>>             +
>>>>>>>>
>>>>>>>>                drm_sched_entity_destroy(&vm->entity);
>>>>>>>>
>>>>>>>>
>>>>>>>>                if (!RB_EMPTY_ROOT(&vm->va.rb_root)) {
>>>>>>>>
>>>>>>>>             @@ -3183,3 +3208,78 @@ void
>>>>>>>> amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
>>>>>>>>
>>>>>>>>                        }
>>>>>>>>
>>>>>>>>                }
>>>>>>>>
>>>>>>>>              }
>>>>>>>>
>>>>>>>>             +
>>>>>>>>
>>>>>>>>             +/**
>>>>>>>>
>>>>>>>>             + * amdgpu_vm_add_fault - Add a page fault record to
>>>>>>>> fault hash table
>>>>>>>>
>>>>>>>>             + *
>>>>>>>>
>>>>>>>>             + * @fault_hash: fault hash table
>>>>>>>>
>>>>>>>>             + * @key: 64-bit encoding of PASID and address
>>>>>>>>
>>>>>>>>             + *
>>>>>>>>
>>>>>>>>             + * This should be called when a retry page fault
>>>>>>>> interrupt is
>>>>>>>>
>>>>>>>>             + * received. If this is a new page fault, it will be
>>>>>>>> added to a hash
>>>>>>>>
>>>>>>>>             + * table. The return value indicates whether this is
>>>>>>>> a new fault, or
>>>>>>>>
>>>>>>>>             + * a fault that was already known and is already
>>>>>>>> being handled.
>>>>>>>>
>>>>>>>>             + *
>>>>>>>>
>>>>>>>>             + * If there are too many pending page faults, this
>>>>>>>> will fail. Retry
>>>>>>>>
>>>>>>>>             + * interrupts should be ignored in this case until
>>>>>>>> there is enough
>>>>>>>>
>>>>>>>>             + * free space.
>>>>>>>>
>>>>>>>>             + *
>>>>>>>>
>>>>>>>>             + * Returns 0 if the fault was added, 1 if the fault
>>>>>>>> was already known,
>>>>>>>>
>>>>>>>>             + * -ENOSPC if there are too many pending faults.
>>>>>>>>
>>>>>>>>             + */
>>>>>>>>
>>>>>>>>             +int amdgpu_vm_add_fault(struct
>>>>>>>> amdgpu_retryfault_hashtable *fault_hash, u64 key)
>>>>>>>>
>>>>>>>>             +{
>>>>>>>>
>>>>>>>>             +  unsigned long flags;
>>>>>>>>
>>>>>>>>             +  int r = -ENOSPC;
>>>>>>>>
>>>>>>>>             +
>>>>>>>>
>>>>>>>>             +  if (WARN_ON_ONCE(!fault_hash))
>>>>>>>>
>>>>>>>>             +          /* Should be allocated in amdgpu_vm_init
>>>>>>>>
>>>>>>>>             +           */
>>>>>>>>
>>>>>>>>             +          return r;
>>>>>>>>
>>>>>>>>             +
>>>>>>>>
>>>>>>>>             +  spin_lock_irqsave(&fault_hash->lock, flags);
>>>>>>>>
>>>>>>>>             +
>>>>>>>>
>>>>>>>>             +  /* Only let the hash table fill up to 50% for best
>>>>>>>> performance */
>>>>>>>>
>>>>>>>>             +  if (fault_hash->count >= (1 <<
>>>>>>>> (AMDGPU_PAGEFAULT_HASH_BITS-1)))
>>>>>>>>
>>>>>>>>             +          goto unlock_out;
>>>>>>>>
>>>>>>>>             +
>>>>>>>>
>>>>>>>>             +  r = chash_table_copy_in(&fault_hash->hash, key,
>>>>>>>> NULL);
>>>>>>>>
>>>>>>>>             +  if (!r)
>>>>>>>>
>>>>>>>>             +          fault_hash->count++;
>>>>>>>>
>>>>>>>>             +
>>>>>>>>
>>>>>>>>             +  /* chash_table_copy_in should never fail unless
>>>>>>>> we're losing count */
>>>>>>>>
>>>>>>>>             +  WARN_ON_ONCE(r < 0);
>>>>>>>>
>>>>>>>>             +
>>>>>>>>
>>>>>>>>             +unlock_out:
>>>>>>>>
>>>>>>>>             +  spin_unlock_irqrestore(&fault_hash->lock, flags);
>>>>>>>>
>>>>>>>>             +  return r;
>>>>>>>>
>>>>>>>>             +}
>>>>>>>>
>>>>>>>>             +
>>>>>>>>
>>>>>>>>             +/**
>>>>>>>>
>>>>>>>>             + * amdgpu_vm_clear_fault - Remove a page fault
>>>>>>>> record
>>>>>>>>
>>>>>>>>             + *
>>>>>>>>
>>>>>>>>             + * @fault_hash: fault hash table
>>>>>>>>
>>>>>>>>             + * @key: 64-bit encoding of PASID and address
>>>>>>>>
>>>>>>>>             + *
>>>>>>>>
>>>>>>>>             + * This should be called when a page fault has been
>>>>>>>> handled. Any
>>>>>>>>
>>>>>>>>             + * future interrupt with this key will be processed
>>>>>>>> as a new
>>>>>>>>
>>>>>>>>             + * page fault.
>>>>>>>>
>>>>>>>>             + */
>>>>>>>>
>>>>>>>>             +void amdgpu_vm_clear_fault(struct
>>>>>>>> amdgpu_retryfault_hashtable *fault_hash, u64 key)
>>>>>>>>
>>>>>>>>             +{
>>>>>>>>
>>>>>>>>             +  unsigned long flags;
>>>>>>>>
>>>>>>>>             +  int r;
>>>>>>>>
>>>>>>>>             +
>>>>>>>>
>>>>>>>>             +  if (!fault_hash)
>>>>>>>>
>>>>>>>>             +          return;
>>>>>>>>
>>>>>>>>             +
>>>>>>>>
>>>>>>>>             +  spin_lock_irqsave(&fault_hash->lock, flags);
>>>>>>>>
>>>>>>>>             +
>>>>>>>>
>>>>>>>>             +  r = chash_table_remove(&fault_hash->hash, key,
>>>>>>>> NULL);
>>>>>>>>
>>>>>>>>             +  if (!WARN_ON_ONCE(r < 0)) {
>>>>>>>>
>>>>>>>>             +          fault_hash->count--;
>>>>>>>>
>>>>>>>>             +          WARN_ON_ONCE(fault_hash->count < 0);
>>>>>>>>
>>>>>>>>             +  }
>>>>>>>>
>>>>>>>>             +
>>>>>>>>
>>>>>>>>             +  spin_unlock_irqrestore(&fault_hash->lock, flags);
>>>>>>>>
>>>>>>>>             +}
>>>>>>>>
>>>>>>>>             diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>>>>>>
>>>>>>>>             index e275ee7..6eb1da1 100644
>>>>>>>>
>>>>>>>>             --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>>>>>>
>>>>>>>>             +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>>>>>>
>>>>>>>>             @@ -178,6 +178,13 @@ struct amdgpu_task_info {
>>>>>>>>
>>>>>>>>                pid_t   tgid;
>>>>>>>>
>>>>>>>>              };
>>>>>>>>
>>>>>>>>
>>>>>>>>             +#define AMDGPU_PAGEFAULT_HASH_BITS 8
>>>>>>>>
>>>>>>>>             +struct amdgpu_retryfault_hashtable {
>>>>>>>>
>>>>>>>>             +  DECLARE_CHASH_TABLE(hash,
>>>>>>>> AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
>>>>>>>>
>>>>>>>>             +  spinlock_t     lock;
>>>>>>>>
>>>>>>>>             +  int            count;
>>>>>>>>
>>>>>>>>             +};
>>>>>>>>
>>>>>>>>             +
>>>>>>>>
>>>>>>>>              struct amdgpu_vm {
>>>>>>>>
>>>>>>>>                /* tree of virtual addresses mapped */
>>>>>>>>
>>>>>>>>                struct rb_root_cached  va;
>>>>>>>>
>>>>>>>>             @@ -240,6 +247,7 @@ struct amdgpu_vm {
>>>>>>>>
>>>>>>>>                struct ttm_lru_bulk_move lru_bulk_move;
>>>>>>>>
>>>>>>>>                /* mark whether can do the bulk move */
>>>>>>>>
>>>>>>>>                bool                   bulk_moveable;
>>>>>>>>
>>>>>>>>             +  struct amdgpu_retryfault_hashtable *fault_hash;
>>>>>>>>
>>>>>>>>              };
>>>>>>>>
>>>>>>>>
>>>>>>>>              struct amdgpu_vm_manager {
>>>>>>>>
>>>>>>>>             @@ -355,4 +363,8 @@ void
>>>>>>>> amdgpu_vm_set_task_info(struct amdgpu_vm *vm);
>>>>>>>>
>>>>>>>>              void amdgpu_vm_move_to_lru_tail(struct amdgpu_device
>>>>>>>> *adev,
>>>>>>>>
>>>>>>>>                                       struct amdgpu_vm *vm);
>>>>>>>>
>>>>>>>>
>>>>>>>>             +int amdgpu_vm_add_fault(struct
>>>>>>>> amdgpu_retryfault_hashtable *fault_hash, u64 key);
>>>>>>>>
>>>>>>>>             +
>>>>>>>>
>>>>>>>>             +void amdgpu_vm_clear_fault(struct
>>>>>>>> amdgpu_retryfault_hashtable *fault_hash, u64 key);
>>>>>>>>
>>>>>>>>             +
>>>>>>>>
>>>>>>>>              #endif
>>>>>>>>
>>>>>>>>             diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>>>
>>>>>>>>             index 5ae5ed2..acbe5a7 100644
>>>>>>>>
>>>>>>>>             --- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>>>
>>>>>>>>             +++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>>>
>>>>>>>>             @@ -265,35 +265,36 @@ static bool
>>>>>>>> vega10_ih_prescreen_iv(struct amdgpu_device *adev)
>>>>>>>>
>>>>>>>>                        return true;
>>>>>>>>
>>>>>>>>                }
>>>>>>>>
>>>>>>>>
>>>>>>>>             -  addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 <<
>>>>>>>> 12);
>>>>>>>>
>>>>>>>>             -  key = AMDGPU_VM_FAULT(pasid, addr);
>>>>>>>>
>>>>>>>>             -  r = amdgpu_ih_add_fault(adev, key);
>>>>>>>>
>>>>>>>>             -
>>>>>>>>
>>>>>>>>             -  /* Hash table is full or the fault is already
>>>>>>>> being processed,
>>>>>>>>
>>>>>>>>             -   * ignore further page faults
>>>>>>>>
>>>>>>>>             -   */
>>>>>>>>
>>>>>>>>             -  if (r != 0)
>>>>>>>>
>>>>>>>>             -          goto ignore_iv;
>>>>>>>>
>>>>>>>>             -
>>>>>>>>
>>>>>>>>                /* Track retry faults in per-VM fault FIFO. */
>>>>>>>>
>>>>>>>>                spin_lock(&adev->vm_manager.pasid_lock);
>>>>>>>>
>>>>>>>>                vm = idr_find(&adev->vm_manager.pasid_idr, pasid);
>>>>>>>>
>>>>>>>>             +  addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 <<
>>>>>>>> 12);
>>>>>>>>
>>>>>>>>             +  key = AMDGPU_VM_FAULT(pasid, addr);
>>>>>>>>
>>>>>>>>                if (!vm) {
>>>>>>>>
>>>>>>>>                        /* VM not found, process it normally */
>>>>>>>>
>>>>>>>> spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>>>>
>>>>>>>>             -          amdgpu_ih_clear_fault(adev, key);
>>>>>>>>
>>>>>>>>                        return true;
>>>>>>>>
>>>>>>>>             +  } else {
>>>>>>>>
>>>>>>>>             +          r = amdgpu_vm_add_fault(vm->fault_hash,
>>>>>>>> key);
>>>>>>>>
>>>>>>>>             +
>>>>>>>>
>>>>>>>>             +          /* Hash table is full or the fault is
>>>>>>>> already being processed,
>>>>>>>>
>>>>>>>>             +           * ignore further page faults
>>>>>>>>
>>>>>>>>             +           */
>>>>>>>>
>>>>>>>>             +          if (r != 0) {
>>>>>>>>
>>>>>>>>             + spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>>>>
>>>>>>>>             +                  goto ignore_iv;
>>>>>>>>
>>>>>>>>             +          }
>>>>>>>>
>>>>>>>>                }
>>>>>>>>
>>>>>>>>                /* No locking required with single writer and
>>>>>>>> single reader */
>>>>>>>>
>>>>>>>>                r = kfifo_put(&vm->faults, key);
>>>>>>>>
>>>>>>>>                if (!r) {
>>>>>>>>
>>>>>>>>                        /* FIFO is full. Ignore it until there is
>>>>>>>> space */
>>>>>>>>
>>>>>>>>             +          amdgpu_vm_clear_fault(vm->fault_hash,
>>>>>>>> key);
>>>>>>>>
>>>>>>>> spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>>>>
>>>>>>>>             -          amdgpu_ih_clear_fault(adev, key);
>>>>>>>>
>>>>>>>>                        goto ignore_iv;
>>>>>>>>
>>>>>>>>                }
>>>>>>>>
>>>>>>>>             - spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>>>>
>>>>>>>>
>>>>>>>>             + spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>>>>
>>>>>>>>                /* It's the first fault for this address, process
>>>>>>>> it normally */
>>>>>>>>
>>>>>>>>                return true;
>>>>>>>>
>>>>>>>>
>>>>>>>>             @@ -386,14 +387,6 @@ static int
>>>>>>>> vega10_ih_sw_init(void
>>>>>>>> *handle)
>>>>>>>>
>>>>>>>>                adev->irq.ih.use_doorbell = true;
>>>>>>>>
>>>>>>>>                adev->irq.ih.doorbell_index = AMDGPU_DOORBELL64_IH
>>>>>>>> << 1;
>>>>>>>>
>>>>>>>>
>>>>>>>>             -  adev->irq.ih.faults =
>>>>>>>> kmalloc(sizeof(*adev->irq.ih.faults), GFP_KERNEL);
>>>>>>>>
>>>>>>>>             -  if (!adev->irq.ih.faults)
>>>>>>>>
>>>>>>>>             -          return -ENOMEM;
>>>>>>>>
>>>>>>>>             - INIT_CHASH_TABLE(adev->irq.ih.faults->hash,
>>>>>>>>
>>>>>>>>             -                   AMDGPU_PAGEFAULT_HASH_BITS, 8,
>>>>>>>> 0);
>>>>>>>>
>>>>>>>>             - spin_lock_init(&adev->irq.ih.faults->lock);
>>>>>>>>
>>>>>>>>             -  adev->irq.ih.faults->count = 0;
>>>>>>>>
>>>>>>>>             -
>>>>>>>>
>>>>>>>>                r = amdgpu_irq_init(adev);
>>>>>>>>
>>>>>>>>
>>>>>>>>                return r;
>>>>>>>>
>>>>>>>>             @@ -406,9 +399,6 @@ static int vega10_ih_sw_fini(void
>>>>>>>> *handle)
>>>>>>>>
>>>>>>>>                amdgpu_irq_fini(adev);
>>>>>>>>
>>>>>>>>                amdgpu_ih_ring_fini(adev);
>>>>>>>>
>>>>>>>>
>>>>>>>>             -  kfree(adev->irq.ih.faults);
>>>>>>>>
>>>>>>>>             -  adev->irq.ih.faults = NULL;
>>>>>>>>
>>>>>>>>             -
>>>>>>>>
>>>>>>>>                return 0;
>>>>>>>>
>>>>>>>>              }
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>         _______________________________________________
>>>>>>>>
>>>>>>>>         amd-gfx mailing list
>>>>>>>>
>>>>>>>>         amd-gfx@lists.freedesktop.org
>>>>>>>> <mailto:amd-gfx@lists.freedesktop.org>
>>>>>>>>
>>>>>>>>         https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>>>
>>>>>>>>
>>>>>>> _______________________________________________
>>>>>>> amd-gfx mailing list
>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>> _______________________________________________
>>>>> amd-gfx mailing list
>>>>> amd-gfx@lists.freedesktop.org
>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to amdgpu vm
       [not found]                                     ` <82fb54c4-8cf2-9033-9fb4-f76e747d7b87-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
  2018-09-11 14:09                                       ` Zeng, Oak
@ 2018-09-11 22:00                                       ` Felix Kuehling
       [not found]                                         ` <5b9dc941-3d29-21ca-9f92-ba3512900660-5C7GfCeVMHo@public.gmane.org>
  1 sibling, 1 reply; 27+ messages in thread
From: Felix Kuehling @ 2018-09-11 22:00 UTC (permalink / raw)
  To: christian.koenig-5C7GfCeVMHo, Zeng, Oak, Oak Zeng,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Yang, Philip

On 2018-09-11 03:19 AM, Christian König wrote:
> Hi Felix,
>
> let me try to explain the problem on an example:
>
> 1. We have a running job which needs recoverable page faults for
> accessing the process address space.
> 2. We run into memory pressure on VRAM and start to evict things.
> 3. A page tables of the running job is picked up for eviction.
> 4. We schedule a copy command to move the content of the page table to
> system memory.

I think we should change this. If you evict a page table, you don't need
to preserve its contents. You should be able to restore the page table
contents from scratch when you handle the page fault that restores it.

> 5. The new system memory location of the page table is noted in its BO.

You mean in the parent page table? You can just invalidate the entry in
the parent page table and let it fault.

> 6. We get a fault from the job and swap in the page from the process
> address space.

No. Then you need to keep updating page tables that are swapped out.
Instead just discard them and rebuild on fault.

> 7. Now we need to enter the new page address into the page table -> *BAM*
>
> The problem is now that we don't know the address of the page table
> because the current location was replaced with the future location in
> step #5.

You solve that problem by discarding page tables that are swapped out.
Then there is no future location.

>
> We could now argue that we could update the page tables on the fly for
> the evicted page and never wait for the current job to finish, but
> this won't work because not all engines can handle that.

Engines that can't handle page faults will depend on fences to keep
their page tables resident, like we do today. So they won't have this
problem.

Regards,
  Felix

>
> I will circumvent this problem for now by blocking the eviction before
> step #5. The issue with that is that this pipelining is responsible
> for nearly 20% of the fps in some testcases.
>
> I hope that it won't be that bad when I limit disabling the pipelining
> to only page tables, but it is certainly possible that we will get
> pushback on this.
>
> If that happens we probably need to add some flags to limit this
> workaround even more to only the root PD and all the PDs/PTs which are
> involved in recoverable faults.
>
> Regards,
> Christian.
>
> Am 10.09.2018 um 21:10 schrieb Felix Kuehling:
>> I'm not sure why you need to distinguish current and future state when
>> dealing with page faults. When you get a page fault, you know that the
>> GPU is trying to access memory right now, in the present. So you're
>> always working with the current state. When the CPU page table changes,
>> you get an MMU notifier that allows you to invalidate the corresponding
>> GPU PTEs. If you have a valid GPU PTE, it always represents the current
>> states and is in sync with the CPU page table. If the GPU page table is
>> ever outdated, it should have an invalid entry in that place.
>>
>> If SDMA is used to update the GPU PTEs, there is a delay. The MMU
>> notifier is synchronous, so it shouldn't be a problem. You just wait for
>> the SDMA job to complete before returning. When updating PTEs with new
>> valid addresses, it's asynchronous. But the GPU will continue retrying
>> on the invalid entry until SDMA finishes. So it's also implicitly
>> synchronized on the GPU side.
>>
>> Regards,
>>    Felix
>>
>>
>> On 2018-09-10 05:42 AM, Christian König wrote:
>>> Hi Felix & Oak,
>>>
>>> over the weekend I had the idea that we could just use the shadow BOs
>>> to have the current state in a page fault. They are GTT BOs and CPU
>>> accessible anyway.
>>>
>>> Regards,
>>> Christian.
>>>
>>> Am 08.09.2018 um 09:34 schrieb Christian König:
>>>> Hi Felix,
>>>>
>>>>> But why do you want to update page tables when there is no more user
>>>>> mode context that cares about them? Is this just to allow pending
>>>>> work
>>>>> to complete after the user mode context is gone? To prevent hangs?
>>>> The problem I'm see is that we still don't have a reliable way of
>>>> killing a command submission while making sure that no more faults of
>>>> it are in the flight.
>>>>
>>>> E.g. the HWS has a function to kill a running job on the hardware,
>>>> but that only works for compute and there is no way of telling when
>>>> that operation has finished.
>>>>
>>>> But Oak has already convinced me that we should probably work on that
>>>> instead of trying to keep the VM alive as long as possible.
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>> Am 08.09.2018 um 02:27 schrieb Felix Kuehling:
>>>>> Hi Christian,
>>>>>
>>>>> I'm not sure I get your point here.
>>>>>
>>>>> As I understand it, the amdgpu_vm structure represents the user
>>>>> mode VM
>>>>> context. It has the pointers to the root page directory and the
>>>>> rest of
>>>>> the page table hierarchy. If there is no amdgpu_vm, there is no user
>>>>> mode context that cares about the page tables.
>>>>>
>>>>> If we leave the page table pointers in the amdgpu_vm, then handling
>>>>> faults after the VM is destroyed is pointless. We can't find the page
>>>>> tables and we can't update them, so there is nothing we can do in
>>>>> response to VM faults.
>>>>>
>>>>> So I'm guessing that you want to move the page table pointers into a
>>>>> different structure that exists half-independently of the VM. It
>>>>> would
>>>>> be created when the VM is created (that's where we currently allocate
>>>>> the PASID) but can be destroyed slightly later.
>>>>>
>>>>> But why do you want to update page tables when there is no more user
>>>>> mode context that cares about them? Is this just to allow pending
>>>>> work
>>>>> to complete after the user mode context is gone? To prevent hangs?
>>>>>
>>>>> Regards,
>>>>>     Felix
>>>>>
>>>>> On 2018-09-10 07:20 AM, Christian König wrote:
>>>>>>> Am I right that the handling of page fault need a valid VM?
>>>>>> No, exactly that view is incorrect.
>>>>>>
>>>>>> The VM is meant to be a memory management structure of page
>>>>>> tables and
>>>>>> is completely pointless fault processing because it represent the
>>>>>> future state of the page tables and not the current one.
>>>>>>
>>>>>> What we need for fault processing is a new structure build around
>>>>>> PASIDs which is feed by the with addresses when page tables are
>>>>>> moved
>>>>>> around.
>>>>>>
>>>>>> Alternatively I hope that we can use the SDMA to walk the page
>>>>>> tables
>>>>>> and update the required entries by just using the address.
>>>>>>
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>>> Am 07.09.2018 um 16:55 schrieb Zeng, Oak:
>>>>>>> Hi Christian,
>>>>>>>
>>>>>>>
>>>>>>> What is the value of delay the destroy of hash to after vm is
>>>>>>> destroyed? Since vm is destroyed, you basically don’t have enough
>>>>>>> information to paging in the correct page to gpuvm. Am I right that
>>>>>>> the handling of page fault need a valid VM? For example, you
>>>>>>> need the
>>>>>>> VM to get the corresponding physical address of the faulty page.
>>>>>>>
>>>>>>>
>>>>>>> After vm is destroyed, the retry interrupt can be directly
>>>>>>> discard as
>>>>>>> you can’t find vm through pasid. You can think this is also one
>>>>>>> kind
>>>>>>> of prescreen.
>>>>>>>
>>>>>>>
>>>>>>> So I am stand for the point that, there is no need to delay the
>>>>>>> destroy of hash to after vm is destroyed. Prescreening hash can be
>>>>>>> destroyed at the time of vm_fini.
>>>>>>>
>>>>>>>
>>>>>>> Thanks,
>>>>>>>
>>>>>>> Oak
>>>>>>>
>>>>>>>
>>>>>>> *From:*Christian König <ckoenig.leichtzumerken@gmail.com>
>>>>>>> *Sent:* Friday, September 07, 2018 4:39 AM
>>>>>>> *To:* Zeng, Oak <Oak.Zeng@amd.com>; Oak Zeng
>>>>>>> <zengshanjun@gmail.com>;
>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>> *Cc:* Zeng, Oak <Oak.Zeng@amd.com>
>>>>>>> *Subject:* Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to
>>>>>>> amdgpu vm
>>>>>>>
>>>>>>>
>>>>>>> Hi Oak,
>>>>>>>
>>>>>>> yeah, but this is still a NAK. Making the hash per PASID was not a
>>>>>>> suggestion but rather a must have.
>>>>>>>
>>>>>>> The VM structures must be destroyed while the processing is still
>>>>>>> ongoing, or otherwise we would not have a clean OOM handling.
>>>>>>>
>>>>>>> The IDR for PASID lockup can be put into amdgpu_ids.c, you can just
>>>>>>> replace the IDA already used there with an IDR.
>>>>>>>
>>>>>>> Since the PASID is already freed up delayed we should have the
>>>>>>> grace
>>>>>>> period for interrupt processing included. If that still isn't
>>>>>>> sufficient we can still add some delayed work for this.
>>>>>>>
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>>> Am 07.09.2018 um 06:16 schrieb ozeng:
>>>>>>>
>>>>>>>       Hi Christian,
>>>>>>>
>>>>>>>       In this implementation, fault hash is made per vm, not per
>>>>>>> pasid
>>>>>>>       as suggested, based on below considerations:
>>>>>>>
>>>>>>>         * Delay the destroy of hash introduces more effort like
>>>>>>> how to
>>>>>>>           set the proper grace period after which no retry
>>>>>>> interrupt
>>>>>>>           will be generated. We want to avoid those complication
>>>>>>> if we
>>>>>>>           can.
>>>>>>>         * The  problem of the per vm implementation is that it
>>>>>>> is hard
>>>>>>>           to delay the destroy of fault hash, because when vm is
>>>>>>>           destroyed, prescreen function won't be able to
>>>>>>> retrieve the
>>>>>>>           fault hash. But in this case, the prescreen function can
>>>>>>>           either let the interrupt go through (to gmc) or ignore
>>>>>>> it.
>>>>>>>           From this perspective, it is not very necessary to
>>>>>>> delay the
>>>>>>>           destroy of hash table.
>>>>>>>         * On the other hand, since ASICs after vega10 have the
>>>>>>> HW CAM
>>>>>>>           feature. So the SW IV prescreen is only useful for
>>>>>>> vega10.
>>>>>>>           For all the HW implemented CAM, we can consider the
>>>>>>> delayed
>>>>>>>           CAM acknowledgment.
>>>>>>>         * Making it per pasid need to introduce another idr to
>>>>>>>           correlate pasid and hash table - where to put the idr?
>>>>>>> You
>>>>>>>           will have to make it a global variable which is not very
>>>>>>>           desirable.
>>>>>>>
>>>>>>>       The main purpose of this patch is to solve the inter-process
>>>>>>> lock
>>>>>>>       issue (when hash table is full), while still keep codes
>>>>>>> simple.
>>>>>>>
>>>>>>>       Also with this patch, the faults kfifo is not necessary any
>>>>>>> more.
>>>>>>>       See patch 2.
>>>>>>>
>>>>>>>       Regards,
>>>>>>>
>>>>>>>       Oak
>>>>>>>
>>>>>>>
>>>>>>>       On 09/06/2018 11:28 PM, Oak Zeng wrote:
>>>>>>>
>>>>>>>           In stead of share one fault hash table per device,
>>>>>>> make it
>>>>>>>
>>>>>>>           per vm. This can avoid inter-process lock issue when
>>>>>>> fault
>>>>>>>
>>>>>>>           hash table is full.
>>>>>>>
>>>>>>>
>>>>>>>           Change-Id: I5d1281b7c41eddc8e26113e010516557588d3708
>>>>>>>
>>>>>>>           Signed-off-by: Oak Zeng <Oak.Zeng@amd.com>
>>>>>>> <mailto:Oak.Zeng@amd.com>
>>>>>>>
>>>>>>>           Suggested-by: Christian Konig <Christian.Koenig@amd.com>
>>>>>>> <mailto:Christian.Koenig@amd.com>
>>>>>>>
>>>>>>>           Suggested-by: Felix Kuehling <Felix.Kuehling@amd.com>
>>>>>>> <mailto:Felix.Kuehling@amd.com>
>>>>>>>
>>>>>>>           ---
>>>>>>>
>>>>>>>            drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c |  75
>>>>>>> ------------------------
>>>>>>>
>>>>>>>            drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h |  10 ----
>>>>>>>
>>>>>>>            drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 102
>>>>>>> ++++++++++++++++++++++++++++++++-
>>>>>>>
>>>>>>>            drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h |  12 ++++
>>>>>>>
>>>>>>>            drivers/gpu/drm/amd/amdgpu/vega10_ih.c |  38
>>>>>>> +++++-------
>>>>>>>
>>>>>>>            5 files changed, 127 insertions(+), 110 deletions(-)
>>>>>>>
>>>>>>>
>>>>>>>           diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>>>>>>
>>>>>>>           index 06373d4..4ed8621 100644
>>>>>>>
>>>>>>>           --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>>>>>>
>>>>>>>           +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>>>>>>
>>>>>>>           @@ -197,78 +197,3 @@ int amdgpu_ih_process(struct
>>>>>>> amdgpu_device *adev)
>>>>>>>
>>>>>>>              return IRQ_HANDLED;
>>>>>>>
>>>>>>>            }
>>>>>>>
>>>>>>>
>>>>>>>           -/**
>>>>>>>
>>>>>>>           - * amdgpu_ih_add_fault - Add a page fault record
>>>>>>>
>>>>>>>           - *
>>>>>>>
>>>>>>>           - * @adev: amdgpu device pointer
>>>>>>>
>>>>>>>           - * @key: 64-bit encoding of PASID and address
>>>>>>>
>>>>>>>           - *
>>>>>>>
>>>>>>>           - * This should be called when a retry page fault
>>>>>>> interrupt is
>>>>>>>
>>>>>>>           - * received. If this is a new page fault, it will be
>>>>>>> added to a hash
>>>>>>>
>>>>>>>           - * table. The return value indicates whether this is a
>>>>>>> new fault, or
>>>>>>>
>>>>>>>           - * a fault that was already known and is already being
>>>>>>> handled.
>>>>>>>
>>>>>>>           - *
>>>>>>>
>>>>>>>           - * If there are too many pending page faults, this will
>>>>>>> fail. Retry
>>>>>>>
>>>>>>>           - * interrupts should be ignored in this case until there
>>>>>>> is enough
>>>>>>>
>>>>>>>           - * free space.
>>>>>>>
>>>>>>>           - *
>>>>>>>
>>>>>>>           - * Returns 0 if the fault was added, 1 if the fault was
>>>>>>> already known,
>>>>>>>
>>>>>>>           - * -ENOSPC if there are too many pending faults.
>>>>>>>
>>>>>>>           - */
>>>>>>>
>>>>>>>           -int amdgpu_ih_add_fault(struct amdgpu_device *adev, u64
>>>>>>> key)
>>>>>>>
>>>>>>>           -{
>>>>>>>
>>>>>>>           -  unsigned long flags;
>>>>>>>
>>>>>>>           -  int r = -ENOSPC;
>>>>>>>
>>>>>>>           -
>>>>>>>
>>>>>>>           -  if (WARN_ON_ONCE(!adev->irq.ih.faults))
>>>>>>>
>>>>>>>           -          /* Should be allocated in <IP>_ih_sw_init on
>>>>>>> GPUs that
>>>>>>>
>>>>>>>           -           * support retry faults and require retry
>>>>>>> filtering.
>>>>>>>
>>>>>>>           -           */
>>>>>>>
>>>>>>>           -          return r;
>>>>>>>
>>>>>>>           -
>>>>>>>
>>>>>>>           - spin_lock_irqsave(&adev->irq.ih.faults->lock, flags);
>>>>>>>
>>>>>>>           -
>>>>>>>
>>>>>>>           -  /* Only let the hash table fill up to 50% for best
>>>>>>> performance */
>>>>>>>
>>>>>>>           -  if (adev->irq.ih.faults->count >= (1 <<
>>>>>>> (AMDGPU_PAGEFAULT_HASH_BITS-1)))
>>>>>>>
>>>>>>>           -          goto unlock_out;
>>>>>>>
>>>>>>>           -
>>>>>>>
>>>>>>>           -  r = chash_table_copy_in(&adev->irq.ih.faults->hash,
>>>>>>> key, NULL);
>>>>>>>
>>>>>>>           -  if (!r)
>>>>>>>
>>>>>>>           -          adev->irq.ih.faults->count++;
>>>>>>>
>>>>>>>           -
>>>>>>>
>>>>>>>           -  /* chash_table_copy_in should never fail unless we're
>>>>>>> losing count */
>>>>>>>
>>>>>>>           -  WARN_ON_ONCE(r < 0);
>>>>>>>
>>>>>>>           -
>>>>>>>
>>>>>>>           -unlock_out:
>>>>>>>
>>>>>>>           - spin_unlock_irqrestore(&adev->irq.ih.faults->lock,
>>>>>>> flags);
>>>>>>>
>>>>>>>           -  return r;
>>>>>>>
>>>>>>>           -}
>>>>>>>
>>>>>>>           -
>>>>>>>
>>>>>>>           -/**
>>>>>>>
>>>>>>>           - * amdgpu_ih_clear_fault - Remove a page fault record
>>>>>>>
>>>>>>>           - *
>>>>>>>
>>>>>>>           - * @adev: amdgpu device pointer
>>>>>>>
>>>>>>>           - * @key: 64-bit encoding of PASID and address
>>>>>>>
>>>>>>>           - *
>>>>>>>
>>>>>>>           - * This should be called when a page fault has been
>>>>>>> handled. Any
>>>>>>>
>>>>>>>           - * future interrupt with this key will be processed as a
>>>>>>> new
>>>>>>>
>>>>>>>           - * page fault.
>>>>>>>
>>>>>>>           - */
>>>>>>>
>>>>>>>           -void amdgpu_ih_clear_fault(struct amdgpu_device *adev,
>>>>>>> u64 key)
>>>>>>>
>>>>>>>           -{
>>>>>>>
>>>>>>>           -  unsigned long flags;
>>>>>>>
>>>>>>>           -  int r;
>>>>>>>
>>>>>>>           -
>>>>>>>
>>>>>>>           -  if (!adev->irq.ih.faults)
>>>>>>>
>>>>>>>           -          return;
>>>>>>>
>>>>>>>           -
>>>>>>>
>>>>>>>           - spin_lock_irqsave(&adev->irq.ih.faults->lock, flags);
>>>>>>>
>>>>>>>           -
>>>>>>>
>>>>>>>           -  r = chash_table_remove(&adev->irq.ih.faults->hash,
>>>>>>> key, NULL);
>>>>>>>
>>>>>>>           -  if (!WARN_ON_ONCE(r < 0)) {
>>>>>>>
>>>>>>>           -          adev->irq.ih.faults->count--;
>>>>>>>
>>>>>>>           - WARN_ON_ONCE(adev->irq.ih.faults->count < 0);
>>>>>>>
>>>>>>>           -  }
>>>>>>>
>>>>>>>           -
>>>>>>>
>>>>>>>           - spin_unlock_irqrestore(&adev->irq.ih.faults->lock,
>>>>>>> flags);
>>>>>>>
>>>>>>>           -}
>>>>>>>
>>>>>>>           diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>>>>>
>>>>>>>           index a23e1c0..f411ffb 100644
>>>>>>>
>>>>>>>           --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>>>>>
>>>>>>>           +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>>>>>
>>>>>>>           @@ -32,13 +32,6 @@ struct amdgpu_device;
>>>>>>>
>>>>>>>            #define AMDGPU_IH_CLIENTID_LEGACY 0
>>>>>>>
>>>>>>>            #define AMDGPU_IH_CLIENTID_MAX SOC15_IH_CLIENTID_MAX
>>>>>>>
>>>>>>>
>>>>>>>           -#define AMDGPU_PAGEFAULT_HASH_BITS 8
>>>>>>>
>>>>>>>           -struct amdgpu_retryfault_hashtable {
>>>>>>>
>>>>>>>           -  DECLARE_CHASH_TABLE(hash, AMDGPU_PAGEFAULT_HASH_BITS,
>>>>>>> 8, 0);
>>>>>>>
>>>>>>>           -  spinlock_t     lock;
>>>>>>>
>>>>>>>           -  int            count;
>>>>>>>
>>>>>>>           -};
>>>>>>>
>>>>>>>           -
>>>>>>>
>>>>>>>            /*
>>>>>>>
>>>>>>>             * R6xx+ IH ring
>>>>>>>
>>>>>>>             */
>>>>>>>
>>>>>>>           @@ -57,7 +50,6 @@ struct amdgpu_ih_ring {
>>>>>>>
>>>>>>>              bool                   use_doorbell;
>>>>>>>
>>>>>>>              bool                   use_bus_addr;
>>>>>>>
>>>>>>>              dma_addr_t             rb_dma_addr; /* only used when
>>>>>>> use_bus_addr = true */
>>>>>>>
>>>>>>>           -  struct amdgpu_retryfault_hashtable *faults;
>>>>>>>
>>>>>>>            };
>>>>>>>
>>>>>>>
>>>>>>>            #define AMDGPU_IH_SRC_DATA_MAX_SIZE_DW 4
>>>>>>>
>>>>>>>           @@ -95,7 +87,5 @@ int amdgpu_ih_ring_init(struct
>>>>>>> amdgpu_device *adev, unsigned ring_size,
>>>>>>>
>>>>>>>                              bool use_bus_addr);
>>>>>>>
>>>>>>>            void amdgpu_ih_ring_fini(struct amdgpu_device *adev);
>>>>>>>
>>>>>>>            int amdgpu_ih_process(struct amdgpu_device *adev);
>>>>>>>
>>>>>>>           -int amdgpu_ih_add_fault(struct amdgpu_device *adev, u64
>>>>>>> key);
>>>>>>>
>>>>>>>           -void amdgpu_ih_clear_fault(struct amdgpu_device *adev,
>>>>>>> u64 key);
>>>>>>>
>>>>>>>
>>>>>>>            #endif
>>>>>>>
>>>>>>>           diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>>
>>>>>>>           index 1d7e3c1..8b220e9 100644
>>>>>>>
>>>>>>>           --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>>
>>>>>>>           +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>>
>>>>>>>           @@ -2692,6 +2692,22 @@ void amdgpu_vm_adjust_size(struct
>>>>>>> amdgpu_device *adev, uint32_t min_vm_size,
>>>>>>>
>>>>>>>                       adev->vm_manager.fragment_size);
>>>>>>>
>>>>>>>            }
>>>>>>>
>>>>>>>
>>>>>>>           +static struct amdgpu_retryfault_hashtable
>>>>>>> *init_fault_hash(void)
>>>>>>>
>>>>>>>           +{
>>>>>>>
>>>>>>>           +  struct amdgpu_retryfault_hashtable *fault_hash;
>>>>>>>
>>>>>>>           +
>>>>>>>
>>>>>>>           +  fault_hash = kmalloc(sizeof(*fault_hash), GFP_KERNEL);
>>>>>>>
>>>>>>>           +  if (!fault_hash)
>>>>>>>
>>>>>>>           +          return fault_hash;
>>>>>>>
>>>>>>>           +
>>>>>>>
>>>>>>>           +  INIT_CHASH_TABLE(fault_hash->hash,
>>>>>>>
>>>>>>>           +                  AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
>>>>>>>
>>>>>>>           +  spin_lock_init(&fault_hash->lock);
>>>>>>>
>>>>>>>           +  fault_hash->count = 0;
>>>>>>>
>>>>>>>           +
>>>>>>>
>>>>>>>           +  return fault_hash;
>>>>>>>
>>>>>>>           +}
>>>>>>>
>>>>>>>           +
>>>>>>>
>>>>>>>            /**
>>>>>>>
>>>>>>>             * amdgpu_vm_init - initialize a vm instance
>>>>>>>
>>>>>>>             *
>>>>>>>
>>>>>>>           @@ -2780,6 +2796,12 @@ int amdgpu_vm_init(struct
>>>>>>> amdgpu_device *adev, struct amdgpu_vm *vm,
>>>>>>>
>>>>>>>                      vm->pasid = pasid;
>>>>>>>
>>>>>>>              }
>>>>>>>
>>>>>>>
>>>>>>>           +  vm->fault_hash = init_fault_hash();
>>>>>>>
>>>>>>>           +  if (!vm->fault_hash) {
>>>>>>>
>>>>>>>           +          r = -ENOMEM;
>>>>>>>
>>>>>>>           +          goto error_free_root;
>>>>>>>
>>>>>>>           +  }
>>>>>>>
>>>>>>>           +
>>>>>>>
>>>>>>>              INIT_KFIFO(vm->faults);
>>>>>>>
>>>>>>>              vm->fault_credit = 16;
>>>>>>>
>>>>>>>
>>>>>>>           @@ -2973,7 +2995,7 @@ void amdgpu_vm_fini(struct
>>>>>>> amdgpu_device *adev, struct amdgpu_vm *vm)
>>>>>>>
>>>>>>>
>>>>>>>              /* Clear pending page faults from IH when the VM is
>>>>>>> destroyed */
>>>>>>>
>>>>>>>              while (kfifo_get(&vm->faults, &fault))
>>>>>>>
>>>>>>>           -          amdgpu_ih_clear_fault(adev, fault);
>>>>>>>
>>>>>>>           +          amdgpu_vm_clear_fault(vm->fault_hash, fault);
>>>>>>>
>>>>>>>
>>>>>>>              if (vm->pasid) {
>>>>>>>
>>>>>>>                      unsigned long flags;
>>>>>>>
>>>>>>>           @@ -2983,6 +3005,9 @@ void amdgpu_vm_fini(struct
>>>>>>> amdgpu_device *adev, struct amdgpu_vm *vm)
>>>>>>>
>>>>>>> spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
>>>>>>>
>>>>>>>              }
>>>>>>>
>>>>>>>
>>>>>>>           +  kfree(vm->fault_hash);
>>>>>>>
>>>>>>>           +  vm->fault_hash = NULL;
>>>>>>>
>>>>>>>           +
>>>>>>>
>>>>>>>              drm_sched_entity_destroy(&vm->entity);
>>>>>>>
>>>>>>>
>>>>>>>              if (!RB_EMPTY_ROOT(&vm->va.rb_root)) {
>>>>>>>
>>>>>>>           @@ -3183,3 +3208,78 @@ void
>>>>>>> amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
>>>>>>>
>>>>>>>                      }
>>>>>>>
>>>>>>>              }
>>>>>>>
>>>>>>>            }
>>>>>>>
>>>>>>>           +
>>>>>>>
>>>>>>>           +/**
>>>>>>>
>>>>>>>           + * amdgpu_vm_add_fault - Add a page fault record to
>>>>>>> fault hash table
>>>>>>>
>>>>>>>           + *
>>>>>>>
>>>>>>>           + * @fault_hash: fault hash table
>>>>>>>
>>>>>>>           + * @key: 64-bit encoding of PASID and address
>>>>>>>
>>>>>>>           + *
>>>>>>>
>>>>>>>           + * This should be called when a retry page fault
>>>>>>> interrupt is
>>>>>>>
>>>>>>>           + * received. If this is a new page fault, it will be
>>>>>>> added to a hash
>>>>>>>
>>>>>>>           + * table. The return value indicates whether this is a
>>>>>>> new fault, or
>>>>>>>
>>>>>>>           + * a fault that was already known and is already being
>>>>>>> handled.
>>>>>>>
>>>>>>>           + *
>>>>>>>
>>>>>>>           + * If there are too many pending page faults, this will
>>>>>>> fail. Retry
>>>>>>>
>>>>>>>           + * interrupts should be ignored in this case until there
>>>>>>> is enough
>>>>>>>
>>>>>>>           + * free space.
>>>>>>>
>>>>>>>           + *
>>>>>>>
>>>>>>>           + * Returns 0 if the fault was added, 1 if the fault was
>>>>>>> already known,
>>>>>>>
>>>>>>>           + * -ENOSPC if there are too many pending faults.
>>>>>>>
>>>>>>>           + */
>>>>>>>
>>>>>>>           +int amdgpu_vm_add_fault(struct
>>>>>>> amdgpu_retryfault_hashtable *fault_hash, u64 key)
>>>>>>>
>>>>>>>           +{
>>>>>>>
>>>>>>>           +  unsigned long flags;
>>>>>>>
>>>>>>>           +  int r = -ENOSPC;
>>>>>>>
>>>>>>>           +
>>>>>>>
>>>>>>>           +  if (WARN_ON_ONCE(!fault_hash))
>>>>>>>
>>>>>>>           +          /* Should be allocated in amdgpu_vm_init
>>>>>>>
>>>>>>>           +           */
>>>>>>>
>>>>>>>           +          return r;
>>>>>>>
>>>>>>>           +
>>>>>>>
>>>>>>>           +  spin_lock_irqsave(&fault_hash->lock, flags);
>>>>>>>
>>>>>>>           +
>>>>>>>
>>>>>>>           +  /* Only let the hash table fill up to 50% for best
>>>>>>> performance */
>>>>>>>
>>>>>>>           +  if (fault_hash->count >= (1 <<
>>>>>>> (AMDGPU_PAGEFAULT_HASH_BITS-1)))
>>>>>>>
>>>>>>>           +          goto unlock_out;
>>>>>>>
>>>>>>>           +
>>>>>>>
>>>>>>>           +  r = chash_table_copy_in(&fault_hash->hash, key, NULL);
>>>>>>>
>>>>>>>           +  if (!r)
>>>>>>>
>>>>>>>           +          fault_hash->count++;
>>>>>>>
>>>>>>>           +
>>>>>>>
>>>>>>>           +  /* chash_table_copy_in should never fail unless we're
>>>>>>> losing count */
>>>>>>>
>>>>>>>           +  WARN_ON_ONCE(r < 0);
>>>>>>>
>>>>>>>           +
>>>>>>>
>>>>>>>           +unlock_out:
>>>>>>>
>>>>>>>           +  spin_unlock_irqrestore(&fault_hash->lock, flags);
>>>>>>>
>>>>>>>           +  return r;
>>>>>>>
>>>>>>>           +}
>>>>>>>
>>>>>>>           +
>>>>>>>
>>>>>>>           +/**
>>>>>>>
>>>>>>>           + * amdgpu_vm_clear_fault - Remove a page fault record
>>>>>>>
>>>>>>>           + *
>>>>>>>
>>>>>>>           + * @fault_hash: fault hash table
>>>>>>>
>>>>>>>           + * @key: 64-bit encoding of PASID and address
>>>>>>>
>>>>>>>           + *
>>>>>>>
>>>>>>>           + * This should be called when a page fault has been
>>>>>>> handled. Any
>>>>>>>
>>>>>>>           + * future interrupt with this key will be processed as a
>>>>>>> new
>>>>>>>
>>>>>>>           + * page fault.
>>>>>>>
>>>>>>>           + */
>>>>>>>
>>>>>>>           +void amdgpu_vm_clear_fault(struct
>>>>>>> amdgpu_retryfault_hashtable *fault_hash, u64 key)
>>>>>>>
>>>>>>>           +{
>>>>>>>
>>>>>>>           +  unsigned long flags;
>>>>>>>
>>>>>>>           +  int r;
>>>>>>>
>>>>>>>           +
>>>>>>>
>>>>>>>           +  if (!fault_hash)
>>>>>>>
>>>>>>>           +          return;
>>>>>>>
>>>>>>>           +
>>>>>>>
>>>>>>>           +  spin_lock_irqsave(&fault_hash->lock, flags);
>>>>>>>
>>>>>>>           +
>>>>>>>
>>>>>>>           +  r = chash_table_remove(&fault_hash->hash, key, NULL);
>>>>>>>
>>>>>>>           +  if (!WARN_ON_ONCE(r < 0)) {
>>>>>>>
>>>>>>>           +          fault_hash->count--;
>>>>>>>
>>>>>>>           +          WARN_ON_ONCE(fault_hash->count < 0);
>>>>>>>
>>>>>>>           +  }
>>>>>>>
>>>>>>>           +
>>>>>>>
>>>>>>>           +  spin_unlock_irqrestore(&fault_hash->lock, flags);
>>>>>>>
>>>>>>>           +}
>>>>>>>
>>>>>>>           diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>>>>>
>>>>>>>           index e275ee7..6eb1da1 100644
>>>>>>>
>>>>>>>           --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>>>>>
>>>>>>>           +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>>>>>
>>>>>>>           @@ -178,6 +178,13 @@ struct amdgpu_task_info {
>>>>>>>
>>>>>>>              pid_t   tgid;
>>>>>>>
>>>>>>>            };
>>>>>>>
>>>>>>>
>>>>>>>           +#define AMDGPU_PAGEFAULT_HASH_BITS 8
>>>>>>>
>>>>>>>           +struct amdgpu_retryfault_hashtable {
>>>>>>>
>>>>>>>           +  DECLARE_CHASH_TABLE(hash, AMDGPU_PAGEFAULT_HASH_BITS,
>>>>>>> 8, 0);
>>>>>>>
>>>>>>>           +  spinlock_t     lock;
>>>>>>>
>>>>>>>           +  int            count;
>>>>>>>
>>>>>>>           +};
>>>>>>>
>>>>>>>           +
>>>>>>>
>>>>>>>            struct amdgpu_vm {
>>>>>>>
>>>>>>>              /* tree of virtual addresses mapped */
>>>>>>>
>>>>>>>              struct rb_root_cached  va;
>>>>>>>
>>>>>>>           @@ -240,6 +247,7 @@ struct amdgpu_vm {
>>>>>>>
>>>>>>>              struct ttm_lru_bulk_move lru_bulk_move;
>>>>>>>
>>>>>>>              /* mark whether can do the bulk move */
>>>>>>>
>>>>>>>              bool                   bulk_moveable;
>>>>>>>
>>>>>>>           +  struct amdgpu_retryfault_hashtable *fault_hash;
>>>>>>>
>>>>>>>            };
>>>>>>>
>>>>>>>
>>>>>>>            struct amdgpu_vm_manager {
>>>>>>>
>>>>>>>           @@ -355,4 +363,8 @@ void amdgpu_vm_set_task_info(struct
>>>>>>> amdgpu_vm *vm);
>>>>>>>
>>>>>>>            void amdgpu_vm_move_to_lru_tail(struct amdgpu_device
>>>>>>> *adev,
>>>>>>>
>>>>>>>                                     struct amdgpu_vm *vm);
>>>>>>>
>>>>>>>
>>>>>>>           +int amdgpu_vm_add_fault(struct
>>>>>>> amdgpu_retryfault_hashtable *fault_hash, u64 key);
>>>>>>>
>>>>>>>           +
>>>>>>>
>>>>>>>           +void amdgpu_vm_clear_fault(struct
>>>>>>> amdgpu_retryfault_hashtable *fault_hash, u64 key);
>>>>>>>
>>>>>>>           +
>>>>>>>
>>>>>>>            #endif
>>>>>>>
>>>>>>>           diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>>
>>>>>>>           index 5ae5ed2..acbe5a7 100644
>>>>>>>
>>>>>>>           --- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>>
>>>>>>>           +++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>>
>>>>>>>           @@ -265,35 +265,36 @@ static bool
>>>>>>> vega10_ih_prescreen_iv(struct amdgpu_device *adev)
>>>>>>>
>>>>>>>                      return true;
>>>>>>>
>>>>>>>              }
>>>>>>>
>>>>>>>
>>>>>>>           -  addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 12);
>>>>>>>
>>>>>>>           -  key = AMDGPU_VM_FAULT(pasid, addr);
>>>>>>>
>>>>>>>           -  r = amdgpu_ih_add_fault(adev, key);
>>>>>>>
>>>>>>>           -
>>>>>>>
>>>>>>>           -  /* Hash table is full or the fault is already being
>>>>>>> processed,
>>>>>>>
>>>>>>>           -   * ignore further page faults
>>>>>>>
>>>>>>>           -   */
>>>>>>>
>>>>>>>           -  if (r != 0)
>>>>>>>
>>>>>>>           -          goto ignore_iv;
>>>>>>>
>>>>>>>           -
>>>>>>>
>>>>>>>              /* Track retry faults in per-VM fault FIFO. */
>>>>>>>
>>>>>>>              spin_lock(&adev->vm_manager.pasid_lock);
>>>>>>>
>>>>>>>              vm = idr_find(&adev->vm_manager.pasid_idr, pasid);
>>>>>>>
>>>>>>>           +  addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 12);
>>>>>>>
>>>>>>>           +  key = AMDGPU_VM_FAULT(pasid, addr);
>>>>>>>
>>>>>>>              if (!vm) {
>>>>>>>
>>>>>>>                      /* VM not found, process it normally */
>>>>>>>
>>>>>>> spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>>>
>>>>>>>           -          amdgpu_ih_clear_fault(adev, key);
>>>>>>>
>>>>>>>                      return true;
>>>>>>>
>>>>>>>           +  } else {
>>>>>>>
>>>>>>>           +          r = amdgpu_vm_add_fault(vm->fault_hash, key);
>>>>>>>
>>>>>>>           +
>>>>>>>
>>>>>>>           +          /* Hash table is full or the fault is already
>>>>>>> being processed,
>>>>>>>
>>>>>>>           +           * ignore further page faults
>>>>>>>
>>>>>>>           +           */
>>>>>>>
>>>>>>>           +          if (r != 0) {
>>>>>>>
>>>>>>>           + spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>>>
>>>>>>>           +                  goto ignore_iv;
>>>>>>>
>>>>>>>           +          }
>>>>>>>
>>>>>>>              }
>>>>>>>
>>>>>>>              /* No locking required with single writer and single
>>>>>>> reader */
>>>>>>>
>>>>>>>              r = kfifo_put(&vm->faults, key);
>>>>>>>
>>>>>>>              if (!r) {
>>>>>>>
>>>>>>>                      /* FIFO is full. Ignore it until there is
>>>>>>> space */
>>>>>>>
>>>>>>>           +          amdgpu_vm_clear_fault(vm->fault_hash, key);
>>>>>>>
>>>>>>> spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>>>
>>>>>>>           -          amdgpu_ih_clear_fault(adev, key);
>>>>>>>
>>>>>>>                      goto ignore_iv;
>>>>>>>
>>>>>>>              }
>>>>>>>
>>>>>>>           - spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>>>
>>>>>>>
>>>>>>>           + spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>>>
>>>>>>>              /* It's the first fault for this address, process it
>>>>>>> normally */
>>>>>>>
>>>>>>>              return true;
>>>>>>>
>>>>>>>
>>>>>>>           @@ -386,14 +387,6 @@ static int vega10_ih_sw_init(void
>>>>>>> *handle)
>>>>>>>
>>>>>>>              adev->irq.ih.use_doorbell = true;
>>>>>>>
>>>>>>>              adev->irq.ih.doorbell_index = AMDGPU_DOORBELL64_IH
>>>>>>> << 1;
>>>>>>>
>>>>>>>
>>>>>>>           -  adev->irq.ih.faults =
>>>>>>> kmalloc(sizeof(*adev->irq.ih.faults), GFP_KERNEL);
>>>>>>>
>>>>>>>           -  if (!adev->irq.ih.faults)
>>>>>>>
>>>>>>>           -          return -ENOMEM;
>>>>>>>
>>>>>>>           - INIT_CHASH_TABLE(adev->irq.ih.faults->hash,
>>>>>>>
>>>>>>>           -                   AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
>>>>>>>
>>>>>>>           - spin_lock_init(&adev->irq.ih.faults->lock);
>>>>>>>
>>>>>>>           -  adev->irq.ih.faults->count = 0;
>>>>>>>
>>>>>>>           -
>>>>>>>
>>>>>>>              r = amdgpu_irq_init(adev);
>>>>>>>
>>>>>>>
>>>>>>>              return r;
>>>>>>>
>>>>>>>           @@ -406,9 +399,6 @@ static int vega10_ih_sw_fini(void
>>>>>>> *handle)
>>>>>>>
>>>>>>>              amdgpu_irq_fini(adev);
>>>>>>>
>>>>>>>              amdgpu_ih_ring_fini(adev);
>>>>>>>
>>>>>>>
>>>>>>>           -  kfree(adev->irq.ih.faults);
>>>>>>>
>>>>>>>           -  adev->irq.ih.faults = NULL;
>>>>>>>
>>>>>>>           -
>>>>>>>
>>>>>>>              return 0;
>>>>>>>
>>>>>>>            }
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>       _______________________________________________
>>>>>>>
>>>>>>>       amd-gfx mailing list
>>>>>>>
>>>>>>>       amd-gfx@lists.freedesktop.org
>>>>>>> <mailto:amd-gfx@lists.freedesktop.org>
>>>>>>>
>>>>>>>       https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>>
>>>>>>>
>>>>>>
>>>>>> _______________________________________________
>>>>>> amd-gfx mailing list
>>>>>> amd-gfx@lists.freedesktop.org
>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>> _______________________________________________
>>>> amd-gfx mailing list
>>>> amd-gfx@lists.freedesktop.org
>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to amdgpu vm
       [not found]                                         ` <5b9dc941-3d29-21ca-9f92-ba3512900660-5C7GfCeVMHo@public.gmane.org>
@ 2018-09-12  6:56                                           ` Christian König
       [not found]                                             ` <ea317e02-546b-b296-2456-542e86d62b6e-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 27+ messages in thread
From: Christian König @ 2018-09-12  6:56 UTC (permalink / raw)
  To: Felix Kuehling, Zeng, Oak, Oak Zeng,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Yang, Philip

Am 12.09.2018 um 00:00 schrieb Felix Kuehling:
> On 2018-09-11 03:19 AM, Christian König wrote:
>> Hi Felix,
>>
>> let me try to explain the problem on an example:
>>
>> 1. We have a running job which needs recoverable page faults for
>> accessing the process address space.
>> 2. We run into memory pressure on VRAM and start to evict things.
>> 3. A page tables of the running job is picked up for eviction.
>> 4. We schedule a copy command to move the content of the page table to
>> system memory.
> I think we should change this. If you evict a page table, you don't need
> to preserve its contents. You should be able to restore the page table
> contents from scratch when you handle the page fault that restores it.

Yeah, already implemented. You actually don't need the page fault for that.

>> 5. The new system memory location of the page table is noted in its BO.
> You mean in the parent page table? You can just invalidate the entry in
> the parent page table and let it fault.

I'm repeating myself, but exactly that is what won't work.

See we still have engines which can't handle page faults which uses the 
same VM at the same time. This means that we can't just fault in page 
tables.

What we could do is to separate the address space into a low and a high 
range where we have different handling for both.

I've already prototyped this and prepared mostly everything necessary 
for that, but that is still not 100% completed.

Regards,
Christian.

>> 6. We get a fault from the job and swap in the page from the process
>> address space.
> No. Then you need to keep updating page tables that are swapped out.
> Instead just discard them and rebuild on fault.
>
>> 7. Now we need to enter the new page address into the page table -> *BAM*
>>
>> The problem is now that we don't know the address of the page table
>> because the current location was replaced with the future location in
>> step #5.
> You solve that problem by discarding page tables that are swapped out.
> Then there is no future location.
>
>> We could now argue that we could update the page tables on the fly for
>> the evicted page and never wait for the current job to finish, but
>> this won't work because not all engines can handle that.
> Engines that can't handle page faults will depend on fences to keep
> their page tables resident, like we do today. So they won't have this
> problem.
>
> Regards,
>    Felix
>
>> I will circumvent this problem for now by blocking the eviction before
>> step #5. The issue with that is that this pipelining is responsible
>> for nearly 20% of the fps in some testcases.
>>
>> I hope that it won't be that bad when I limit disabling the pipelining
>> to only page tables, but it is certainly possible that we will get
>> pushback on this.
>>
>> If that happens we probably need to add some flags to limit this
>> workaround even more to only the root PD and all the PDs/PTs which are
>> involved in recoverable faults.
>>
>> Regards,
>> Christian.
>>
>> Am 10.09.2018 um 21:10 schrieb Felix Kuehling:
>>> I'm not sure why you need to distinguish current and future state when
>>> dealing with page faults. When you get a page fault, you know that the
>>> GPU is trying to access memory right now, in the present. So you're
>>> always working with the current state. When the CPU page table changes,
>>> you get an MMU notifier that allows you to invalidate the corresponding
>>> GPU PTEs. If you have a valid GPU PTE, it always represents the current
>>> states and is in sync with the CPU page table. If the GPU page table is
>>> ever outdated, it should have an invalid entry in that place.
>>>
>>> If SDMA is used to update the GPU PTEs, there is a delay. The MMU
>>> notifier is synchronous, so it shouldn't be a problem. You just wait for
>>> the SDMA job to complete before returning. When updating PTEs with new
>>> valid addresses, it's asynchronous. But the GPU will continue retrying
>>> on the invalid entry until SDMA finishes. So it's also implicitly
>>> synchronized on the GPU side.
>>>
>>> Regards,
>>>     Felix
>>>
>>>
>>> On 2018-09-10 05:42 AM, Christian König wrote:
>>>> Hi Felix & Oak,
>>>>
>>>> over the weekend I had the idea that we could just use the shadow BOs
>>>> to have the current state in a page fault. They are GTT BOs and CPU
>>>> accessible anyway.
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>> Am 08.09.2018 um 09:34 schrieb Christian König:
>>>>> Hi Felix,
>>>>>
>>>>>> But why do you want to update page tables when there is no more user
>>>>>> mode context that cares about them? Is this just to allow pending
>>>>>> work
>>>>>> to complete after the user mode context is gone? To prevent hangs?
>>>>> The problem I'm see is that we still don't have a reliable way of
>>>>> killing a command submission while making sure that no more faults of
>>>>> it are in the flight.
>>>>>
>>>>> E.g. the HWS has a function to kill a running job on the hardware,
>>>>> but that only works for compute and there is no way of telling when
>>>>> that operation has finished.
>>>>>
>>>>> But Oak has already convinced me that we should probably work on that
>>>>> instead of trying to keep the VM alive as long as possible.
>>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>> Am 08.09.2018 um 02:27 schrieb Felix Kuehling:
>>>>>> Hi Christian,
>>>>>>
>>>>>> I'm not sure I get your point here.
>>>>>>
>>>>>> As I understand it, the amdgpu_vm structure represents the user
>>>>>> mode VM
>>>>>> context. It has the pointers to the root page directory and the
>>>>>> rest of
>>>>>> the page table hierarchy. If there is no amdgpu_vm, there is no user
>>>>>> mode context that cares about the page tables.
>>>>>>
>>>>>> If we leave the page table pointers in the amdgpu_vm, then handling
>>>>>> faults after the VM is destroyed is pointless. We can't find the page
>>>>>> tables and we can't update them, so there is nothing we can do in
>>>>>> response to VM faults.
>>>>>>
>>>>>> So I'm guessing that you want to move the page table pointers into a
>>>>>> different structure that exists half-independently of the VM. It
>>>>>> would
>>>>>> be created when the VM is created (that's where we currently allocate
>>>>>> the PASID) but can be destroyed slightly later.
>>>>>>
>>>>>> But why do you want to update page tables when there is no more user
>>>>>> mode context that cares about them? Is this just to allow pending
>>>>>> work
>>>>>> to complete after the user mode context is gone? To prevent hangs?
>>>>>>
>>>>>> Regards,
>>>>>>      Felix
>>>>>>
>>>>>> On 2018-09-10 07:20 AM, Christian König wrote:
>>>>>>>> Am I right that the handling of page fault need a valid VM?
>>>>>>> No, exactly that view is incorrect.
>>>>>>>
>>>>>>> The VM is meant to be a memory management structure of page
>>>>>>> tables and
>>>>>>> is completely pointless fault processing because it represent the
>>>>>>> future state of the page tables and not the current one.
>>>>>>>
>>>>>>> What we need for fault processing is a new structure build around
>>>>>>> PASIDs which is feed by the with addresses when page tables are
>>>>>>> moved
>>>>>>> around.
>>>>>>>
>>>>>>> Alternatively I hope that we can use the SDMA to walk the page
>>>>>>> tables
>>>>>>> and update the required entries by just using the address.
>>>>>>>
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>>> Am 07.09.2018 um 16:55 schrieb Zeng, Oak:
>>>>>>>> Hi Christian,
>>>>>>>>
>>>>>>>>
>>>>>>>> What is the value of delay the destroy of hash to after vm is
>>>>>>>> destroyed? Since vm is destroyed, you basically don’t have enough
>>>>>>>> information to paging in the correct page to gpuvm. Am I right that
>>>>>>>> the handling of page fault need a valid VM? For example, you
>>>>>>>> need the
>>>>>>>> VM to get the corresponding physical address of the faulty page.
>>>>>>>>
>>>>>>>>
>>>>>>>> After vm is destroyed, the retry interrupt can be directly
>>>>>>>> discard as
>>>>>>>> you can’t find vm through pasid. You can think this is also one
>>>>>>>> kind
>>>>>>>> of prescreen.
>>>>>>>>
>>>>>>>>
>>>>>>>> So I am stand for the point that, there is no need to delay the
>>>>>>>> destroy of hash to after vm is destroyed. Prescreening hash can be
>>>>>>>> destroyed at the time of vm_fini.
>>>>>>>>
>>>>>>>>
>>>>>>>> Thanks,
>>>>>>>>
>>>>>>>> Oak
>>>>>>>>
>>>>>>>>
>>>>>>>> *From:*Christian König <ckoenig.leichtzumerken@gmail.com>
>>>>>>>> *Sent:* Friday, September 07, 2018 4:39 AM
>>>>>>>> *To:* Zeng, Oak <Oak.Zeng@amd.com>; Oak Zeng
>>>>>>>> <zengshanjun@gmail.com>;
>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>> *Cc:* Zeng, Oak <Oak.Zeng@amd.com>
>>>>>>>> *Subject:* Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to
>>>>>>>> amdgpu vm
>>>>>>>>
>>>>>>>>
>>>>>>>> Hi Oak,
>>>>>>>>
>>>>>>>> yeah, but this is still a NAK. Making the hash per PASID was not a
>>>>>>>> suggestion but rather a must have.
>>>>>>>>
>>>>>>>> The VM structures must be destroyed while the processing is still
>>>>>>>> ongoing, or otherwise we would not have a clean OOM handling.
>>>>>>>>
>>>>>>>> The IDR for PASID lockup can be put into amdgpu_ids.c, you can just
>>>>>>>> replace the IDA already used there with an IDR.
>>>>>>>>
>>>>>>>> Since the PASID is already freed up delayed we should have the
>>>>>>>> grace
>>>>>>>> period for interrupt processing included. If that still isn't
>>>>>>>> sufficient we can still add some delayed work for this.
>>>>>>>>
>>>>>>>> Regards,
>>>>>>>> Christian.
>>>>>>>>
>>>>>>>> Am 07.09.2018 um 06:16 schrieb ozeng:
>>>>>>>>
>>>>>>>>        Hi Christian,
>>>>>>>>
>>>>>>>>        In this implementation, fault hash is made per vm, not per
>>>>>>>> pasid
>>>>>>>>        as suggested, based on below considerations:
>>>>>>>>
>>>>>>>>          * Delay the destroy of hash introduces more effort like
>>>>>>>> how to
>>>>>>>>            set the proper grace period after which no retry
>>>>>>>> interrupt
>>>>>>>>            will be generated. We want to avoid those complication
>>>>>>>> if we
>>>>>>>>            can.
>>>>>>>>          * The  problem of the per vm implementation is that it
>>>>>>>> is hard
>>>>>>>>            to delay the destroy of fault hash, because when vm is
>>>>>>>>            destroyed, prescreen function won't be able to
>>>>>>>> retrieve the
>>>>>>>>            fault hash. But in this case, the prescreen function can
>>>>>>>>            either let the interrupt go through (to gmc) or ignore
>>>>>>>> it.
>>>>>>>>            From this perspective, it is not very necessary to
>>>>>>>> delay the
>>>>>>>>            destroy of hash table.
>>>>>>>>          * On the other hand, since ASICs after vega10 have the
>>>>>>>> HW CAM
>>>>>>>>            feature. So the SW IV prescreen is only useful for
>>>>>>>> vega10.
>>>>>>>>            For all the HW implemented CAM, we can consider the
>>>>>>>> delayed
>>>>>>>>            CAM acknowledgment.
>>>>>>>>          * Making it per pasid need to introduce another idr to
>>>>>>>>            correlate pasid and hash table - where to put the idr?
>>>>>>>> You
>>>>>>>>            will have to make it a global variable which is not very
>>>>>>>>            desirable.
>>>>>>>>
>>>>>>>>        The main purpose of this patch is to solve the inter-process
>>>>>>>> lock
>>>>>>>>        issue (when hash table is full), while still keep codes
>>>>>>>> simple.
>>>>>>>>
>>>>>>>>        Also with this patch, the faults kfifo is not necessary any
>>>>>>>> more.
>>>>>>>>        See patch 2.
>>>>>>>>
>>>>>>>>        Regards,
>>>>>>>>
>>>>>>>>        Oak
>>>>>>>>
>>>>>>>>
>>>>>>>>        On 09/06/2018 11:28 PM, Oak Zeng wrote:
>>>>>>>>
>>>>>>>>            In stead of share one fault hash table per device,
>>>>>>>> make it
>>>>>>>>
>>>>>>>>            per vm. This can avoid inter-process lock issue when
>>>>>>>> fault
>>>>>>>>
>>>>>>>>            hash table is full.
>>>>>>>>
>>>>>>>>
>>>>>>>>            Change-Id: I5d1281b7c41eddc8e26113e010516557588d3708
>>>>>>>>
>>>>>>>>            Signed-off-by: Oak Zeng <Oak.Zeng@amd.com>
>>>>>>>> <mailto:Oak.Zeng@amd.com>
>>>>>>>>
>>>>>>>>            Suggested-by: Christian Konig <Christian.Koenig@amd.com>
>>>>>>>> <mailto:Christian.Koenig@amd.com>
>>>>>>>>
>>>>>>>>            Suggested-by: Felix Kuehling <Felix.Kuehling@amd.com>
>>>>>>>> <mailto:Felix.Kuehling@amd.com>
>>>>>>>>
>>>>>>>>            ---
>>>>>>>>
>>>>>>>>             drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c |  75
>>>>>>>> ------------------------
>>>>>>>>
>>>>>>>>             drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h |  10 ----
>>>>>>>>
>>>>>>>>             drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 102
>>>>>>>> ++++++++++++++++++++++++++++++++-
>>>>>>>>
>>>>>>>>             drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h |  12 ++++
>>>>>>>>
>>>>>>>>             drivers/gpu/drm/amd/amdgpu/vega10_ih.c |  38
>>>>>>>> +++++-------
>>>>>>>>
>>>>>>>>             5 files changed, 127 insertions(+), 110 deletions(-)
>>>>>>>>
>>>>>>>>
>>>>>>>>            diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>>>>>>>
>>>>>>>>            index 06373d4..4ed8621 100644
>>>>>>>>
>>>>>>>>            --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>>>>>>>
>>>>>>>>            +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>>>>>>>
>>>>>>>>            @@ -197,78 +197,3 @@ int amdgpu_ih_process(struct
>>>>>>>> amdgpu_device *adev)
>>>>>>>>
>>>>>>>>               return IRQ_HANDLED;
>>>>>>>>
>>>>>>>>             }
>>>>>>>>
>>>>>>>>
>>>>>>>>            -/**
>>>>>>>>
>>>>>>>>            - * amdgpu_ih_add_fault - Add a page fault record
>>>>>>>>
>>>>>>>>            - *
>>>>>>>>
>>>>>>>>            - * @adev: amdgpu device pointer
>>>>>>>>
>>>>>>>>            - * @key: 64-bit encoding of PASID and address
>>>>>>>>
>>>>>>>>            - *
>>>>>>>>
>>>>>>>>            - * This should be called when a retry page fault
>>>>>>>> interrupt is
>>>>>>>>
>>>>>>>>            - * received. If this is a new page fault, it will be
>>>>>>>> added to a hash
>>>>>>>>
>>>>>>>>            - * table. The return value indicates whether this is a
>>>>>>>> new fault, or
>>>>>>>>
>>>>>>>>            - * a fault that was already known and is already being
>>>>>>>> handled.
>>>>>>>>
>>>>>>>>            - *
>>>>>>>>
>>>>>>>>            - * If there are too many pending page faults, this will
>>>>>>>> fail. Retry
>>>>>>>>
>>>>>>>>            - * interrupts should be ignored in this case until there
>>>>>>>> is enough
>>>>>>>>
>>>>>>>>            - * free space.
>>>>>>>>
>>>>>>>>            - *
>>>>>>>>
>>>>>>>>            - * Returns 0 if the fault was added, 1 if the fault was
>>>>>>>> already known,
>>>>>>>>
>>>>>>>>            - * -ENOSPC if there are too many pending faults.
>>>>>>>>
>>>>>>>>            - */
>>>>>>>>
>>>>>>>>            -int amdgpu_ih_add_fault(struct amdgpu_device *adev, u64
>>>>>>>> key)
>>>>>>>>
>>>>>>>>            -{
>>>>>>>>
>>>>>>>>            -  unsigned long flags;
>>>>>>>>
>>>>>>>>            -  int r = -ENOSPC;
>>>>>>>>
>>>>>>>>            -
>>>>>>>>
>>>>>>>>            -  if (WARN_ON_ONCE(!adev->irq.ih.faults))
>>>>>>>>
>>>>>>>>            -          /* Should be allocated in <IP>_ih_sw_init on
>>>>>>>> GPUs that
>>>>>>>>
>>>>>>>>            -           * support retry faults and require retry
>>>>>>>> filtering.
>>>>>>>>
>>>>>>>>            -           */
>>>>>>>>
>>>>>>>>            -          return r;
>>>>>>>>
>>>>>>>>            -
>>>>>>>>
>>>>>>>>            - spin_lock_irqsave(&adev->irq.ih.faults->lock, flags);
>>>>>>>>
>>>>>>>>            -
>>>>>>>>
>>>>>>>>            -  /* Only let the hash table fill up to 50% for best
>>>>>>>> performance */
>>>>>>>>
>>>>>>>>            -  if (adev->irq.ih.faults->count >= (1 <<
>>>>>>>> (AMDGPU_PAGEFAULT_HASH_BITS-1)))
>>>>>>>>
>>>>>>>>            -          goto unlock_out;
>>>>>>>>
>>>>>>>>            -
>>>>>>>>
>>>>>>>>            -  r = chash_table_copy_in(&adev->irq.ih.faults->hash,
>>>>>>>> key, NULL);
>>>>>>>>
>>>>>>>>            -  if (!r)
>>>>>>>>
>>>>>>>>            -          adev->irq.ih.faults->count++;
>>>>>>>>
>>>>>>>>            -
>>>>>>>>
>>>>>>>>            -  /* chash_table_copy_in should never fail unless we're
>>>>>>>> losing count */
>>>>>>>>
>>>>>>>>            -  WARN_ON_ONCE(r < 0);
>>>>>>>>
>>>>>>>>            -
>>>>>>>>
>>>>>>>>            -unlock_out:
>>>>>>>>
>>>>>>>>            - spin_unlock_irqrestore(&adev->irq.ih.faults->lock,
>>>>>>>> flags);
>>>>>>>>
>>>>>>>>            -  return r;
>>>>>>>>
>>>>>>>>            -}
>>>>>>>>
>>>>>>>>            -
>>>>>>>>
>>>>>>>>            -/**
>>>>>>>>
>>>>>>>>            - * amdgpu_ih_clear_fault - Remove a page fault record
>>>>>>>>
>>>>>>>>            - *
>>>>>>>>
>>>>>>>>            - * @adev: amdgpu device pointer
>>>>>>>>
>>>>>>>>            - * @key: 64-bit encoding of PASID and address
>>>>>>>>
>>>>>>>>            - *
>>>>>>>>
>>>>>>>>            - * This should be called when a page fault has been
>>>>>>>> handled. Any
>>>>>>>>
>>>>>>>>            - * future interrupt with this key will be processed as a
>>>>>>>> new
>>>>>>>>
>>>>>>>>            - * page fault.
>>>>>>>>
>>>>>>>>            - */
>>>>>>>>
>>>>>>>>            -void amdgpu_ih_clear_fault(struct amdgpu_device *adev,
>>>>>>>> u64 key)
>>>>>>>>
>>>>>>>>            -{
>>>>>>>>
>>>>>>>>            -  unsigned long flags;
>>>>>>>>
>>>>>>>>            -  int r;
>>>>>>>>
>>>>>>>>            -
>>>>>>>>
>>>>>>>>            -  if (!adev->irq.ih.faults)
>>>>>>>>
>>>>>>>>            -          return;
>>>>>>>>
>>>>>>>>            -
>>>>>>>>
>>>>>>>>            - spin_lock_irqsave(&adev->irq.ih.faults->lock, flags);
>>>>>>>>
>>>>>>>>            -
>>>>>>>>
>>>>>>>>            -  r = chash_table_remove(&adev->irq.ih.faults->hash,
>>>>>>>> key, NULL);
>>>>>>>>
>>>>>>>>            -  if (!WARN_ON_ONCE(r < 0)) {
>>>>>>>>
>>>>>>>>            -          adev->irq.ih.faults->count--;
>>>>>>>>
>>>>>>>>            - WARN_ON_ONCE(adev->irq.ih.faults->count < 0);
>>>>>>>>
>>>>>>>>            -  }
>>>>>>>>
>>>>>>>>            -
>>>>>>>>
>>>>>>>>            - spin_unlock_irqrestore(&adev->irq.ih.faults->lock,
>>>>>>>> flags);
>>>>>>>>
>>>>>>>>            -}
>>>>>>>>
>>>>>>>>            diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>>>>>>
>>>>>>>>            index a23e1c0..f411ffb 100644
>>>>>>>>
>>>>>>>>            --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>>>>>>
>>>>>>>>            +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>>>>>>
>>>>>>>>            @@ -32,13 +32,6 @@ struct amdgpu_device;
>>>>>>>>
>>>>>>>>             #define AMDGPU_IH_CLIENTID_LEGACY 0
>>>>>>>>
>>>>>>>>             #define AMDGPU_IH_CLIENTID_MAX SOC15_IH_CLIENTID_MAX
>>>>>>>>
>>>>>>>>
>>>>>>>>            -#define AMDGPU_PAGEFAULT_HASH_BITS 8
>>>>>>>>
>>>>>>>>            -struct amdgpu_retryfault_hashtable {
>>>>>>>>
>>>>>>>>            -  DECLARE_CHASH_TABLE(hash, AMDGPU_PAGEFAULT_HASH_BITS,
>>>>>>>> 8, 0);
>>>>>>>>
>>>>>>>>            -  spinlock_t     lock;
>>>>>>>>
>>>>>>>>            -  int            count;
>>>>>>>>
>>>>>>>>            -};
>>>>>>>>
>>>>>>>>            -
>>>>>>>>
>>>>>>>>             /*
>>>>>>>>
>>>>>>>>              * R6xx+ IH ring
>>>>>>>>
>>>>>>>>              */
>>>>>>>>
>>>>>>>>            @@ -57,7 +50,6 @@ struct amdgpu_ih_ring {
>>>>>>>>
>>>>>>>>               bool                   use_doorbell;
>>>>>>>>
>>>>>>>>               bool                   use_bus_addr;
>>>>>>>>
>>>>>>>>               dma_addr_t             rb_dma_addr; /* only used when
>>>>>>>> use_bus_addr = true */
>>>>>>>>
>>>>>>>>            -  struct amdgpu_retryfault_hashtable *faults;
>>>>>>>>
>>>>>>>>             };
>>>>>>>>
>>>>>>>>
>>>>>>>>             #define AMDGPU_IH_SRC_DATA_MAX_SIZE_DW 4
>>>>>>>>
>>>>>>>>            @@ -95,7 +87,5 @@ int amdgpu_ih_ring_init(struct
>>>>>>>> amdgpu_device *adev, unsigned ring_size,
>>>>>>>>
>>>>>>>>                               bool use_bus_addr);
>>>>>>>>
>>>>>>>>             void amdgpu_ih_ring_fini(struct amdgpu_device *adev);
>>>>>>>>
>>>>>>>>             int amdgpu_ih_process(struct amdgpu_device *adev);
>>>>>>>>
>>>>>>>>            -int amdgpu_ih_add_fault(struct amdgpu_device *adev, u64
>>>>>>>> key);
>>>>>>>>
>>>>>>>>            -void amdgpu_ih_clear_fault(struct amdgpu_device *adev,
>>>>>>>> u64 key);
>>>>>>>>
>>>>>>>>
>>>>>>>>             #endif
>>>>>>>>
>>>>>>>>            diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>>>
>>>>>>>>            index 1d7e3c1..8b220e9 100644
>>>>>>>>
>>>>>>>>            --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>>>
>>>>>>>>            +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>>>
>>>>>>>>            @@ -2692,6 +2692,22 @@ void amdgpu_vm_adjust_size(struct
>>>>>>>> amdgpu_device *adev, uint32_t min_vm_size,
>>>>>>>>
>>>>>>>>                        adev->vm_manager.fragment_size);
>>>>>>>>
>>>>>>>>             }
>>>>>>>>
>>>>>>>>
>>>>>>>>            +static struct amdgpu_retryfault_hashtable
>>>>>>>> *init_fault_hash(void)
>>>>>>>>
>>>>>>>>            +{
>>>>>>>>
>>>>>>>>            +  struct amdgpu_retryfault_hashtable *fault_hash;
>>>>>>>>
>>>>>>>>            +
>>>>>>>>
>>>>>>>>            +  fault_hash = kmalloc(sizeof(*fault_hash), GFP_KERNEL);
>>>>>>>>
>>>>>>>>            +  if (!fault_hash)
>>>>>>>>
>>>>>>>>            +          return fault_hash;
>>>>>>>>
>>>>>>>>            +
>>>>>>>>
>>>>>>>>            +  INIT_CHASH_TABLE(fault_hash->hash,
>>>>>>>>
>>>>>>>>            +                  AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
>>>>>>>>
>>>>>>>>            +  spin_lock_init(&fault_hash->lock);
>>>>>>>>
>>>>>>>>            +  fault_hash->count = 0;
>>>>>>>>
>>>>>>>>            +
>>>>>>>>
>>>>>>>>            +  return fault_hash;
>>>>>>>>
>>>>>>>>            +}
>>>>>>>>
>>>>>>>>            +
>>>>>>>>
>>>>>>>>             /**
>>>>>>>>
>>>>>>>>              * amdgpu_vm_init - initialize a vm instance
>>>>>>>>
>>>>>>>>              *
>>>>>>>>
>>>>>>>>            @@ -2780,6 +2796,12 @@ int amdgpu_vm_init(struct
>>>>>>>> amdgpu_device *adev, struct amdgpu_vm *vm,
>>>>>>>>
>>>>>>>>                       vm->pasid = pasid;
>>>>>>>>
>>>>>>>>               }
>>>>>>>>
>>>>>>>>
>>>>>>>>            +  vm->fault_hash = init_fault_hash();
>>>>>>>>
>>>>>>>>            +  if (!vm->fault_hash) {
>>>>>>>>
>>>>>>>>            +          r = -ENOMEM;
>>>>>>>>
>>>>>>>>            +          goto error_free_root;
>>>>>>>>
>>>>>>>>            +  }
>>>>>>>>
>>>>>>>>            +
>>>>>>>>
>>>>>>>>               INIT_KFIFO(vm->faults);
>>>>>>>>
>>>>>>>>               vm->fault_credit = 16;
>>>>>>>>
>>>>>>>>
>>>>>>>>            @@ -2973,7 +2995,7 @@ void amdgpu_vm_fini(struct
>>>>>>>> amdgpu_device *adev, struct amdgpu_vm *vm)
>>>>>>>>
>>>>>>>>
>>>>>>>>               /* Clear pending page faults from IH when the VM is
>>>>>>>> destroyed */
>>>>>>>>
>>>>>>>>               while (kfifo_get(&vm->faults, &fault))
>>>>>>>>
>>>>>>>>            -          amdgpu_ih_clear_fault(adev, fault);
>>>>>>>>
>>>>>>>>            +          amdgpu_vm_clear_fault(vm->fault_hash, fault);
>>>>>>>>
>>>>>>>>
>>>>>>>>               if (vm->pasid) {
>>>>>>>>
>>>>>>>>                       unsigned long flags;
>>>>>>>>
>>>>>>>>            @@ -2983,6 +3005,9 @@ void amdgpu_vm_fini(struct
>>>>>>>> amdgpu_device *adev, struct amdgpu_vm *vm)
>>>>>>>>
>>>>>>>> spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
>>>>>>>>
>>>>>>>>               }
>>>>>>>>
>>>>>>>>
>>>>>>>>            +  kfree(vm->fault_hash);
>>>>>>>>
>>>>>>>>            +  vm->fault_hash = NULL;
>>>>>>>>
>>>>>>>>            +
>>>>>>>>
>>>>>>>>               drm_sched_entity_destroy(&vm->entity);
>>>>>>>>
>>>>>>>>
>>>>>>>>               if (!RB_EMPTY_ROOT(&vm->va.rb_root)) {
>>>>>>>>
>>>>>>>>            @@ -3183,3 +3208,78 @@ void
>>>>>>>> amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
>>>>>>>>
>>>>>>>>                       }
>>>>>>>>
>>>>>>>>               }
>>>>>>>>
>>>>>>>>             }
>>>>>>>>
>>>>>>>>            +
>>>>>>>>
>>>>>>>>            +/**
>>>>>>>>
>>>>>>>>            + * amdgpu_vm_add_fault - Add a page fault record to
>>>>>>>> fault hash table
>>>>>>>>
>>>>>>>>            + *
>>>>>>>>
>>>>>>>>            + * @fault_hash: fault hash table
>>>>>>>>
>>>>>>>>            + * @key: 64-bit encoding of PASID and address
>>>>>>>>
>>>>>>>>            + *
>>>>>>>>
>>>>>>>>            + * This should be called when a retry page fault
>>>>>>>> interrupt is
>>>>>>>>
>>>>>>>>            + * received. If this is a new page fault, it will be
>>>>>>>> added to a hash
>>>>>>>>
>>>>>>>>            + * table. The return value indicates whether this is a
>>>>>>>> new fault, or
>>>>>>>>
>>>>>>>>            + * a fault that was already known and is already being
>>>>>>>> handled.
>>>>>>>>
>>>>>>>>            + *
>>>>>>>>
>>>>>>>>            + * If there are too many pending page faults, this will
>>>>>>>> fail. Retry
>>>>>>>>
>>>>>>>>            + * interrupts should be ignored in this case until there
>>>>>>>> is enough
>>>>>>>>
>>>>>>>>            + * free space.
>>>>>>>>
>>>>>>>>            + *
>>>>>>>>
>>>>>>>>            + * Returns 0 if the fault was added, 1 if the fault was
>>>>>>>> already known,
>>>>>>>>
>>>>>>>>            + * -ENOSPC if there are too many pending faults.
>>>>>>>>
>>>>>>>>            + */
>>>>>>>>
>>>>>>>>            +int amdgpu_vm_add_fault(struct
>>>>>>>> amdgpu_retryfault_hashtable *fault_hash, u64 key)
>>>>>>>>
>>>>>>>>            +{
>>>>>>>>
>>>>>>>>            +  unsigned long flags;
>>>>>>>>
>>>>>>>>            +  int r = -ENOSPC;
>>>>>>>>
>>>>>>>>            +
>>>>>>>>
>>>>>>>>            +  if (WARN_ON_ONCE(!fault_hash))
>>>>>>>>
>>>>>>>>            +          /* Should be allocated in amdgpu_vm_init
>>>>>>>>
>>>>>>>>            +           */
>>>>>>>>
>>>>>>>>            +          return r;
>>>>>>>>
>>>>>>>>            +
>>>>>>>>
>>>>>>>>            +  spin_lock_irqsave(&fault_hash->lock, flags);
>>>>>>>>
>>>>>>>>            +
>>>>>>>>
>>>>>>>>            +  /* Only let the hash table fill up to 50% for best
>>>>>>>> performance */
>>>>>>>>
>>>>>>>>            +  if (fault_hash->count >= (1 <<
>>>>>>>> (AMDGPU_PAGEFAULT_HASH_BITS-1)))
>>>>>>>>
>>>>>>>>            +          goto unlock_out;
>>>>>>>>
>>>>>>>>            +
>>>>>>>>
>>>>>>>>            +  r = chash_table_copy_in(&fault_hash->hash, key, NULL);
>>>>>>>>
>>>>>>>>            +  if (!r)
>>>>>>>>
>>>>>>>>            +          fault_hash->count++;
>>>>>>>>
>>>>>>>>            +
>>>>>>>>
>>>>>>>>            +  /* chash_table_copy_in should never fail unless we're
>>>>>>>> losing count */
>>>>>>>>
>>>>>>>>            +  WARN_ON_ONCE(r < 0);
>>>>>>>>
>>>>>>>>            +
>>>>>>>>
>>>>>>>>            +unlock_out:
>>>>>>>>
>>>>>>>>            +  spin_unlock_irqrestore(&fault_hash->lock, flags);
>>>>>>>>
>>>>>>>>            +  return r;
>>>>>>>>
>>>>>>>>            +}
>>>>>>>>
>>>>>>>>            +
>>>>>>>>
>>>>>>>>            +/**
>>>>>>>>
>>>>>>>>            + * amdgpu_vm_clear_fault - Remove a page fault record
>>>>>>>>
>>>>>>>>            + *
>>>>>>>>
>>>>>>>>            + * @fault_hash: fault hash table
>>>>>>>>
>>>>>>>>            + * @key: 64-bit encoding of PASID and address
>>>>>>>>
>>>>>>>>            + *
>>>>>>>>
>>>>>>>>            + * This should be called when a page fault has been
>>>>>>>> handled. Any
>>>>>>>>
>>>>>>>>            + * future interrupt with this key will be processed as a
>>>>>>>> new
>>>>>>>>
>>>>>>>>            + * page fault.
>>>>>>>>
>>>>>>>>            + */
>>>>>>>>
>>>>>>>>            +void amdgpu_vm_clear_fault(struct
>>>>>>>> amdgpu_retryfault_hashtable *fault_hash, u64 key)
>>>>>>>>
>>>>>>>>            +{
>>>>>>>>
>>>>>>>>            +  unsigned long flags;
>>>>>>>>
>>>>>>>>            +  int r;
>>>>>>>>
>>>>>>>>            +
>>>>>>>>
>>>>>>>>            +  if (!fault_hash)
>>>>>>>>
>>>>>>>>            +          return;
>>>>>>>>
>>>>>>>>            +
>>>>>>>>
>>>>>>>>            +  spin_lock_irqsave(&fault_hash->lock, flags);
>>>>>>>>
>>>>>>>>            +
>>>>>>>>
>>>>>>>>            +  r = chash_table_remove(&fault_hash->hash, key, NULL);
>>>>>>>>
>>>>>>>>            +  if (!WARN_ON_ONCE(r < 0)) {
>>>>>>>>
>>>>>>>>            +          fault_hash->count--;
>>>>>>>>
>>>>>>>>            +          WARN_ON_ONCE(fault_hash->count < 0);
>>>>>>>>
>>>>>>>>            +  }
>>>>>>>>
>>>>>>>>            +
>>>>>>>>
>>>>>>>>            +  spin_unlock_irqrestore(&fault_hash->lock, flags);
>>>>>>>>
>>>>>>>>            +}
>>>>>>>>
>>>>>>>>            diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>>>>>>
>>>>>>>>            index e275ee7..6eb1da1 100644
>>>>>>>>
>>>>>>>>            --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>>>>>>
>>>>>>>>            +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>>>>>>
>>>>>>>>            @@ -178,6 +178,13 @@ struct amdgpu_task_info {
>>>>>>>>
>>>>>>>>               pid_t   tgid;
>>>>>>>>
>>>>>>>>             };
>>>>>>>>
>>>>>>>>
>>>>>>>>            +#define AMDGPU_PAGEFAULT_HASH_BITS 8
>>>>>>>>
>>>>>>>>            +struct amdgpu_retryfault_hashtable {
>>>>>>>>
>>>>>>>>            +  DECLARE_CHASH_TABLE(hash, AMDGPU_PAGEFAULT_HASH_BITS,
>>>>>>>> 8, 0);
>>>>>>>>
>>>>>>>>            +  spinlock_t     lock;
>>>>>>>>
>>>>>>>>            +  int            count;
>>>>>>>>
>>>>>>>>            +};
>>>>>>>>
>>>>>>>>            +
>>>>>>>>
>>>>>>>>             struct amdgpu_vm {
>>>>>>>>
>>>>>>>>               /* tree of virtual addresses mapped */
>>>>>>>>
>>>>>>>>               struct rb_root_cached  va;
>>>>>>>>
>>>>>>>>            @@ -240,6 +247,7 @@ struct amdgpu_vm {
>>>>>>>>
>>>>>>>>               struct ttm_lru_bulk_move lru_bulk_move;
>>>>>>>>
>>>>>>>>               /* mark whether can do the bulk move */
>>>>>>>>
>>>>>>>>               bool                   bulk_moveable;
>>>>>>>>
>>>>>>>>            +  struct amdgpu_retryfault_hashtable *fault_hash;
>>>>>>>>
>>>>>>>>             };
>>>>>>>>
>>>>>>>>
>>>>>>>>             struct amdgpu_vm_manager {
>>>>>>>>
>>>>>>>>            @@ -355,4 +363,8 @@ void amdgpu_vm_set_task_info(struct
>>>>>>>> amdgpu_vm *vm);
>>>>>>>>
>>>>>>>>             void amdgpu_vm_move_to_lru_tail(struct amdgpu_device
>>>>>>>> *adev,
>>>>>>>>
>>>>>>>>                                      struct amdgpu_vm *vm);
>>>>>>>>
>>>>>>>>
>>>>>>>>            +int amdgpu_vm_add_fault(struct
>>>>>>>> amdgpu_retryfault_hashtable *fault_hash, u64 key);
>>>>>>>>
>>>>>>>>            +
>>>>>>>>
>>>>>>>>            +void amdgpu_vm_clear_fault(struct
>>>>>>>> amdgpu_retryfault_hashtable *fault_hash, u64 key);
>>>>>>>>
>>>>>>>>            +
>>>>>>>>
>>>>>>>>             #endif
>>>>>>>>
>>>>>>>>            diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>>>
>>>>>>>>            index 5ae5ed2..acbe5a7 100644
>>>>>>>>
>>>>>>>>            --- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>>>
>>>>>>>>            +++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>>>
>>>>>>>>            @@ -265,35 +265,36 @@ static bool
>>>>>>>> vega10_ih_prescreen_iv(struct amdgpu_device *adev)
>>>>>>>>
>>>>>>>>                       return true;
>>>>>>>>
>>>>>>>>               }
>>>>>>>>
>>>>>>>>
>>>>>>>>            -  addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 12);
>>>>>>>>
>>>>>>>>            -  key = AMDGPU_VM_FAULT(pasid, addr);
>>>>>>>>
>>>>>>>>            -  r = amdgpu_ih_add_fault(adev, key);
>>>>>>>>
>>>>>>>>            -
>>>>>>>>
>>>>>>>>            -  /* Hash table is full or the fault is already being
>>>>>>>> processed,
>>>>>>>>
>>>>>>>>            -   * ignore further page faults
>>>>>>>>
>>>>>>>>            -   */
>>>>>>>>
>>>>>>>>            -  if (r != 0)
>>>>>>>>
>>>>>>>>            -          goto ignore_iv;
>>>>>>>>
>>>>>>>>            -
>>>>>>>>
>>>>>>>>               /* Track retry faults in per-VM fault FIFO. */
>>>>>>>>
>>>>>>>>               spin_lock(&adev->vm_manager.pasid_lock);
>>>>>>>>
>>>>>>>>               vm = idr_find(&adev->vm_manager.pasid_idr, pasid);
>>>>>>>>
>>>>>>>>            +  addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 12);
>>>>>>>>
>>>>>>>>            +  key = AMDGPU_VM_FAULT(pasid, addr);
>>>>>>>>
>>>>>>>>               if (!vm) {
>>>>>>>>
>>>>>>>>                       /* VM not found, process it normally */
>>>>>>>>
>>>>>>>> spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>>>>
>>>>>>>>            -          amdgpu_ih_clear_fault(adev, key);
>>>>>>>>
>>>>>>>>                       return true;
>>>>>>>>
>>>>>>>>            +  } else {
>>>>>>>>
>>>>>>>>            +          r = amdgpu_vm_add_fault(vm->fault_hash, key);
>>>>>>>>
>>>>>>>>            +
>>>>>>>>
>>>>>>>>            +          /* Hash table is full or the fault is already
>>>>>>>> being processed,
>>>>>>>>
>>>>>>>>            +           * ignore further page faults
>>>>>>>>
>>>>>>>>            +           */
>>>>>>>>
>>>>>>>>            +          if (r != 0) {
>>>>>>>>
>>>>>>>>            + spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>>>>
>>>>>>>>            +                  goto ignore_iv;
>>>>>>>>
>>>>>>>>            +          }
>>>>>>>>
>>>>>>>>               }
>>>>>>>>
>>>>>>>>               /* No locking required with single writer and single
>>>>>>>> reader */
>>>>>>>>
>>>>>>>>               r = kfifo_put(&vm->faults, key);
>>>>>>>>
>>>>>>>>               if (!r) {
>>>>>>>>
>>>>>>>>                       /* FIFO is full. Ignore it until there is
>>>>>>>> space */
>>>>>>>>
>>>>>>>>            +          amdgpu_vm_clear_fault(vm->fault_hash, key);
>>>>>>>>
>>>>>>>> spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>>>>
>>>>>>>>            -          amdgpu_ih_clear_fault(adev, key);
>>>>>>>>
>>>>>>>>                       goto ignore_iv;
>>>>>>>>
>>>>>>>>               }
>>>>>>>>
>>>>>>>>            - spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>>>>
>>>>>>>>
>>>>>>>>            + spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>>>>
>>>>>>>>               /* It's the first fault for this address, process it
>>>>>>>> normally */
>>>>>>>>
>>>>>>>>               return true;
>>>>>>>>
>>>>>>>>
>>>>>>>>            @@ -386,14 +387,6 @@ static int vega10_ih_sw_init(void
>>>>>>>> *handle)
>>>>>>>>
>>>>>>>>               adev->irq.ih.use_doorbell = true;
>>>>>>>>
>>>>>>>>               adev->irq.ih.doorbell_index = AMDGPU_DOORBELL64_IH
>>>>>>>> << 1;
>>>>>>>>
>>>>>>>>
>>>>>>>>            -  adev->irq.ih.faults =
>>>>>>>> kmalloc(sizeof(*adev->irq.ih.faults), GFP_KERNEL);
>>>>>>>>
>>>>>>>>            -  if (!adev->irq.ih.faults)
>>>>>>>>
>>>>>>>>            -          return -ENOMEM;
>>>>>>>>
>>>>>>>>            - INIT_CHASH_TABLE(adev->irq.ih.faults->hash,
>>>>>>>>
>>>>>>>>            -                   AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
>>>>>>>>
>>>>>>>>            - spin_lock_init(&adev->irq.ih.faults->lock);
>>>>>>>>
>>>>>>>>            -  adev->irq.ih.faults->count = 0;
>>>>>>>>
>>>>>>>>            -
>>>>>>>>
>>>>>>>>               r = amdgpu_irq_init(adev);
>>>>>>>>
>>>>>>>>
>>>>>>>>               return r;
>>>>>>>>
>>>>>>>>            @@ -406,9 +399,6 @@ static int vega10_ih_sw_fini(void
>>>>>>>> *handle)
>>>>>>>>
>>>>>>>>               amdgpu_irq_fini(adev);
>>>>>>>>
>>>>>>>>               amdgpu_ih_ring_fini(adev);
>>>>>>>>
>>>>>>>>
>>>>>>>>            -  kfree(adev->irq.ih.faults);
>>>>>>>>
>>>>>>>>            -  adev->irq.ih.faults = NULL;
>>>>>>>>
>>>>>>>>            -
>>>>>>>>
>>>>>>>>               return 0;
>>>>>>>>
>>>>>>>>             }
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>        _______________________________________________
>>>>>>>>
>>>>>>>>        amd-gfx mailing list
>>>>>>>>
>>>>>>>>        amd-gfx@lists.freedesktop.org
>>>>>>>> <mailto:amd-gfx@lists.freedesktop.org>
>>>>>>>>
>>>>>>>>        https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>>>
>>>>>>>>
>>>>>>> _______________________________________________
>>>>>>> amd-gfx mailing list
>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>> _______________________________________________
>>>>> amd-gfx mailing list
>>>>> amd-gfx@lists.freedesktop.org
>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to amdgpu vm
       [not found] ` <1536290910-6476-1-git-send-email-Oak.Zeng-5C7GfCeVMHo@public.gmane.org>
  2018-09-07  3:28   ` [PATCH 2/2] drm/amdgpu: Deleted faults kfifo Oak Zeng
  2018-09-07  4:16   ` [PATCH 1/2] drm/amdgpu: Moved fault hash table to amdgpu vm ozeng
@ 2018-09-12  9:03   ` Christian König
  2 siblings, 0 replies; 27+ messages in thread
From: Christian König @ 2018-09-12  9:03 UTC (permalink / raw)
  To: Oak Zeng, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Oak Zeng

Am 07.09.2018 um 05:28 schrieb Oak Zeng:
> In stead of share one fault hash table per device, make it
> per vm. This can avoid inter-process lock issue when fault
> hash table is full.
>
> Change-Id: I5d1281b7c41eddc8e26113e010516557588d3708
> Signed-off-by: Oak Zeng <Oak.Zeng@amd.com>
> Suggested-by: Christian Konig <Christian.Koenig@amd.com>
> Suggested-by: Felix Kuehling <Felix.Kuehling@amd.com>

Reviewed-by: Christian König <christian.koenig@amd.com>

> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c |  75 ------------------------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h |  10 ----
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 102 ++++++++++++++++++++++++++++++++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h |  12 ++++
>   drivers/gpu/drm/amd/amdgpu/vega10_ih.c |  38 +++++-------
>   5 files changed, 127 insertions(+), 110 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
> index 06373d4..4ed8621 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
> @@ -197,78 +197,3 @@ int amdgpu_ih_process(struct amdgpu_device *adev)
>   	return IRQ_HANDLED;
>   }
>   
> -/**
> - * amdgpu_ih_add_fault - Add a page fault record
> - *
> - * @adev: amdgpu device pointer
> - * @key: 64-bit encoding of PASID and address
> - *
> - * This should be called when a retry page fault interrupt is
> - * received. If this is a new page fault, it will be added to a hash
> - * table. The return value indicates whether this is a new fault, or
> - * a fault that was already known and is already being handled.
> - *
> - * If there are too many pending page faults, this will fail. Retry
> - * interrupts should be ignored in this case until there is enough
> - * free space.
> - *
> - * Returns 0 if the fault was added, 1 if the fault was already known,
> - * -ENOSPC if there are too many pending faults.
> - */
> -int amdgpu_ih_add_fault(struct amdgpu_device *adev, u64 key)
> -{
> -	unsigned long flags;
> -	int r = -ENOSPC;
> -
> -	if (WARN_ON_ONCE(!adev->irq.ih.faults))
> -		/* Should be allocated in <IP>_ih_sw_init on GPUs that
> -		 * support retry faults and require retry filtering.
> -		 */
> -		return r;
> -
> -	spin_lock_irqsave(&adev->irq.ih.faults->lock, flags);
> -
> -	/* Only let the hash table fill up to 50% for best performance */
> -	if (adev->irq.ih.faults->count >= (1 << (AMDGPU_PAGEFAULT_HASH_BITS-1)))
> -		goto unlock_out;
> -
> -	r = chash_table_copy_in(&adev->irq.ih.faults->hash, key, NULL);
> -	if (!r)
> -		adev->irq.ih.faults->count++;
> -
> -	/* chash_table_copy_in should never fail unless we're losing count */
> -	WARN_ON_ONCE(r < 0);
> -
> -unlock_out:
> -	spin_unlock_irqrestore(&adev->irq.ih.faults->lock, flags);
> -	return r;
> -}
> -
> -/**
> - * amdgpu_ih_clear_fault - Remove a page fault record
> - *
> - * @adev: amdgpu device pointer
> - * @key: 64-bit encoding of PASID and address
> - *
> - * This should be called when a page fault has been handled. Any
> - * future interrupt with this key will be processed as a new
> - * page fault.
> - */
> -void amdgpu_ih_clear_fault(struct amdgpu_device *adev, u64 key)
> -{
> -	unsigned long flags;
> -	int r;
> -
> -	if (!adev->irq.ih.faults)
> -		return;
> -
> -	spin_lock_irqsave(&adev->irq.ih.faults->lock, flags);
> -
> -	r = chash_table_remove(&adev->irq.ih.faults->hash, key, NULL);
> -	if (!WARN_ON_ONCE(r < 0)) {
> -		adev->irq.ih.faults->count--;
> -		WARN_ON_ONCE(adev->irq.ih.faults->count < 0);
> -	}
> -
> -	spin_unlock_irqrestore(&adev->irq.ih.faults->lock, flags);
> -}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
> index a23e1c0..f411ffb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
> @@ -32,13 +32,6 @@ struct amdgpu_device;
>   #define AMDGPU_IH_CLIENTID_LEGACY 0
>   #define AMDGPU_IH_CLIENTID_MAX SOC15_IH_CLIENTID_MAX
>   
> -#define AMDGPU_PAGEFAULT_HASH_BITS 8
> -struct amdgpu_retryfault_hashtable {
> -	DECLARE_CHASH_TABLE(hash, AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
> -	spinlock_t	lock;
> -	int		count;
> -};
> -
>   /*
>    * R6xx+ IH ring
>    */
> @@ -57,7 +50,6 @@ struct amdgpu_ih_ring {
>   	bool			use_doorbell;
>   	bool			use_bus_addr;
>   	dma_addr_t		rb_dma_addr; /* only used when use_bus_addr = true */
> -	struct amdgpu_retryfault_hashtable *faults;
>   };
>   
>   #define AMDGPU_IH_SRC_DATA_MAX_SIZE_DW 4
> @@ -95,7 +87,5 @@ int amdgpu_ih_ring_init(struct amdgpu_device *adev, unsigned ring_size,
>   			bool use_bus_addr);
>   void amdgpu_ih_ring_fini(struct amdgpu_device *adev);
>   int amdgpu_ih_process(struct amdgpu_device *adev);
> -int amdgpu_ih_add_fault(struct amdgpu_device *adev, u64 key);
> -void amdgpu_ih_clear_fault(struct amdgpu_device *adev, u64 key);
>   
>   #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 1d7e3c1..8b220e9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -2692,6 +2692,22 @@ void amdgpu_vm_adjust_size(struct amdgpu_device *adev, uint32_t min_vm_size,
>   		 adev->vm_manager.fragment_size);
>   }
>   
> +static struct amdgpu_retryfault_hashtable *init_fault_hash(void)
> +{
> +	struct amdgpu_retryfault_hashtable *fault_hash;
> +
> +	fault_hash = kmalloc(sizeof(*fault_hash), GFP_KERNEL);
> +	if (!fault_hash)
> +		return fault_hash;
> +
> +	INIT_CHASH_TABLE(fault_hash->hash,
> +			AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
> +	spin_lock_init(&fault_hash->lock);
> +	fault_hash->count = 0;
> +
> +	return fault_hash;
> +}
> +
>   /**
>    * amdgpu_vm_init - initialize a vm instance
>    *
> @@ -2780,6 +2796,12 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>   		vm->pasid = pasid;
>   	}
>   
> +	vm->fault_hash = init_fault_hash();
> +	if (!vm->fault_hash) {
> +		r = -ENOMEM;
> +		goto error_free_root;
> +	}
> +
>   	INIT_KFIFO(vm->faults);
>   	vm->fault_credit = 16;
>   
> @@ -2973,7 +2995,7 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
>   
>   	/* Clear pending page faults from IH when the VM is destroyed */
>   	while (kfifo_get(&vm->faults, &fault))
> -		amdgpu_ih_clear_fault(adev, fault);
> +		amdgpu_vm_clear_fault(vm->fault_hash, fault);
>   
>   	if (vm->pasid) {
>   		unsigned long flags;
> @@ -2983,6 +3005,9 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
>   		spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
>   	}
>   
> +	kfree(vm->fault_hash);
> +	vm->fault_hash = NULL;
> +
>   	drm_sched_entity_destroy(&vm->entity);
>   
>   	if (!RB_EMPTY_ROOT(&vm->va.rb_root)) {
> @@ -3183,3 +3208,78 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
>   		}
>   	}
>   }
> +
> +/**
> + * amdgpu_vm_add_fault - Add a page fault record to fault hash table
> + *
> + * @fault_hash: fault hash table
> + * @key: 64-bit encoding of PASID and address
> + *
> + * This should be called when a retry page fault interrupt is
> + * received. If this is a new page fault, it will be added to a hash
> + * table. The return value indicates whether this is a new fault, or
> + * a fault that was already known and is already being handled.
> + *
> + * If there are too many pending page faults, this will fail. Retry
> + * interrupts should be ignored in this case until there is enough
> + * free space.
> + *
> + * Returns 0 if the fault was added, 1 if the fault was already known,
> + * -ENOSPC if there are too many pending faults.
> + */
> +int amdgpu_vm_add_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key)
> +{
> +	unsigned long flags;
> +	int r = -ENOSPC;
> +
> +	if (WARN_ON_ONCE(!fault_hash))
> +		/* Should be allocated in amdgpu_vm_init
> +		 */
> +		return r;
> +
> +	spin_lock_irqsave(&fault_hash->lock, flags);
> +
> +	/* Only let the hash table fill up to 50% for best performance */
> +	if (fault_hash->count >= (1 << (AMDGPU_PAGEFAULT_HASH_BITS-1)))
> +		goto unlock_out;
> +
> +	r = chash_table_copy_in(&fault_hash->hash, key, NULL);
> +	if (!r)
> +		fault_hash->count++;
> +
> +	/* chash_table_copy_in should never fail unless we're losing count */
> +	WARN_ON_ONCE(r < 0);
> +
> +unlock_out:
> +	spin_unlock_irqrestore(&fault_hash->lock, flags);
> +	return r;
> +}
> +
> +/**
> + * amdgpu_vm_clear_fault - Remove a page fault record
> + *
> + * @fault_hash: fault hash table
> + * @key: 64-bit encoding of PASID and address
> + *
> + * This should be called when a page fault has been handled. Any
> + * future interrupt with this key will be processed as a new
> + * page fault.
> + */
> +void amdgpu_vm_clear_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key)
> +{
> +	unsigned long flags;
> +	int r;
> +
> +	if (!fault_hash)
> +		return;
> +
> +	spin_lock_irqsave(&fault_hash->lock, flags);
> +
> +	r = chash_table_remove(&fault_hash->hash, key, NULL);
> +	if (!WARN_ON_ONCE(r < 0)) {
> +		fault_hash->count--;
> +		WARN_ON_ONCE(fault_hash->count < 0);
> +	}
> +
> +	spin_unlock_irqrestore(&fault_hash->lock, flags);
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> index e275ee7..6eb1da1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> @@ -178,6 +178,13 @@ struct amdgpu_task_info {
>   	pid_t	tgid;
>   };
>   
> +#define AMDGPU_PAGEFAULT_HASH_BITS 8
> +struct amdgpu_retryfault_hashtable {
> +	DECLARE_CHASH_TABLE(hash, AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
> +	spinlock_t	lock;
> +	int		count;
> +};
> +
>   struct amdgpu_vm {
>   	/* tree of virtual addresses mapped */
>   	struct rb_root_cached	va;
> @@ -240,6 +247,7 @@ struct amdgpu_vm {
>   	struct ttm_lru_bulk_move lru_bulk_move;
>   	/* mark whether can do the bulk move */
>   	bool			bulk_moveable;
> +	struct amdgpu_retryfault_hashtable *fault_hash;
>   };
>   
>   struct amdgpu_vm_manager {
> @@ -355,4 +363,8 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);
>   void amdgpu_vm_move_to_lru_tail(struct amdgpu_device *adev,
>   				struct amdgpu_vm *vm);
>   
> +int amdgpu_vm_add_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key);
> +
> +void amdgpu_vm_clear_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key);
> +
>   #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
> index 5ae5ed2..acbe5a7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
> +++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
> @@ -265,35 +265,36 @@ static bool vega10_ih_prescreen_iv(struct amdgpu_device *adev)
>   		return true;
>   	}
>   
> -	addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 12);
> -	key = AMDGPU_VM_FAULT(pasid, addr);
> -	r = amdgpu_ih_add_fault(adev, key);
> -
> -	/* Hash table is full or the fault is already being processed,
> -	 * ignore further page faults
> -	 */
> -	if (r != 0)
> -		goto ignore_iv;
> -
>   	/* Track retry faults in per-VM fault FIFO. */
>   	spin_lock(&adev->vm_manager.pasid_lock);
>   	vm = idr_find(&adev->vm_manager.pasid_idr, pasid);
> +	addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 12);
> +	key = AMDGPU_VM_FAULT(pasid, addr);
>   	if (!vm) {
>   		/* VM not found, process it normally */
>   		spin_unlock(&adev->vm_manager.pasid_lock);
> -		amdgpu_ih_clear_fault(adev, key);
>   		return true;
> +	} else {
> +		r = amdgpu_vm_add_fault(vm->fault_hash, key);
> +
> +		/* Hash table is full or the fault is already being processed,
> +		 * ignore further page faults
> +		 */
> +		if (r != 0) {
> +			spin_unlock(&adev->vm_manager.pasid_lock);
> +			goto ignore_iv;
> +		}
>   	}
>   	/* No locking required with single writer and single reader */
>   	r = kfifo_put(&vm->faults, key);
>   	if (!r) {
>   		/* FIFO is full. Ignore it until there is space */
> +		amdgpu_vm_clear_fault(vm->fault_hash, key);
>   		spin_unlock(&adev->vm_manager.pasid_lock);
> -		amdgpu_ih_clear_fault(adev, key);
>   		goto ignore_iv;
>   	}
> -	spin_unlock(&adev->vm_manager.pasid_lock);
>   
> +	spin_unlock(&adev->vm_manager.pasid_lock);
>   	/* It's the first fault for this address, process it normally */
>   	return true;
>   
> @@ -386,14 +387,6 @@ static int vega10_ih_sw_init(void *handle)
>   	adev->irq.ih.use_doorbell = true;
>   	adev->irq.ih.doorbell_index = AMDGPU_DOORBELL64_IH << 1;
>   
> -	adev->irq.ih.faults = kmalloc(sizeof(*adev->irq.ih.faults), GFP_KERNEL);
> -	if (!adev->irq.ih.faults)
> -		return -ENOMEM;
> -	INIT_CHASH_TABLE(adev->irq.ih.faults->hash,
> -			 AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
> -	spin_lock_init(&adev->irq.ih.faults->lock);
> -	adev->irq.ih.faults->count = 0;
> -
>   	r = amdgpu_irq_init(adev);
>   
>   	return r;
> @@ -406,9 +399,6 @@ static int vega10_ih_sw_fini(void *handle)
>   	amdgpu_irq_fini(adev);
>   	amdgpu_ih_ring_fini(adev);
>   
> -	kfree(adev->irq.ih.faults);
> -	adev->irq.ih.faults = NULL;
> -
>   	return 0;
>   }
>   

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to amdgpu vm
       [not found]                                             ` <ea317e02-546b-b296-2456-542e86d62b6e-5C7GfCeVMHo@public.gmane.org>
@ 2018-09-12 15:29                                               ` Felix Kuehling
       [not found]                                                 ` <2e37f4d8-9f64-5412-8fa3-9ef1a12c4001-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 27+ messages in thread
From: Felix Kuehling @ 2018-09-12 15:29 UTC (permalink / raw)
  To: Christian König, Zeng, Oak, Oak Zeng,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Yang, Philip

On 2018-09-12 02:56 AM, Christian König wrote:
> Am 12.09.2018 um 00:00 schrieb Felix Kuehling:
>> On 2018-09-11 03:19 AM, Christian König wrote:
>>> Hi Felix,
>>>
>>> let me try to explain the problem on an example:
>>>
>>> 1. We have a running job which needs recoverable page faults for
>>> accessing the process address space.
>>> 2. We run into memory pressure on VRAM and start to evict things.
>>> 3. A page tables of the running job is picked up for eviction.
>>> 4. We schedule a copy command to move the content of the page table to
>>> system memory.
>> I think we should change this. If you evict a page table, you don't need
>> to preserve its contents. You should be able to restore the page table
>> contents from scratch when you handle the page fault that restores it.
>
> Yeah, already implemented. You actually don't need the page fault for
> that.
>
>>> 5. The new system memory location of the page table is noted in its BO.
>> You mean in the parent page table? You can just invalidate the entry in
>> the parent page table and let it fault.
>
> I'm repeating myself, but exactly that is what won't work.
>
> See we still have engines which can't handle page faults which uses
> the same VM at the same time. This means that we can't just fault in
> page tables.

And I don't understand why that is a problem. Those clients rely on
fences to keep their BOs resident, including the page tables. Are you
planning to change that?

Regards,
  Felix

>
> What we could do is to separate the address space into a low and a
> high range where we have different handling for both.
>
> I've already prototyped this and prepared mostly everything necessary
> for that, but that is still not 100% completed.
>
> Regards,
> Christian.
>
>>> 6. We get a fault from the job and swap in the page from the process
>>> address space.
>> No. Then you need to keep updating page tables that are swapped out.
>> Instead just discard them and rebuild on fault.
>>
>>> 7. Now we need to enter the new page address into the page table ->
>>> *BAM*
>>>
>>> The problem is now that we don't know the address of the page table
>>> because the current location was replaced with the future location in
>>> step #5.
>> You solve that problem by discarding page tables that are swapped out.
>> Then there is no future location.
>>
>>> We could now argue that we could update the page tables on the fly for
>>> the evicted page and never wait for the current job to finish, but
>>> this won't work because not all engines can handle that.
>> Engines that can't handle page faults will depend on fences to keep
>> their page tables resident, like we do today. So they won't have this
>> problem.
>>
>> Regards,
>>    Felix
>>
>>> I will circumvent this problem for now by blocking the eviction before
>>> step #5. The issue with that is that this pipelining is responsible
>>> for nearly 20% of the fps in some testcases.
>>>
>>> I hope that it won't be that bad when I limit disabling the pipelining
>>> to only page tables, but it is certainly possible that we will get
>>> pushback on this.
>>>
>>> If that happens we probably need to add some flags to limit this
>>> workaround even more to only the root PD and all the PDs/PTs which are
>>> involved in recoverable faults.
>>>
>>> Regards,
>>> Christian.
>>>
>>> Am 10.09.2018 um 21:10 schrieb Felix Kuehling:
>>>> I'm not sure why you need to distinguish current and future state when
>>>> dealing with page faults. When you get a page fault, you know that the
>>>> GPU is trying to access memory right now, in the present. So you're
>>>> always working with the current state. When the CPU page table
>>>> changes,
>>>> you get an MMU notifier that allows you to invalidate the
>>>> corresponding
>>>> GPU PTEs. If you have a valid GPU PTE, it always represents the
>>>> current
>>>> states and is in sync with the CPU page table. If the GPU page
>>>> table is
>>>> ever outdated, it should have an invalid entry in that place.
>>>>
>>>> If SDMA is used to update the GPU PTEs, there is a delay. The MMU
>>>> notifier is synchronous, so it shouldn't be a problem. You just
>>>> wait for
>>>> the SDMA job to complete before returning. When updating PTEs with new
>>>> valid addresses, it's asynchronous. But the GPU will continue retrying
>>>> on the invalid entry until SDMA finishes. So it's also implicitly
>>>> synchronized on the GPU side.
>>>>
>>>> Regards,
>>>>     Felix
>>>>
>>>>
>>>> On 2018-09-10 05:42 AM, Christian König wrote:
>>>>> Hi Felix & Oak,
>>>>>
>>>>> over the weekend I had the idea that we could just use the shadow BOs
>>>>> to have the current state in a page fault. They are GTT BOs and CPU
>>>>> accessible anyway.
>>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>> Am 08.09.2018 um 09:34 schrieb Christian König:
>>>>>> Hi Felix,
>>>>>>
>>>>>>> But why do you want to update page tables when there is no more
>>>>>>> user
>>>>>>> mode context that cares about them? Is this just to allow pending
>>>>>>> work
>>>>>>> to complete after the user mode context is gone? To prevent hangs?
>>>>>> The problem I'm see is that we still don't have a reliable way of
>>>>>> killing a command submission while making sure that no more
>>>>>> faults of
>>>>>> it are in the flight.
>>>>>>
>>>>>> E.g. the HWS has a function to kill a running job on the hardware,
>>>>>> but that only works for compute and there is no way of telling when
>>>>>> that operation has finished.
>>>>>>
>>>>>> But Oak has already convinced me that we should probably work on
>>>>>> that
>>>>>> instead of trying to keep the VM alive as long as possible.
>>>>>>
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>>> Am 08.09.2018 um 02:27 schrieb Felix Kuehling:
>>>>>>> Hi Christian,
>>>>>>>
>>>>>>> I'm not sure I get your point here.
>>>>>>>
>>>>>>> As I understand it, the amdgpu_vm structure represents the user
>>>>>>> mode VM
>>>>>>> context. It has the pointers to the root page directory and the
>>>>>>> rest of
>>>>>>> the page table hierarchy. If there is no amdgpu_vm, there is no
>>>>>>> user
>>>>>>> mode context that cares about the page tables.
>>>>>>>
>>>>>>> If we leave the page table pointers in the amdgpu_vm, then handling
>>>>>>> faults after the VM is destroyed is pointless. We can't find the
>>>>>>> page
>>>>>>> tables and we can't update them, so there is nothing we can do in
>>>>>>> response to VM faults.
>>>>>>>
>>>>>>> So I'm guessing that you want to move the page table pointers
>>>>>>> into a
>>>>>>> different structure that exists half-independently of the VM. It
>>>>>>> would
>>>>>>> be created when the VM is created (that's where we currently
>>>>>>> allocate
>>>>>>> the PASID) but can be destroyed slightly later.
>>>>>>>
>>>>>>> But why do you want to update page tables when there is no more
>>>>>>> user
>>>>>>> mode context that cares about them? Is this just to allow pending
>>>>>>> work
>>>>>>> to complete after the user mode context is gone? To prevent hangs?
>>>>>>>
>>>>>>> Regards,
>>>>>>>      Felix
>>>>>>>
>>>>>>> On 2018-09-10 07:20 AM, Christian König wrote:
>>>>>>>>> Am I right that the handling of page fault need a valid VM?
>>>>>>>> No, exactly that view is incorrect.
>>>>>>>>
>>>>>>>> The VM is meant to be a memory management structure of page
>>>>>>>> tables and
>>>>>>>> is completely pointless fault processing because it represent the
>>>>>>>> future state of the page tables and not the current one.
>>>>>>>>
>>>>>>>> What we need for fault processing is a new structure build around
>>>>>>>> PASIDs which is feed by the with addresses when page tables are
>>>>>>>> moved
>>>>>>>> around.
>>>>>>>>
>>>>>>>> Alternatively I hope that we can use the SDMA to walk the page
>>>>>>>> tables
>>>>>>>> and update the required entries by just using the address.
>>>>>>>>
>>>>>>>> Regards,
>>>>>>>> Christian.
>>>>>>>>
>>>>>>>> Am 07.09.2018 um 16:55 schrieb Zeng, Oak:
>>>>>>>>> Hi Christian,
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> What is the value of delay the destroy of hash to after vm is
>>>>>>>>> destroyed? Since vm is destroyed, you basically don’t have enough
>>>>>>>>> information to paging in the correct page to gpuvm. Am I right
>>>>>>>>> that
>>>>>>>>> the handling of page fault need a valid VM? For example, you
>>>>>>>>> need the
>>>>>>>>> VM to get the corresponding physical address of the faulty page.
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> After vm is destroyed, the retry interrupt can be directly
>>>>>>>>> discard as
>>>>>>>>> you can’t find vm through pasid. You can think this is also one
>>>>>>>>> kind
>>>>>>>>> of prescreen.
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> So I am stand for the point that, there is no need to delay the
>>>>>>>>> destroy of hash to after vm is destroyed. Prescreening hash
>>>>>>>>> can be
>>>>>>>>> destroyed at the time of vm_fini.
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> Thanks,
>>>>>>>>>
>>>>>>>>> Oak
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> *From:*Christian König <ckoenig.leichtzumerken@gmail.com>
>>>>>>>>> *Sent:* Friday, September 07, 2018 4:39 AM
>>>>>>>>> *To:* Zeng, Oak <Oak.Zeng@amd.com>; Oak Zeng
>>>>>>>>> <zengshanjun@gmail.com>;
>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>> *Cc:* Zeng, Oak <Oak.Zeng@amd.com>
>>>>>>>>> *Subject:* Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to
>>>>>>>>> amdgpu vm
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> Hi Oak,
>>>>>>>>>
>>>>>>>>> yeah, but this is still a NAK. Making the hash per PASID was
>>>>>>>>> not a
>>>>>>>>> suggestion but rather a must have.
>>>>>>>>>
>>>>>>>>> The VM structures must be destroyed while the processing is still
>>>>>>>>> ongoing, or otherwise we would not have a clean OOM handling.
>>>>>>>>>
>>>>>>>>> The IDR for PASID lockup can be put into amdgpu_ids.c, you can
>>>>>>>>> just
>>>>>>>>> replace the IDA already used there with an IDR.
>>>>>>>>>
>>>>>>>>> Since the PASID is already freed up delayed we should have the
>>>>>>>>> grace
>>>>>>>>> period for interrupt processing included. If that still isn't
>>>>>>>>> sufficient we can still add some delayed work for this.
>>>>>>>>>
>>>>>>>>> Regards,
>>>>>>>>> Christian.
>>>>>>>>>
>>>>>>>>> Am 07.09.2018 um 06:16 schrieb ozeng:
>>>>>>>>>
>>>>>>>>>        Hi Christian,
>>>>>>>>>
>>>>>>>>>        In this implementation, fault hash is made per vm, not per
>>>>>>>>> pasid
>>>>>>>>>        as suggested, based on below considerations:
>>>>>>>>>
>>>>>>>>>          * Delay the destroy of hash introduces more effort like
>>>>>>>>> how to
>>>>>>>>>            set the proper grace period after which no retry
>>>>>>>>> interrupt
>>>>>>>>>            will be generated. We want to avoid those complication
>>>>>>>>> if we
>>>>>>>>>            can.
>>>>>>>>>          * The  problem of the per vm implementation is that it
>>>>>>>>> is hard
>>>>>>>>>            to delay the destroy of fault hash, because when vm is
>>>>>>>>>            destroyed, prescreen function won't be able to
>>>>>>>>> retrieve the
>>>>>>>>>            fault hash. But in this case, the prescreen
>>>>>>>>> function can
>>>>>>>>>            either let the interrupt go through (to gmc) or ignore
>>>>>>>>> it.
>>>>>>>>>            From this perspective, it is not very necessary to
>>>>>>>>> delay the
>>>>>>>>>            destroy of hash table.
>>>>>>>>>          * On the other hand, since ASICs after vega10 have the
>>>>>>>>> HW CAM
>>>>>>>>>            feature. So the SW IV prescreen is only useful for
>>>>>>>>> vega10.
>>>>>>>>>            For all the HW implemented CAM, we can consider the
>>>>>>>>> delayed
>>>>>>>>>            CAM acknowledgment.
>>>>>>>>>          * Making it per pasid need to introduce another idr to
>>>>>>>>>            correlate pasid and hash table - where to put the idr?
>>>>>>>>> You
>>>>>>>>>            will have to make it a global variable which is not
>>>>>>>>> very
>>>>>>>>>            desirable.
>>>>>>>>>
>>>>>>>>>        The main purpose of this patch is to solve the
>>>>>>>>> inter-process
>>>>>>>>> lock
>>>>>>>>>        issue (when hash table is full), while still keep codes
>>>>>>>>> simple.
>>>>>>>>>
>>>>>>>>>        Also with this patch, the faults kfifo is not necessary
>>>>>>>>> any
>>>>>>>>> more.
>>>>>>>>>        See patch 2.
>>>>>>>>>
>>>>>>>>>        Regards,
>>>>>>>>>
>>>>>>>>>        Oak
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>        On 09/06/2018 11:28 PM, Oak Zeng wrote:
>>>>>>>>>
>>>>>>>>>            In stead of share one fault hash table per device,
>>>>>>>>> make it
>>>>>>>>>
>>>>>>>>>            per vm. This can avoid inter-process lock issue when
>>>>>>>>> fault
>>>>>>>>>
>>>>>>>>>            hash table is full.
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>            Change-Id: I5d1281b7c41eddc8e26113e010516557588d3708
>>>>>>>>>
>>>>>>>>>            Signed-off-by: Oak Zeng <Oak.Zeng@amd.com>
>>>>>>>>> <mailto:Oak.Zeng@amd.com>
>>>>>>>>>
>>>>>>>>>            Suggested-by: Christian Konig
>>>>>>>>> <Christian.Koenig@amd.com>
>>>>>>>>> <mailto:Christian.Koenig@amd.com>
>>>>>>>>>
>>>>>>>>>            Suggested-by: Felix Kuehling <Felix.Kuehling@amd.com>
>>>>>>>>> <mailto:Felix.Kuehling@amd.com>
>>>>>>>>>
>>>>>>>>>            ---
>>>>>>>>>
>>>>>>>>>             drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c |  75
>>>>>>>>> ------------------------
>>>>>>>>>
>>>>>>>>>             drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h |  10 ----
>>>>>>>>>
>>>>>>>>>             drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 102
>>>>>>>>> ++++++++++++++++++++++++++++++++-
>>>>>>>>>
>>>>>>>>>             drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h |  12 ++++
>>>>>>>>>
>>>>>>>>>             drivers/gpu/drm/amd/amdgpu/vega10_ih.c |  38
>>>>>>>>> +++++-------
>>>>>>>>>
>>>>>>>>>             5 files changed, 127 insertions(+), 110 deletions(-)
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>            diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>>>>>>>>
>>>>>>>>>            index 06373d4..4ed8621 100644
>>>>>>>>>
>>>>>>>>>            --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>>>>>>>>
>>>>>>>>>            +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>>>>>>>>
>>>>>>>>>            @@ -197,78 +197,3 @@ int amdgpu_ih_process(struct
>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>
>>>>>>>>>               return IRQ_HANDLED;
>>>>>>>>>
>>>>>>>>>             }
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>            -/**
>>>>>>>>>
>>>>>>>>>            - * amdgpu_ih_add_fault - Add a page fault record
>>>>>>>>>
>>>>>>>>>            - *
>>>>>>>>>
>>>>>>>>>            - * @adev: amdgpu device pointer
>>>>>>>>>
>>>>>>>>>            - * @key: 64-bit encoding of PASID and address
>>>>>>>>>
>>>>>>>>>            - *
>>>>>>>>>
>>>>>>>>>            - * This should be called when a retry page fault
>>>>>>>>> interrupt is
>>>>>>>>>
>>>>>>>>>            - * received. If this is a new page fault, it will be
>>>>>>>>> added to a hash
>>>>>>>>>
>>>>>>>>>            - * table. The return value indicates whether this
>>>>>>>>> is a
>>>>>>>>> new fault, or
>>>>>>>>>
>>>>>>>>>            - * a fault that was already known and is already
>>>>>>>>> being
>>>>>>>>> handled.
>>>>>>>>>
>>>>>>>>>            - *
>>>>>>>>>
>>>>>>>>>            - * If there are too many pending page faults, this
>>>>>>>>> will
>>>>>>>>> fail. Retry
>>>>>>>>>
>>>>>>>>>            - * interrupts should be ignored in this case until
>>>>>>>>> there
>>>>>>>>> is enough
>>>>>>>>>
>>>>>>>>>            - * free space.
>>>>>>>>>
>>>>>>>>>            - *
>>>>>>>>>
>>>>>>>>>            - * Returns 0 if the fault was added, 1 if the
>>>>>>>>> fault was
>>>>>>>>> already known,
>>>>>>>>>
>>>>>>>>>            - * -ENOSPC if there are too many pending faults.
>>>>>>>>>
>>>>>>>>>            - */
>>>>>>>>>
>>>>>>>>>            -int amdgpu_ih_add_fault(struct amdgpu_device
>>>>>>>>> *adev, u64
>>>>>>>>> key)
>>>>>>>>>
>>>>>>>>>            -{
>>>>>>>>>
>>>>>>>>>            -  unsigned long flags;
>>>>>>>>>
>>>>>>>>>            -  int r = -ENOSPC;
>>>>>>>>>
>>>>>>>>>            -
>>>>>>>>>
>>>>>>>>>            -  if (WARN_ON_ONCE(!adev->irq.ih.faults))
>>>>>>>>>
>>>>>>>>>            -          /* Should be allocated in
>>>>>>>>> <IP>_ih_sw_init on
>>>>>>>>> GPUs that
>>>>>>>>>
>>>>>>>>>            -           * support retry faults and require retry
>>>>>>>>> filtering.
>>>>>>>>>
>>>>>>>>>            -           */
>>>>>>>>>
>>>>>>>>>            -          return r;
>>>>>>>>>
>>>>>>>>>            -
>>>>>>>>>
>>>>>>>>>            - spin_lock_irqsave(&adev->irq.ih.faults->lock,
>>>>>>>>> flags);
>>>>>>>>>
>>>>>>>>>            -
>>>>>>>>>
>>>>>>>>>            -  /* Only let the hash table fill up to 50% for best
>>>>>>>>> performance */
>>>>>>>>>
>>>>>>>>>            -  if (adev->irq.ih.faults->count >= (1 <<
>>>>>>>>> (AMDGPU_PAGEFAULT_HASH_BITS-1)))
>>>>>>>>>
>>>>>>>>>            -          goto unlock_out;
>>>>>>>>>
>>>>>>>>>            -
>>>>>>>>>
>>>>>>>>>            -  r = chash_table_copy_in(&adev->irq.ih.faults->hash,
>>>>>>>>> key, NULL);
>>>>>>>>>
>>>>>>>>>            -  if (!r)
>>>>>>>>>
>>>>>>>>>            -          adev->irq.ih.faults->count++;
>>>>>>>>>
>>>>>>>>>            -
>>>>>>>>>
>>>>>>>>>            -  /* chash_table_copy_in should never fail unless
>>>>>>>>> we're
>>>>>>>>> losing count */
>>>>>>>>>
>>>>>>>>>            -  WARN_ON_ONCE(r < 0);
>>>>>>>>>
>>>>>>>>>            -
>>>>>>>>>
>>>>>>>>>            -unlock_out:
>>>>>>>>>
>>>>>>>>>            - spin_unlock_irqrestore(&adev->irq.ih.faults->lock,
>>>>>>>>> flags);
>>>>>>>>>
>>>>>>>>>            -  return r;
>>>>>>>>>
>>>>>>>>>            -}
>>>>>>>>>
>>>>>>>>>            -
>>>>>>>>>
>>>>>>>>>            -/**
>>>>>>>>>
>>>>>>>>>            - * amdgpu_ih_clear_fault - Remove a page fault record
>>>>>>>>>
>>>>>>>>>            - *
>>>>>>>>>
>>>>>>>>>            - * @adev: amdgpu device pointer
>>>>>>>>>
>>>>>>>>>            - * @key: 64-bit encoding of PASID and address
>>>>>>>>>
>>>>>>>>>            - *
>>>>>>>>>
>>>>>>>>>            - * This should be called when a page fault has been
>>>>>>>>> handled. Any
>>>>>>>>>
>>>>>>>>>            - * future interrupt with this key will be
>>>>>>>>> processed as a
>>>>>>>>> new
>>>>>>>>>
>>>>>>>>>            - * page fault.
>>>>>>>>>
>>>>>>>>>            - */
>>>>>>>>>
>>>>>>>>>            -void amdgpu_ih_clear_fault(struct amdgpu_device
>>>>>>>>> *adev,
>>>>>>>>> u64 key)
>>>>>>>>>
>>>>>>>>>            -{
>>>>>>>>>
>>>>>>>>>            -  unsigned long flags;
>>>>>>>>>
>>>>>>>>>            -  int r;
>>>>>>>>>
>>>>>>>>>            -
>>>>>>>>>
>>>>>>>>>            -  if (!adev->irq.ih.faults)
>>>>>>>>>
>>>>>>>>>            -          return;
>>>>>>>>>
>>>>>>>>>            -
>>>>>>>>>
>>>>>>>>>            - spin_lock_irqsave(&adev->irq.ih.faults->lock,
>>>>>>>>> flags);
>>>>>>>>>
>>>>>>>>>            -
>>>>>>>>>
>>>>>>>>>            -  r = chash_table_remove(&adev->irq.ih.faults->hash,
>>>>>>>>> key, NULL);
>>>>>>>>>
>>>>>>>>>            -  if (!WARN_ON_ONCE(r < 0)) {
>>>>>>>>>
>>>>>>>>>            -          adev->irq.ih.faults->count--;
>>>>>>>>>
>>>>>>>>>            - WARN_ON_ONCE(adev->irq.ih.faults->count < 0);
>>>>>>>>>
>>>>>>>>>            -  }
>>>>>>>>>
>>>>>>>>>            -
>>>>>>>>>
>>>>>>>>>            - spin_unlock_irqrestore(&adev->irq.ih.faults->lock,
>>>>>>>>> flags);
>>>>>>>>>
>>>>>>>>>            -}
>>>>>>>>>
>>>>>>>>>            diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>>>>>>>
>>>>>>>>>            index a23e1c0..f411ffb 100644
>>>>>>>>>
>>>>>>>>>            --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>>>>>>>
>>>>>>>>>            +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>>>>>>>
>>>>>>>>>            @@ -32,13 +32,6 @@ struct amdgpu_device;
>>>>>>>>>
>>>>>>>>>             #define AMDGPU_IH_CLIENTID_LEGACY 0
>>>>>>>>>
>>>>>>>>>             #define AMDGPU_IH_CLIENTID_MAX SOC15_IH_CLIENTID_MAX
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>            -#define AMDGPU_PAGEFAULT_HASH_BITS 8
>>>>>>>>>
>>>>>>>>>            -struct amdgpu_retryfault_hashtable {
>>>>>>>>>
>>>>>>>>>            -  DECLARE_CHASH_TABLE(hash,
>>>>>>>>> AMDGPU_PAGEFAULT_HASH_BITS,
>>>>>>>>> 8, 0);
>>>>>>>>>
>>>>>>>>>            -  spinlock_t     lock;
>>>>>>>>>
>>>>>>>>>            -  int            count;
>>>>>>>>>
>>>>>>>>>            -};
>>>>>>>>>
>>>>>>>>>            -
>>>>>>>>>
>>>>>>>>>             /*
>>>>>>>>>
>>>>>>>>>              * R6xx+ IH ring
>>>>>>>>>
>>>>>>>>>              */
>>>>>>>>>
>>>>>>>>>            @@ -57,7 +50,6 @@ struct amdgpu_ih_ring {
>>>>>>>>>
>>>>>>>>>               bool                   use_doorbell;
>>>>>>>>>
>>>>>>>>>               bool                   use_bus_addr;
>>>>>>>>>
>>>>>>>>>               dma_addr_t             rb_dma_addr; /* only used
>>>>>>>>> when
>>>>>>>>> use_bus_addr = true */
>>>>>>>>>
>>>>>>>>>            -  struct amdgpu_retryfault_hashtable *faults;
>>>>>>>>>
>>>>>>>>>             };
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>             #define AMDGPU_IH_SRC_DATA_MAX_SIZE_DW 4
>>>>>>>>>
>>>>>>>>>            @@ -95,7 +87,5 @@ int amdgpu_ih_ring_init(struct
>>>>>>>>> amdgpu_device *adev, unsigned ring_size,
>>>>>>>>>
>>>>>>>>>                               bool use_bus_addr);
>>>>>>>>>
>>>>>>>>>             void amdgpu_ih_ring_fini(struct amdgpu_device *adev);
>>>>>>>>>
>>>>>>>>>             int amdgpu_ih_process(struct amdgpu_device *adev);
>>>>>>>>>
>>>>>>>>>            -int amdgpu_ih_add_fault(struct amdgpu_device
>>>>>>>>> *adev, u64
>>>>>>>>> key);
>>>>>>>>>
>>>>>>>>>            -void amdgpu_ih_clear_fault(struct amdgpu_device
>>>>>>>>> *adev,
>>>>>>>>> u64 key);
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>             #endif
>>>>>>>>>
>>>>>>>>>            diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>>>>
>>>>>>>>>            index 1d7e3c1..8b220e9 100644
>>>>>>>>>
>>>>>>>>>            --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>>>>
>>>>>>>>>            +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>>>>
>>>>>>>>>            @@ -2692,6 +2692,22 @@ void
>>>>>>>>> amdgpu_vm_adjust_size(struct
>>>>>>>>> amdgpu_device *adev, uint32_t min_vm_size,
>>>>>>>>>
>>>>>>>>>                        adev->vm_manager.fragment_size);
>>>>>>>>>
>>>>>>>>>             }
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>            +static struct amdgpu_retryfault_hashtable
>>>>>>>>> *init_fault_hash(void)
>>>>>>>>>
>>>>>>>>>            +{
>>>>>>>>>
>>>>>>>>>            +  struct amdgpu_retryfault_hashtable *fault_hash;
>>>>>>>>>
>>>>>>>>>            +
>>>>>>>>>
>>>>>>>>>            +  fault_hash = kmalloc(sizeof(*fault_hash),
>>>>>>>>> GFP_KERNEL);
>>>>>>>>>
>>>>>>>>>            +  if (!fault_hash)
>>>>>>>>>
>>>>>>>>>            +          return fault_hash;
>>>>>>>>>
>>>>>>>>>            +
>>>>>>>>>
>>>>>>>>>            +  INIT_CHASH_TABLE(fault_hash->hash,
>>>>>>>>>
>>>>>>>>>            +                  AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
>>>>>>>>>
>>>>>>>>>            +  spin_lock_init(&fault_hash->lock);
>>>>>>>>>
>>>>>>>>>            +  fault_hash->count = 0;
>>>>>>>>>
>>>>>>>>>            +
>>>>>>>>>
>>>>>>>>>            +  return fault_hash;
>>>>>>>>>
>>>>>>>>>            +}
>>>>>>>>>
>>>>>>>>>            +
>>>>>>>>>
>>>>>>>>>             /**
>>>>>>>>>
>>>>>>>>>              * amdgpu_vm_init - initialize a vm instance
>>>>>>>>>
>>>>>>>>>              *
>>>>>>>>>
>>>>>>>>>            @@ -2780,6 +2796,12 @@ int amdgpu_vm_init(struct
>>>>>>>>> amdgpu_device *adev, struct amdgpu_vm *vm,
>>>>>>>>>
>>>>>>>>>                       vm->pasid = pasid;
>>>>>>>>>
>>>>>>>>>               }
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>            +  vm->fault_hash = init_fault_hash();
>>>>>>>>>
>>>>>>>>>            +  if (!vm->fault_hash) {
>>>>>>>>>
>>>>>>>>>            +          r = -ENOMEM;
>>>>>>>>>
>>>>>>>>>            +          goto error_free_root;
>>>>>>>>>
>>>>>>>>>            +  }
>>>>>>>>>
>>>>>>>>>            +
>>>>>>>>>
>>>>>>>>>               INIT_KFIFO(vm->faults);
>>>>>>>>>
>>>>>>>>>               vm->fault_credit = 16;
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>            @@ -2973,7 +2995,7 @@ void amdgpu_vm_fini(struct
>>>>>>>>> amdgpu_device *adev, struct amdgpu_vm *vm)
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>               /* Clear pending page faults from IH when the VM is
>>>>>>>>> destroyed */
>>>>>>>>>
>>>>>>>>>               while (kfifo_get(&vm->faults, &fault))
>>>>>>>>>
>>>>>>>>>            -          amdgpu_ih_clear_fault(adev, fault);
>>>>>>>>>
>>>>>>>>>            +          amdgpu_vm_clear_fault(vm->fault_hash,
>>>>>>>>> fault);
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>               if (vm->pasid) {
>>>>>>>>>
>>>>>>>>>                       unsigned long flags;
>>>>>>>>>
>>>>>>>>>            @@ -2983,6 +3005,9 @@ void amdgpu_vm_fini(struct
>>>>>>>>> amdgpu_device *adev, struct amdgpu_vm *vm)
>>>>>>>>>
>>>>>>>>> spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
>>>>>>>>>
>>>>>>>>>               }
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>            +  kfree(vm->fault_hash);
>>>>>>>>>
>>>>>>>>>            +  vm->fault_hash = NULL;
>>>>>>>>>
>>>>>>>>>            +
>>>>>>>>>
>>>>>>>>>               drm_sched_entity_destroy(&vm->entity);
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>               if (!RB_EMPTY_ROOT(&vm->va.rb_root)) {
>>>>>>>>>
>>>>>>>>>            @@ -3183,3 +3208,78 @@ void
>>>>>>>>> amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
>>>>>>>>>
>>>>>>>>>                       }
>>>>>>>>>
>>>>>>>>>               }
>>>>>>>>>
>>>>>>>>>             }
>>>>>>>>>
>>>>>>>>>            +
>>>>>>>>>
>>>>>>>>>            +/**
>>>>>>>>>
>>>>>>>>>            + * amdgpu_vm_add_fault - Add a page fault record to
>>>>>>>>> fault hash table
>>>>>>>>>
>>>>>>>>>            + *
>>>>>>>>>
>>>>>>>>>            + * @fault_hash: fault hash table
>>>>>>>>>
>>>>>>>>>            + * @key: 64-bit encoding of PASID and address
>>>>>>>>>
>>>>>>>>>            + *
>>>>>>>>>
>>>>>>>>>            + * This should be called when a retry page fault
>>>>>>>>> interrupt is
>>>>>>>>>
>>>>>>>>>            + * received. If this is a new page fault, it will be
>>>>>>>>> added to a hash
>>>>>>>>>
>>>>>>>>>            + * table. The return value indicates whether this
>>>>>>>>> is a
>>>>>>>>> new fault, or
>>>>>>>>>
>>>>>>>>>            + * a fault that was already known and is already
>>>>>>>>> being
>>>>>>>>> handled.
>>>>>>>>>
>>>>>>>>>            + *
>>>>>>>>>
>>>>>>>>>            + * If there are too many pending page faults, this
>>>>>>>>> will
>>>>>>>>> fail. Retry
>>>>>>>>>
>>>>>>>>>            + * interrupts should be ignored in this case until
>>>>>>>>> there
>>>>>>>>> is enough
>>>>>>>>>
>>>>>>>>>            + * free space.
>>>>>>>>>
>>>>>>>>>            + *
>>>>>>>>>
>>>>>>>>>            + * Returns 0 if the fault was added, 1 if the
>>>>>>>>> fault was
>>>>>>>>> already known,
>>>>>>>>>
>>>>>>>>>            + * -ENOSPC if there are too many pending faults.
>>>>>>>>>
>>>>>>>>>            + */
>>>>>>>>>
>>>>>>>>>            +int amdgpu_vm_add_fault(struct
>>>>>>>>> amdgpu_retryfault_hashtable *fault_hash, u64 key)
>>>>>>>>>
>>>>>>>>>            +{
>>>>>>>>>
>>>>>>>>>            +  unsigned long flags;
>>>>>>>>>
>>>>>>>>>            +  int r = -ENOSPC;
>>>>>>>>>
>>>>>>>>>            +
>>>>>>>>>
>>>>>>>>>            +  if (WARN_ON_ONCE(!fault_hash))
>>>>>>>>>
>>>>>>>>>            +          /* Should be allocated in amdgpu_vm_init
>>>>>>>>>
>>>>>>>>>            +           */
>>>>>>>>>
>>>>>>>>>            +          return r;
>>>>>>>>>
>>>>>>>>>            +
>>>>>>>>>
>>>>>>>>>            +  spin_lock_irqsave(&fault_hash->lock, flags);
>>>>>>>>>
>>>>>>>>>            +
>>>>>>>>>
>>>>>>>>>            +  /* Only let the hash table fill up to 50% for best
>>>>>>>>> performance */
>>>>>>>>>
>>>>>>>>>            +  if (fault_hash->count >= (1 <<
>>>>>>>>> (AMDGPU_PAGEFAULT_HASH_BITS-1)))
>>>>>>>>>
>>>>>>>>>            +          goto unlock_out;
>>>>>>>>>
>>>>>>>>>            +
>>>>>>>>>
>>>>>>>>>            +  r = chash_table_copy_in(&fault_hash->hash, key,
>>>>>>>>> NULL);
>>>>>>>>>
>>>>>>>>>            +  if (!r)
>>>>>>>>>
>>>>>>>>>            +          fault_hash->count++;
>>>>>>>>>
>>>>>>>>>            +
>>>>>>>>>
>>>>>>>>>            +  /* chash_table_copy_in should never fail unless
>>>>>>>>> we're
>>>>>>>>> losing count */
>>>>>>>>>
>>>>>>>>>            +  WARN_ON_ONCE(r < 0);
>>>>>>>>>
>>>>>>>>>            +
>>>>>>>>>
>>>>>>>>>            +unlock_out:
>>>>>>>>>
>>>>>>>>>            +  spin_unlock_irqrestore(&fault_hash->lock, flags);
>>>>>>>>>
>>>>>>>>>            +  return r;
>>>>>>>>>
>>>>>>>>>            +}
>>>>>>>>>
>>>>>>>>>            +
>>>>>>>>>
>>>>>>>>>            +/**
>>>>>>>>>
>>>>>>>>>            + * amdgpu_vm_clear_fault - Remove a page fault record
>>>>>>>>>
>>>>>>>>>            + *
>>>>>>>>>
>>>>>>>>>            + * @fault_hash: fault hash table
>>>>>>>>>
>>>>>>>>>            + * @key: 64-bit encoding of PASID and address
>>>>>>>>>
>>>>>>>>>            + *
>>>>>>>>>
>>>>>>>>>            + * This should be called when a page fault has been
>>>>>>>>> handled. Any
>>>>>>>>>
>>>>>>>>>            + * future interrupt with this key will be
>>>>>>>>> processed as a
>>>>>>>>> new
>>>>>>>>>
>>>>>>>>>            + * page fault.
>>>>>>>>>
>>>>>>>>>            + */
>>>>>>>>>
>>>>>>>>>            +void amdgpu_vm_clear_fault(struct
>>>>>>>>> amdgpu_retryfault_hashtable *fault_hash, u64 key)
>>>>>>>>>
>>>>>>>>>            +{
>>>>>>>>>
>>>>>>>>>            +  unsigned long flags;
>>>>>>>>>
>>>>>>>>>            +  int r;
>>>>>>>>>
>>>>>>>>>            +
>>>>>>>>>
>>>>>>>>>            +  if (!fault_hash)
>>>>>>>>>
>>>>>>>>>            +          return;
>>>>>>>>>
>>>>>>>>>            +
>>>>>>>>>
>>>>>>>>>            +  spin_lock_irqsave(&fault_hash->lock, flags);
>>>>>>>>>
>>>>>>>>>            +
>>>>>>>>>
>>>>>>>>>            +  r = chash_table_remove(&fault_hash->hash, key,
>>>>>>>>> NULL);
>>>>>>>>>
>>>>>>>>>            +  if (!WARN_ON_ONCE(r < 0)) {
>>>>>>>>>
>>>>>>>>>            +          fault_hash->count--;
>>>>>>>>>
>>>>>>>>>            +          WARN_ON_ONCE(fault_hash->count < 0);
>>>>>>>>>
>>>>>>>>>            +  }
>>>>>>>>>
>>>>>>>>>            +
>>>>>>>>>
>>>>>>>>>            +  spin_unlock_irqrestore(&fault_hash->lock, flags);
>>>>>>>>>
>>>>>>>>>            +}
>>>>>>>>>
>>>>>>>>>            diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>>>>>>>
>>>>>>>>>            index e275ee7..6eb1da1 100644
>>>>>>>>>
>>>>>>>>>            --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>>>>>>>
>>>>>>>>>            +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>>>>>>>
>>>>>>>>>            @@ -178,6 +178,13 @@ struct amdgpu_task_info {
>>>>>>>>>
>>>>>>>>>               pid_t   tgid;
>>>>>>>>>
>>>>>>>>>             };
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>            +#define AMDGPU_PAGEFAULT_HASH_BITS 8
>>>>>>>>>
>>>>>>>>>            +struct amdgpu_retryfault_hashtable {
>>>>>>>>>
>>>>>>>>>            +  DECLARE_CHASH_TABLE(hash,
>>>>>>>>> AMDGPU_PAGEFAULT_HASH_BITS,
>>>>>>>>> 8, 0);
>>>>>>>>>
>>>>>>>>>            +  spinlock_t     lock;
>>>>>>>>>
>>>>>>>>>            +  int            count;
>>>>>>>>>
>>>>>>>>>            +};
>>>>>>>>>
>>>>>>>>>            +
>>>>>>>>>
>>>>>>>>>             struct amdgpu_vm {
>>>>>>>>>
>>>>>>>>>               /* tree of virtual addresses mapped */
>>>>>>>>>
>>>>>>>>>               struct rb_root_cached  va;
>>>>>>>>>
>>>>>>>>>            @@ -240,6 +247,7 @@ struct amdgpu_vm {
>>>>>>>>>
>>>>>>>>>               struct ttm_lru_bulk_move lru_bulk_move;
>>>>>>>>>
>>>>>>>>>               /* mark whether can do the bulk move */
>>>>>>>>>
>>>>>>>>>               bool                   bulk_moveable;
>>>>>>>>>
>>>>>>>>>            +  struct amdgpu_retryfault_hashtable *fault_hash;
>>>>>>>>>
>>>>>>>>>             };
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>             struct amdgpu_vm_manager {
>>>>>>>>>
>>>>>>>>>            @@ -355,4 +363,8 @@ void
>>>>>>>>> amdgpu_vm_set_task_info(struct
>>>>>>>>> amdgpu_vm *vm);
>>>>>>>>>
>>>>>>>>>             void amdgpu_vm_move_to_lru_tail(struct amdgpu_device
>>>>>>>>> *adev,
>>>>>>>>>
>>>>>>>>>                                      struct amdgpu_vm *vm);
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>            +int amdgpu_vm_add_fault(struct
>>>>>>>>> amdgpu_retryfault_hashtable *fault_hash, u64 key);
>>>>>>>>>
>>>>>>>>>            +
>>>>>>>>>
>>>>>>>>>            +void amdgpu_vm_clear_fault(struct
>>>>>>>>> amdgpu_retryfault_hashtable *fault_hash, u64 key);
>>>>>>>>>
>>>>>>>>>            +
>>>>>>>>>
>>>>>>>>>             #endif
>>>>>>>>>
>>>>>>>>>            diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>>>>
>>>>>>>>>            index 5ae5ed2..acbe5a7 100644
>>>>>>>>>
>>>>>>>>>            --- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>>>>
>>>>>>>>>            +++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>>>>
>>>>>>>>>            @@ -265,35 +265,36 @@ static bool
>>>>>>>>> vega10_ih_prescreen_iv(struct amdgpu_device *adev)
>>>>>>>>>
>>>>>>>>>                       return true;
>>>>>>>>>
>>>>>>>>>               }
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>            -  addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 12);
>>>>>>>>>
>>>>>>>>>            -  key = AMDGPU_VM_FAULT(pasid, addr);
>>>>>>>>>
>>>>>>>>>            -  r = amdgpu_ih_add_fault(adev, key);
>>>>>>>>>
>>>>>>>>>            -
>>>>>>>>>
>>>>>>>>>            -  /* Hash table is full or the fault is already being
>>>>>>>>> processed,
>>>>>>>>>
>>>>>>>>>            -   * ignore further page faults
>>>>>>>>>
>>>>>>>>>            -   */
>>>>>>>>>
>>>>>>>>>            -  if (r != 0)
>>>>>>>>>
>>>>>>>>>            -          goto ignore_iv;
>>>>>>>>>
>>>>>>>>>            -
>>>>>>>>>
>>>>>>>>>               /* Track retry faults in per-VM fault FIFO. */
>>>>>>>>>
>>>>>>>>>               spin_lock(&adev->vm_manager.pasid_lock);
>>>>>>>>>
>>>>>>>>>               vm = idr_find(&adev->vm_manager.pasid_idr, pasid);
>>>>>>>>>
>>>>>>>>>            +  addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 12);
>>>>>>>>>
>>>>>>>>>            +  key = AMDGPU_VM_FAULT(pasid, addr);
>>>>>>>>>
>>>>>>>>>               if (!vm) {
>>>>>>>>>
>>>>>>>>>                       /* VM not found, process it normally */
>>>>>>>>>
>>>>>>>>> spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>>>>>
>>>>>>>>>            -          amdgpu_ih_clear_fault(adev, key);
>>>>>>>>>
>>>>>>>>>                       return true;
>>>>>>>>>
>>>>>>>>>            +  } else {
>>>>>>>>>
>>>>>>>>>            +          r = amdgpu_vm_add_fault(vm->fault_hash,
>>>>>>>>> key);
>>>>>>>>>
>>>>>>>>>            +
>>>>>>>>>
>>>>>>>>>            +          /* Hash table is full or the fault is
>>>>>>>>> already
>>>>>>>>> being processed,
>>>>>>>>>
>>>>>>>>>            +           * ignore further page faults
>>>>>>>>>
>>>>>>>>>            +           */
>>>>>>>>>
>>>>>>>>>            +          if (r != 0) {
>>>>>>>>>
>>>>>>>>>            + spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>>>>>
>>>>>>>>>            +                  goto ignore_iv;
>>>>>>>>>
>>>>>>>>>            +          }
>>>>>>>>>
>>>>>>>>>               }
>>>>>>>>>
>>>>>>>>>               /* No locking required with single writer and
>>>>>>>>> single
>>>>>>>>> reader */
>>>>>>>>>
>>>>>>>>>               r = kfifo_put(&vm->faults, key);
>>>>>>>>>
>>>>>>>>>               if (!r) {
>>>>>>>>>
>>>>>>>>>                       /* FIFO is full. Ignore it until there is
>>>>>>>>> space */
>>>>>>>>>
>>>>>>>>>            +          amdgpu_vm_clear_fault(vm->fault_hash, key);
>>>>>>>>>
>>>>>>>>> spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>>>>>
>>>>>>>>>            -          amdgpu_ih_clear_fault(adev, key);
>>>>>>>>>
>>>>>>>>>                       goto ignore_iv;
>>>>>>>>>
>>>>>>>>>               }
>>>>>>>>>
>>>>>>>>>            - spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>            + spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>>>>>
>>>>>>>>>               /* It's the first fault for this address,
>>>>>>>>> process it
>>>>>>>>> normally */
>>>>>>>>>
>>>>>>>>>               return true;
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>            @@ -386,14 +387,6 @@ static int vega10_ih_sw_init(void
>>>>>>>>> *handle)
>>>>>>>>>
>>>>>>>>>               adev->irq.ih.use_doorbell = true;
>>>>>>>>>
>>>>>>>>>               adev->irq.ih.doorbell_index = AMDGPU_DOORBELL64_IH
>>>>>>>>> << 1;
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>            -  adev->irq.ih.faults =
>>>>>>>>> kmalloc(sizeof(*adev->irq.ih.faults), GFP_KERNEL);
>>>>>>>>>
>>>>>>>>>            -  if (!adev->irq.ih.faults)
>>>>>>>>>
>>>>>>>>>            -          return -ENOMEM;
>>>>>>>>>
>>>>>>>>>            - INIT_CHASH_TABLE(adev->irq.ih.faults->hash,
>>>>>>>>>
>>>>>>>>>            -                   AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
>>>>>>>>>
>>>>>>>>>            - spin_lock_init(&adev->irq.ih.faults->lock);
>>>>>>>>>
>>>>>>>>>            -  adev->irq.ih.faults->count = 0;
>>>>>>>>>
>>>>>>>>>            -
>>>>>>>>>
>>>>>>>>>               r = amdgpu_irq_init(adev);
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>               return r;
>>>>>>>>>
>>>>>>>>>            @@ -406,9 +399,6 @@ static int vega10_ih_sw_fini(void
>>>>>>>>> *handle)
>>>>>>>>>
>>>>>>>>>               amdgpu_irq_fini(adev);
>>>>>>>>>
>>>>>>>>>               amdgpu_ih_ring_fini(adev);
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>            -  kfree(adev->irq.ih.faults);
>>>>>>>>>
>>>>>>>>>            -  adev->irq.ih.faults = NULL;
>>>>>>>>>
>>>>>>>>>            -
>>>>>>>>>
>>>>>>>>>               return 0;
>>>>>>>>>
>>>>>>>>>             }
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>        _______________________________________________
>>>>>>>>>
>>>>>>>>>        amd-gfx mailing list
>>>>>>>>>
>>>>>>>>>        amd-gfx@lists.freedesktop.org
>>>>>>>>> <mailto:amd-gfx@lists.freedesktop.org>
>>>>>>>>>
>>>>>>>>>        https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>>>>
>>>>>>>>>
>>>>>>>> _______________________________________________
>>>>>>>> amd-gfx mailing list
>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>> _______________________________________________
>>>>>> amd-gfx mailing list
>>>>>> amd-gfx@lists.freedesktop.org
>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>> _______________________________________________
>>>> amd-gfx mailing list
>>>> amd-gfx@lists.freedesktop.org
>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to amdgpu vm
       [not found]                                                 ` <2e37f4d8-9f64-5412-8fa3-9ef1a12c4001-5C7GfCeVMHo@public.gmane.org>
@ 2018-09-12 15:59                                                   ` Christian König
       [not found]                                                     ` <0fa9086e-49c7-580e-2611-4878c92899f5-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 27+ messages in thread
From: Christian König @ 2018-09-12 15:59 UTC (permalink / raw)
  To: Felix Kuehling, Zeng, Oak, Oak Zeng,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Yang, Philip

Am 12.09.2018 um 17:29 schrieb Felix Kuehling:
> On 2018-09-12 02:56 AM, Christian König wrote:
>> Am 12.09.2018 um 00:00 schrieb Felix Kuehling:
>>> On 2018-09-11 03:19 AM, Christian König wrote:
>>>> Hi Felix,
>>>>
>>>> let me try to explain the problem on an example:
>>>>
>>>> 1. We have a running job which needs recoverable page faults for
>>>> accessing the process address space.
>>>> 2. We run into memory pressure on VRAM and start to evict things.
>>>> 3. A page tables of the running job is picked up for eviction.
>>>> 4. We schedule a copy command to move the content of the page table to
>>>> system memory.
>>> I think we should change this. If you evict a page table, you don't need
>>> to preserve its contents. You should be able to restore the page table
>>> contents from scratch when you handle the page fault that restores it.
>> Yeah, already implemented. You actually don't need the page fault for
>> that.
>>
>>>> 5. The new system memory location of the page table is noted in its BO.
>>> You mean in the parent page table? You can just invalidate the entry in
>>> the parent page table and let it fault.
>> I'm repeating myself, but exactly that is what won't work.
>>
>> See we still have engines which can't handle page faults which uses
>> the same VM at the same time. This means that we can't just fault in
>> page tables.
> And I don't understand why that is a problem. Those clients rely on
> fences to keep their BOs resident, including the page tables. Are you
> planning to change that?

No, but how do you want to swap out page tables when there is a fence added?

Or do you want to stop adding fences to page tables for engines with 
recoverable faults?

I think that is completely unrealistic considering the performance 
penalty of faults.

At least for currently available hardware we should limit page faults to 
be used in as few cases as possible, e.g. SVM and userptr.

Regards,
Christian.

>
> Regards,
>    Felix
>
>> What we could do is to separate the address space into a low and a
>> high range where we have different handling for both.
>>
>> I've already prototyped this and prepared mostly everything necessary
>> for that, but that is still not 100% completed.
>>
>> Regards,
>> Christian.
>>
>>>> 6. We get a fault from the job and swap in the page from the process
>>>> address space.
>>> No. Then you need to keep updating page tables that are swapped out.
>>> Instead just discard them and rebuild on fault.
>>>
>>>> 7. Now we need to enter the new page address into the page table ->
>>>> *BAM*
>>>>
>>>> The problem is now that we don't know the address of the page table
>>>> because the current location was replaced with the future location in
>>>> step #5.
>>> You solve that problem by discarding page tables that are swapped out.
>>> Then there is no future location.
>>>
>>>> We could now argue that we could update the page tables on the fly for
>>>> the evicted page and never wait for the current job to finish, but
>>>> this won't work because not all engines can handle that.
>>> Engines that can't handle page faults will depend on fences to keep
>>> their page tables resident, like we do today. So they won't have this
>>> problem.
>>>
>>> Regards,
>>>     Felix
>>>
>>>> I will circumvent this problem for now by blocking the eviction before
>>>> step #5. The issue with that is that this pipelining is responsible
>>>> for nearly 20% of the fps in some testcases.
>>>>
>>>> I hope that it won't be that bad when I limit disabling the pipelining
>>>> to only page tables, but it is certainly possible that we will get
>>>> pushback on this.
>>>>
>>>> If that happens we probably need to add some flags to limit this
>>>> workaround even more to only the root PD and all the PDs/PTs which are
>>>> involved in recoverable faults.
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>> Am 10.09.2018 um 21:10 schrieb Felix Kuehling:
>>>>> I'm not sure why you need to distinguish current and future state when
>>>>> dealing with page faults. When you get a page fault, you know that the
>>>>> GPU is trying to access memory right now, in the present. So you're
>>>>> always working with the current state. When the CPU page table
>>>>> changes,
>>>>> you get an MMU notifier that allows you to invalidate the
>>>>> corresponding
>>>>> GPU PTEs. If you have a valid GPU PTE, it always represents the
>>>>> current
>>>>> states and is in sync with the CPU page table. If the GPU page
>>>>> table is
>>>>> ever outdated, it should have an invalid entry in that place.
>>>>>
>>>>> If SDMA is used to update the GPU PTEs, there is a delay. The MMU
>>>>> notifier is synchronous, so it shouldn't be a problem. You just
>>>>> wait for
>>>>> the SDMA job to complete before returning. When updating PTEs with new
>>>>> valid addresses, it's asynchronous. But the GPU will continue retrying
>>>>> on the invalid entry until SDMA finishes. So it's also implicitly
>>>>> synchronized on the GPU side.
>>>>>
>>>>> Regards,
>>>>>      Felix
>>>>>
>>>>>
>>>>> On 2018-09-10 05:42 AM, Christian König wrote:
>>>>>> Hi Felix & Oak,
>>>>>>
>>>>>> over the weekend I had the idea that we could just use the shadow BOs
>>>>>> to have the current state in a page fault. They are GTT BOs and CPU
>>>>>> accessible anyway.
>>>>>>
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>>> Am 08.09.2018 um 09:34 schrieb Christian König:
>>>>>>> Hi Felix,
>>>>>>>
>>>>>>>> But why do you want to update page tables when there is no more
>>>>>>>> user
>>>>>>>> mode context that cares about them? Is this just to allow pending
>>>>>>>> work
>>>>>>>> to complete after the user mode context is gone? To prevent hangs?
>>>>>>> The problem I'm see is that we still don't have a reliable way of
>>>>>>> killing a command submission while making sure that no more
>>>>>>> faults of
>>>>>>> it are in the flight.
>>>>>>>
>>>>>>> E.g. the HWS has a function to kill a running job on the hardware,
>>>>>>> but that only works for compute and there is no way of telling when
>>>>>>> that operation has finished.
>>>>>>>
>>>>>>> But Oak has already convinced me that we should probably work on
>>>>>>> that
>>>>>>> instead of trying to keep the VM alive as long as possible.
>>>>>>>
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>>> Am 08.09.2018 um 02:27 schrieb Felix Kuehling:
>>>>>>>> Hi Christian,
>>>>>>>>
>>>>>>>> I'm not sure I get your point here.
>>>>>>>>
>>>>>>>> As I understand it, the amdgpu_vm structure represents the user
>>>>>>>> mode VM
>>>>>>>> context. It has the pointers to the root page directory and the
>>>>>>>> rest of
>>>>>>>> the page table hierarchy. If there is no amdgpu_vm, there is no
>>>>>>>> user
>>>>>>>> mode context that cares about the page tables.
>>>>>>>>
>>>>>>>> If we leave the page table pointers in the amdgpu_vm, then handling
>>>>>>>> faults after the VM is destroyed is pointless. We can't find the
>>>>>>>> page
>>>>>>>> tables and we can't update them, so there is nothing we can do in
>>>>>>>> response to VM faults.
>>>>>>>>
>>>>>>>> So I'm guessing that you want to move the page table pointers
>>>>>>>> into a
>>>>>>>> different structure that exists half-independently of the VM. It
>>>>>>>> would
>>>>>>>> be created when the VM is created (that's where we currently
>>>>>>>> allocate
>>>>>>>> the PASID) but can be destroyed slightly later.
>>>>>>>>
>>>>>>>> But why do you want to update page tables when there is no more
>>>>>>>> user
>>>>>>>> mode context that cares about them? Is this just to allow pending
>>>>>>>> work
>>>>>>>> to complete after the user mode context is gone? To prevent hangs?
>>>>>>>>
>>>>>>>> Regards,
>>>>>>>>       Felix
>>>>>>>>
>>>>>>>> On 2018-09-10 07:20 AM, Christian König wrote:
>>>>>>>>>> Am I right that the handling of page fault need a valid VM?
>>>>>>>>> No, exactly that view is incorrect.
>>>>>>>>>
>>>>>>>>> The VM is meant to be a memory management structure of page
>>>>>>>>> tables and
>>>>>>>>> is completely pointless fault processing because it represent the
>>>>>>>>> future state of the page tables and not the current one.
>>>>>>>>>
>>>>>>>>> What we need for fault processing is a new structure build around
>>>>>>>>> PASIDs which is feed by the with addresses when page tables are
>>>>>>>>> moved
>>>>>>>>> around.
>>>>>>>>>
>>>>>>>>> Alternatively I hope that we can use the SDMA to walk the page
>>>>>>>>> tables
>>>>>>>>> and update the required entries by just using the address.
>>>>>>>>>
>>>>>>>>> Regards,
>>>>>>>>> Christian.
>>>>>>>>>
>>>>>>>>> Am 07.09.2018 um 16:55 schrieb Zeng, Oak:
>>>>>>>>>> Hi Christian,
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> What is the value of delay the destroy of hash to after vm is
>>>>>>>>>> destroyed? Since vm is destroyed, you basically don’t have enough
>>>>>>>>>> information to paging in the correct page to gpuvm. Am I right
>>>>>>>>>> that
>>>>>>>>>> the handling of page fault need a valid VM? For example, you
>>>>>>>>>> need the
>>>>>>>>>> VM to get the corresponding physical address of the faulty page.
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> After vm is destroyed, the retry interrupt can be directly
>>>>>>>>>> discard as
>>>>>>>>>> you can’t find vm through pasid. You can think this is also one
>>>>>>>>>> kind
>>>>>>>>>> of prescreen.
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> So I am stand for the point that, there is no need to delay the
>>>>>>>>>> destroy of hash to after vm is destroyed. Prescreening hash
>>>>>>>>>> can be
>>>>>>>>>> destroyed at the time of vm_fini.
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> Thanks,
>>>>>>>>>>
>>>>>>>>>> Oak
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> *From:*Christian König <ckoenig.leichtzumerken@gmail.com>
>>>>>>>>>> *Sent:* Friday, September 07, 2018 4:39 AM
>>>>>>>>>> *To:* Zeng, Oak <Oak.Zeng@amd.com>; Oak Zeng
>>>>>>>>>> <zengshanjun@gmail.com>;
>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>> *Cc:* Zeng, Oak <Oak.Zeng@amd.com>
>>>>>>>>>> *Subject:* Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to
>>>>>>>>>> amdgpu vm
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> Hi Oak,
>>>>>>>>>>
>>>>>>>>>> yeah, but this is still a NAK. Making the hash per PASID was
>>>>>>>>>> not a
>>>>>>>>>> suggestion but rather a must have.
>>>>>>>>>>
>>>>>>>>>> The VM structures must be destroyed while the processing is still
>>>>>>>>>> ongoing, or otherwise we would not have a clean OOM handling.
>>>>>>>>>>
>>>>>>>>>> The IDR for PASID lockup can be put into amdgpu_ids.c, you can
>>>>>>>>>> just
>>>>>>>>>> replace the IDA already used there with an IDR.
>>>>>>>>>>
>>>>>>>>>> Since the PASID is already freed up delayed we should have the
>>>>>>>>>> grace
>>>>>>>>>> period for interrupt processing included. If that still isn't
>>>>>>>>>> sufficient we can still add some delayed work for this.
>>>>>>>>>>
>>>>>>>>>> Regards,
>>>>>>>>>> Christian.
>>>>>>>>>>
>>>>>>>>>> Am 07.09.2018 um 06:16 schrieb ozeng:
>>>>>>>>>>
>>>>>>>>>>         Hi Christian,
>>>>>>>>>>
>>>>>>>>>>         In this implementation, fault hash is made per vm, not per
>>>>>>>>>> pasid
>>>>>>>>>>         as suggested, based on below considerations:
>>>>>>>>>>
>>>>>>>>>>           * Delay the destroy of hash introduces more effort like
>>>>>>>>>> how to
>>>>>>>>>>             set the proper grace period after which no retry
>>>>>>>>>> interrupt
>>>>>>>>>>             will be generated. We want to avoid those complication
>>>>>>>>>> if we
>>>>>>>>>>             can.
>>>>>>>>>>           * The  problem of the per vm implementation is that it
>>>>>>>>>> is hard
>>>>>>>>>>             to delay the destroy of fault hash, because when vm is
>>>>>>>>>>             destroyed, prescreen function won't be able to
>>>>>>>>>> retrieve the
>>>>>>>>>>             fault hash. But in this case, the prescreen
>>>>>>>>>> function can
>>>>>>>>>>             either let the interrupt go through (to gmc) or ignore
>>>>>>>>>> it.
>>>>>>>>>>             From this perspective, it is not very necessary to
>>>>>>>>>> delay the
>>>>>>>>>>             destroy of hash table.
>>>>>>>>>>           * On the other hand, since ASICs after vega10 have the
>>>>>>>>>> HW CAM
>>>>>>>>>>             feature. So the SW IV prescreen is only useful for
>>>>>>>>>> vega10.
>>>>>>>>>>             For all the HW implemented CAM, we can consider the
>>>>>>>>>> delayed
>>>>>>>>>>             CAM acknowledgment.
>>>>>>>>>>           * Making it per pasid need to introduce another idr to
>>>>>>>>>>             correlate pasid and hash table - where to put the idr?
>>>>>>>>>> You
>>>>>>>>>>             will have to make it a global variable which is not
>>>>>>>>>> very
>>>>>>>>>>             desirable.
>>>>>>>>>>
>>>>>>>>>>         The main purpose of this patch is to solve the
>>>>>>>>>> inter-process
>>>>>>>>>> lock
>>>>>>>>>>         issue (when hash table is full), while still keep codes
>>>>>>>>>> simple.
>>>>>>>>>>
>>>>>>>>>>         Also with this patch, the faults kfifo is not necessary
>>>>>>>>>> any
>>>>>>>>>> more.
>>>>>>>>>>         See patch 2.
>>>>>>>>>>
>>>>>>>>>>         Regards,
>>>>>>>>>>
>>>>>>>>>>         Oak
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>         On 09/06/2018 11:28 PM, Oak Zeng wrote:
>>>>>>>>>>
>>>>>>>>>>             In stead of share one fault hash table per device,
>>>>>>>>>> make it
>>>>>>>>>>
>>>>>>>>>>             per vm. This can avoid inter-process lock issue when
>>>>>>>>>> fault
>>>>>>>>>>
>>>>>>>>>>             hash table is full.
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>             Change-Id: I5d1281b7c41eddc8e26113e010516557588d3708
>>>>>>>>>>
>>>>>>>>>>             Signed-off-by: Oak Zeng <Oak.Zeng@amd.com>
>>>>>>>>>> <mailto:Oak.Zeng@amd.com>
>>>>>>>>>>
>>>>>>>>>>             Suggested-by: Christian Konig
>>>>>>>>>> <Christian.Koenig@amd.com>
>>>>>>>>>> <mailto:Christian.Koenig@amd.com>
>>>>>>>>>>
>>>>>>>>>>             Suggested-by: Felix Kuehling <Felix.Kuehling@amd.com>
>>>>>>>>>> <mailto:Felix.Kuehling@amd.com>
>>>>>>>>>>
>>>>>>>>>>             ---
>>>>>>>>>>
>>>>>>>>>>              drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c |  75
>>>>>>>>>> ------------------------
>>>>>>>>>>
>>>>>>>>>>              drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h |  10 ----
>>>>>>>>>>
>>>>>>>>>>              drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 102
>>>>>>>>>> ++++++++++++++++++++++++++++++++-
>>>>>>>>>>
>>>>>>>>>>              drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h |  12 ++++
>>>>>>>>>>
>>>>>>>>>>              drivers/gpu/drm/amd/amdgpu/vega10_ih.c |  38
>>>>>>>>>> +++++-------
>>>>>>>>>>
>>>>>>>>>>              5 files changed, 127 insertions(+), 110 deletions(-)
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>             diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>>>>>>>>>
>>>>>>>>>>             index 06373d4..4ed8621 100644
>>>>>>>>>>
>>>>>>>>>>             --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>>>>>>>>>
>>>>>>>>>>             +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>>>>>>>>>
>>>>>>>>>>             @@ -197,78 +197,3 @@ int amdgpu_ih_process(struct
>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>
>>>>>>>>>>                return IRQ_HANDLED;
>>>>>>>>>>
>>>>>>>>>>              }
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>             -/**
>>>>>>>>>>
>>>>>>>>>>             - * amdgpu_ih_add_fault - Add a page fault record
>>>>>>>>>>
>>>>>>>>>>             - *
>>>>>>>>>>
>>>>>>>>>>             - * @adev: amdgpu device pointer
>>>>>>>>>>
>>>>>>>>>>             - * @key: 64-bit encoding of PASID and address
>>>>>>>>>>
>>>>>>>>>>             - *
>>>>>>>>>>
>>>>>>>>>>             - * This should be called when a retry page fault
>>>>>>>>>> interrupt is
>>>>>>>>>>
>>>>>>>>>>             - * received. If this is a new page fault, it will be
>>>>>>>>>> added to a hash
>>>>>>>>>>
>>>>>>>>>>             - * table. The return value indicates whether this
>>>>>>>>>> is a
>>>>>>>>>> new fault, or
>>>>>>>>>>
>>>>>>>>>>             - * a fault that was already known and is already
>>>>>>>>>> being
>>>>>>>>>> handled.
>>>>>>>>>>
>>>>>>>>>>             - *
>>>>>>>>>>
>>>>>>>>>>             - * If there are too many pending page faults, this
>>>>>>>>>> will
>>>>>>>>>> fail. Retry
>>>>>>>>>>
>>>>>>>>>>             - * interrupts should be ignored in this case until
>>>>>>>>>> there
>>>>>>>>>> is enough
>>>>>>>>>>
>>>>>>>>>>             - * free space.
>>>>>>>>>>
>>>>>>>>>>             - *
>>>>>>>>>>
>>>>>>>>>>             - * Returns 0 if the fault was added, 1 if the
>>>>>>>>>> fault was
>>>>>>>>>> already known,
>>>>>>>>>>
>>>>>>>>>>             - * -ENOSPC if there are too many pending faults.
>>>>>>>>>>
>>>>>>>>>>             - */
>>>>>>>>>>
>>>>>>>>>>             -int amdgpu_ih_add_fault(struct amdgpu_device
>>>>>>>>>> *adev, u64
>>>>>>>>>> key)
>>>>>>>>>>
>>>>>>>>>>             -{
>>>>>>>>>>
>>>>>>>>>>             -  unsigned long flags;
>>>>>>>>>>
>>>>>>>>>>             -  int r = -ENOSPC;
>>>>>>>>>>
>>>>>>>>>>             -
>>>>>>>>>>
>>>>>>>>>>             -  if (WARN_ON_ONCE(!adev->irq.ih.faults))
>>>>>>>>>>
>>>>>>>>>>             -          /* Should be allocated in
>>>>>>>>>> <IP>_ih_sw_init on
>>>>>>>>>> GPUs that
>>>>>>>>>>
>>>>>>>>>>             -           * support retry faults and require retry
>>>>>>>>>> filtering.
>>>>>>>>>>
>>>>>>>>>>             -           */
>>>>>>>>>>
>>>>>>>>>>             -          return r;
>>>>>>>>>>
>>>>>>>>>>             -
>>>>>>>>>>
>>>>>>>>>>             - spin_lock_irqsave(&adev->irq.ih.faults->lock,
>>>>>>>>>> flags);
>>>>>>>>>>
>>>>>>>>>>             -
>>>>>>>>>>
>>>>>>>>>>             -  /* Only let the hash table fill up to 50% for best
>>>>>>>>>> performance */
>>>>>>>>>>
>>>>>>>>>>             -  if (adev->irq.ih.faults->count >= (1 <<
>>>>>>>>>> (AMDGPU_PAGEFAULT_HASH_BITS-1)))
>>>>>>>>>>
>>>>>>>>>>             -          goto unlock_out;
>>>>>>>>>>
>>>>>>>>>>             -
>>>>>>>>>>
>>>>>>>>>>             -  r = chash_table_copy_in(&adev->irq.ih.faults->hash,
>>>>>>>>>> key, NULL);
>>>>>>>>>>
>>>>>>>>>>             -  if (!r)
>>>>>>>>>>
>>>>>>>>>>             -          adev->irq.ih.faults->count++;
>>>>>>>>>>
>>>>>>>>>>             -
>>>>>>>>>>
>>>>>>>>>>             -  /* chash_table_copy_in should never fail unless
>>>>>>>>>> we're
>>>>>>>>>> losing count */
>>>>>>>>>>
>>>>>>>>>>             -  WARN_ON_ONCE(r < 0);
>>>>>>>>>>
>>>>>>>>>>             -
>>>>>>>>>>
>>>>>>>>>>             -unlock_out:
>>>>>>>>>>
>>>>>>>>>>             - spin_unlock_irqrestore(&adev->irq.ih.faults->lock,
>>>>>>>>>> flags);
>>>>>>>>>>
>>>>>>>>>>             -  return r;
>>>>>>>>>>
>>>>>>>>>>             -}
>>>>>>>>>>
>>>>>>>>>>             -
>>>>>>>>>>
>>>>>>>>>>             -/**
>>>>>>>>>>
>>>>>>>>>>             - * amdgpu_ih_clear_fault - Remove a page fault record
>>>>>>>>>>
>>>>>>>>>>             - *
>>>>>>>>>>
>>>>>>>>>>             - * @adev: amdgpu device pointer
>>>>>>>>>>
>>>>>>>>>>             - * @key: 64-bit encoding of PASID and address
>>>>>>>>>>
>>>>>>>>>>             - *
>>>>>>>>>>
>>>>>>>>>>             - * This should be called when a page fault has been
>>>>>>>>>> handled. Any
>>>>>>>>>>
>>>>>>>>>>             - * future interrupt with this key will be
>>>>>>>>>> processed as a
>>>>>>>>>> new
>>>>>>>>>>
>>>>>>>>>>             - * page fault.
>>>>>>>>>>
>>>>>>>>>>             - */
>>>>>>>>>>
>>>>>>>>>>             -void amdgpu_ih_clear_fault(struct amdgpu_device
>>>>>>>>>> *adev,
>>>>>>>>>> u64 key)
>>>>>>>>>>
>>>>>>>>>>             -{
>>>>>>>>>>
>>>>>>>>>>             -  unsigned long flags;
>>>>>>>>>>
>>>>>>>>>>             -  int r;
>>>>>>>>>>
>>>>>>>>>>             -
>>>>>>>>>>
>>>>>>>>>>             -  if (!adev->irq.ih.faults)
>>>>>>>>>>
>>>>>>>>>>             -          return;
>>>>>>>>>>
>>>>>>>>>>             -
>>>>>>>>>>
>>>>>>>>>>             - spin_lock_irqsave(&adev->irq.ih.faults->lock,
>>>>>>>>>> flags);
>>>>>>>>>>
>>>>>>>>>>             -
>>>>>>>>>>
>>>>>>>>>>             -  r = chash_table_remove(&adev->irq.ih.faults->hash,
>>>>>>>>>> key, NULL);
>>>>>>>>>>
>>>>>>>>>>             -  if (!WARN_ON_ONCE(r < 0)) {
>>>>>>>>>>
>>>>>>>>>>             -          adev->irq.ih.faults->count--;
>>>>>>>>>>
>>>>>>>>>>             - WARN_ON_ONCE(adev->irq.ih.faults->count < 0);
>>>>>>>>>>
>>>>>>>>>>             -  }
>>>>>>>>>>
>>>>>>>>>>             -
>>>>>>>>>>
>>>>>>>>>>             - spin_unlock_irqrestore(&adev->irq.ih.faults->lock,
>>>>>>>>>> flags);
>>>>>>>>>>
>>>>>>>>>>             -}
>>>>>>>>>>
>>>>>>>>>>             diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>>>>>>>>
>>>>>>>>>>             index a23e1c0..f411ffb 100644
>>>>>>>>>>
>>>>>>>>>>             --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>>>>>>>>
>>>>>>>>>>             +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>>>>>>>>
>>>>>>>>>>             @@ -32,13 +32,6 @@ struct amdgpu_device;
>>>>>>>>>>
>>>>>>>>>>              #define AMDGPU_IH_CLIENTID_LEGACY 0
>>>>>>>>>>
>>>>>>>>>>              #define AMDGPU_IH_CLIENTID_MAX SOC15_IH_CLIENTID_MAX
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>             -#define AMDGPU_PAGEFAULT_HASH_BITS 8
>>>>>>>>>>
>>>>>>>>>>             -struct amdgpu_retryfault_hashtable {
>>>>>>>>>>
>>>>>>>>>>             -  DECLARE_CHASH_TABLE(hash,
>>>>>>>>>> AMDGPU_PAGEFAULT_HASH_BITS,
>>>>>>>>>> 8, 0);
>>>>>>>>>>
>>>>>>>>>>             -  spinlock_t     lock;
>>>>>>>>>>
>>>>>>>>>>             -  int            count;
>>>>>>>>>>
>>>>>>>>>>             -};
>>>>>>>>>>
>>>>>>>>>>             -
>>>>>>>>>>
>>>>>>>>>>              /*
>>>>>>>>>>
>>>>>>>>>>               * R6xx+ IH ring
>>>>>>>>>>
>>>>>>>>>>               */
>>>>>>>>>>
>>>>>>>>>>             @@ -57,7 +50,6 @@ struct amdgpu_ih_ring {
>>>>>>>>>>
>>>>>>>>>>                bool                   use_doorbell;
>>>>>>>>>>
>>>>>>>>>>                bool                   use_bus_addr;
>>>>>>>>>>
>>>>>>>>>>                dma_addr_t             rb_dma_addr; /* only used
>>>>>>>>>> when
>>>>>>>>>> use_bus_addr = true */
>>>>>>>>>>
>>>>>>>>>>             -  struct amdgpu_retryfault_hashtable *faults;
>>>>>>>>>>
>>>>>>>>>>              };
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>              #define AMDGPU_IH_SRC_DATA_MAX_SIZE_DW 4
>>>>>>>>>>
>>>>>>>>>>             @@ -95,7 +87,5 @@ int amdgpu_ih_ring_init(struct
>>>>>>>>>> amdgpu_device *adev, unsigned ring_size,
>>>>>>>>>>
>>>>>>>>>>                                bool use_bus_addr);
>>>>>>>>>>
>>>>>>>>>>              void amdgpu_ih_ring_fini(struct amdgpu_device *adev);
>>>>>>>>>>
>>>>>>>>>>              int amdgpu_ih_process(struct amdgpu_device *adev);
>>>>>>>>>>
>>>>>>>>>>             -int amdgpu_ih_add_fault(struct amdgpu_device
>>>>>>>>>> *adev, u64
>>>>>>>>>> key);
>>>>>>>>>>
>>>>>>>>>>             -void amdgpu_ih_clear_fault(struct amdgpu_device
>>>>>>>>>> *adev,
>>>>>>>>>> u64 key);
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>              #endif
>>>>>>>>>>
>>>>>>>>>>             diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>>>>>
>>>>>>>>>>             index 1d7e3c1..8b220e9 100644
>>>>>>>>>>
>>>>>>>>>>             --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>>>>>
>>>>>>>>>>             +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>>>>>
>>>>>>>>>>             @@ -2692,6 +2692,22 @@ void
>>>>>>>>>> amdgpu_vm_adjust_size(struct
>>>>>>>>>> amdgpu_device *adev, uint32_t min_vm_size,
>>>>>>>>>>
>>>>>>>>>>                         adev->vm_manager.fragment_size);
>>>>>>>>>>
>>>>>>>>>>              }
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>             +static struct amdgpu_retryfault_hashtable
>>>>>>>>>> *init_fault_hash(void)
>>>>>>>>>>
>>>>>>>>>>             +{
>>>>>>>>>>
>>>>>>>>>>             +  struct amdgpu_retryfault_hashtable *fault_hash;
>>>>>>>>>>
>>>>>>>>>>             +
>>>>>>>>>>
>>>>>>>>>>             +  fault_hash = kmalloc(sizeof(*fault_hash),
>>>>>>>>>> GFP_KERNEL);
>>>>>>>>>>
>>>>>>>>>>             +  if (!fault_hash)
>>>>>>>>>>
>>>>>>>>>>             +          return fault_hash;
>>>>>>>>>>
>>>>>>>>>>             +
>>>>>>>>>>
>>>>>>>>>>             +  INIT_CHASH_TABLE(fault_hash->hash,
>>>>>>>>>>
>>>>>>>>>>             +                  AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
>>>>>>>>>>
>>>>>>>>>>             +  spin_lock_init(&fault_hash->lock);
>>>>>>>>>>
>>>>>>>>>>             +  fault_hash->count = 0;
>>>>>>>>>>
>>>>>>>>>>             +
>>>>>>>>>>
>>>>>>>>>>             +  return fault_hash;
>>>>>>>>>>
>>>>>>>>>>             +}
>>>>>>>>>>
>>>>>>>>>>             +
>>>>>>>>>>
>>>>>>>>>>              /**
>>>>>>>>>>
>>>>>>>>>>               * amdgpu_vm_init - initialize a vm instance
>>>>>>>>>>
>>>>>>>>>>               *
>>>>>>>>>>
>>>>>>>>>>             @@ -2780,6 +2796,12 @@ int amdgpu_vm_init(struct
>>>>>>>>>> amdgpu_device *adev, struct amdgpu_vm *vm,
>>>>>>>>>>
>>>>>>>>>>                        vm->pasid = pasid;
>>>>>>>>>>
>>>>>>>>>>                }
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>             +  vm->fault_hash = init_fault_hash();
>>>>>>>>>>
>>>>>>>>>>             +  if (!vm->fault_hash) {
>>>>>>>>>>
>>>>>>>>>>             +          r = -ENOMEM;
>>>>>>>>>>
>>>>>>>>>>             +          goto error_free_root;
>>>>>>>>>>
>>>>>>>>>>             +  }
>>>>>>>>>>
>>>>>>>>>>             +
>>>>>>>>>>
>>>>>>>>>>                INIT_KFIFO(vm->faults);
>>>>>>>>>>
>>>>>>>>>>                vm->fault_credit = 16;
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>             @@ -2973,7 +2995,7 @@ void amdgpu_vm_fini(struct
>>>>>>>>>> amdgpu_device *adev, struct amdgpu_vm *vm)
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>                /* Clear pending page faults from IH when the VM is
>>>>>>>>>> destroyed */
>>>>>>>>>>
>>>>>>>>>>                while (kfifo_get(&vm->faults, &fault))
>>>>>>>>>>
>>>>>>>>>>             -          amdgpu_ih_clear_fault(adev, fault);
>>>>>>>>>>
>>>>>>>>>>             +          amdgpu_vm_clear_fault(vm->fault_hash,
>>>>>>>>>> fault);
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>                if (vm->pasid) {
>>>>>>>>>>
>>>>>>>>>>                        unsigned long flags;
>>>>>>>>>>
>>>>>>>>>>             @@ -2983,6 +3005,9 @@ void amdgpu_vm_fini(struct
>>>>>>>>>> amdgpu_device *adev, struct amdgpu_vm *vm)
>>>>>>>>>>
>>>>>>>>>> spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
>>>>>>>>>>
>>>>>>>>>>                }
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>             +  kfree(vm->fault_hash);
>>>>>>>>>>
>>>>>>>>>>             +  vm->fault_hash = NULL;
>>>>>>>>>>
>>>>>>>>>>             +
>>>>>>>>>>
>>>>>>>>>>                drm_sched_entity_destroy(&vm->entity);
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>                if (!RB_EMPTY_ROOT(&vm->va.rb_root)) {
>>>>>>>>>>
>>>>>>>>>>             @@ -3183,3 +3208,78 @@ void
>>>>>>>>>> amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
>>>>>>>>>>
>>>>>>>>>>                        }
>>>>>>>>>>
>>>>>>>>>>                }
>>>>>>>>>>
>>>>>>>>>>              }
>>>>>>>>>>
>>>>>>>>>>             +
>>>>>>>>>>
>>>>>>>>>>             +/**
>>>>>>>>>>
>>>>>>>>>>             + * amdgpu_vm_add_fault - Add a page fault record to
>>>>>>>>>> fault hash table
>>>>>>>>>>
>>>>>>>>>>             + *
>>>>>>>>>>
>>>>>>>>>>             + * @fault_hash: fault hash table
>>>>>>>>>>
>>>>>>>>>>             + * @key: 64-bit encoding of PASID and address
>>>>>>>>>>
>>>>>>>>>>             + *
>>>>>>>>>>
>>>>>>>>>>             + * This should be called when a retry page fault
>>>>>>>>>> interrupt is
>>>>>>>>>>
>>>>>>>>>>             + * received. If this is a new page fault, it will be
>>>>>>>>>> added to a hash
>>>>>>>>>>
>>>>>>>>>>             + * table. The return value indicates whether this
>>>>>>>>>> is a
>>>>>>>>>> new fault, or
>>>>>>>>>>
>>>>>>>>>>             + * a fault that was already known and is already
>>>>>>>>>> being
>>>>>>>>>> handled.
>>>>>>>>>>
>>>>>>>>>>             + *
>>>>>>>>>>
>>>>>>>>>>             + * If there are too many pending page faults, this
>>>>>>>>>> will
>>>>>>>>>> fail. Retry
>>>>>>>>>>
>>>>>>>>>>             + * interrupts should be ignored in this case until
>>>>>>>>>> there
>>>>>>>>>> is enough
>>>>>>>>>>
>>>>>>>>>>             + * free space.
>>>>>>>>>>
>>>>>>>>>>             + *
>>>>>>>>>>
>>>>>>>>>>             + * Returns 0 if the fault was added, 1 if the
>>>>>>>>>> fault was
>>>>>>>>>> already known,
>>>>>>>>>>
>>>>>>>>>>             + * -ENOSPC if there are too many pending faults.
>>>>>>>>>>
>>>>>>>>>>             + */
>>>>>>>>>>
>>>>>>>>>>             +int amdgpu_vm_add_fault(struct
>>>>>>>>>> amdgpu_retryfault_hashtable *fault_hash, u64 key)
>>>>>>>>>>
>>>>>>>>>>             +{
>>>>>>>>>>
>>>>>>>>>>             +  unsigned long flags;
>>>>>>>>>>
>>>>>>>>>>             +  int r = -ENOSPC;
>>>>>>>>>>
>>>>>>>>>>             +
>>>>>>>>>>
>>>>>>>>>>             +  if (WARN_ON_ONCE(!fault_hash))
>>>>>>>>>>
>>>>>>>>>>             +          /* Should be allocated in amdgpu_vm_init
>>>>>>>>>>
>>>>>>>>>>             +           */
>>>>>>>>>>
>>>>>>>>>>             +          return r;
>>>>>>>>>>
>>>>>>>>>>             +
>>>>>>>>>>
>>>>>>>>>>             +  spin_lock_irqsave(&fault_hash->lock, flags);
>>>>>>>>>>
>>>>>>>>>>             +
>>>>>>>>>>
>>>>>>>>>>             +  /* Only let the hash table fill up to 50% for best
>>>>>>>>>> performance */
>>>>>>>>>>
>>>>>>>>>>             +  if (fault_hash->count >= (1 <<
>>>>>>>>>> (AMDGPU_PAGEFAULT_HASH_BITS-1)))
>>>>>>>>>>
>>>>>>>>>>             +          goto unlock_out;
>>>>>>>>>>
>>>>>>>>>>             +
>>>>>>>>>>
>>>>>>>>>>             +  r = chash_table_copy_in(&fault_hash->hash, key,
>>>>>>>>>> NULL);
>>>>>>>>>>
>>>>>>>>>>             +  if (!r)
>>>>>>>>>>
>>>>>>>>>>             +          fault_hash->count++;
>>>>>>>>>>
>>>>>>>>>>             +
>>>>>>>>>>
>>>>>>>>>>             +  /* chash_table_copy_in should never fail unless
>>>>>>>>>> we're
>>>>>>>>>> losing count */
>>>>>>>>>>
>>>>>>>>>>             +  WARN_ON_ONCE(r < 0);
>>>>>>>>>>
>>>>>>>>>>             +
>>>>>>>>>>
>>>>>>>>>>             +unlock_out:
>>>>>>>>>>
>>>>>>>>>>             +  spin_unlock_irqrestore(&fault_hash->lock, flags);
>>>>>>>>>>
>>>>>>>>>>             +  return r;
>>>>>>>>>>
>>>>>>>>>>             +}
>>>>>>>>>>
>>>>>>>>>>             +
>>>>>>>>>>
>>>>>>>>>>             +/**
>>>>>>>>>>
>>>>>>>>>>             + * amdgpu_vm_clear_fault - Remove a page fault record
>>>>>>>>>>
>>>>>>>>>>             + *
>>>>>>>>>>
>>>>>>>>>>             + * @fault_hash: fault hash table
>>>>>>>>>>
>>>>>>>>>>             + * @key: 64-bit encoding of PASID and address
>>>>>>>>>>
>>>>>>>>>>             + *
>>>>>>>>>>
>>>>>>>>>>             + * This should be called when a page fault has been
>>>>>>>>>> handled. Any
>>>>>>>>>>
>>>>>>>>>>             + * future interrupt with this key will be
>>>>>>>>>> processed as a
>>>>>>>>>> new
>>>>>>>>>>
>>>>>>>>>>             + * page fault.
>>>>>>>>>>
>>>>>>>>>>             + */
>>>>>>>>>>
>>>>>>>>>>             +void amdgpu_vm_clear_fault(struct
>>>>>>>>>> amdgpu_retryfault_hashtable *fault_hash, u64 key)
>>>>>>>>>>
>>>>>>>>>>             +{
>>>>>>>>>>
>>>>>>>>>>             +  unsigned long flags;
>>>>>>>>>>
>>>>>>>>>>             +  int r;
>>>>>>>>>>
>>>>>>>>>>             +
>>>>>>>>>>
>>>>>>>>>>             +  if (!fault_hash)
>>>>>>>>>>
>>>>>>>>>>             +          return;
>>>>>>>>>>
>>>>>>>>>>             +
>>>>>>>>>>
>>>>>>>>>>             +  spin_lock_irqsave(&fault_hash->lock, flags);
>>>>>>>>>>
>>>>>>>>>>             +
>>>>>>>>>>
>>>>>>>>>>             +  r = chash_table_remove(&fault_hash->hash, key,
>>>>>>>>>> NULL);
>>>>>>>>>>
>>>>>>>>>>             +  if (!WARN_ON_ONCE(r < 0)) {
>>>>>>>>>>
>>>>>>>>>>             +          fault_hash->count--;
>>>>>>>>>>
>>>>>>>>>>             +          WARN_ON_ONCE(fault_hash->count < 0);
>>>>>>>>>>
>>>>>>>>>>             +  }
>>>>>>>>>>
>>>>>>>>>>             +
>>>>>>>>>>
>>>>>>>>>>             +  spin_unlock_irqrestore(&fault_hash->lock, flags);
>>>>>>>>>>
>>>>>>>>>>             +}
>>>>>>>>>>
>>>>>>>>>>             diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>>>>>>>>
>>>>>>>>>>             index e275ee7..6eb1da1 100644
>>>>>>>>>>
>>>>>>>>>>             --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>>>>>>>>
>>>>>>>>>>             +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>>>>>>>>
>>>>>>>>>>             @@ -178,6 +178,13 @@ struct amdgpu_task_info {
>>>>>>>>>>
>>>>>>>>>>                pid_t   tgid;
>>>>>>>>>>
>>>>>>>>>>              };
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>             +#define AMDGPU_PAGEFAULT_HASH_BITS 8
>>>>>>>>>>
>>>>>>>>>>             +struct amdgpu_retryfault_hashtable {
>>>>>>>>>>
>>>>>>>>>>             +  DECLARE_CHASH_TABLE(hash,
>>>>>>>>>> AMDGPU_PAGEFAULT_HASH_BITS,
>>>>>>>>>> 8, 0);
>>>>>>>>>>
>>>>>>>>>>             +  spinlock_t     lock;
>>>>>>>>>>
>>>>>>>>>>             +  int            count;
>>>>>>>>>>
>>>>>>>>>>             +};
>>>>>>>>>>
>>>>>>>>>>             +
>>>>>>>>>>
>>>>>>>>>>              struct amdgpu_vm {
>>>>>>>>>>
>>>>>>>>>>                /* tree of virtual addresses mapped */
>>>>>>>>>>
>>>>>>>>>>                struct rb_root_cached  va;
>>>>>>>>>>
>>>>>>>>>>             @@ -240,6 +247,7 @@ struct amdgpu_vm {
>>>>>>>>>>
>>>>>>>>>>                struct ttm_lru_bulk_move lru_bulk_move;
>>>>>>>>>>
>>>>>>>>>>                /* mark whether can do the bulk move */
>>>>>>>>>>
>>>>>>>>>>                bool                   bulk_moveable;
>>>>>>>>>>
>>>>>>>>>>             +  struct amdgpu_retryfault_hashtable *fault_hash;
>>>>>>>>>>
>>>>>>>>>>              };
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>              struct amdgpu_vm_manager {
>>>>>>>>>>
>>>>>>>>>>             @@ -355,4 +363,8 @@ void
>>>>>>>>>> amdgpu_vm_set_task_info(struct
>>>>>>>>>> amdgpu_vm *vm);
>>>>>>>>>>
>>>>>>>>>>              void amdgpu_vm_move_to_lru_tail(struct amdgpu_device
>>>>>>>>>> *adev,
>>>>>>>>>>
>>>>>>>>>>                                       struct amdgpu_vm *vm);
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>             +int amdgpu_vm_add_fault(struct
>>>>>>>>>> amdgpu_retryfault_hashtable *fault_hash, u64 key);
>>>>>>>>>>
>>>>>>>>>>             +
>>>>>>>>>>
>>>>>>>>>>             +void amdgpu_vm_clear_fault(struct
>>>>>>>>>> amdgpu_retryfault_hashtable *fault_hash, u64 key);
>>>>>>>>>>
>>>>>>>>>>             +
>>>>>>>>>>
>>>>>>>>>>              #endif
>>>>>>>>>>
>>>>>>>>>>             diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>>>>>
>>>>>>>>>>             index 5ae5ed2..acbe5a7 100644
>>>>>>>>>>
>>>>>>>>>>             --- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>>>>>
>>>>>>>>>>             +++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>>>>>
>>>>>>>>>>             @@ -265,35 +265,36 @@ static bool
>>>>>>>>>> vega10_ih_prescreen_iv(struct amdgpu_device *adev)
>>>>>>>>>>
>>>>>>>>>>                        return true;
>>>>>>>>>>
>>>>>>>>>>                }
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>             -  addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 12);
>>>>>>>>>>
>>>>>>>>>>             -  key = AMDGPU_VM_FAULT(pasid, addr);
>>>>>>>>>>
>>>>>>>>>>             -  r = amdgpu_ih_add_fault(adev, key);
>>>>>>>>>>
>>>>>>>>>>             -
>>>>>>>>>>
>>>>>>>>>>             -  /* Hash table is full or the fault is already being
>>>>>>>>>> processed,
>>>>>>>>>>
>>>>>>>>>>             -   * ignore further page faults
>>>>>>>>>>
>>>>>>>>>>             -   */
>>>>>>>>>>
>>>>>>>>>>             -  if (r != 0)
>>>>>>>>>>
>>>>>>>>>>             -          goto ignore_iv;
>>>>>>>>>>
>>>>>>>>>>             -
>>>>>>>>>>
>>>>>>>>>>                /* Track retry faults in per-VM fault FIFO. */
>>>>>>>>>>
>>>>>>>>>>                spin_lock(&adev->vm_manager.pasid_lock);
>>>>>>>>>>
>>>>>>>>>>                vm = idr_find(&adev->vm_manager.pasid_idr, pasid);
>>>>>>>>>>
>>>>>>>>>>             +  addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 12);
>>>>>>>>>>
>>>>>>>>>>             +  key = AMDGPU_VM_FAULT(pasid, addr);
>>>>>>>>>>
>>>>>>>>>>                if (!vm) {
>>>>>>>>>>
>>>>>>>>>>                        /* VM not found, process it normally */
>>>>>>>>>>
>>>>>>>>>> spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>>>>>>
>>>>>>>>>>             -          amdgpu_ih_clear_fault(adev, key);
>>>>>>>>>>
>>>>>>>>>>                        return true;
>>>>>>>>>>
>>>>>>>>>>             +  } else {
>>>>>>>>>>
>>>>>>>>>>             +          r = amdgpu_vm_add_fault(vm->fault_hash,
>>>>>>>>>> key);
>>>>>>>>>>
>>>>>>>>>>             +
>>>>>>>>>>
>>>>>>>>>>             +          /* Hash table is full or the fault is
>>>>>>>>>> already
>>>>>>>>>> being processed,
>>>>>>>>>>
>>>>>>>>>>             +           * ignore further page faults
>>>>>>>>>>
>>>>>>>>>>             +           */
>>>>>>>>>>
>>>>>>>>>>             +          if (r != 0) {
>>>>>>>>>>
>>>>>>>>>>             + spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>>>>>>
>>>>>>>>>>             +                  goto ignore_iv;
>>>>>>>>>>
>>>>>>>>>>             +          }
>>>>>>>>>>
>>>>>>>>>>                }
>>>>>>>>>>
>>>>>>>>>>                /* No locking required with single writer and
>>>>>>>>>> single
>>>>>>>>>> reader */
>>>>>>>>>>
>>>>>>>>>>                r = kfifo_put(&vm->faults, key);
>>>>>>>>>>
>>>>>>>>>>                if (!r) {
>>>>>>>>>>
>>>>>>>>>>                        /* FIFO is full. Ignore it until there is
>>>>>>>>>> space */
>>>>>>>>>>
>>>>>>>>>>             +          amdgpu_vm_clear_fault(vm->fault_hash, key);
>>>>>>>>>>
>>>>>>>>>> spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>>>>>>
>>>>>>>>>>             -          amdgpu_ih_clear_fault(adev, key);
>>>>>>>>>>
>>>>>>>>>>                        goto ignore_iv;
>>>>>>>>>>
>>>>>>>>>>                }
>>>>>>>>>>
>>>>>>>>>>             - spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>             + spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>>>>>>
>>>>>>>>>>                /* It's the first fault for this address,
>>>>>>>>>> process it
>>>>>>>>>> normally */
>>>>>>>>>>
>>>>>>>>>>                return true;
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>             @@ -386,14 +387,6 @@ static int vega10_ih_sw_init(void
>>>>>>>>>> *handle)
>>>>>>>>>>
>>>>>>>>>>                adev->irq.ih.use_doorbell = true;
>>>>>>>>>>
>>>>>>>>>>                adev->irq.ih.doorbell_index = AMDGPU_DOORBELL64_IH
>>>>>>>>>> << 1;
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>             -  adev->irq.ih.faults =
>>>>>>>>>> kmalloc(sizeof(*adev->irq.ih.faults), GFP_KERNEL);
>>>>>>>>>>
>>>>>>>>>>             -  if (!adev->irq.ih.faults)
>>>>>>>>>>
>>>>>>>>>>             -          return -ENOMEM;
>>>>>>>>>>
>>>>>>>>>>             - INIT_CHASH_TABLE(adev->irq.ih.faults->hash,
>>>>>>>>>>
>>>>>>>>>>             -                   AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
>>>>>>>>>>
>>>>>>>>>>             - spin_lock_init(&adev->irq.ih.faults->lock);
>>>>>>>>>>
>>>>>>>>>>             -  adev->irq.ih.faults->count = 0;
>>>>>>>>>>
>>>>>>>>>>             -
>>>>>>>>>>
>>>>>>>>>>                r = amdgpu_irq_init(adev);
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>                return r;
>>>>>>>>>>
>>>>>>>>>>             @@ -406,9 +399,6 @@ static int vega10_ih_sw_fini(void
>>>>>>>>>> *handle)
>>>>>>>>>>
>>>>>>>>>>                amdgpu_irq_fini(adev);
>>>>>>>>>>
>>>>>>>>>>                amdgpu_ih_ring_fini(adev);
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>             -  kfree(adev->irq.ih.faults);
>>>>>>>>>>
>>>>>>>>>>             -  adev->irq.ih.faults = NULL;
>>>>>>>>>>
>>>>>>>>>>             -
>>>>>>>>>>
>>>>>>>>>>                return 0;
>>>>>>>>>>
>>>>>>>>>>              }
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>         _______________________________________________
>>>>>>>>>>
>>>>>>>>>>         amd-gfx mailing list
>>>>>>>>>>
>>>>>>>>>>         amd-gfx@lists.freedesktop.org
>>>>>>>>>> <mailto:amd-gfx@lists.freedesktop.org>
>>>>>>>>>>
>>>>>>>>>>         https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>> _______________________________________________
>>>>>>>>> amd-gfx mailing list
>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>> _______________________________________________
>>>>>>> amd-gfx mailing list
>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>> _______________________________________________
>>>>> amd-gfx mailing list
>>>>> amd-gfx@lists.freedesktop.org
>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to amdgpu vm
       [not found]                                                     ` <0fa9086e-49c7-580e-2611-4878c92899f5-5C7GfCeVMHo@public.gmane.org>
@ 2018-09-13  4:45                                                       ` Kuehling, Felix
       [not found]                                                         ` <DM5PR12MB1707D50758797558D34F17A0921A0-2J9CzHegvk9TCtO+SvGBKwdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  0 siblings, 1 reply; 27+ messages in thread
From: Kuehling, Felix @ 2018-09-13  4:45 UTC (permalink / raw)
  To: Koenig, Christian, Zeng, Oak, Oak Zeng,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Yang, Philip

I think I'm missing something. I thought the whole discussion we were having about tracking current and future locations of page tables was about swapping out page tables that are managed by HMM. We currently do swap out page tables that don't have fence on them and that are not managed by HMM. And pipelining that is not a problem today.

You're raising some potential problems that I think must be about swapping out page tables that don't have fences on them (otherwise they wouldn't be evicted) and that are managed by HMM. If there is no fence on the page table, no engine that can't handle page faults can depend on the page tables. So we can discard the contents and set the parent page directory entry to invalid. Setting the parent PDE needs to be synchronous so that the GPU doesn't try to access a page table that is no longer there. No pipelining, no problem.

Then you came back with an argument "what about engines that don't support page faults". Those engines can't use memory mapped by HMM anyway. And they can't be evicted while they have a fence on them that indicates active usage by such an engine.

You seem to see some problems that require not pipelining page table evictions. I still don't understand what the problem is. I think we should have a phone call to sort this out.

A few more comments inline, but I think we're misunderstanding each other ...

>>>>> 5. The new system memory location of the page table is noted in its BO.
>>>> You mean in the parent page table? You can just invalidate the entry in
>>>> the parent page table and let it fault.
>>> I'm repeating myself, but exactly that is what won't work.
>>>
>>> See we still have engines which can't handle page faults which uses
>>> the same VM at the same time. This means that we can't just fault in
>>> page tables.
>> And I don't understand why that is a problem. Those clients rely on
>> fences to keep their BOs resident, including the page tables. Are you
>> planning to change that?
> 
> No, but how do you want to swap out page tables when there is a fence added?

I don't. That's my point. If the page table has fences on it, it can't be swapped out. So there is no problem.

> 
> Or do you want to stop adding fences to page tables for engines with
> recoverable faults?

Yes, that was my assumption, coming from a KFD perspective where with HMM we want to get away from our eviction fences that enforce BO residency, including page tables. Right now signaling an eviction fence means stopping user mode queues. We want to stop doing that.

If page tables get evicted, that's fine as long as those virtual addresses aren't accessed. Once an access happens, the page table needs to be restored or recreated. Once that's done, the retrying hardware block will get a successful translation and continue executing.

> 
> I think that is completely unrealistic considering the performance
> penalty of faults.

I agree that evicting a page table that's in active use is bad. For amdgpu_cs you can prevent that with a fence, no problem.

But a fence is not a good way to prevent that for KFD, because it forces us to keep using our eviction fence and preempting user mode queues on evictions. You see, the eviction fence does not prevent the page table from being evicted, but it forces us to preempt all our queues when an eviction happens. That is a worse performance penalty than dealing with a page fault because the page fault is much more selective. A page fault can interrupt a compute shader at wave granularity. An preemption interrupts compute shaders with process granularity and at much longer time scales.

For KFD I would try to find a better way to avoid evictions of page tables, maybe by bumping them up in the LRU at appropriate times. But even without any such improvements, page faults are still better for KFD than the current eviction-fence-based approach.

Regards,
  Felix

> 
> At least for currently available hardware we should limit page faults to
> be used in as few cases as possible, e.g. SVM and userptr.
> 
> Regards,
> Christian.
________________________________________
From: Koenig, Christian
Sent: Wednesday, September 12, 2018 11:59 AM
To: Kuehling, Felix; Zeng, Oak; Oak Zeng; amd-gfx@lists.freedesktop.org; Yang, Philip
Subject: Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to amdgpu vm

Am 12.09.2018 um 17:29 schrieb Felix Kuehling:
> On 2018-09-12 02:56 AM, Christian König wrote:
>> Am 12.09.2018 um 00:00 schrieb Felix Kuehling:
>>> On 2018-09-11 03:19 AM, Christian König wrote:
>>>> Hi Felix,
>>>>
>>>> let me try to explain the problem on an example:
>>>>
>>>> 1. We have a running job which needs recoverable page faults for
>>>> accessing the process address space.
>>>> 2. We run into memory pressure on VRAM and start to evict things.
>>>> 3. A page tables of the running job is picked up for eviction.
>>>> 4. We schedule a copy command to move the content of the page table to
>>>> system memory.
>>> I think we should change this. If you evict a page table, you don't need
>>> to preserve its contents. You should be able to restore the page table
>>> contents from scratch when you handle the page fault that restores it.
>> Yeah, already implemented. You actually don't need the page fault for
>> that.
>>
>>>> 5. The new system memory location of the page table is noted in its BO.
>>> You mean in the parent page table? You can just invalidate the entry in
>>> the parent page table and let it fault.
>> I'm repeating myself, but exactly that is what won't work.
>>
>> See we still have engines which can't handle page faults which uses
>> the same VM at the same time. This means that we can't just fault in
>> page tables.
> And I don't understand why that is a problem. Those clients rely on
> fences to keep their BOs resident, including the page tables. Are you
> planning to change that?

No, but how do you want to swap out page tables when there is a fence added?

Or do you want to stop adding fences to page tables for engines with
recoverable faults?

I think that is completely unrealistic considering the performance
penalty of faults.

At least for currently available hardware we should limit page faults to
be used in as few cases as possible, e.g. SVM and userptr.

Regards,
Christian.

>
> Regards,
>    Felix
>
>> What we could do is to separate the address space into a low and a
>> high range where we have different handling for both.
>>
>> I've already prototyped this and prepared mostly everything necessary
>> for that, but that is still not 100% completed.
>>
>> Regards,
>> Christian.
>>
>>>> 6. We get a fault from the job and swap in the page from the process
>>>> address space.
>>> No. Then you need to keep updating page tables that are swapped out.
>>> Instead just discard them and rebuild on fault.
>>>
>>>> 7. Now we need to enter the new page address into the page table ->
>>>> *BAM*
>>>>
>>>> The problem is now that we don't know the address of the page table
>>>> because the current location was replaced with the future location in
>>>> step #5.
>>> You solve that problem by discarding page tables that are swapped out.
>>> Then there is no future location.
>>>
>>>> We could now argue that we could update the page tables on the fly for
>>>> the evicted page and never wait for the current job to finish, but
>>>> this won't work because not all engines can handle that.
>>> Engines that can't handle page faults will depend on fences to keep
>>> their page tables resident, like we do today. So they won't have this
>>> problem.
>>>
>>> Regards,
>>>     Felix
>>>
>>>> I will circumvent this problem for now by blocking the eviction before
>>>> step #5. The issue with that is that this pipelining is responsible
>>>> for nearly 20% of the fps in some testcases.
>>>>
>>>> I hope that it won't be that bad when I limit disabling the pipelining
>>>> to only page tables, but it is certainly possible that we will get
>>>> pushback on this.
>>>>
>>>> If that happens we probably need to add some flags to limit this
>>>> workaround even more to only the root PD and all the PDs/PTs which are
>>>> involved in recoverable faults.
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>> Am 10.09.2018 um 21:10 schrieb Felix Kuehling:
>>>>> I'm not sure why you need to distinguish current and future state when
>>>>> dealing with page faults. When you get a page fault, you know that the
>>>>> GPU is trying to access memory right now, in the present. So you're
>>>>> always working with the current state. When the CPU page table
>>>>> changes,
>>>>> you get an MMU notifier that allows you to invalidate the
>>>>> corresponding
>>>>> GPU PTEs. If you have a valid GPU PTE, it always represents the
>>>>> current
>>>>> states and is in sync with the CPU page table. If the GPU page
>>>>> table is
>>>>> ever outdated, it should have an invalid entry in that place.
>>>>>
>>>>> If SDMA is used to update the GPU PTEs, there is a delay. The MMU
>>>>> notifier is synchronous, so it shouldn't be a problem. You just
>>>>> wait for
>>>>> the SDMA job to complete before returning. When updating PTEs with new
>>>>> valid addresses, it's asynchronous. But the GPU will continue retrying
>>>>> on the invalid entry until SDMA finishes. So it's also implicitly
>>>>> synchronized on the GPU side.
>>>>>
>>>>> Regards,
>>>>>      Felix
>>>>>
>>>>>
>>>>> On 2018-09-10 05:42 AM, Christian König wrote:
>>>>>> Hi Felix & Oak,
>>>>>>
>>>>>> over the weekend I had the idea that we could just use the shadow BOs
>>>>>> to have the current state in a page fault. They are GTT BOs and CPU
>>>>>> accessible anyway.
>>>>>>
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>>> Am 08.09.2018 um 09:34 schrieb Christian König:
>>>>>>> Hi Felix,
>>>>>>>
>>>>>>>> But why do you want to update page tables when there is no more
>>>>>>>> user
>>>>>>>> mode context that cares about them? Is this just to allow pending
>>>>>>>> work
>>>>>>>> to complete after the user mode context is gone? To prevent hangs?
>>>>>>> The problem I'm see is that we still don't have a reliable way of
>>>>>>> killing a command submission while making sure that no more
>>>>>>> faults of
>>>>>>> it are in the flight.
>>>>>>>
>>>>>>> E.g. the HWS has a function to kill a running job on the hardware,
>>>>>>> but that only works for compute and there is no way of telling when
>>>>>>> that operation has finished.
>>>>>>>
>>>>>>> But Oak has already convinced me that we should probably work on
>>>>>>> that
>>>>>>> instead of trying to keep the VM alive as long as possible.
>>>>>>>
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>>> Am 08.09.2018 um 02:27 schrieb Felix Kuehling:
>>>>>>>> Hi Christian,
>>>>>>>>
>>>>>>>> I'm not sure I get your point here.
>>>>>>>>
>>>>>>>> As I understand it, the amdgpu_vm structure represents the user
>>>>>>>> mode VM
>>>>>>>> context. It has the pointers to the root page directory and the
>>>>>>>> rest of
>>>>>>>> the page table hierarchy. If there is no amdgpu_vm, there is no
>>>>>>>> user
>>>>>>>> mode context that cares about the page tables.
>>>>>>>>
>>>>>>>> If we leave the page table pointers in the amdgpu_vm, then handling
>>>>>>>> faults after the VM is destroyed is pointless. We can't find the
>>>>>>>> page
>>>>>>>> tables and we can't update them, so there is nothing we can do in
>>>>>>>> response to VM faults.
>>>>>>>>
>>>>>>>> So I'm guessing that you want to move the page table pointers
>>>>>>>> into a
>>>>>>>> different structure that exists half-independently of the VM. It
>>>>>>>> would
>>>>>>>> be created when the VM is created (that's where we currently
>>>>>>>> allocate
>>>>>>>> the PASID) but can be destroyed slightly later.
>>>>>>>>
>>>>>>>> But why do you want to update page tables when there is no more
>>>>>>>> user
>>>>>>>> mode context that cares about them? Is this just to allow pending
>>>>>>>> work
>>>>>>>> to complete after the user mode context is gone? To prevent hangs?
>>>>>>>>
>>>>>>>> Regards,
>>>>>>>>       Felix
>>>>>>>>
>>>>>>>> On 2018-09-10 07:20 AM, Christian König wrote:
>>>>>>>>>> Am I right that the handling of page fault need a valid VM?
>>>>>>>>> No, exactly that view is incorrect.
>>>>>>>>>
>>>>>>>>> The VM is meant to be a memory management structure of page
>>>>>>>>> tables and
>>>>>>>>> is completely pointless fault processing because it represent the
>>>>>>>>> future state of the page tables and not the current one.
>>>>>>>>>
>>>>>>>>> What we need for fault processing is a new structure build around
>>>>>>>>> PASIDs which is feed by the with addresses when page tables are
>>>>>>>>> moved
>>>>>>>>> around.
>>>>>>>>>
>>>>>>>>> Alternatively I hope that we can use the SDMA to walk the page
>>>>>>>>> tables
>>>>>>>>> and update the required entries by just using the address.
>>>>>>>>>
>>>>>>>>> Regards,
>>>>>>>>> Christian.
>>>>>>>>>
>>>>>>>>> Am 07.09.2018 um 16:55 schrieb Zeng, Oak:
>>>>>>>>>> Hi Christian,
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> What is the value of delay the destroy of hash to after vm is
>>>>>>>>>> destroyed? Since vm is destroyed, you basically don’t have enough
>>>>>>>>>> information to paging in the correct page to gpuvm. Am I right
>>>>>>>>>> that
>>>>>>>>>> the handling of page fault need a valid VM? For example, you
>>>>>>>>>> need the
>>>>>>>>>> VM to get the corresponding physical address of the faulty page.
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> After vm is destroyed, the retry interrupt can be directly
>>>>>>>>>> discard as
>>>>>>>>>> you can’t find vm through pasid. You can think this is also one
>>>>>>>>>> kind
>>>>>>>>>> of prescreen.
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> So I am stand for the point that, there is no need to delay the
>>>>>>>>>> destroy of hash to after vm is destroyed. Prescreening hash
>>>>>>>>>> can be
>>>>>>>>>> destroyed at the time of vm_fini.
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> Thanks,
>>>>>>>>>>
>>>>>>>>>> Oak
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> *From:*Christian König <ckoenig.leichtzumerken@gmail.com>
>>>>>>>>>> *Sent:* Friday, September 07, 2018 4:39 AM
>>>>>>>>>> *To:* Zeng, Oak <Oak.Zeng@amd.com>; Oak Zeng
>>>>>>>>>> <zengshanjun@gmail.com>;
>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>> *Cc:* Zeng, Oak <Oak.Zeng@amd.com>
>>>>>>>>>> *Subject:* Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to
>>>>>>>>>> amdgpu vm
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> Hi Oak,
>>>>>>>>>>
>>>>>>>>>> yeah, but this is still a NAK. Making the hash per PASID was
>>>>>>>>>> not a
>>>>>>>>>> suggestion but rather a must have.
>>>>>>>>>>
>>>>>>>>>> The VM structures must be destroyed while the processing is still
>>>>>>>>>> ongoing, or otherwise we would not have a clean OOM handling.
>>>>>>>>>>
>>>>>>>>>> The IDR for PASID lockup can be put into amdgpu_ids.c, you can
>>>>>>>>>> just
>>>>>>>>>> replace the IDA already used there with an IDR.
>>>>>>>>>>
>>>>>>>>>> Since the PASID is already freed up delayed we should have the
>>>>>>>>>> grace
>>>>>>>>>> period for interrupt processing included. If that still isn't
>>>>>>>>>> sufficient we can still add some delayed work for this.
>>>>>>>>>>
>>>>>>>>>> Regards,
>>>>>>>>>> Christian.
>>>>>>>>>>
>>>>>>>>>> Am 07.09.2018 um 06:16 schrieb ozeng:
>>>>>>>>>>
>>>>>>>>>>         Hi Christian,
>>>>>>>>>>
>>>>>>>>>>         In this implementation, fault hash is made per vm, not per
>>>>>>>>>> pasid
>>>>>>>>>>         as suggested, based on below considerations:
>>>>>>>>>>
>>>>>>>>>>           * Delay the destroy of hash introduces more effort like
>>>>>>>>>> how to
>>>>>>>>>>             set the proper grace period after which no retry
>>>>>>>>>> interrupt
>>>>>>>>>>             will be generated. We want to avoid those complication
>>>>>>>>>> if we
>>>>>>>>>>             can.
>>>>>>>>>>           * The  problem of the per vm implementation is that it
>>>>>>>>>> is hard
>>>>>>>>>>             to delay the destroy of fault hash, because when vm is
>>>>>>>>>>             destroyed, prescreen function won't be able to
>>>>>>>>>> retrieve the
>>>>>>>>>>             fault hash. But in this case, the prescreen
>>>>>>>>>> function can
>>>>>>>>>>             either let the interrupt go through (to gmc) or ignore
>>>>>>>>>> it.
>>>>>>>>>>             From this perspective, it is not very necessary to
>>>>>>>>>> delay the
>>>>>>>>>>             destroy of hash table.
>>>>>>>>>>           * On the other hand, since ASICs after vega10 have the
>>>>>>>>>> HW CAM
>>>>>>>>>>             feature. So the SW IV prescreen is only useful for
>>>>>>>>>> vega10.
>>>>>>>>>>             For all the HW implemented CAM, we can consider the
>>>>>>>>>> delayed
>>>>>>>>>>             CAM acknowledgment.
>>>>>>>>>>           * Making it per pasid need to introduce another idr to
>>>>>>>>>>             correlate pasid and hash table - where to put the idr?
>>>>>>>>>> You
>>>>>>>>>>             will have to make it a global variable which is not
>>>>>>>>>> very
>>>>>>>>>>             desirable.
>>>>>>>>>>
>>>>>>>>>>         The main purpose of this patch is to solve the
>>>>>>>>>> inter-process
>>>>>>>>>> lock
>>>>>>>>>>         issue (when hash table is full), while still keep codes
>>>>>>>>>> simple.
>>>>>>>>>>
>>>>>>>>>>         Also with this patch, the faults kfifo is not necessary
>>>>>>>>>> any
>>>>>>>>>> more.
>>>>>>>>>>         See patch 2.
>>>>>>>>>>
>>>>>>>>>>         Regards,
>>>>>>>>>>
>>>>>>>>>>         Oak
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>         On 09/06/2018 11:28 PM, Oak Zeng wrote:
>>>>>>>>>>
>>>>>>>>>>             In stead of share one fault hash table per device,
>>>>>>>>>> make it
>>>>>>>>>>
>>>>>>>>>>             per vm. This can avoid inter-process lock issue when
>>>>>>>>>> fault
>>>>>>>>>>
>>>>>>>>>>             hash table is full.
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>             Change-Id: I5d1281b7c41eddc8e26113e010516557588d3708
>>>>>>>>>>
>>>>>>>>>>             Signed-off-by: Oak Zeng <Oak.Zeng@amd.com>
>>>>>>>>>> <mailto:Oak.Zeng@amd.com>
>>>>>>>>>>
>>>>>>>>>>             Suggested-by: Christian Konig
>>>>>>>>>> <Christian.Koenig@amd.com>
>>>>>>>>>> <mailto:Christian.Koenig@amd.com>
>>>>>>>>>>
>>>>>>>>>>             Suggested-by: Felix Kuehling <Felix.Kuehling@amd.com>
>>>>>>>>>> <mailto:Felix.Kuehling@amd.com>
>>>>>>>>>>
>>>>>>>>>>             ---
>>>>>>>>>>
>>>>>>>>>>              drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c |  75
>>>>>>>>>> ------------------------
>>>>>>>>>>
>>>>>>>>>>              drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h |  10 ----
>>>>>>>>>>
>>>>>>>>>>              drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 102
>>>>>>>>>> ++++++++++++++++++++++++++++++++-
>>>>>>>>>>
>>>>>>>>>>              drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h |  12 ++++
>>>>>>>>>>
>>>>>>>>>>              drivers/gpu/drm/amd/amdgpu/vega10_ih.c |  38
>>>>>>>>>> +++++-------
>>>>>>>>>>
>>>>>>>>>>              5 files changed, 127 insertions(+), 110 deletions(-)
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>             diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>>>>>>>>>
>>>>>>>>>>             index 06373d4..4ed8621 100644
>>>>>>>>>>
>>>>>>>>>>             --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>>>>>>>>>
>>>>>>>>>>             +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>>>>>>>>>
>>>>>>>>>>             @@ -197,78 +197,3 @@ int amdgpu_ih_process(struct
>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>
>>>>>>>>>>                return IRQ_HANDLED;
>>>>>>>>>>
>>>>>>>>>>              }
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>             -/**
>>>>>>>>>>
>>>>>>>>>>             - * amdgpu_ih_add_fault - Add a page fault record
>>>>>>>>>>
>>>>>>>>>>             - *
>>>>>>>>>>
>>>>>>>>>>             - * @adev: amdgpu device pointer
>>>>>>>>>>
>>>>>>>>>>             - * @key: 64-bit encoding of PASID and address
>>>>>>>>>>
>>>>>>>>>>             - *
>>>>>>>>>>
>>>>>>>>>>             - * This should be called when a retry page fault
>>>>>>>>>> interrupt is
>>>>>>>>>>
>>>>>>>>>>             - * received. If this is a new page fault, it will be
>>>>>>>>>> added to a hash
>>>>>>>>>>
>>>>>>>>>>             - * table. The return value indicates whether this
>>>>>>>>>> is a
>>>>>>>>>> new fault, or
>>>>>>>>>>
>>>>>>>>>>             - * a fault that was already known and is already
>>>>>>>>>> being
>>>>>>>>>> handled.
>>>>>>>>>>
>>>>>>>>>>             - *
>>>>>>>>>>
>>>>>>>>>>             - * If there are too many pending page faults, this
>>>>>>>>>> will
>>>>>>>>>> fail. Retry
>>>>>>>>>>
>>>>>>>>>>             - * interrupts should be ignored in this case until
>>>>>>>>>> there
>>>>>>>>>> is enough
>>>>>>>>>>
>>>>>>>>>>             - * free space.
>>>>>>>>>>
>>>>>>>>>>             - *
>>>>>>>>>>
>>>>>>>>>>             - * Returns 0 if the fault was added, 1 if the
>>>>>>>>>> fault was
>>>>>>>>>> already known,
>>>>>>>>>>
>>>>>>>>>>             - * -ENOSPC if there are too many pending faults.
>>>>>>>>>>
>>>>>>>>>>             - */
>>>>>>>>>>
>>>>>>>>>>             -int amdgpu_ih_add_fault(struct amdgpu_device
>>>>>>>>>> *adev, u64
>>>>>>>>>> key)
>>>>>>>>>>
>>>>>>>>>>             -{
>>>>>>>>>>
>>>>>>>>>>             -  unsigned long flags;
>>>>>>>>>>
>>>>>>>>>>             -  int r = -ENOSPC;
>>>>>>>>>>
>>>>>>>>>>             -
>>>>>>>>>>
>>>>>>>>>>             -  if (WARN_ON_ONCE(!adev->irq.ih.faults))
>>>>>>>>>>
>>>>>>>>>>             -          /* Should be allocated in
>>>>>>>>>> <IP>_ih_sw_init on
>>>>>>>>>> GPUs that
>>>>>>>>>>
>>>>>>>>>>             -           * support retry faults and require retry
>>>>>>>>>> filtering.
>>>>>>>>>>
>>>>>>>>>>             -           */
>>>>>>>>>>
>>>>>>>>>>             -          return r;
>>>>>>>>>>
>>>>>>>>>>             -
>>>>>>>>>>
>>>>>>>>>>             - spin_lock_irqsave(&adev->irq.ih.faults->lock,
>>>>>>>>>> flags);
>>>>>>>>>>
>>>>>>>>>>             -
>>>>>>>>>>
>>>>>>>>>>             -  /* Only let the hash table fill up to 50% for best
>>>>>>>>>> performance */
>>>>>>>>>>
>>>>>>>>>>             -  if (adev->irq.ih.faults->count >= (1 <<
>>>>>>>>>> (AMDGPU_PAGEFAULT_HASH_BITS-1)))
>>>>>>>>>>
>>>>>>>>>>             -          goto unlock_out;
>>>>>>>>>>
>>>>>>>>>>             -
>>>>>>>>>>
>>>>>>>>>>             -  r = chash_table_copy_in(&adev->irq.ih.faults->hash,
>>>>>>>>>> key, NULL);
>>>>>>>>>>
>>>>>>>>>>             -  if (!r)
>>>>>>>>>>
>>>>>>>>>>             -          adev->irq.ih.faults->count++;
>>>>>>>>>>
>>>>>>>>>>             -
>>>>>>>>>>
>>>>>>>>>>             -  /* chash_table_copy_in should never fail unless
>>>>>>>>>> we're
>>>>>>>>>> losing count */
>>>>>>>>>>
>>>>>>>>>>             -  WARN_ON_ONCE(r < 0);
>>>>>>>>>>
>>>>>>>>>>             -
>>>>>>>>>>
>>>>>>>>>>             -unlock_out:
>>>>>>>>>>
>>>>>>>>>>             - spin_unlock_irqrestore(&adev->irq.ih.faults->lock,
>>>>>>>>>> flags);
>>>>>>>>>>
>>>>>>>>>>             -  return r;
>>>>>>>>>>
>>>>>>>>>>             -}
>>>>>>>>>>
>>>>>>>>>>             -
>>>>>>>>>>
>>>>>>>>>>             -/**
>>>>>>>>>>
>>>>>>>>>>             - * amdgpu_ih_clear_fault - Remove a page fault record
>>>>>>>>>>
>>>>>>>>>>             - *
>>>>>>>>>>
>>>>>>>>>>             - * @adev: amdgpu device pointer
>>>>>>>>>>
>>>>>>>>>>             - * @key: 64-bit encoding of PASID and address
>>>>>>>>>>
>>>>>>>>>>             - *
>>>>>>>>>>
>>>>>>>>>>             - * This should be called when a page fault has been
>>>>>>>>>> handled. Any
>>>>>>>>>>
>>>>>>>>>>             - * future interrupt with this key will be
>>>>>>>>>> processed as a
>>>>>>>>>> new
>>>>>>>>>>
>>>>>>>>>>             - * page fault.
>>>>>>>>>>
>>>>>>>>>>             - */
>>>>>>>>>>
>>>>>>>>>>             -void amdgpu_ih_clear_fault(struct amdgpu_device
>>>>>>>>>> *adev,
>>>>>>>>>> u64 key)
>>>>>>>>>>
>>>>>>>>>>             -{
>>>>>>>>>>
>>>>>>>>>>             -  unsigned long flags;
>>>>>>>>>>
>>>>>>>>>>             -  int r;
>>>>>>>>>>
>>>>>>>>>>             -
>>>>>>>>>>
>>>>>>>>>>             -  if (!adev->irq.ih.faults)
>>>>>>>>>>
>>>>>>>>>>             -          return;
>>>>>>>>>>
>>>>>>>>>>             -
>>>>>>>>>>
>>>>>>>>>>             - spin_lock_irqsave(&adev->irq.ih.faults->lock,
>>>>>>>>>> flags);
>>>>>>>>>>
>>>>>>>>>>             -
>>>>>>>>>>
>>>>>>>>>>             -  r = chash_table_remove(&adev->irq.ih.faults->hash,
>>>>>>>>>> key, NULL);
>>>>>>>>>>
>>>>>>>>>>             -  if (!WARN_ON_ONCE(r < 0)) {
>>>>>>>>>>
>>>>>>>>>>             -          adev->irq.ih.faults->count--;
>>>>>>>>>>
>>>>>>>>>>             - WARN_ON_ONCE(adev->irq.ih.faults->count < 0);
>>>>>>>>>>
>>>>>>>>>>             -  }
>>>>>>>>>>
>>>>>>>>>>             -
>>>>>>>>>>
>>>>>>>>>>             - spin_unlock_irqrestore(&adev->irq.ih.faults->lock,
>>>>>>>>>> flags);
>>>>>>>>>>
>>>>>>>>>>             -}
>>>>>>>>>>
>>>>>>>>>>             diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>>>>>>>>
>>>>>>>>>>             index a23e1c0..f411ffb 100644
>>>>>>>>>>
>>>>>>>>>>             --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>>>>>>>>
>>>>>>>>>>             +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>>>>>>>>
>>>>>>>>>>             @@ -32,13 +32,6 @@ struct amdgpu_device;
>>>>>>>>>>
>>>>>>>>>>              #define AMDGPU_IH_CLIENTID_LEGACY 0
>>>>>>>>>>
>>>>>>>>>>              #define AMDGPU_IH_CLIENTID_MAX SOC15_IH_CLIENTID_MAX
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>             -#define AMDGPU_PAGEFAULT_HASH_BITS 8
>>>>>>>>>>
>>>>>>>>>>             -struct amdgpu_retryfault_hashtable {
>>>>>>>>>>
>>>>>>>>>>             -  DECLARE_CHASH_TABLE(hash,
>>>>>>>>>> AMDGPU_PAGEFAULT_HASH_BITS,
>>>>>>>>>> 8, 0);
>>>>>>>>>>
>>>>>>>>>>             -  spinlock_t     lock;
>>>>>>>>>>
>>>>>>>>>>             -  int            count;
>>>>>>>>>>
>>>>>>>>>>             -};
>>>>>>>>>>
>>>>>>>>>>             -
>>>>>>>>>>
>>>>>>>>>>              /*
>>>>>>>>>>
>>>>>>>>>>               * R6xx+ IH ring
>>>>>>>>>>
>>>>>>>>>>               */
>>>>>>>>>>
>>>>>>>>>>             @@ -57,7 +50,6 @@ struct amdgpu_ih_ring {
>>>>>>>>>>
>>>>>>>>>>                bool                   use_doorbell;
>>>>>>>>>>
>>>>>>>>>>                bool                   use_bus_addr;
>>>>>>>>>>
>>>>>>>>>>                dma_addr_t             rb_dma_addr; /* only used
>>>>>>>>>> when
>>>>>>>>>> use_bus_addr = true */
>>>>>>>>>>
>>>>>>>>>>             -  struct amdgpu_retryfault_hashtable *faults;
>>>>>>>>>>
>>>>>>>>>>              };
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>              #define AMDGPU_IH_SRC_DATA_MAX_SIZE_DW 4
>>>>>>>>>>
>>>>>>>>>>             @@ -95,7 +87,5 @@ int amdgpu_ih_ring_init(struct
>>>>>>>>>> amdgpu_device *adev, unsigned ring_size,
>>>>>>>>>>
>>>>>>>>>>                                bool use_bus_addr);
>>>>>>>>>>
>>>>>>>>>>              void amdgpu_ih_ring_fini(struct amdgpu_device *adev);
>>>>>>>>>>
>>>>>>>>>>              int amdgpu_ih_process(struct amdgpu_device *adev);
>>>>>>>>>>
>>>>>>>>>>             -int amdgpu_ih_add_fault(struct amdgpu_device
>>>>>>>>>> *adev, u64
>>>>>>>>>> key);
>>>>>>>>>>
>>>>>>>>>>             -void amdgpu_ih_clear_fault(struct amdgpu_device
>>>>>>>>>> *adev,
>>>>>>>>>> u64 key);
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>              #endif
>>>>>>>>>>
>>>>>>>>>>             diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>>>>>
>>>>>>>>>>             index 1d7e3c1..8b220e9 100644
>>>>>>>>>>
>>>>>>>>>>             --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>>>>>
>>>>>>>>>>             +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>>>>>
>>>>>>>>>>             @@ -2692,6 +2692,22 @@ void
>>>>>>>>>> amdgpu_vm_adjust_size(struct
>>>>>>>>>> amdgpu_device *adev, uint32_t min_vm_size,
>>>>>>>>>>
>>>>>>>>>>                         adev->vm_manager.fragment_size);
>>>>>>>>>>
>>>>>>>>>>              }
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>             +static struct amdgpu_retryfault_hashtable
>>>>>>>>>> *init_fault_hash(void)
>>>>>>>>>>
>>>>>>>>>>             +{
>>>>>>>>>>
>>>>>>>>>>             +  struct amdgpu_retryfault_hashtable *fault_hash;
>>>>>>>>>>
>>>>>>>>>>             +
>>>>>>>>>>
>>>>>>>>>>             +  fault_hash = kmalloc(sizeof(*fault_hash),
>>>>>>>>>> GFP_KERNEL);
>>>>>>>>>>
>>>>>>>>>>             +  if (!fault_hash)
>>>>>>>>>>
>>>>>>>>>>             +          return fault_hash;
>>>>>>>>>>
>>>>>>>>>>             +
>>>>>>>>>>
>>>>>>>>>>             +  INIT_CHASH_TABLE(fault_hash->hash,
>>>>>>>>>>
>>>>>>>>>>             +                  AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
>>>>>>>>>>
>>>>>>>>>>             +  spin_lock_init(&fault_hash->lock);
>>>>>>>>>>
>>>>>>>>>>             +  fault_hash->count = 0;
>>>>>>>>>>
>>>>>>>>>>             +
>>>>>>>>>>
>>>>>>>>>>             +  return fault_hash;
>>>>>>>>>>
>>>>>>>>>>             +}
>>>>>>>>>>
>>>>>>>>>>             +
>>>>>>>>>>
>>>>>>>>>>              /**
>>>>>>>>>>
>>>>>>>>>>               * amdgpu_vm_init - initialize a vm instance
>>>>>>>>>>
>>>>>>>>>>               *
>>>>>>>>>>
>>>>>>>>>>             @@ -2780,6 +2796,12 @@ int amdgpu_vm_init(struct
>>>>>>>>>> amdgpu_device *adev, struct amdgpu_vm *vm,
>>>>>>>>>>
>>>>>>>>>>                        vm->pasid = pasid;
>>>>>>>>>>
>>>>>>>>>>                }
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>             +  vm->fault_hash = init_fault_hash();
>>>>>>>>>>
>>>>>>>>>>             +  if (!vm->fault_hash) {
>>>>>>>>>>
>>>>>>>>>>             +          r = -ENOMEM;
>>>>>>>>>>
>>>>>>>>>>             +          goto error_free_root;
>>>>>>>>>>
>>>>>>>>>>             +  }
>>>>>>>>>>
>>>>>>>>>>             +
>>>>>>>>>>
>>>>>>>>>>                INIT_KFIFO(vm->faults);
>>>>>>>>>>
>>>>>>>>>>                vm->fault_credit = 16;
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>             @@ -2973,7 +2995,7 @@ void amdgpu_vm_fini(struct
>>>>>>>>>> amdgpu_device *adev, struct amdgpu_vm *vm)
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>                /* Clear pending page faults from IH when the VM is
>>>>>>>>>> destroyed */
>>>>>>>>>>
>>>>>>>>>>                while (kfifo_get(&vm->faults, &fault))
>>>>>>>>>>
>>>>>>>>>>             -          amdgpu_ih_clear_fault(adev, fault);
>>>>>>>>>>
>>>>>>>>>>             +          amdgpu_vm_clear_fault(vm->fault_hash,
>>>>>>>>>> fault);
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>                if (vm->pasid) {
>>>>>>>>>>
>>>>>>>>>>                        unsigned long flags;
>>>>>>>>>>
>>>>>>>>>>             @@ -2983,6 +3005,9 @@ void amdgpu_vm_fini(struct
>>>>>>>>>> amdgpu_device *adev, struct amdgpu_vm *vm)
>>>>>>>>>>
>>>>>>>>>> spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
>>>>>>>>>>
>>>>>>>>>>                }
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>             +  kfree(vm->fault_hash);
>>>>>>>>>>
>>>>>>>>>>             +  vm->fault_hash = NULL;
>>>>>>>>>>
>>>>>>>>>>             +
>>>>>>>>>>
>>>>>>>>>>                drm_sched_entity_destroy(&vm->entity);
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>                if (!RB_EMPTY_ROOT(&vm->va.rb_root)) {
>>>>>>>>>>
>>>>>>>>>>             @@ -3183,3 +3208,78 @@ void
>>>>>>>>>> amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
>>>>>>>>>>
>>>>>>>>>>                        }
>>>>>>>>>>
>>>>>>>>>>                }
>>>>>>>>>>
>>>>>>>>>>              }
>>>>>>>>>>
>>>>>>>>>>             +
>>>>>>>>>>
>>>>>>>>>>             +/**
>>>>>>>>>>
>>>>>>>>>>             + * amdgpu_vm_add_fault - Add a page fault record to
>>>>>>>>>> fault hash table
>>>>>>>>>>
>>>>>>>>>>             + *
>>>>>>>>>>
>>>>>>>>>>             + * @fault_hash: fault hash table
>>>>>>>>>>
>>>>>>>>>>             + * @key: 64-bit encoding of PASID and address
>>>>>>>>>>
>>>>>>>>>>             + *
>>>>>>>>>>
>>>>>>>>>>             + * This should be called when a retry page fault
>>>>>>>>>> interrupt is
>>>>>>>>>>
>>>>>>>>>>             + * received. If this is a new page fault, it will be
>>>>>>>>>> added to a hash
>>>>>>>>>>
>>>>>>>>>>             + * table. The return value indicates whether this
>>>>>>>>>> is a
>>>>>>>>>> new fault, or
>>>>>>>>>>
>>>>>>>>>>             + * a fault that was already known and is already
>>>>>>>>>> being
>>>>>>>>>> handled.
>>>>>>>>>>
>>>>>>>>>>             + *
>>>>>>>>>>
>>>>>>>>>>             + * If there are too many pending page faults, this
>>>>>>>>>> will
>>>>>>>>>> fail. Retry
>>>>>>>>>>
>>>>>>>>>>             + * interrupts should be ignored in this case until
>>>>>>>>>> there
>>>>>>>>>> is enough
>>>>>>>>>>
>>>>>>>>>>             + * free space.
>>>>>>>>>>
>>>>>>>>>>             + *
>>>>>>>>>>
>>>>>>>>>>             + * Returns 0 if the fault was added, 1 if the
>>>>>>>>>> fault was
>>>>>>>>>> already known,
>>>>>>>>>>
>>>>>>>>>>             + * -ENOSPC if there are too many pending faults.
>>>>>>>>>>
>>>>>>>>>>             + */
>>>>>>>>>>
>>>>>>>>>>             +int amdgpu_vm_add_fault(struct
>>>>>>>>>> amdgpu_retryfault_hashtable *fault_hash, u64 key)
>>>>>>>>>>
>>>>>>>>>>             +{
>>>>>>>>>>
>>>>>>>>>>             +  unsigned long flags;
>>>>>>>>>>
>>>>>>>>>>             +  int r = -ENOSPC;
>>>>>>>>>>
>>>>>>>>>>             +
>>>>>>>>>>
>>>>>>>>>>             +  if (WARN_ON_ONCE(!fault_hash))
>>>>>>>>>>
>>>>>>>>>>             +          /* Should be allocated in amdgpu_vm_init
>>>>>>>>>>
>>>>>>>>>>             +           */
>>>>>>>>>>
>>>>>>>>>>             +          return r;
>>>>>>>>>>
>>>>>>>>>>             +
>>>>>>>>>>
>>>>>>>>>>             +  spin_lock_irqsave(&fault_hash->lock, flags);
>>>>>>>>>>
>>>>>>>>>>             +
>>>>>>>>>>
>>>>>>>>>>             +  /* Only let the hash table fill up to 50% for best
>>>>>>>>>> performance */
>>>>>>>>>>
>>>>>>>>>>             +  if (fault_hash->count >= (1 <<
>>>>>>>>>> (AMDGPU_PAGEFAULT_HASH_BITS-1)))
>>>>>>>>>>
>>>>>>>>>>             +          goto unlock_out;
>>>>>>>>>>
>>>>>>>>>>             +
>>>>>>>>>>
>>>>>>>>>>             +  r = chash_table_copy_in(&fault_hash->hash, key,
>>>>>>>>>> NULL);
>>>>>>>>>>
>>>>>>>>>>             +  if (!r)
>>>>>>>>>>
>>>>>>>>>>             +          fault_hash->count++;
>>>>>>>>>>
>>>>>>>>>>             +
>>>>>>>>>>
>>>>>>>>>>             +  /* chash_table_copy_in should never fail unless
>>>>>>>>>> we're
>>>>>>>>>> losing count */
>>>>>>>>>>
>>>>>>>>>>             +  WARN_ON_ONCE(r < 0);
>>>>>>>>>>
>>>>>>>>>>             +
>>>>>>>>>>
>>>>>>>>>>             +unlock_out:
>>>>>>>>>>
>>>>>>>>>>             +  spin_unlock_irqrestore(&fault_hash->lock, flags);
>>>>>>>>>>
>>>>>>>>>>             +  return r;
>>>>>>>>>>
>>>>>>>>>>             +}
>>>>>>>>>>
>>>>>>>>>>             +
>>>>>>>>>>
>>>>>>>>>>             +/**
>>>>>>>>>>
>>>>>>>>>>             + * amdgpu_vm_clear_fault - Remove a page fault record
>>>>>>>>>>
>>>>>>>>>>             + *
>>>>>>>>>>
>>>>>>>>>>             + * @fault_hash: fault hash table
>>>>>>>>>>
>>>>>>>>>>             + * @key: 64-bit encoding of PASID and address
>>>>>>>>>>
>>>>>>>>>>             + *
>>>>>>>>>>
>>>>>>>>>>             + * This should be called when a page fault has been
>>>>>>>>>> handled. Any
>>>>>>>>>>
>>>>>>>>>>             + * future interrupt with this key will be
>>>>>>>>>> processed as a
>>>>>>>>>> new
>>>>>>>>>>
>>>>>>>>>>             + * page fault.
>>>>>>>>>>
>>>>>>>>>>             + */
>>>>>>>>>>
>>>>>>>>>>             +void amdgpu_vm_clear_fault(struct
>>>>>>>>>> amdgpu_retryfault_hashtable *fault_hash, u64 key)
>>>>>>>>>>
>>>>>>>>>>             +{
>>>>>>>>>>
>>>>>>>>>>             +  unsigned long flags;
>>>>>>>>>>
>>>>>>>>>>             +  int r;
>>>>>>>>>>
>>>>>>>>>>             +
>>>>>>>>>>
>>>>>>>>>>             +  if (!fault_hash)
>>>>>>>>>>
>>>>>>>>>>             +          return;
>>>>>>>>>>
>>>>>>>>>>             +
>>>>>>>>>>
>>>>>>>>>>             +  spin_lock_irqsave(&fault_hash->lock, flags);
>>>>>>>>>>
>>>>>>>>>>             +
>>>>>>>>>>
>>>>>>>>>>             +  r = chash_table_remove(&fault_hash->hash, key,
>>>>>>>>>> NULL);
>>>>>>>>>>
>>>>>>>>>>             +  if (!WARN_ON_ONCE(r < 0)) {
>>>>>>>>>>
>>>>>>>>>>             +          fault_hash->count--;
>>>>>>>>>>
>>>>>>>>>>             +          WARN_ON_ONCE(fault_hash->count < 0);
>>>>>>>>>>
>>>>>>>>>>             +  }
>>>>>>>>>>
>>>>>>>>>>             +
>>>>>>>>>>
>>>>>>>>>>             +  spin_unlock_irqrestore(&fault_hash->lock, flags);
>>>>>>>>>>
>>>>>>>>>>             +}
>>>>>>>>>>
>>>>>>>>>>             diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>>>>>>>>
>>>>>>>>>>             index e275ee7..6eb1da1 100644
>>>>>>>>>>
>>>>>>>>>>             --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>>>>>>>>
>>>>>>>>>>             +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>>>>>>>>
>>>>>>>>>>             @@ -178,6 +178,13 @@ struct amdgpu_task_info {
>>>>>>>>>>
>>>>>>>>>>                pid_t   tgid;
>>>>>>>>>>
>>>>>>>>>>              };
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>             +#define AMDGPU_PAGEFAULT_HASH_BITS 8
>>>>>>>>>>
>>>>>>>>>>             +struct amdgpu_retryfault_hashtable {
>>>>>>>>>>
>>>>>>>>>>             +  DECLARE_CHASH_TABLE(hash,
>>>>>>>>>> AMDGPU_PAGEFAULT_HASH_BITS,
>>>>>>>>>> 8, 0);
>>>>>>>>>>
>>>>>>>>>>             +  spinlock_t     lock;
>>>>>>>>>>
>>>>>>>>>>             +  int            count;
>>>>>>>>>>
>>>>>>>>>>             +};
>>>>>>>>>>
>>>>>>>>>>             +
>>>>>>>>>>
>>>>>>>>>>              struct amdgpu_vm {
>>>>>>>>>>
>>>>>>>>>>                /* tree of virtual addresses mapped */
>>>>>>>>>>
>>>>>>>>>>                struct rb_root_cached  va;
>>>>>>>>>>
>>>>>>>>>>             @@ -240,6 +247,7 @@ struct amdgpu_vm {
>>>>>>>>>>
>>>>>>>>>>                struct ttm_lru_bulk_move lru_bulk_move;
>>>>>>>>>>
>>>>>>>>>>                /* mark whether can do the bulk move */
>>>>>>>>>>
>>>>>>>>>>                bool                   bulk_moveable;
>>>>>>>>>>
>>>>>>>>>>             +  struct amdgpu_retryfault_hashtable *fault_hash;
>>>>>>>>>>
>>>>>>>>>>              };
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>              struct amdgpu_vm_manager {
>>>>>>>>>>
>>>>>>>>>>             @@ -355,4 +363,8 @@ void
>>>>>>>>>> amdgpu_vm_set_task_info(struct
>>>>>>>>>> amdgpu_vm *vm);
>>>>>>>>>>
>>>>>>>>>>              void amdgpu_vm_move_to_lru_tail(struct amdgpu_device
>>>>>>>>>> *adev,
>>>>>>>>>>
>>>>>>>>>>                                       struct amdgpu_vm *vm);
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>             +int amdgpu_vm_add_fault(struct
>>>>>>>>>> amdgpu_retryfault_hashtable *fault_hash, u64 key);
>>>>>>>>>>
>>>>>>>>>>             +
>>>>>>>>>>
>>>>>>>>>>             +void amdgpu_vm_clear_fault(struct
>>>>>>>>>> amdgpu_retryfault_hashtable *fault_hash, u64 key);
>>>>>>>>>>
>>>>>>>>>>             +
>>>>>>>>>>
>>>>>>>>>>              #endif
>>>>>>>>>>
>>>>>>>>>>             diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>>>>>
>>>>>>>>>>             index 5ae5ed2..acbe5a7 100644
>>>>>>>>>>
>>>>>>>>>>             --- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>>>>>
>>>>>>>>>>             +++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>>>>>
>>>>>>>>>>             @@ -265,35 +265,36 @@ static bool
>>>>>>>>>> vega10_ih_prescreen_iv(struct amdgpu_device *adev)
>>>>>>>>>>
>>>>>>>>>>                        return true;
>>>>>>>>>>
>>>>>>>>>>                }
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>             -  addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 12);
>>>>>>>>>>
>>>>>>>>>>             -  key = AMDGPU_VM_FAULT(pasid, addr);
>>>>>>>>>>
>>>>>>>>>>             -  r = amdgpu_ih_add_fault(adev, key);
>>>>>>>>>>
>>>>>>>>>>             -
>>>>>>>>>>
>>>>>>>>>>             -  /* Hash table is full or the fault is already being
>>>>>>>>>> processed,
>>>>>>>>>>
>>>>>>>>>>             -   * ignore further page faults
>>>>>>>>>>
>>>>>>>>>>             -   */
>>>>>>>>>>
>>>>>>>>>>             -  if (r != 0)
>>>>>>>>>>
>>>>>>>>>>             -          goto ignore_iv;
>>>>>>>>>>
>>>>>>>>>>             -
>>>>>>>>>>
>>>>>>>>>>                /* Track retry faults in per-VM fault FIFO. */
>>>>>>>>>>
>>>>>>>>>>                spin_lock(&adev->vm_manager.pasid_lock);
>>>>>>>>>>
>>>>>>>>>>                vm = idr_find(&adev->vm_manager.pasid_idr, pasid);
>>>>>>>>>>
>>>>>>>>>>             +  addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 12);
>>>>>>>>>>
>>>>>>>>>>             +  key = AMDGPU_VM_FAULT(pasid, addr);
>>>>>>>>>>
>>>>>>>>>>                if (!vm) {
>>>>>>>>>>
>>>>>>>>>>                        /* VM not found, process it normally */
>>>>>>>>>>
>>>>>>>>>> spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>>>>>>
>>>>>>>>>>             -          amdgpu_ih_clear_fault(adev, key);
>>>>>>>>>>
>>>>>>>>>>                        return true;
>>>>>>>>>>
>>>>>>>>>>             +  } else {
>>>>>>>>>>
>>>>>>>>>>             +          r = amdgpu_vm_add_fault(vm->fault_hash,
>>>>>>>>>> key);
>>>>>>>>>>
>>>>>>>>>>             +
>>>>>>>>>>
>>>>>>>>>>             +          /* Hash table is full or the fault is
>>>>>>>>>> already
>>>>>>>>>> being processed,
>>>>>>>>>>
>>>>>>>>>>             +           * ignore further page faults
>>>>>>>>>>
>>>>>>>>>>             +           */
>>>>>>>>>>
>>>>>>>>>>             +          if (r != 0) {
>>>>>>>>>>
>>>>>>>>>>             + spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>>>>>>
>>>>>>>>>>             +                  goto ignore_iv;
>>>>>>>>>>
>>>>>>>>>>             +          }
>>>>>>>>>>
>>>>>>>>>>                }
>>>>>>>>>>
>>>>>>>>>>                /* No locking required with single writer and
>>>>>>>>>> single
>>>>>>>>>> reader */
>>>>>>>>>>
>>>>>>>>>>                r = kfifo_put(&vm->faults, key);
>>>>>>>>>>
>>>>>>>>>>                if (!r) {
>>>>>>>>>>
>>>>>>>>>>                        /* FIFO is full. Ignore it until there is
>>>>>>>>>> space */
>>>>>>>>>>
>>>>>>>>>>             +          amdgpu_vm_clear_fault(vm->fault_hash, key);
>>>>>>>>>>
>>>>>>>>>> spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>>>>>>
>>>>>>>>>>             -          amdgpu_ih_clear_fault(adev, key);
>>>>>>>>>>
>>>>>>>>>>                        goto ignore_iv;
>>>>>>>>>>
>>>>>>>>>>                }
>>>>>>>>>>
>>>>>>>>>>             - spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>             + spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>>>>>>
>>>>>>>>>>                /* It's the first fault for this address,
>>>>>>>>>> process it
>>>>>>>>>> normally */
>>>>>>>>>>
>>>>>>>>>>                return true;
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>             @@ -386,14 +387,6 @@ static int vega10_ih_sw_init(void
>>>>>>>>>> *handle)
>>>>>>>>>>
>>>>>>>>>>                adev->irq.ih.use_doorbell = true;
>>>>>>>>>>
>>>>>>>>>>                adev->irq.ih.doorbell_index = AMDGPU_DOORBELL64_IH
>>>>>>>>>> << 1;
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>             -  adev->irq.ih.faults =
>>>>>>>>>> kmalloc(sizeof(*adev->irq.ih.faults), GFP_KERNEL);
>>>>>>>>>>
>>>>>>>>>>             -  if (!adev->irq.ih.faults)
>>>>>>>>>>
>>>>>>>>>>             -          return -ENOMEM;
>>>>>>>>>>
>>>>>>>>>>             - INIT_CHASH_TABLE(adev->irq.ih.faults->hash,
>>>>>>>>>>
>>>>>>>>>>             -                   AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
>>>>>>>>>>
>>>>>>>>>>             - spin_lock_init(&adev->irq.ih.faults->lock);
>>>>>>>>>>
>>>>>>>>>>             -  adev->irq.ih.faults->count = 0;
>>>>>>>>>>
>>>>>>>>>>             -
>>>>>>>>>>
>>>>>>>>>>                r = amdgpu_irq_init(adev);
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>                return r;
>>>>>>>>>>
>>>>>>>>>>             @@ -406,9 +399,6 @@ static int vega10_ih_sw_fini(void
>>>>>>>>>> *handle)
>>>>>>>>>>
>>>>>>>>>>                amdgpu_irq_fini(adev);
>>>>>>>>>>
>>>>>>>>>>                amdgpu_ih_ring_fini(adev);
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>             -  kfree(adev->irq.ih.faults);
>>>>>>>>>>
>>>>>>>>>>             -  adev->irq.ih.faults = NULL;
>>>>>>>>>>
>>>>>>>>>>             -
>>>>>>>>>>
>>>>>>>>>>                return 0;
>>>>>>>>>>
>>>>>>>>>>              }
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>         _______________________________________________
>>>>>>>>>>
>>>>>>>>>>         amd-gfx mailing list
>>>>>>>>>>
>>>>>>>>>>         amd-gfx@lists.freedesktop.org
>>>>>>>>>> <mailto:amd-gfx@lists.freedesktop.org>
>>>>>>>>>>
>>>>>>>>>>         https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>> _______________________________________________
>>>>>>>>> amd-gfx mailing list
>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>> _______________________________________________
>>>>>>> amd-gfx mailing list
>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>> _______________________________________________
>>>>> amd-gfx mailing list
>>>>> amd-gfx@lists.freedesktop.org
>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to amdgpu vm
       [not found]                                                         ` <DM5PR12MB1707D50758797558D34F17A0921A0-2J9CzHegvk9TCtO+SvGBKwdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2018-09-13  7:02                                                           ` Christian König
  0 siblings, 0 replies; 27+ messages in thread
From: Christian König @ 2018-09-13  7:02 UTC (permalink / raw)
  To: Kuehling, Felix, Zeng, Oak, Oak Zeng,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Yang, Philip

> I think I'm missing something.
Actually I'm thinking more that I'm missing something :) But the result 
is the same.

> You're raising some potential problems that I think must be about swapping out page tables that don't have fences on them (otherwise they wouldn't be evicted) and that are managed by HMM.
Wait a second, exactly that's the point. The page tables do have fences 
on them when they are evicted.

We currently just pipeline the eviction so that it happens only after 
those fences are signaled.

> I think we should have a phone call to sort this out.
Yeah, that is a really good idea.

Regards,
Christian.

Am 13.09.2018 um 06:45 schrieb Kuehling, Felix:
> I think I'm missing something. I thought the whole discussion we were having about tracking current and future locations of page tables was about swapping out page tables that are managed by HMM. We currently do swap out page tables that don't have fence on them and that are not managed by HMM. And pipelining that is not a problem today.
>
> You're raising some potential problems that I think must be about swapping out page tables that don't have fences on them (otherwise they wouldn't be evicted) and that are managed by HMM. If there is no fence on the page table, no engine that can't handle page faults can depend on the page tables. So we can discard the contents and set the parent page directory entry to invalid. Setting the parent PDE needs to be synchronous so that the GPU doesn't try to access a page table that is no longer there. No pipelining, no problem.
>
> Then you came back with an argument "what about engines that don't support page faults". Those engines can't use memory mapped by HMM anyway. And they can't be evicted while they have a fence on them that indicates active usage by such an engine.
>
> You seem to see some problems that require not pipelining page table evictions. I still don't understand what the problem is. I think we should have a phone call to sort this out.
>
> A few more comments inline, but I think we're misunderstanding each other ...
>
>>>>>> 5. The new system memory location of the page table is noted in its BO.
>>>>> You mean in the parent page table? You can just invalidate the entry in
>>>>> the parent page table and let it fault.
>>>> I'm repeating myself, but exactly that is what won't work.
>>>>
>>>> See we still have engines which can't handle page faults which uses
>>>> the same VM at the same time. This means that we can't just fault in
>>>> page tables.
>>> And I don't understand why that is a problem. Those clients rely on
>>> fences to keep their BOs resident, including the page tables. Are you
>>> planning to change that?
>> No, but how do you want to swap out page tables when there is a fence added?
> I don't. That's my point. If the page table has fences on it, it can't be swapped out. So there is no problem.
>
>> Or do you want to stop adding fences to page tables for engines with
>> recoverable faults?
> Yes, that was my assumption, coming from a KFD perspective where with HMM we want to get away from our eviction fences that enforce BO residency, including page tables. Right now signaling an eviction fence means stopping user mode queues. We want to stop doing that.
>
> If page tables get evicted, that's fine as long as those virtual addresses aren't accessed. Once an access happens, the page table needs to be restored or recreated. Once that's done, the retrying hardware block will get a successful translation and continue executing.
>
>> I think that is completely unrealistic considering the performance
>> penalty of faults.
> I agree that evicting a page table that's in active use is bad. For amdgpu_cs you can prevent that with a fence, no problem.
>
> But a fence is not a good way to prevent that for KFD, because it forces us to keep using our eviction fence and preempting user mode queues on evictions. You see, the eviction fence does not prevent the page table from being evicted, but it forces us to preempt all our queues when an eviction happens. That is a worse performance penalty than dealing with a page fault because the page fault is much more selective. A page fault can interrupt a compute shader at wave granularity. An preemption interrupts compute shaders with process granularity and at much longer time scales.
>
> For KFD I would try to find a better way to avoid evictions of page tables, maybe by bumping them up in the LRU at appropriate times. But even without any such improvements, page faults are still better for KFD than the current eviction-fence-based approach.
>
> Regards,
>    Felix
>
>> At least for currently available hardware we should limit page faults to
>> be used in as few cases as possible, e.g. SVM and userptr.
>>
>> Regards,
>> Christian.
> ________________________________________
> From: Koenig, Christian
> Sent: Wednesday, September 12, 2018 11:59 AM
> To: Kuehling, Felix; Zeng, Oak; Oak Zeng; amd-gfx@lists.freedesktop.org; Yang, Philip
> Subject: Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to amdgpu vm
>
> Am 12.09.2018 um 17:29 schrieb Felix Kuehling:
>> On 2018-09-12 02:56 AM, Christian König wrote:
>>> Am 12.09.2018 um 00:00 schrieb Felix Kuehling:
>>>> On 2018-09-11 03:19 AM, Christian König wrote:
>>>>> Hi Felix,
>>>>>
>>>>> let me try to explain the problem on an example:
>>>>>
>>>>> 1. We have a running job which needs recoverable page faults for
>>>>> accessing the process address space.
>>>>> 2. We run into memory pressure on VRAM and start to evict things.
>>>>> 3. A page tables of the running job is picked up for eviction.
>>>>> 4. We schedule a copy command to move the content of the page table to
>>>>> system memory.
>>>> I think we should change this. If you evict a page table, you don't need
>>>> to preserve its contents. You should be able to restore the page table
>>>> contents from scratch when you handle the page fault that restores it.
>>> Yeah, already implemented. You actually don't need the page fault for
>>> that.
>>>
>>>>> 5. The new system memory location of the page table is noted in its BO.
>>>> You mean in the parent page table? You can just invalidate the entry in
>>>> the parent page table and let it fault.
>>> I'm repeating myself, but exactly that is what won't work.
>>>
>>> See we still have engines which can't handle page faults which uses
>>> the same VM at the same time. This means that we can't just fault in
>>> page tables.
>> And I don't understand why that is a problem. Those clients rely on
>> fences to keep their BOs resident, including the page tables. Are you
>> planning to change that?
> No, but how do you want to swap out page tables when there is a fence added?
>
> Or do you want to stop adding fences to page tables for engines with
> recoverable faults?
>
> I think that is completely unrealistic considering the performance
> penalty of faults.
>
> At least for currently available hardware we should limit page faults to
> be used in as few cases as possible, e.g. SVM and userptr.
>
> Regards,
> Christian.
>
>> Regards,
>>     Felix
>>
>>> What we could do is to separate the address space into a low and a
>>> high range where we have different handling for both.
>>>
>>> I've already prototyped this and prepared mostly everything necessary
>>> for that, but that is still not 100% completed.
>>>
>>> Regards,
>>> Christian.
>>>
>>>>> 6. We get a fault from the job and swap in the page from the process
>>>>> address space.
>>>> No. Then you need to keep updating page tables that are swapped out.
>>>> Instead just discard them and rebuild on fault.
>>>>
>>>>> 7. Now we need to enter the new page address into the page table ->
>>>>> *BAM*
>>>>>
>>>>> The problem is now that we don't know the address of the page table
>>>>> because the current location was replaced with the future location in
>>>>> step #5.
>>>> You solve that problem by discarding page tables that are swapped out.
>>>> Then there is no future location.
>>>>
>>>>> We could now argue that we could update the page tables on the fly for
>>>>> the evicted page and never wait for the current job to finish, but
>>>>> this won't work because not all engines can handle that.
>>>> Engines that can't handle page faults will depend on fences to keep
>>>> their page tables resident, like we do today. So they won't have this
>>>> problem.
>>>>
>>>> Regards,
>>>>      Felix
>>>>
>>>>> I will circumvent this problem for now by blocking the eviction before
>>>>> step #5. The issue with that is that this pipelining is responsible
>>>>> for nearly 20% of the fps in some testcases.
>>>>>
>>>>> I hope that it won't be that bad when I limit disabling the pipelining
>>>>> to only page tables, but it is certainly possible that we will get
>>>>> pushback on this.
>>>>>
>>>>> If that happens we probably need to add some flags to limit this
>>>>> workaround even more to only the root PD and all the PDs/PTs which are
>>>>> involved in recoverable faults.
>>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>> Am 10.09.2018 um 21:10 schrieb Felix Kuehling:
>>>>>> I'm not sure why you need to distinguish current and future state when
>>>>>> dealing with page faults. When you get a page fault, you know that the
>>>>>> GPU is trying to access memory right now, in the present. So you're
>>>>>> always working with the current state. When the CPU page table
>>>>>> changes,
>>>>>> you get an MMU notifier that allows you to invalidate the
>>>>>> corresponding
>>>>>> GPU PTEs. If you have a valid GPU PTE, it always represents the
>>>>>> current
>>>>>> states and is in sync with the CPU page table. If the GPU page
>>>>>> table is
>>>>>> ever outdated, it should have an invalid entry in that place.
>>>>>>
>>>>>> If SDMA is used to update the GPU PTEs, there is a delay. The MMU
>>>>>> notifier is synchronous, so it shouldn't be a problem. You just
>>>>>> wait for
>>>>>> the SDMA job to complete before returning. When updating PTEs with new
>>>>>> valid addresses, it's asynchronous. But the GPU will continue retrying
>>>>>> on the invalid entry until SDMA finishes. So it's also implicitly
>>>>>> synchronized on the GPU side.
>>>>>>
>>>>>> Regards,
>>>>>>       Felix
>>>>>>
>>>>>>
>>>>>> On 2018-09-10 05:42 AM, Christian König wrote:
>>>>>>> Hi Felix & Oak,
>>>>>>>
>>>>>>> over the weekend I had the idea that we could just use the shadow BOs
>>>>>>> to have the current state in a page fault. They are GTT BOs and CPU
>>>>>>> accessible anyway.
>>>>>>>
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>>> Am 08.09.2018 um 09:34 schrieb Christian König:
>>>>>>>> Hi Felix,
>>>>>>>>
>>>>>>>>> But why do you want to update page tables when there is no more
>>>>>>>>> user
>>>>>>>>> mode context that cares about them? Is this just to allow pending
>>>>>>>>> work
>>>>>>>>> to complete after the user mode context is gone? To prevent hangs?
>>>>>>>> The problem I'm see is that we still don't have a reliable way of
>>>>>>>> killing a command submission while making sure that no more
>>>>>>>> faults of
>>>>>>>> it are in the flight.
>>>>>>>>
>>>>>>>> E.g. the HWS has a function to kill a running job on the hardware,
>>>>>>>> but that only works for compute and there is no way of telling when
>>>>>>>> that operation has finished.
>>>>>>>>
>>>>>>>> But Oak has already convinced me that we should probably work on
>>>>>>>> that
>>>>>>>> instead of trying to keep the VM alive as long as possible.
>>>>>>>>
>>>>>>>> Regards,
>>>>>>>> Christian.
>>>>>>>>
>>>>>>>> Am 08.09.2018 um 02:27 schrieb Felix Kuehling:
>>>>>>>>> Hi Christian,
>>>>>>>>>
>>>>>>>>> I'm not sure I get your point here.
>>>>>>>>>
>>>>>>>>> As I understand it, the amdgpu_vm structure represents the user
>>>>>>>>> mode VM
>>>>>>>>> context. It has the pointers to the root page directory and the
>>>>>>>>> rest of
>>>>>>>>> the page table hierarchy. If there is no amdgpu_vm, there is no
>>>>>>>>> user
>>>>>>>>> mode context that cares about the page tables.
>>>>>>>>>
>>>>>>>>> If we leave the page table pointers in the amdgpu_vm, then handling
>>>>>>>>> faults after the VM is destroyed is pointless. We can't find the
>>>>>>>>> page
>>>>>>>>> tables and we can't update them, so there is nothing we can do in
>>>>>>>>> response to VM faults.
>>>>>>>>>
>>>>>>>>> So I'm guessing that you want to move the page table pointers
>>>>>>>>> into a
>>>>>>>>> different structure that exists half-independently of the VM. It
>>>>>>>>> would
>>>>>>>>> be created when the VM is created (that's where we currently
>>>>>>>>> allocate
>>>>>>>>> the PASID) but can be destroyed slightly later.
>>>>>>>>>
>>>>>>>>> But why do you want to update page tables when there is no more
>>>>>>>>> user
>>>>>>>>> mode context that cares about them? Is this just to allow pending
>>>>>>>>> work
>>>>>>>>> to complete after the user mode context is gone? To prevent hangs?
>>>>>>>>>
>>>>>>>>> Regards,
>>>>>>>>>        Felix
>>>>>>>>>
>>>>>>>>> On 2018-09-10 07:20 AM, Christian König wrote:
>>>>>>>>>>> Am I right that the handling of page fault need a valid VM?
>>>>>>>>>> No, exactly that view is incorrect.
>>>>>>>>>>
>>>>>>>>>> The VM is meant to be a memory management structure of page
>>>>>>>>>> tables and
>>>>>>>>>> is completely pointless fault processing because it represent the
>>>>>>>>>> future state of the page tables and not the current one.
>>>>>>>>>>
>>>>>>>>>> What we need for fault processing is a new structure build around
>>>>>>>>>> PASIDs which is feed by the with addresses when page tables are
>>>>>>>>>> moved
>>>>>>>>>> around.
>>>>>>>>>>
>>>>>>>>>> Alternatively I hope that we can use the SDMA to walk the page
>>>>>>>>>> tables
>>>>>>>>>> and update the required entries by just using the address.
>>>>>>>>>>
>>>>>>>>>> Regards,
>>>>>>>>>> Christian.
>>>>>>>>>>
>>>>>>>>>> Am 07.09.2018 um 16:55 schrieb Zeng, Oak:
>>>>>>>>>>> Hi Christian,
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> What is the value of delay the destroy of hash to after vm is
>>>>>>>>>>> destroyed? Since vm is destroyed, you basically don’t have enough
>>>>>>>>>>> information to paging in the correct page to gpuvm. Am I right
>>>>>>>>>>> that
>>>>>>>>>>> the handling of page fault need a valid VM? For example, you
>>>>>>>>>>> need the
>>>>>>>>>>> VM to get the corresponding physical address of the faulty page.
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> After vm is destroyed, the retry interrupt can be directly
>>>>>>>>>>> discard as
>>>>>>>>>>> you can’t find vm through pasid. You can think this is also one
>>>>>>>>>>> kind
>>>>>>>>>>> of prescreen.
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> So I am stand for the point that, there is no need to delay the
>>>>>>>>>>> destroy of hash to after vm is destroyed. Prescreening hash
>>>>>>>>>>> can be
>>>>>>>>>>> destroyed at the time of vm_fini.
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> Thanks,
>>>>>>>>>>>
>>>>>>>>>>> Oak
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> *From:*Christian König <ckoenig.leichtzumerken@gmail.com>
>>>>>>>>>>> *Sent:* Friday, September 07, 2018 4:39 AM
>>>>>>>>>>> *To:* Zeng, Oak <Oak.Zeng@amd.com>; Oak Zeng
>>>>>>>>>>> <zengshanjun@gmail.com>;
>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>> *Cc:* Zeng, Oak <Oak.Zeng@amd.com>
>>>>>>>>>>> *Subject:* Re: [PATCH 1/2] drm/amdgpu: Moved fault hash table to
>>>>>>>>>>> amdgpu vm
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> Hi Oak,
>>>>>>>>>>>
>>>>>>>>>>> yeah, but this is still a NAK. Making the hash per PASID was
>>>>>>>>>>> not a
>>>>>>>>>>> suggestion but rather a must have.
>>>>>>>>>>>
>>>>>>>>>>> The VM structures must be destroyed while the processing is still
>>>>>>>>>>> ongoing, or otherwise we would not have a clean OOM handling.
>>>>>>>>>>>
>>>>>>>>>>> The IDR for PASID lockup can be put into amdgpu_ids.c, you can
>>>>>>>>>>> just
>>>>>>>>>>> replace the IDA already used there with an IDR.
>>>>>>>>>>>
>>>>>>>>>>> Since the PASID is already freed up delayed we should have the
>>>>>>>>>>> grace
>>>>>>>>>>> period for interrupt processing included. If that still isn't
>>>>>>>>>>> sufficient we can still add some delayed work for this.
>>>>>>>>>>>
>>>>>>>>>>> Regards,
>>>>>>>>>>> Christian.
>>>>>>>>>>>
>>>>>>>>>>> Am 07.09.2018 um 06:16 schrieb ozeng:
>>>>>>>>>>>
>>>>>>>>>>>          Hi Christian,
>>>>>>>>>>>
>>>>>>>>>>>          In this implementation, fault hash is made per vm, not per
>>>>>>>>>>> pasid
>>>>>>>>>>>          as suggested, based on below considerations:
>>>>>>>>>>>
>>>>>>>>>>>            * Delay the destroy of hash introduces more effort like
>>>>>>>>>>> how to
>>>>>>>>>>>              set the proper grace period after which no retry
>>>>>>>>>>> interrupt
>>>>>>>>>>>              will be generated. We want to avoid those complication
>>>>>>>>>>> if we
>>>>>>>>>>>              can.
>>>>>>>>>>>            * The  problem of the per vm implementation is that it
>>>>>>>>>>> is hard
>>>>>>>>>>>              to delay the destroy of fault hash, because when vm is
>>>>>>>>>>>              destroyed, prescreen function won't be able to
>>>>>>>>>>> retrieve the
>>>>>>>>>>>              fault hash. But in this case, the prescreen
>>>>>>>>>>> function can
>>>>>>>>>>>              either let the interrupt go through (to gmc) or ignore
>>>>>>>>>>> it.
>>>>>>>>>>>              From this perspective, it is not very necessary to
>>>>>>>>>>> delay the
>>>>>>>>>>>              destroy of hash table.
>>>>>>>>>>>            * On the other hand, since ASICs after vega10 have the
>>>>>>>>>>> HW CAM
>>>>>>>>>>>              feature. So the SW IV prescreen is only useful for
>>>>>>>>>>> vega10.
>>>>>>>>>>>              For all the HW implemented CAM, we can consider the
>>>>>>>>>>> delayed
>>>>>>>>>>>              CAM acknowledgment.
>>>>>>>>>>>            * Making it per pasid need to introduce another idr to
>>>>>>>>>>>              correlate pasid and hash table - where to put the idr?
>>>>>>>>>>> You
>>>>>>>>>>>              will have to make it a global variable which is not
>>>>>>>>>>> very
>>>>>>>>>>>              desirable.
>>>>>>>>>>>
>>>>>>>>>>>          The main purpose of this patch is to solve the
>>>>>>>>>>> inter-process
>>>>>>>>>>> lock
>>>>>>>>>>>          issue (when hash table is full), while still keep codes
>>>>>>>>>>> simple.
>>>>>>>>>>>
>>>>>>>>>>>          Also with this patch, the faults kfifo is not necessary
>>>>>>>>>>> any
>>>>>>>>>>> more.
>>>>>>>>>>>          See patch 2.
>>>>>>>>>>>
>>>>>>>>>>>          Regards,
>>>>>>>>>>>
>>>>>>>>>>>          Oak
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>          On 09/06/2018 11:28 PM, Oak Zeng wrote:
>>>>>>>>>>>
>>>>>>>>>>>              In stead of share one fault hash table per device,
>>>>>>>>>>> make it
>>>>>>>>>>>
>>>>>>>>>>>              per vm. This can avoid inter-process lock issue when
>>>>>>>>>>> fault
>>>>>>>>>>>
>>>>>>>>>>>              hash table is full.
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>              Change-Id: I5d1281b7c41eddc8e26113e010516557588d3708
>>>>>>>>>>>
>>>>>>>>>>>              Signed-off-by: Oak Zeng <Oak.Zeng@amd.com>
>>>>>>>>>>> <mailto:Oak.Zeng@amd.com>
>>>>>>>>>>>
>>>>>>>>>>>              Suggested-by: Christian Konig
>>>>>>>>>>> <Christian.Koenig@amd.com>
>>>>>>>>>>> <mailto:Christian.Koenig@amd.com>
>>>>>>>>>>>
>>>>>>>>>>>              Suggested-by: Felix Kuehling <Felix.Kuehling@amd.com>
>>>>>>>>>>> <mailto:Felix.Kuehling@amd.com>
>>>>>>>>>>>
>>>>>>>>>>>              ---
>>>>>>>>>>>
>>>>>>>>>>>               drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c |  75
>>>>>>>>>>> ------------------------
>>>>>>>>>>>
>>>>>>>>>>>               drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h |  10 ----
>>>>>>>>>>>
>>>>>>>>>>>               drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 102
>>>>>>>>>>> ++++++++++++++++++++++++++++++++-
>>>>>>>>>>>
>>>>>>>>>>>               drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h |  12 ++++
>>>>>>>>>>>
>>>>>>>>>>>               drivers/gpu/drm/amd/amdgpu/vega10_ih.c |  38
>>>>>>>>>>> +++++-------
>>>>>>>>>>>
>>>>>>>>>>>               5 files changed, 127 insertions(+), 110 deletions(-)
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>              diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>>>>>>>>>>
>>>>>>>>>>>              index 06373d4..4ed8621 100644
>>>>>>>>>>>
>>>>>>>>>>>              --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>>>>>>>>>>
>>>>>>>>>>>              +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
>>>>>>>>>>>
>>>>>>>>>>>              @@ -197,78 +197,3 @@ int amdgpu_ih_process(struct
>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>
>>>>>>>>>>>                 return IRQ_HANDLED;
>>>>>>>>>>>
>>>>>>>>>>>               }
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>              -/**
>>>>>>>>>>>
>>>>>>>>>>>              - * amdgpu_ih_add_fault - Add a page fault record
>>>>>>>>>>>
>>>>>>>>>>>              - *
>>>>>>>>>>>
>>>>>>>>>>>              - * @adev: amdgpu device pointer
>>>>>>>>>>>
>>>>>>>>>>>              - * @key: 64-bit encoding of PASID and address
>>>>>>>>>>>
>>>>>>>>>>>              - *
>>>>>>>>>>>
>>>>>>>>>>>              - * This should be called when a retry page fault
>>>>>>>>>>> interrupt is
>>>>>>>>>>>
>>>>>>>>>>>              - * received. If this is a new page fault, it will be
>>>>>>>>>>> added to a hash
>>>>>>>>>>>
>>>>>>>>>>>              - * table. The return value indicates whether this
>>>>>>>>>>> is a
>>>>>>>>>>> new fault, or
>>>>>>>>>>>
>>>>>>>>>>>              - * a fault that was already known and is already
>>>>>>>>>>> being
>>>>>>>>>>> handled.
>>>>>>>>>>>
>>>>>>>>>>>              - *
>>>>>>>>>>>
>>>>>>>>>>>              - * If there are too many pending page faults, this
>>>>>>>>>>> will
>>>>>>>>>>> fail. Retry
>>>>>>>>>>>
>>>>>>>>>>>              - * interrupts should be ignored in this case until
>>>>>>>>>>> there
>>>>>>>>>>> is enough
>>>>>>>>>>>
>>>>>>>>>>>              - * free space.
>>>>>>>>>>>
>>>>>>>>>>>              - *
>>>>>>>>>>>
>>>>>>>>>>>              - * Returns 0 if the fault was added, 1 if the
>>>>>>>>>>> fault was
>>>>>>>>>>> already known,
>>>>>>>>>>>
>>>>>>>>>>>              - * -ENOSPC if there are too many pending faults.
>>>>>>>>>>>
>>>>>>>>>>>              - */
>>>>>>>>>>>
>>>>>>>>>>>              -int amdgpu_ih_add_fault(struct amdgpu_device
>>>>>>>>>>> *adev, u64
>>>>>>>>>>> key)
>>>>>>>>>>>
>>>>>>>>>>>              -{
>>>>>>>>>>>
>>>>>>>>>>>              -  unsigned long flags;
>>>>>>>>>>>
>>>>>>>>>>>              -  int r = -ENOSPC;
>>>>>>>>>>>
>>>>>>>>>>>              -
>>>>>>>>>>>
>>>>>>>>>>>              -  if (WARN_ON_ONCE(!adev->irq.ih.faults))
>>>>>>>>>>>
>>>>>>>>>>>              -          /* Should be allocated in
>>>>>>>>>>> <IP>_ih_sw_init on
>>>>>>>>>>> GPUs that
>>>>>>>>>>>
>>>>>>>>>>>              -           * support retry faults and require retry
>>>>>>>>>>> filtering.
>>>>>>>>>>>
>>>>>>>>>>>              -           */
>>>>>>>>>>>
>>>>>>>>>>>              -          return r;
>>>>>>>>>>>
>>>>>>>>>>>              -
>>>>>>>>>>>
>>>>>>>>>>>              - spin_lock_irqsave(&adev->irq.ih.faults->lock,
>>>>>>>>>>> flags);
>>>>>>>>>>>
>>>>>>>>>>>              -
>>>>>>>>>>>
>>>>>>>>>>>              -  /* Only let the hash table fill up to 50% for best
>>>>>>>>>>> performance */
>>>>>>>>>>>
>>>>>>>>>>>              -  if (adev->irq.ih.faults->count >= (1 <<
>>>>>>>>>>> (AMDGPU_PAGEFAULT_HASH_BITS-1)))
>>>>>>>>>>>
>>>>>>>>>>>              -          goto unlock_out;
>>>>>>>>>>>
>>>>>>>>>>>              -
>>>>>>>>>>>
>>>>>>>>>>>              -  r = chash_table_copy_in(&adev->irq.ih.faults->hash,
>>>>>>>>>>> key, NULL);
>>>>>>>>>>>
>>>>>>>>>>>              -  if (!r)
>>>>>>>>>>>
>>>>>>>>>>>              -          adev->irq.ih.faults->count++;
>>>>>>>>>>>
>>>>>>>>>>>              -
>>>>>>>>>>>
>>>>>>>>>>>              -  /* chash_table_copy_in should never fail unless
>>>>>>>>>>> we're
>>>>>>>>>>> losing count */
>>>>>>>>>>>
>>>>>>>>>>>              -  WARN_ON_ONCE(r < 0);
>>>>>>>>>>>
>>>>>>>>>>>              -
>>>>>>>>>>>
>>>>>>>>>>>              -unlock_out:
>>>>>>>>>>>
>>>>>>>>>>>              - spin_unlock_irqrestore(&adev->irq.ih.faults->lock,
>>>>>>>>>>> flags);
>>>>>>>>>>>
>>>>>>>>>>>              -  return r;
>>>>>>>>>>>
>>>>>>>>>>>              -}
>>>>>>>>>>>
>>>>>>>>>>>              -
>>>>>>>>>>>
>>>>>>>>>>>              -/**
>>>>>>>>>>>
>>>>>>>>>>>              - * amdgpu_ih_clear_fault - Remove a page fault record
>>>>>>>>>>>
>>>>>>>>>>>              - *
>>>>>>>>>>>
>>>>>>>>>>>              - * @adev: amdgpu device pointer
>>>>>>>>>>>
>>>>>>>>>>>              - * @key: 64-bit encoding of PASID and address
>>>>>>>>>>>
>>>>>>>>>>>              - *
>>>>>>>>>>>
>>>>>>>>>>>              - * This should be called when a page fault has been
>>>>>>>>>>> handled. Any
>>>>>>>>>>>
>>>>>>>>>>>              - * future interrupt with this key will be
>>>>>>>>>>> processed as a
>>>>>>>>>>> new
>>>>>>>>>>>
>>>>>>>>>>>              - * page fault.
>>>>>>>>>>>
>>>>>>>>>>>              - */
>>>>>>>>>>>
>>>>>>>>>>>              -void amdgpu_ih_clear_fault(struct amdgpu_device
>>>>>>>>>>> *adev,
>>>>>>>>>>> u64 key)
>>>>>>>>>>>
>>>>>>>>>>>              -{
>>>>>>>>>>>
>>>>>>>>>>>              -  unsigned long flags;
>>>>>>>>>>>
>>>>>>>>>>>              -  int r;
>>>>>>>>>>>
>>>>>>>>>>>              -
>>>>>>>>>>>
>>>>>>>>>>>              -  if (!adev->irq.ih.faults)
>>>>>>>>>>>
>>>>>>>>>>>              -          return;
>>>>>>>>>>>
>>>>>>>>>>>              -
>>>>>>>>>>>
>>>>>>>>>>>              - spin_lock_irqsave(&adev->irq.ih.faults->lock,
>>>>>>>>>>> flags);
>>>>>>>>>>>
>>>>>>>>>>>              -
>>>>>>>>>>>
>>>>>>>>>>>              -  r = chash_table_remove(&adev->irq.ih.faults->hash,
>>>>>>>>>>> key, NULL);
>>>>>>>>>>>
>>>>>>>>>>>              -  if (!WARN_ON_ONCE(r < 0)) {
>>>>>>>>>>>
>>>>>>>>>>>              -          adev->irq.ih.faults->count--;
>>>>>>>>>>>
>>>>>>>>>>>              - WARN_ON_ONCE(adev->irq.ih.faults->count < 0);
>>>>>>>>>>>
>>>>>>>>>>>              -  }
>>>>>>>>>>>
>>>>>>>>>>>              -
>>>>>>>>>>>
>>>>>>>>>>>              - spin_unlock_irqrestore(&adev->irq.ih.faults->lock,
>>>>>>>>>>> flags);
>>>>>>>>>>>
>>>>>>>>>>>              -}
>>>>>>>>>>>
>>>>>>>>>>>              diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>>>>>>>>>
>>>>>>>>>>>              index a23e1c0..f411ffb 100644
>>>>>>>>>>>
>>>>>>>>>>>              --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>>>>>>>>>
>>>>>>>>>>>              +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
>>>>>>>>>>>
>>>>>>>>>>>              @@ -32,13 +32,6 @@ struct amdgpu_device;
>>>>>>>>>>>
>>>>>>>>>>>               #define AMDGPU_IH_CLIENTID_LEGACY 0
>>>>>>>>>>>
>>>>>>>>>>>               #define AMDGPU_IH_CLIENTID_MAX SOC15_IH_CLIENTID_MAX
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>              -#define AMDGPU_PAGEFAULT_HASH_BITS 8
>>>>>>>>>>>
>>>>>>>>>>>              -struct amdgpu_retryfault_hashtable {
>>>>>>>>>>>
>>>>>>>>>>>              -  DECLARE_CHASH_TABLE(hash,
>>>>>>>>>>> AMDGPU_PAGEFAULT_HASH_BITS,
>>>>>>>>>>> 8, 0);
>>>>>>>>>>>
>>>>>>>>>>>              -  spinlock_t     lock;
>>>>>>>>>>>
>>>>>>>>>>>              -  int            count;
>>>>>>>>>>>
>>>>>>>>>>>              -};
>>>>>>>>>>>
>>>>>>>>>>>              -
>>>>>>>>>>>
>>>>>>>>>>>               /*
>>>>>>>>>>>
>>>>>>>>>>>                * R6xx+ IH ring
>>>>>>>>>>>
>>>>>>>>>>>                */
>>>>>>>>>>>
>>>>>>>>>>>              @@ -57,7 +50,6 @@ struct amdgpu_ih_ring {
>>>>>>>>>>>
>>>>>>>>>>>                 bool                   use_doorbell;
>>>>>>>>>>>
>>>>>>>>>>>                 bool                   use_bus_addr;
>>>>>>>>>>>
>>>>>>>>>>>                 dma_addr_t             rb_dma_addr; /* only used
>>>>>>>>>>> when
>>>>>>>>>>> use_bus_addr = true */
>>>>>>>>>>>
>>>>>>>>>>>              -  struct amdgpu_retryfault_hashtable *faults;
>>>>>>>>>>>
>>>>>>>>>>>               };
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>               #define AMDGPU_IH_SRC_DATA_MAX_SIZE_DW 4
>>>>>>>>>>>
>>>>>>>>>>>              @@ -95,7 +87,5 @@ int amdgpu_ih_ring_init(struct
>>>>>>>>>>> amdgpu_device *adev, unsigned ring_size,
>>>>>>>>>>>
>>>>>>>>>>>                                 bool use_bus_addr);
>>>>>>>>>>>
>>>>>>>>>>>               void amdgpu_ih_ring_fini(struct amdgpu_device *adev);
>>>>>>>>>>>
>>>>>>>>>>>               int amdgpu_ih_process(struct amdgpu_device *adev);
>>>>>>>>>>>
>>>>>>>>>>>              -int amdgpu_ih_add_fault(struct amdgpu_device
>>>>>>>>>>> *adev, u64
>>>>>>>>>>> key);
>>>>>>>>>>>
>>>>>>>>>>>              -void amdgpu_ih_clear_fault(struct amdgpu_device
>>>>>>>>>>> *adev,
>>>>>>>>>>> u64 key);
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>               #endif
>>>>>>>>>>>
>>>>>>>>>>>              diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>>>>>>
>>>>>>>>>>>              index 1d7e3c1..8b220e9 100644
>>>>>>>>>>>
>>>>>>>>>>>              --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>>>>>>
>>>>>>>>>>>              +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>>>>>>
>>>>>>>>>>>              @@ -2692,6 +2692,22 @@ void
>>>>>>>>>>> amdgpu_vm_adjust_size(struct
>>>>>>>>>>> amdgpu_device *adev, uint32_t min_vm_size,
>>>>>>>>>>>
>>>>>>>>>>>                          adev->vm_manager.fragment_size);
>>>>>>>>>>>
>>>>>>>>>>>               }
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>              +static struct amdgpu_retryfault_hashtable
>>>>>>>>>>> *init_fault_hash(void)
>>>>>>>>>>>
>>>>>>>>>>>              +{
>>>>>>>>>>>
>>>>>>>>>>>              +  struct amdgpu_retryfault_hashtable *fault_hash;
>>>>>>>>>>>
>>>>>>>>>>>              +
>>>>>>>>>>>
>>>>>>>>>>>              +  fault_hash = kmalloc(sizeof(*fault_hash),
>>>>>>>>>>> GFP_KERNEL);
>>>>>>>>>>>
>>>>>>>>>>>              +  if (!fault_hash)
>>>>>>>>>>>
>>>>>>>>>>>              +          return fault_hash;
>>>>>>>>>>>
>>>>>>>>>>>              +
>>>>>>>>>>>
>>>>>>>>>>>              +  INIT_CHASH_TABLE(fault_hash->hash,
>>>>>>>>>>>
>>>>>>>>>>>              +                  AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
>>>>>>>>>>>
>>>>>>>>>>>              +  spin_lock_init(&fault_hash->lock);
>>>>>>>>>>>
>>>>>>>>>>>              +  fault_hash->count = 0;
>>>>>>>>>>>
>>>>>>>>>>>              +
>>>>>>>>>>>
>>>>>>>>>>>              +  return fault_hash;
>>>>>>>>>>>
>>>>>>>>>>>              +}
>>>>>>>>>>>
>>>>>>>>>>>              +
>>>>>>>>>>>
>>>>>>>>>>>               /**
>>>>>>>>>>>
>>>>>>>>>>>                * amdgpu_vm_init - initialize a vm instance
>>>>>>>>>>>
>>>>>>>>>>>                *
>>>>>>>>>>>
>>>>>>>>>>>              @@ -2780,6 +2796,12 @@ int amdgpu_vm_init(struct
>>>>>>>>>>> amdgpu_device *adev, struct amdgpu_vm *vm,
>>>>>>>>>>>
>>>>>>>>>>>                         vm->pasid = pasid;
>>>>>>>>>>>
>>>>>>>>>>>                 }
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>              +  vm->fault_hash = init_fault_hash();
>>>>>>>>>>>
>>>>>>>>>>>              +  if (!vm->fault_hash) {
>>>>>>>>>>>
>>>>>>>>>>>              +          r = -ENOMEM;
>>>>>>>>>>>
>>>>>>>>>>>              +          goto error_free_root;
>>>>>>>>>>>
>>>>>>>>>>>              +  }
>>>>>>>>>>>
>>>>>>>>>>>              +
>>>>>>>>>>>
>>>>>>>>>>>                 INIT_KFIFO(vm->faults);
>>>>>>>>>>>
>>>>>>>>>>>                 vm->fault_credit = 16;
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>              @@ -2973,7 +2995,7 @@ void amdgpu_vm_fini(struct
>>>>>>>>>>> amdgpu_device *adev, struct amdgpu_vm *vm)
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>                 /* Clear pending page faults from IH when the VM is
>>>>>>>>>>> destroyed */
>>>>>>>>>>>
>>>>>>>>>>>                 while (kfifo_get(&vm->faults, &fault))
>>>>>>>>>>>
>>>>>>>>>>>              -          amdgpu_ih_clear_fault(adev, fault);
>>>>>>>>>>>
>>>>>>>>>>>              +          amdgpu_vm_clear_fault(vm->fault_hash,
>>>>>>>>>>> fault);
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>                 if (vm->pasid) {
>>>>>>>>>>>
>>>>>>>>>>>                         unsigned long flags;
>>>>>>>>>>>
>>>>>>>>>>>              @@ -2983,6 +3005,9 @@ void amdgpu_vm_fini(struct
>>>>>>>>>>> amdgpu_device *adev, struct amdgpu_vm *vm)
>>>>>>>>>>>
>>>>>>>>>>> spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
>>>>>>>>>>>
>>>>>>>>>>>                 }
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>              +  kfree(vm->fault_hash);
>>>>>>>>>>>
>>>>>>>>>>>              +  vm->fault_hash = NULL;
>>>>>>>>>>>
>>>>>>>>>>>              +
>>>>>>>>>>>
>>>>>>>>>>>                 drm_sched_entity_destroy(&vm->entity);
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>                 if (!RB_EMPTY_ROOT(&vm->va.rb_root)) {
>>>>>>>>>>>
>>>>>>>>>>>              @@ -3183,3 +3208,78 @@ void
>>>>>>>>>>> amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
>>>>>>>>>>>
>>>>>>>>>>>                         }
>>>>>>>>>>>
>>>>>>>>>>>                 }
>>>>>>>>>>>
>>>>>>>>>>>               }
>>>>>>>>>>>
>>>>>>>>>>>              +
>>>>>>>>>>>
>>>>>>>>>>>              +/**
>>>>>>>>>>>
>>>>>>>>>>>              + * amdgpu_vm_add_fault - Add a page fault record to
>>>>>>>>>>> fault hash table
>>>>>>>>>>>
>>>>>>>>>>>              + *
>>>>>>>>>>>
>>>>>>>>>>>              + * @fault_hash: fault hash table
>>>>>>>>>>>
>>>>>>>>>>>              + * @key: 64-bit encoding of PASID and address
>>>>>>>>>>>
>>>>>>>>>>>              + *
>>>>>>>>>>>
>>>>>>>>>>>              + * This should be called when a retry page fault
>>>>>>>>>>> interrupt is
>>>>>>>>>>>
>>>>>>>>>>>              + * received. If this is a new page fault, it will be
>>>>>>>>>>> added to a hash
>>>>>>>>>>>
>>>>>>>>>>>              + * table. The return value indicates whether this
>>>>>>>>>>> is a
>>>>>>>>>>> new fault, or
>>>>>>>>>>>
>>>>>>>>>>>              + * a fault that was already known and is already
>>>>>>>>>>> being
>>>>>>>>>>> handled.
>>>>>>>>>>>
>>>>>>>>>>>              + *
>>>>>>>>>>>
>>>>>>>>>>>              + * If there are too many pending page faults, this
>>>>>>>>>>> will
>>>>>>>>>>> fail. Retry
>>>>>>>>>>>
>>>>>>>>>>>              + * interrupts should be ignored in this case until
>>>>>>>>>>> there
>>>>>>>>>>> is enough
>>>>>>>>>>>
>>>>>>>>>>>              + * free space.
>>>>>>>>>>>
>>>>>>>>>>>              + *
>>>>>>>>>>>
>>>>>>>>>>>              + * Returns 0 if the fault was added, 1 if the
>>>>>>>>>>> fault was
>>>>>>>>>>> already known,
>>>>>>>>>>>
>>>>>>>>>>>              + * -ENOSPC if there are too many pending faults.
>>>>>>>>>>>
>>>>>>>>>>>              + */
>>>>>>>>>>>
>>>>>>>>>>>              +int amdgpu_vm_add_fault(struct
>>>>>>>>>>> amdgpu_retryfault_hashtable *fault_hash, u64 key)
>>>>>>>>>>>
>>>>>>>>>>>              +{
>>>>>>>>>>>
>>>>>>>>>>>              +  unsigned long flags;
>>>>>>>>>>>
>>>>>>>>>>>              +  int r = -ENOSPC;
>>>>>>>>>>>
>>>>>>>>>>>              +
>>>>>>>>>>>
>>>>>>>>>>>              +  if (WARN_ON_ONCE(!fault_hash))
>>>>>>>>>>>
>>>>>>>>>>>              +          /* Should be allocated in amdgpu_vm_init
>>>>>>>>>>>
>>>>>>>>>>>              +           */
>>>>>>>>>>>
>>>>>>>>>>>              +          return r;
>>>>>>>>>>>
>>>>>>>>>>>              +
>>>>>>>>>>>
>>>>>>>>>>>              +  spin_lock_irqsave(&fault_hash->lock, flags);
>>>>>>>>>>>
>>>>>>>>>>>              +
>>>>>>>>>>>
>>>>>>>>>>>              +  /* Only let the hash table fill up to 50% for best
>>>>>>>>>>> performance */
>>>>>>>>>>>
>>>>>>>>>>>              +  if (fault_hash->count >= (1 <<
>>>>>>>>>>> (AMDGPU_PAGEFAULT_HASH_BITS-1)))
>>>>>>>>>>>
>>>>>>>>>>>              +          goto unlock_out;
>>>>>>>>>>>
>>>>>>>>>>>              +
>>>>>>>>>>>
>>>>>>>>>>>              +  r = chash_table_copy_in(&fault_hash->hash, key,
>>>>>>>>>>> NULL);
>>>>>>>>>>>
>>>>>>>>>>>              +  if (!r)
>>>>>>>>>>>
>>>>>>>>>>>              +          fault_hash->count++;
>>>>>>>>>>>
>>>>>>>>>>>              +
>>>>>>>>>>>
>>>>>>>>>>>              +  /* chash_table_copy_in should never fail unless
>>>>>>>>>>> we're
>>>>>>>>>>> losing count */
>>>>>>>>>>>
>>>>>>>>>>>              +  WARN_ON_ONCE(r < 0);
>>>>>>>>>>>
>>>>>>>>>>>              +
>>>>>>>>>>>
>>>>>>>>>>>              +unlock_out:
>>>>>>>>>>>
>>>>>>>>>>>              +  spin_unlock_irqrestore(&fault_hash->lock, flags);
>>>>>>>>>>>
>>>>>>>>>>>              +  return r;
>>>>>>>>>>>
>>>>>>>>>>>              +}
>>>>>>>>>>>
>>>>>>>>>>>              +
>>>>>>>>>>>
>>>>>>>>>>>              +/**
>>>>>>>>>>>
>>>>>>>>>>>              + * amdgpu_vm_clear_fault - Remove a page fault record
>>>>>>>>>>>
>>>>>>>>>>>              + *
>>>>>>>>>>>
>>>>>>>>>>>              + * @fault_hash: fault hash table
>>>>>>>>>>>
>>>>>>>>>>>              + * @key: 64-bit encoding of PASID and address
>>>>>>>>>>>
>>>>>>>>>>>              + *
>>>>>>>>>>>
>>>>>>>>>>>              + * This should be called when a page fault has been
>>>>>>>>>>> handled. Any
>>>>>>>>>>>
>>>>>>>>>>>              + * future interrupt with this key will be
>>>>>>>>>>> processed as a
>>>>>>>>>>> new
>>>>>>>>>>>
>>>>>>>>>>>              + * page fault.
>>>>>>>>>>>
>>>>>>>>>>>              + */
>>>>>>>>>>>
>>>>>>>>>>>              +void amdgpu_vm_clear_fault(struct
>>>>>>>>>>> amdgpu_retryfault_hashtable *fault_hash, u64 key)
>>>>>>>>>>>
>>>>>>>>>>>              +{
>>>>>>>>>>>
>>>>>>>>>>>              +  unsigned long flags;
>>>>>>>>>>>
>>>>>>>>>>>              +  int r;
>>>>>>>>>>>
>>>>>>>>>>>              +
>>>>>>>>>>>
>>>>>>>>>>>              +  if (!fault_hash)
>>>>>>>>>>>
>>>>>>>>>>>              +          return;
>>>>>>>>>>>
>>>>>>>>>>>              +
>>>>>>>>>>>
>>>>>>>>>>>              +  spin_lock_irqsave(&fault_hash->lock, flags);
>>>>>>>>>>>
>>>>>>>>>>>              +
>>>>>>>>>>>
>>>>>>>>>>>              +  r = chash_table_remove(&fault_hash->hash, key,
>>>>>>>>>>> NULL);
>>>>>>>>>>>
>>>>>>>>>>>              +  if (!WARN_ON_ONCE(r < 0)) {
>>>>>>>>>>>
>>>>>>>>>>>              +          fault_hash->count--;
>>>>>>>>>>>
>>>>>>>>>>>              +          WARN_ON_ONCE(fault_hash->count < 0);
>>>>>>>>>>>
>>>>>>>>>>>              +  }
>>>>>>>>>>>
>>>>>>>>>>>              +
>>>>>>>>>>>
>>>>>>>>>>>              +  spin_unlock_irqrestore(&fault_hash->lock, flags);
>>>>>>>>>>>
>>>>>>>>>>>              +}
>>>>>>>>>>>
>>>>>>>>>>>              diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>>>>>>>>>
>>>>>>>>>>>              index e275ee7..6eb1da1 100644
>>>>>>>>>>>
>>>>>>>>>>>              --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>>>>>>>>>
>>>>>>>>>>>              +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>>>>>>>>>
>>>>>>>>>>>              @@ -178,6 +178,13 @@ struct amdgpu_task_info {
>>>>>>>>>>>
>>>>>>>>>>>                 pid_t   tgid;
>>>>>>>>>>>
>>>>>>>>>>>               };
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>              +#define AMDGPU_PAGEFAULT_HASH_BITS 8
>>>>>>>>>>>
>>>>>>>>>>>              +struct amdgpu_retryfault_hashtable {
>>>>>>>>>>>
>>>>>>>>>>>              +  DECLARE_CHASH_TABLE(hash,
>>>>>>>>>>> AMDGPU_PAGEFAULT_HASH_BITS,
>>>>>>>>>>> 8, 0);
>>>>>>>>>>>
>>>>>>>>>>>              +  spinlock_t     lock;
>>>>>>>>>>>
>>>>>>>>>>>              +  int            count;
>>>>>>>>>>>
>>>>>>>>>>>              +};
>>>>>>>>>>>
>>>>>>>>>>>              +
>>>>>>>>>>>
>>>>>>>>>>>               struct amdgpu_vm {
>>>>>>>>>>>
>>>>>>>>>>>                 /* tree of virtual addresses mapped */
>>>>>>>>>>>
>>>>>>>>>>>                 struct rb_root_cached  va;
>>>>>>>>>>>
>>>>>>>>>>>              @@ -240,6 +247,7 @@ struct amdgpu_vm {
>>>>>>>>>>>
>>>>>>>>>>>                 struct ttm_lru_bulk_move lru_bulk_move;
>>>>>>>>>>>
>>>>>>>>>>>                 /* mark whether can do the bulk move */
>>>>>>>>>>>
>>>>>>>>>>>                 bool                   bulk_moveable;
>>>>>>>>>>>
>>>>>>>>>>>              +  struct amdgpu_retryfault_hashtable *fault_hash;
>>>>>>>>>>>
>>>>>>>>>>>               };
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>               struct amdgpu_vm_manager {
>>>>>>>>>>>
>>>>>>>>>>>              @@ -355,4 +363,8 @@ void
>>>>>>>>>>> amdgpu_vm_set_task_info(struct
>>>>>>>>>>> amdgpu_vm *vm);
>>>>>>>>>>>
>>>>>>>>>>>               void amdgpu_vm_move_to_lru_tail(struct amdgpu_device
>>>>>>>>>>> *adev,
>>>>>>>>>>>
>>>>>>>>>>>                                        struct amdgpu_vm *vm);
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>              +int amdgpu_vm_add_fault(struct
>>>>>>>>>>> amdgpu_retryfault_hashtable *fault_hash, u64 key);
>>>>>>>>>>>
>>>>>>>>>>>              +
>>>>>>>>>>>
>>>>>>>>>>>              +void amdgpu_vm_clear_fault(struct
>>>>>>>>>>> amdgpu_retryfault_hashtable *fault_hash, u64 key);
>>>>>>>>>>>
>>>>>>>>>>>              +
>>>>>>>>>>>
>>>>>>>>>>>               #endif
>>>>>>>>>>>
>>>>>>>>>>>              diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>>>>>>
>>>>>>>>>>>              index 5ae5ed2..acbe5a7 100644
>>>>>>>>>>>
>>>>>>>>>>>              --- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>>>>>>
>>>>>>>>>>>              +++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>>>>>>>>
>>>>>>>>>>>              @@ -265,35 +265,36 @@ static bool
>>>>>>>>>>> vega10_ih_prescreen_iv(struct amdgpu_device *adev)
>>>>>>>>>>>
>>>>>>>>>>>                         return true;
>>>>>>>>>>>
>>>>>>>>>>>                 }
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>              -  addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 12);
>>>>>>>>>>>
>>>>>>>>>>>              -  key = AMDGPU_VM_FAULT(pasid, addr);
>>>>>>>>>>>
>>>>>>>>>>>              -  r = amdgpu_ih_add_fault(adev, key);
>>>>>>>>>>>
>>>>>>>>>>>              -
>>>>>>>>>>>
>>>>>>>>>>>              -  /* Hash table is full or the fault is already being
>>>>>>>>>>> processed,
>>>>>>>>>>>
>>>>>>>>>>>              -   * ignore further page faults
>>>>>>>>>>>
>>>>>>>>>>>              -   */
>>>>>>>>>>>
>>>>>>>>>>>              -  if (r != 0)
>>>>>>>>>>>
>>>>>>>>>>>              -          goto ignore_iv;
>>>>>>>>>>>
>>>>>>>>>>>              -
>>>>>>>>>>>
>>>>>>>>>>>                 /* Track retry faults in per-VM fault FIFO. */
>>>>>>>>>>>
>>>>>>>>>>>                 spin_lock(&adev->vm_manager.pasid_lock);
>>>>>>>>>>>
>>>>>>>>>>>                 vm = idr_find(&adev->vm_manager.pasid_idr, pasid);
>>>>>>>>>>>
>>>>>>>>>>>              +  addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 12);
>>>>>>>>>>>
>>>>>>>>>>>              +  key = AMDGPU_VM_FAULT(pasid, addr);
>>>>>>>>>>>
>>>>>>>>>>>                 if (!vm) {
>>>>>>>>>>>
>>>>>>>>>>>                         /* VM not found, process it normally */
>>>>>>>>>>>
>>>>>>>>>>> spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>>>>>>>
>>>>>>>>>>>              -          amdgpu_ih_clear_fault(adev, key);
>>>>>>>>>>>
>>>>>>>>>>>                         return true;
>>>>>>>>>>>
>>>>>>>>>>>              +  } else {
>>>>>>>>>>>
>>>>>>>>>>>              +          r = amdgpu_vm_add_fault(vm->fault_hash,
>>>>>>>>>>> key);
>>>>>>>>>>>
>>>>>>>>>>>              +
>>>>>>>>>>>
>>>>>>>>>>>              +          /* Hash table is full or the fault is
>>>>>>>>>>> already
>>>>>>>>>>> being processed,
>>>>>>>>>>>
>>>>>>>>>>>              +           * ignore further page faults
>>>>>>>>>>>
>>>>>>>>>>>              +           */
>>>>>>>>>>>
>>>>>>>>>>>              +          if (r != 0) {
>>>>>>>>>>>
>>>>>>>>>>>              + spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>>>>>>>
>>>>>>>>>>>              +                  goto ignore_iv;
>>>>>>>>>>>
>>>>>>>>>>>              +          }
>>>>>>>>>>>
>>>>>>>>>>>                 }
>>>>>>>>>>>
>>>>>>>>>>>                 /* No locking required with single writer and
>>>>>>>>>>> single
>>>>>>>>>>> reader */
>>>>>>>>>>>
>>>>>>>>>>>                 r = kfifo_put(&vm->faults, key);
>>>>>>>>>>>
>>>>>>>>>>>                 if (!r) {
>>>>>>>>>>>
>>>>>>>>>>>                         /* FIFO is full. Ignore it until there is
>>>>>>>>>>> space */
>>>>>>>>>>>
>>>>>>>>>>>              +          amdgpu_vm_clear_fault(vm->fault_hash, key);
>>>>>>>>>>>
>>>>>>>>>>> spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>>>>>>>
>>>>>>>>>>>              -          amdgpu_ih_clear_fault(adev, key);
>>>>>>>>>>>
>>>>>>>>>>>                         goto ignore_iv;
>>>>>>>>>>>
>>>>>>>>>>>                 }
>>>>>>>>>>>
>>>>>>>>>>>              - spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>              + spin_unlock(&adev->vm_manager.pasid_lock);
>>>>>>>>>>>
>>>>>>>>>>>                 /* It's the first fault for this address,
>>>>>>>>>>> process it
>>>>>>>>>>> normally */
>>>>>>>>>>>
>>>>>>>>>>>                 return true;
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>              @@ -386,14 +387,6 @@ static int vega10_ih_sw_init(void
>>>>>>>>>>> *handle)
>>>>>>>>>>>
>>>>>>>>>>>                 adev->irq.ih.use_doorbell = true;
>>>>>>>>>>>
>>>>>>>>>>>                 adev->irq.ih.doorbell_index = AMDGPU_DOORBELL64_IH
>>>>>>>>>>> << 1;
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>              -  adev->irq.ih.faults =
>>>>>>>>>>> kmalloc(sizeof(*adev->irq.ih.faults), GFP_KERNEL);
>>>>>>>>>>>
>>>>>>>>>>>              -  if (!adev->irq.ih.faults)
>>>>>>>>>>>
>>>>>>>>>>>              -          return -ENOMEM;
>>>>>>>>>>>
>>>>>>>>>>>              - INIT_CHASH_TABLE(adev->irq.ih.faults->hash,
>>>>>>>>>>>
>>>>>>>>>>>              -                   AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
>>>>>>>>>>>
>>>>>>>>>>>              - spin_lock_init(&adev->irq.ih.faults->lock);
>>>>>>>>>>>
>>>>>>>>>>>              -  adev->irq.ih.faults->count = 0;
>>>>>>>>>>>
>>>>>>>>>>>              -
>>>>>>>>>>>
>>>>>>>>>>>                 r = amdgpu_irq_init(adev);
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>                 return r;
>>>>>>>>>>>
>>>>>>>>>>>              @@ -406,9 +399,6 @@ static int vega10_ih_sw_fini(void
>>>>>>>>>>> *handle)
>>>>>>>>>>>
>>>>>>>>>>>                 amdgpu_irq_fini(adev);
>>>>>>>>>>>
>>>>>>>>>>>                 amdgpu_ih_ring_fini(adev);
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>              -  kfree(adev->irq.ih.faults);
>>>>>>>>>>>
>>>>>>>>>>>              -  adev->irq.ih.faults = NULL;
>>>>>>>>>>>
>>>>>>>>>>>              -
>>>>>>>>>>>
>>>>>>>>>>>                 return 0;
>>>>>>>>>>>
>>>>>>>>>>>               }
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>          _______________________________________________
>>>>>>>>>>>
>>>>>>>>>>>          amd-gfx mailing list
>>>>>>>>>>>
>>>>>>>>>>>          amd-gfx@lists.freedesktop.org
>>>>>>>>>>> <mailto:amd-gfx@lists.freedesktop.org>
>>>>>>>>>>>
>>>>>>>>>>>          https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>> _______________________________________________
>>>>>>>>>> amd-gfx mailing list
>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>>> _______________________________________________
>>>>>>>> amd-gfx mailing list
>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>> _______________________________________________
>>>>>> amd-gfx mailing list
>>>>>> amd-gfx@lists.freedesktop.org
>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

end of thread, other threads:[~2018-09-13  7:02 UTC | newest]

Thread overview: 27+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-09-07  3:28 [PATCH 1/2] drm/amdgpu: Moved fault hash table to amdgpu vm Oak Zeng
     [not found] ` <1536290910-6476-1-git-send-email-Oak.Zeng-5C7GfCeVMHo@public.gmane.org>
2018-09-07  3:28   ` [PATCH 2/2] drm/amdgpu: Deleted faults kfifo Oak Zeng
2018-09-07  4:16   ` [PATCH 1/2] drm/amdgpu: Moved fault hash table to amdgpu vm ozeng
     [not found]     ` <12913b70-925d-2e17-2dc2-491ed8592610-5C7GfCeVMHo@public.gmane.org>
2018-09-07  8:38       ` Christian König
     [not found]         ` <546c9911-b7af-03c8-b5e2-f2bb07932c0b-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2018-09-07 14:55           ` Zeng, Oak
     [not found]             ` <BN6PR12MB1651779BF0B3AE84E1C4430C80000-/b2+HYfkarRSqX7PDniLCgdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2018-09-07 15:18               ` Christian König
     [not found]                 ` <89e7515f-53f1-9f10-aeb6-7fc1b8df2702-5C7GfCeVMHo@public.gmane.org>
2018-09-07 15:30                   ` Zeng, Oak
     [not found]                     ` <BN6PR12MB1651CA24220B91670A75890C80000-/b2+HYfkarRSqX7PDniLCgdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2018-09-07 15:41                       ` Christian König
     [not found]                         ` <0e52fe27-716c-f9d3-9b2c-5540825ffcef-5C7GfCeVMHo@public.gmane.org>
2018-09-07 21:52                           ` Zeng, Oak
     [not found]                             ` <BN6PR12MB1651CCA8BB73DE64815EB0B280000-/b2+HYfkarRSqX7PDniLCgdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2018-09-08  7:16                               ` Christian König
     [not found]                                 ` <bf233da4-5cd9-d7e4-1e9a-fbef3613bfd6-5C7GfCeVMHo@public.gmane.org>
2018-09-10 21:09                                   ` Zeng, Oak
2018-09-08  0:27                   ` Felix Kuehling
     [not found]                     ` <ab2c26bc-e168-eaae-3d80-826a47b3e0d3-5C7GfCeVMHo@public.gmane.org>
2018-09-08  7:34                       ` Christian König
     [not found]                         ` <9b6b78aa-c98a-a741-7ce2-0e7811e1f806-5C7GfCeVMHo@public.gmane.org>
2018-09-10  9:42                           ` Christian König
     [not found]                             ` <143b57c2-12d1-8cb1-ddea-a8fe7226e1c4-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2018-09-10 19:10                               ` Felix Kuehling
     [not found]                                 ` <8cee8563-2df9-1075-3113-66135cbd14fd-5C7GfCeVMHo@public.gmane.org>
2018-09-11  7:19                                   ` Christian König
     [not found]                                     ` <82fb54c4-8cf2-9033-9fb4-f76e747d7b87-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2018-09-11 14:09                                       ` Zeng, Oak
     [not found]                                         ` <DM5PR12MB1660F46176F685755F5BC48880040-2J9CzHegvk9mP71xM+BsFwdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2018-09-11 14:23                                           ` Christian König
     [not found]                                             ` <375d7ba0-1da4-b5b3-6eb9-0d947290ee74-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2018-09-11 15:02                                               ` Zeng, Oak
     [not found]                                                 ` <DM5PR12MB16608D6DAB8D9BFAAE348C4780040-2J9CzHegvk9mP71xM+BsFwdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2018-09-11 15:15                                                   ` Christian König
2018-09-11 22:00                                       ` Felix Kuehling
     [not found]                                         ` <5b9dc941-3d29-21ca-9f92-ba3512900660-5C7GfCeVMHo@public.gmane.org>
2018-09-12  6:56                                           ` Christian König
     [not found]                                             ` <ea317e02-546b-b296-2456-542e86d62b6e-5C7GfCeVMHo@public.gmane.org>
2018-09-12 15:29                                               ` Felix Kuehling
     [not found]                                                 ` <2e37f4d8-9f64-5412-8fa3-9ef1a12c4001-5C7GfCeVMHo@public.gmane.org>
2018-09-12 15:59                                                   ` Christian König
     [not found]                                                     ` <0fa9086e-49c7-580e-2611-4878c92899f5-5C7GfCeVMHo@public.gmane.org>
2018-09-13  4:45                                                       ` Kuehling, Felix
     [not found]                                                         ` <DM5PR12MB1707D50758797558D34F17A0921A0-2J9CzHegvk9TCtO+SvGBKwdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2018-09-13  7:02                                                           ` Christian König
2018-09-12  9:03   ` Christian König

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.