All of lore.kernel.org
 help / color / mirror / Atom feed
* [Intel-gfx] [PATCH 1/7] drm/i915/gem: Replace i915_gem_object.mm.mutex with reservation_ww_class
@ 2020-06-22  9:59 Chris Wilson
  2020-06-22  9:59 ` [Intel-gfx] [PATCH 2/7] drm/i915: Reuse the reservation_ww_class for acquiring vma backing storage Chris Wilson
                   ` (6 more replies)
  0 siblings, 7 replies; 48+ messages in thread
From: Chris Wilson @ 2020-06-22  9:59 UTC (permalink / raw)
  To: intel-gfx; +Cc: Chris Wilson

Our goal is to pull all memory reservations (next iteration
obj->ops->get_pages()) under a ww_mutex, and to align those reservations
with other drivers, i.e. control all such allocations with the
reservation_ww_class. Currently, this is under the purview of the
obj->mm.mutex, and while obj->mm remains an embedded struct we can
"simply" switch to using the reservation_ww_class obj->base.resv->lock

The major consequence is the impact on the shrinker paths as the
reservation_ww_class is used to wrap allocations, and a ww_mutex does
not support subclassing so we cannot do our usual trick of knowing that
we never recurse inside the shrinker and instead have to finish the
reclaim with a trylock. This may result in us failing to release the
pages after having released the vma. This will have to do until a better
idea comes along.

However, this step only converts the mutex over and continues to treat
everything as a single allocation and pinning the pages. With the
ww_mutex in place we can remove the temporary pinning, as we can then
reserve all storage en masse.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c    | 18 +------
 drivers/gpu/drm/i915/gem/i915_gem_domain.c    | 36 ++++---------
 drivers/gpu/drm/i915/gem/i915_gem_object.c    |  8 +--
 drivers/gpu/drm/i915/gem/i915_gem_object.h    | 21 +-------
 .../gpu/drm/i915/gem/i915_gem_object_types.h  |  1 -
 drivers/gpu/drm/i915/gem/i915_gem_pages.c     | 51 ++++++++++---------
 drivers/gpu/drm/i915/gem/i915_gem_phys.c      |  6 +--
 drivers/gpu/drm/i915/gem/i915_gem_shrinker.c  | 15 +++---
 drivers/gpu/drm/i915/gem/i915_gem_tiling.c    |  2 -
 drivers/gpu/drm/i915/gem/i915_gem_userptr.c   | 15 ++++--
 .../gpu/drm/i915/gem/selftests/huge_pages.c   | 32 +++++++-----
 .../i915/gem/selftests/i915_gem_coherency.c   | 14 +++--
 .../drm/i915/gem/selftests/i915_gem_context.c | 10 +++-
 .../drm/i915/gem/selftests/i915_gem_mman.c    |  2 +
 drivers/gpu/drm/i915/i915_gem.c               | 16 ++++--
 .../drm/i915/selftests/intel_memory_region.c  | 17 +++++--
 16 files changed, 128 insertions(+), 136 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c b/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c
index 2679380159fc..049a15e6b496 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c
@@ -124,19 +124,12 @@ static int i915_gem_begin_cpu_access(struct dma_buf *dma_buf, enum dma_data_dire
 	bool write = (direction == DMA_BIDIRECTIONAL || direction == DMA_TO_DEVICE);
 	int err;
 
-	err = i915_gem_object_pin_pages(obj);
-	if (err)
-		return err;
-
 	err = i915_gem_object_lock_interruptible(obj);
 	if (err)
-		goto out;
+		return err;
 
 	err = i915_gem_object_set_to_cpu_domain(obj, write);
 	i915_gem_object_unlock(obj);
-
-out:
-	i915_gem_object_unpin_pages(obj);
 	return err;
 }
 
@@ -145,19 +138,12 @@ static int i915_gem_end_cpu_access(struct dma_buf *dma_buf, enum dma_data_direct
 	struct drm_i915_gem_object *obj = dma_buf_to_obj(dma_buf);
 	int err;
 
-	err = i915_gem_object_pin_pages(obj);
-	if (err)
-		return err;
-
 	err = i915_gem_object_lock_interruptible(obj);
 	if (err)
-		goto out;
+		return err;
 
 	err = i915_gem_object_set_to_gtt_domain(obj, false);
 	i915_gem_object_unlock(obj);
-
-out:
-	i915_gem_object_unpin_pages(obj);
 	return err;
 }
 
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_domain.c b/drivers/gpu/drm/i915/gem/i915_gem_domain.c
index 7f76fc68f498..8b93941f7f93 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_domain.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_domain.c
@@ -70,7 +70,7 @@ i915_gem_object_set_to_wc_domain(struct drm_i915_gem_object *obj, bool write)
 	 * continue to assume that the obj remained out of the CPU cached
 	 * domain.
 	 */
-	ret = i915_gem_object_pin_pages(obj);
+	ret = __i915_gem_object_get_pages_locked(obj);
 	if (ret)
 		return ret;
 
@@ -131,7 +131,7 @@ i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj, bool write)
 	 * continue to assume that the obj remained out of the CPU cached
 	 * domain.
 	 */
-	ret = i915_gem_object_pin_pages(obj);
+	ret = __i915_gem_object_get_pages_locked(obj);
 	if (ret)
 		return ret;
 
@@ -532,13 +532,9 @@ i915_gem_set_domain_ioctl(struct drm_device *dev, void *data,
 	 * continue to assume that the obj remained out of the CPU cached
 	 * domain.
 	 */
-	err = i915_gem_object_pin_pages(obj);
-	if (err)
-		goto out;
-
 	err = i915_gem_object_lock_interruptible(obj);
 	if (err)
-		goto out_unpin;
+		goto out;
 
 	if (read_domains & I915_GEM_DOMAIN_WC)
 		err = i915_gem_object_set_to_wc_domain(obj, write_domain);
@@ -555,8 +551,6 @@ i915_gem_set_domain_ioctl(struct drm_device *dev, void *data,
 	if (write_domain)
 		i915_gem_object_invalidate_frontbuffer(obj, ORIGIN_CPU);
 
-out_unpin:
-	i915_gem_object_unpin_pages(obj);
 out:
 	i915_gem_object_put(obj);
 	return err;
@@ -572,11 +566,13 @@ int i915_gem_object_prepare_read(struct drm_i915_gem_object *obj,
 {
 	int ret;
 
+	assert_object_held(obj);
+
 	*needs_clflush = 0;
 	if (!i915_gem_object_has_struct_page(obj))
 		return -ENODEV;
 
-	ret = i915_gem_object_lock_interruptible(obj);
+	ret = __i915_gem_object_get_pages_locked(obj);
 	if (ret)
 		return ret;
 
@@ -584,11 +580,7 @@ int i915_gem_object_prepare_read(struct drm_i915_gem_object *obj,
 				   I915_WAIT_INTERRUPTIBLE,
 				   MAX_SCHEDULE_TIMEOUT);
 	if (ret)
-		goto err_unlock;
-
-	ret = i915_gem_object_pin_pages(obj);
-	if (ret)
-		goto err_unlock;
+		goto err_unpin;
 
 	if (obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ ||
 	    !static_cpu_has(X86_FEATURE_CLFLUSH)) {
@@ -616,8 +608,6 @@ int i915_gem_object_prepare_read(struct drm_i915_gem_object *obj,
 
 err_unpin:
 	i915_gem_object_unpin_pages(obj);
-err_unlock:
-	i915_gem_object_unlock(obj);
 	return ret;
 }
 
@@ -626,11 +616,13 @@ int i915_gem_object_prepare_write(struct drm_i915_gem_object *obj,
 {
 	int ret;
 
+	assert_object_held(obj);
+
 	*needs_clflush = 0;
 	if (!i915_gem_object_has_struct_page(obj))
 		return -ENODEV;
 
-	ret = i915_gem_object_lock_interruptible(obj);
+	ret = __i915_gem_object_get_pages_locked(obj);
 	if (ret)
 		return ret;
 
@@ -639,11 +631,7 @@ int i915_gem_object_prepare_write(struct drm_i915_gem_object *obj,
 				   I915_WAIT_ALL,
 				   MAX_SCHEDULE_TIMEOUT);
 	if (ret)
-		goto err_unlock;
-
-	ret = i915_gem_object_pin_pages(obj);
-	if (ret)
-		goto err_unlock;
+		goto err_unpin;
 
 	if (obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE ||
 	    !static_cpu_has(X86_FEATURE_CLFLUSH)) {
@@ -680,7 +668,5 @@ int i915_gem_object_prepare_write(struct drm_i915_gem_object *obj,
 
 err_unpin:
 	i915_gem_object_unpin_pages(obj);
-err_unlock:
-	i915_gem_object_unlock(obj);
 	return ret;
 }
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.c b/drivers/gpu/drm/i915/gem/i915_gem_object.c
index b6ec5b50d93b..37b3fb0eb943 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_object.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_object.c
@@ -53,8 +53,6 @@ void i915_gem_object_init(struct drm_i915_gem_object *obj,
 			  const struct drm_i915_gem_object_ops *ops,
 			  struct lock_class_key *key)
 {
-	__mutex_init(&obj->mm.lock, ops->name ?: "obj->mm.lock", key);
-
 	spin_lock_init(&obj->vma.lock);
 	INIT_LIST_HEAD(&obj->vma.list);
 
@@ -72,10 +70,6 @@ void i915_gem_object_init(struct drm_i915_gem_object *obj,
 	obj->mm.madv = I915_MADV_WILLNEED;
 	INIT_RADIX_TREE(&obj->mm.get_page.radix, GFP_KERNEL | __GFP_NOWARN);
 	mutex_init(&obj->mm.get_page.lock);
-
-	if (IS_ENABLED(CONFIG_LOCKDEP) && i915_gem_object_is_shrinkable(obj))
-		i915_gem_shrinker_taints_mutex(to_i915(obj->base.dev),
-					       &obj->mm.lock);
 }
 
 /**
@@ -209,10 +203,12 @@ static void __i915_gem_free_objects(struct drm_i915_private *i915,
 		GEM_BUG_ON(obj->userfault_count);
 		GEM_BUG_ON(!list_empty(&obj->lut_list));
 
+		i915_gem_object_lock(obj);
 		atomic_set(&obj->mm.pages_pin_count, 0);
 		__i915_gem_object_put_pages(obj);
 		GEM_BUG_ON(i915_gem_object_has_pages(obj));
 		bitmap_free(obj->bit_17);
+		i915_gem_object_unlock(obj);
 
 		if (obj->base.import_attach)
 			drm_prime_gem_destroy(&obj->base, NULL);
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.h b/drivers/gpu/drm/i915/gem/i915_gem_object.h
index 7bcd2661de4c..03a1b859aeef 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_object.h
+++ b/drivers/gpu/drm/i915/gem/i915_gem_object.h
@@ -277,28 +277,12 @@ void __i915_gem_object_set_pages(struct drm_i915_gem_object *obj,
 
 int ____i915_gem_object_get_pages(struct drm_i915_gem_object *obj);
 int __i915_gem_object_get_pages(struct drm_i915_gem_object *obj);
-
-enum i915_mm_subclass { /* lockdep subclass for obj->mm.lock/struct_mutex */
-	I915_MM_NORMAL = 0,
-	/*
-	 * Only used by struct_mutex, when called "recursively" from
-	 * direct-reclaim-esque. Safe because there is only every one
-	 * struct_mutex in the entire system.
-	 */
-	I915_MM_SHRINKER = 1,
-	/*
-	 * Used for obj->mm.lock when allocating pages. Safe because the object
-	 * isn't yet on any LRU, and therefore the shrinker can't deadlock on
-	 * it. As soon as the object has pages, obj->mm.lock nests within
-	 * fs_reclaim.
-	 */
-	I915_MM_GET_PAGES = 1,
-};
+int __i915_gem_object_get_pages_locked(struct drm_i915_gem_object *obj);
 
 static inline int __must_check
 i915_gem_object_pin_pages(struct drm_i915_gem_object *obj)
 {
-	might_lock_nested(&obj->mm.lock, I915_MM_GET_PAGES);
+	might_lock(&obj->base.resv->lock.base);
 
 	if (atomic_inc_not_zero(&obj->mm.pages_pin_count))
 		return 0;
@@ -410,7 +394,6 @@ static inline void
 i915_gem_object_finish_access(struct drm_i915_gem_object *obj)
 {
 	i915_gem_object_unpin_pages(obj);
-	i915_gem_object_unlock(obj);
 }
 
 static inline struct intel_engine_cs *
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
index b1f82a11aef2..dbb33aac7828 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
+++ b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
@@ -186,7 +186,6 @@ struct drm_i915_gem_object {
 		 * Protects the pages and their use. Do not use directly, but
 		 * instead go through the pin/unpin interfaces.
 		 */
-		struct mutex lock;
 		atomic_t pages_pin_count;
 		atomic_t shrink_pin;
 
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_pages.c b/drivers/gpu/drm/i915/gem/i915_gem_pages.c
index af9e48ee4a33..2ff1036ef91f 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_pages.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_pages.c
@@ -18,7 +18,7 @@ void __i915_gem_object_set_pages(struct drm_i915_gem_object *obj,
 	unsigned long supported = INTEL_INFO(i915)->page_sizes;
 	int i;
 
-	lockdep_assert_held(&obj->mm.lock);
+	assert_object_held(obj);
 
 	if (i915_gem_object_is_volatile(obj))
 		obj->mm.madv = I915_MADV_DONTNEED;
@@ -86,6 +86,8 @@ int ____i915_gem_object_get_pages(struct drm_i915_gem_object *obj)
 	struct drm_i915_private *i915 = to_i915(obj->base.dev);
 	int err;
 
+	assert_object_held(obj);
+
 	if (unlikely(obj->mm.madv != I915_MADV_WILLNEED)) {
 		drm_dbg(&i915->drm,
 			"Attempting to obtain a purgeable object\n");
@@ -105,27 +107,34 @@ int ____i915_gem_object_get_pages(struct drm_i915_gem_object *obj)
  * either as a result of memory pressure (reaping pages under the shrinker)
  * or as the object is itself released.
  */
-int __i915_gem_object_get_pages(struct drm_i915_gem_object *obj)
+int __i915_gem_object_get_pages_locked(struct drm_i915_gem_object *obj)
 {
 	int err;
 
-	err = mutex_lock_interruptible_nested(&obj->mm.lock, I915_MM_GET_PAGES);
-	if (err)
-		return err;
-
 	if (unlikely(!i915_gem_object_has_pages(obj))) {
 		GEM_BUG_ON(i915_gem_object_has_pinned_pages(obj));
 
 		err = ____i915_gem_object_get_pages(obj);
 		if (err)
-			goto unlock;
+			return err;
 
 		smp_mb__before_atomic();
 	}
 	atomic_inc(&obj->mm.pages_pin_count);
 
-unlock:
-	mutex_unlock(&obj->mm.lock);
+	return 0;
+}
+
+int __i915_gem_object_get_pages(struct drm_i915_gem_object *obj)
+{
+	int err;
+
+	err = i915_gem_object_lock_interruptible(obj);
+	if (err)
+		return err;
+
+	err = __i915_gem_object_get_pages_locked(obj);
+	i915_gem_object_unlock(obj);
 	return err;
 }
 
@@ -140,7 +149,7 @@ void i915_gem_object_truncate(struct drm_i915_gem_object *obj)
 /* Try to discard unwanted pages */
 void i915_gem_object_writeback(struct drm_i915_gem_object *obj)
 {
-	lockdep_assert_held(&obj->mm.lock);
+	assert_object_held(obj);
 	GEM_BUG_ON(i915_gem_object_has_pages(obj));
 
 	if (obj->ops->writeback)
@@ -194,17 +203,15 @@ __i915_gem_object_unset_pages(struct drm_i915_gem_object *obj)
 int __i915_gem_object_put_pages(struct drm_i915_gem_object *obj)
 {
 	struct sg_table *pages;
-	int err;
+
+	/* May be called by shrinker from within get_pages() (on another bo) */
+	assert_object_held(obj);
 
 	if (i915_gem_object_has_pinned_pages(obj))
 		return -EBUSY;
 
-	/* May be called by shrinker from within get_pages() (on another bo) */
-	mutex_lock(&obj->mm.lock);
-	if (unlikely(atomic_read(&obj->mm.pages_pin_count))) {
-		err = -EBUSY;
-		goto unlock;
-	}
+	if (unlikely(atomic_read(&obj->mm.pages_pin_count)))
+		return -EBUSY;
 
 	i915_gem_object_release_mmap_offset(obj);
 
@@ -227,11 +234,7 @@ int __i915_gem_object_put_pages(struct drm_i915_gem_object *obj)
 	if (!IS_ERR(pages))
 		obj->ops->put_pages(obj, pages);
 
-	err = 0;
-unlock:
-	mutex_unlock(&obj->mm.lock);
-
-	return err;
+	return 0;
 }
 
 static inline pte_t iomap_pte(resource_size_t base,
@@ -325,7 +328,7 @@ void *i915_gem_object_pin_map(struct drm_i915_gem_object *obj,
 	if (!i915_gem_object_type_has(obj, flags))
 		return ERR_PTR(-ENXIO);
 
-	err = mutex_lock_interruptible_nested(&obj->mm.lock, I915_MM_GET_PAGES);
+	err = i915_gem_object_lock_interruptible(obj);
 	if (err)
 		return ERR_PTR(err);
 
@@ -370,7 +373,7 @@ void *i915_gem_object_pin_map(struct drm_i915_gem_object *obj,
 	}
 
 out_unlock:
-	mutex_unlock(&obj->mm.lock);
+	i915_gem_object_unlock(obj);
 	return ptr;
 
 err_unpin:
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_phys.c b/drivers/gpu/drm/i915/gem/i915_gem_phys.c
index 28147aab47b9..099bcfa8f978 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_phys.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_phys.c
@@ -165,7 +165,7 @@ int i915_gem_object_attach_phys(struct drm_i915_gem_object *obj, int align)
 	if (err)
 		return err;
 
-	mutex_lock_nested(&obj->mm.lock, I915_MM_GET_PAGES);
+	i915_gem_object_lock(obj);
 
 	if (obj->mm.madv != I915_MADV_WILLNEED) {
 		err = -EFAULT;
@@ -198,7 +198,7 @@ int i915_gem_object_attach_phys(struct drm_i915_gem_object *obj, int align)
 
 	i915_gem_object_release_memory_region(obj);
 
-	mutex_unlock(&obj->mm.lock);
+	i915_gem_object_unlock(obj);
 	return 0;
 
 err_xfer:
@@ -209,7 +209,7 @@ int i915_gem_object_attach_phys(struct drm_i915_gem_object *obj, int align)
 		__i915_gem_object_set_pages(obj, pages, sg_page_sizes);
 	}
 err_unlock:
-	mutex_unlock(&obj->mm.lock);
+	i915_gem_object_unlock(obj);
 	return err;
 }
 
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c b/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c
index 5b65ce738b16..5fe2c1dd82ee 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c
@@ -45,10 +45,7 @@ static bool unsafe_drop_pages(struct drm_i915_gem_object *obj,
 	if (!(shrink & I915_SHRINK_BOUND))
 		flags = I915_GEM_OBJECT_UNBIND_TEST;
 
-	if (i915_gem_object_unbind(obj, flags) == 0)
-		__i915_gem_object_put_pages(obj);
-
-	return !i915_gem_object_has_pages(obj);
+	return i915_gem_object_unbind(obj, flags) == 0;
 }
 
 static void try_to_writeback(struct drm_i915_gem_object *obj,
@@ -192,14 +189,14 @@ i915_gem_shrink(struct drm_i915_private *i915,
 
 			spin_unlock_irqrestore(&i915->mm.obj_lock, flags);
 
-			if (unsafe_drop_pages(obj, shrink)) {
-				/* May arrive from get_pages on another bo */
-				mutex_lock(&obj->mm.lock);
+			if (unsafe_drop_pages(obj, shrink) &&
+			    i915_gem_object_trylock(obj)) {
+				__i915_gem_object_put_pages(obj);
 				if (!i915_gem_object_has_pages(obj)) {
 					try_to_writeback(obj, shrink);
 					count += obj->base.size >> PAGE_SHIFT;
 				}
-				mutex_unlock(&obj->mm.lock);
+				i915_gem_object_unlock(obj);
 			}
 
 			scanned += obj->base.size >> PAGE_SHIFT;
@@ -415,7 +412,7 @@ void i915_gem_shrinker_taints_mutex(struct drm_i915_private *i915,
 
 	if (!lockdep_is_held_type(&i915->drm.struct_mutex, -1)) {
 		mutex_acquire(&i915->drm.struct_mutex.dep_map,
-			      I915_MM_NORMAL, 0, _RET_IP_);
+			      0, 0, _RET_IP_);
 		unlock = true;
 	}
 
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_tiling.c b/drivers/gpu/drm/i915/gem/i915_gem_tiling.c
index 0158e49bf9bb..a5a272c2e43b 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_tiling.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_tiling.c
@@ -265,7 +265,6 @@ i915_gem_object_set_tiling(struct drm_i915_gem_object *obj,
 	 * pages to prevent them being swapped out and causing corruption
 	 * due to the change in swizzling.
 	 */
-	mutex_lock(&obj->mm.lock);
 	if (i915_gem_object_has_pages(obj) &&
 	    obj->mm.madv == I915_MADV_WILLNEED &&
 	    i915->quirks & QUIRK_PIN_SWIZZLED_PAGES) {
@@ -280,7 +279,6 @@ i915_gem_object_set_tiling(struct drm_i915_gem_object *obj,
 			obj->mm.quirked = true;
 		}
 	}
-	mutex_unlock(&obj->mm.lock);
 
 	spin_lock(&obj->vma.lock);
 	for_each_ggtt_vma(vma, obj) {
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c
index e946032b13e4..4cd79f425eac 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c
@@ -129,8 +129,15 @@ userptr_mn_invalidate_range_start(struct mmu_notifier *_mn,
 		ret = i915_gem_object_unbind(obj,
 					     I915_GEM_OBJECT_UNBIND_ACTIVE |
 					     I915_GEM_OBJECT_UNBIND_BARRIER);
-		if (ret == 0)
-			ret = __i915_gem_object_put_pages(obj);
+		if (ret == 0) {
+			/* XXX ww_mutex is fs_reclaim tainted */
+			if (i915_gem_object_trylock(obj)) {
+				ret = __i915_gem_object_put_pages(obj);
+				i915_gem_object_unlock(obj);
+			} else {
+				ret = -EAGAIN;
+			}
+		}
 		i915_gem_object_put(obj);
 		if (ret)
 			return ret;
@@ -485,7 +492,7 @@ __i915_gem_userptr_get_pages_worker(struct work_struct *_work)
 		}
 	}
 
-	mutex_lock_nested(&obj->mm.lock, I915_MM_GET_PAGES);
+	i915_gem_object_lock(obj);
 	if (obj->userptr.work == &work->work) {
 		struct sg_table *pages = ERR_PTR(ret);
 
@@ -502,7 +509,7 @@ __i915_gem_userptr_get_pages_worker(struct work_struct *_work)
 		if (IS_ERR(pages))
 			__i915_gem_userptr_set_active(obj, false);
 	}
-	mutex_unlock(&obj->mm.lock);
+	i915_gem_object_unlock(obj);
 
 	unpin_user_pages(pvec, pinned);
 	kvfree(pvec);
diff --git a/drivers/gpu/drm/i915/gem/selftests/huge_pages.c b/drivers/gpu/drm/i915/gem/selftests/huge_pages.c
index 8291ede6902c..6ff7c402556e 100644
--- a/drivers/gpu/drm/i915/gem/selftests/huge_pages.c
+++ b/drivers/gpu/drm/i915/gem/selftests/huge_pages.c
@@ -452,6 +452,15 @@ static int igt_mock_exhaust_device_supported_pages(void *arg)
 	return err;
 }
 
+static void close_object(struct drm_i915_gem_object *obj)
+{
+	i915_gem_object_lock(obj);
+	__i915_gem_object_put_pages(obj);
+	i915_gem_object_unlock(obj);
+
+	i915_gem_object_put(obj);
+}
+
 static int igt_mock_memory_region_huge_pages(void *arg)
 {
 	const unsigned int flags[] = { 0, I915_BO_ALLOC_CONTIGUOUS };
@@ -514,8 +523,7 @@ static int igt_mock_memory_region_huge_pages(void *arg)
 			}
 
 			i915_vma_unpin(vma);
-			__i915_gem_object_put_pages(obj);
-			i915_gem_object_put(obj);
+			close_object(obj);
 		}
 	}
 
@@ -633,8 +641,7 @@ static int igt_mock_ppgtt_misaligned_dma(void *arg)
 		}
 
 		i915_gem_object_unpin_pages(obj);
-		__i915_gem_object_put_pages(obj);
-		i915_gem_object_put(obj);
+		close_object(obj);
 	}
 
 	return 0;
@@ -655,8 +662,7 @@ static void close_object_list(struct list_head *objects,
 	list_for_each_entry_safe(obj, on, objects, st_link) {
 		list_del(&obj->st_link);
 		i915_gem_object_unpin_pages(obj);
-		__i915_gem_object_put_pages(obj);
-		i915_gem_object_put(obj);
+		close_object(obj);
 	}
 }
 
@@ -923,8 +929,7 @@ static int igt_mock_ppgtt_64K(void *arg)
 
 			i915_vma_unpin(vma);
 			i915_gem_object_unpin_pages(obj);
-			__i915_gem_object_put_pages(obj);
-			i915_gem_object_put(obj);
+			close_object(obj);
 		}
 	}
 
@@ -964,9 +969,10 @@ __cpu_check_shmem(struct drm_i915_gem_object *obj, u32 dword, u32 val)
 	unsigned long n;
 	int err;
 
+	i915_gem_object_lock(obj);
 	err = i915_gem_object_prepare_read(obj, &needs_flush);
 	if (err)
-		return err;
+		goto unlock;
 
 	for (n = 0; n < obj->base.size >> PAGE_SHIFT; ++n) {
 		u32 *ptr = kmap_atomic(i915_gem_object_get_page(obj, n));
@@ -986,7 +992,8 @@ __cpu_check_shmem(struct drm_i915_gem_object *obj, u32 dword, u32 val)
 	}
 
 	i915_gem_object_finish_access(obj);
-
+unlock:
+	i915_gem_object_unlock(obj);
 	return err;
 }
 
@@ -1304,7 +1311,9 @@ static int igt_ppgtt_smoke_huge(void *arg)
 		}
 out_unpin:
 		i915_gem_object_unpin_pages(obj);
+		i915_gem_object_lock(obj);
 		__i915_gem_object_put_pages(obj);
+		i915_gem_object_unlock(obj);
 out_put:
 		i915_gem_object_put(obj);
 
@@ -1392,8 +1401,7 @@ static int igt_ppgtt_sanity_check(void *arg)
 			err = igt_write_huge(ctx, obj);
 
 			i915_gem_object_unpin_pages(obj);
-			__i915_gem_object_put_pages(obj);
-			i915_gem_object_put(obj);
+			close_object(obj);
 
 			if (err) {
 				pr_err("%s write-huge failed with size=%u pages=%u i=%d, j=%d\n",
diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_coherency.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_coherency.c
index 87d7d8aa080f..b8dd6fabe70a 100644
--- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_coherency.c
+++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_coherency.c
@@ -27,9 +27,10 @@ static int cpu_set(struct context *ctx, unsigned long offset, u32 v)
 	u32 *cpu;
 	int err;
 
+	i915_gem_object_lock(ctx->obj);
 	err = i915_gem_object_prepare_write(ctx->obj, &needs_clflush);
 	if (err)
-		return err;
+		goto unlock;
 
 	page = i915_gem_object_get_page(ctx->obj, offset >> PAGE_SHIFT);
 	map = kmap_atomic(page);
@@ -46,7 +47,9 @@ static int cpu_set(struct context *ctx, unsigned long offset, u32 v)
 	kunmap_atomic(map);
 	i915_gem_object_finish_access(ctx->obj);
 
-	return 0;
+unlock:
+	i915_gem_object_unlock(ctx->obj);
+	return err;
 }
 
 static int cpu_get(struct context *ctx, unsigned long offset, u32 *v)
@@ -57,9 +60,10 @@ static int cpu_get(struct context *ctx, unsigned long offset, u32 *v)
 	u32 *cpu;
 	int err;
 
+	i915_gem_object_lock(ctx->obj);
 	err = i915_gem_object_prepare_read(ctx->obj, &needs_clflush);
 	if (err)
-		return err;
+		goto unlock;
 
 	page = i915_gem_object_get_page(ctx->obj, offset >> PAGE_SHIFT);
 	map = kmap_atomic(page);
@@ -73,7 +77,9 @@ static int cpu_get(struct context *ctx, unsigned long offset, u32 *v)
 	kunmap_atomic(map);
 	i915_gem_object_finish_access(ctx->obj);
 
-	return 0;
+unlock:
+	i915_gem_object_unlock(ctx->obj);
+	return err;
 }
 
 static int gtt_set(struct context *ctx, unsigned long offset, u32 v)
diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
index bb57687aea99..7e373a5b5c3c 100644
--- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
@@ -461,9 +461,10 @@ static int cpu_fill(struct drm_i915_gem_object *obj, u32 value)
 	unsigned int n, m, need_flush;
 	int err;
 
+	i915_gem_object_lock(obj);
 	err = i915_gem_object_prepare_write(obj, &need_flush);
 	if (err)
-		return err;
+		goto unlock;
 
 	for (n = 0; n < real_page_count(obj); n++) {
 		u32 *map;
@@ -479,6 +480,8 @@ static int cpu_fill(struct drm_i915_gem_object *obj, u32 value)
 	i915_gem_object_finish_access(obj);
 	obj->read_domains = I915_GEM_DOMAIN_GTT | I915_GEM_DOMAIN_CPU;
 	obj->write_domain = 0;
+unlock:
+	i915_gem_object_unlock(obj);
 	return 0;
 }
 
@@ -488,9 +491,10 @@ static noinline int cpu_check(struct drm_i915_gem_object *obj,
 	unsigned int n, m, needs_flush;
 	int err;
 
+	i915_gem_object_lock(obj);
 	err = i915_gem_object_prepare_read(obj, &needs_flush);
 	if (err)
-		return err;
+		goto unlock;
 
 	for (n = 0; n < real_page_count(obj); n++) {
 		u32 *map;
@@ -527,6 +531,8 @@ static noinline int cpu_check(struct drm_i915_gem_object *obj,
 	}
 
 	i915_gem_object_finish_access(obj);
+unlock:
+	i915_gem_object_unlock(obj);
 	return err;
 }
 
diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_mman.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_mman.c
index 9c7402ce5bf9..11f734fea3ab 100644
--- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_mman.c
+++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_mman.c
@@ -1297,7 +1297,9 @@ static int __igt_mmap_revoke(struct drm_i915_private *i915,
 	}
 
 	if (type != I915_MMAP_TYPE_GTT) {
+		i915_gem_object_lock(obj);
 		__i915_gem_object_put_pages(obj);
+		i915_gem_object_unlock(obj);
 		if (i915_gem_object_has_pages(obj)) {
 			pr_err("Failed to put-pages object!\n");
 			err = -EINVAL;
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index e998f25f30a3..0fbe438c4523 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -335,12 +335,16 @@ i915_gem_shmem_pread(struct drm_i915_gem_object *obj,
 	u64 remain;
 	int ret;
 
+	i915_gem_object_lock(obj);
 	ret = i915_gem_object_prepare_read(obj, &needs_clflush);
-	if (ret)
+	if (ret) {
+		i915_gem_object_unlock(obj);
 		return ret;
+	}
 
 	fence = i915_gem_object_lock_fence(obj);
 	i915_gem_object_finish_access(obj);
+	i915_gem_object_unlock(obj);
 	if (!fence)
 		return -ENOMEM;
 
@@ -734,12 +738,16 @@ i915_gem_shmem_pwrite(struct drm_i915_gem_object *obj,
 	u64 remain;
 	int ret;
 
+	i915_gem_object_lock(obj);
 	ret = i915_gem_object_prepare_write(obj, &needs_clflush);
-	if (ret)
+	if (ret) {
+		i915_gem_object_unlock(obj);
 		return ret;
+	}
 
 	fence = i915_gem_object_lock_fence(obj);
 	i915_gem_object_finish_access(obj);
+	i915_gem_object_unlock(obj);
 	if (!fence)
 		return -ENOMEM;
 
@@ -1063,7 +1071,7 @@ i915_gem_madvise_ioctl(struct drm_device *dev, void *data,
 	if (!obj)
 		return -ENOENT;
 
-	err = mutex_lock_interruptible(&obj->mm.lock);
+	err = i915_gem_object_lock_interruptible(obj);
 	if (err)
 		goto out;
 
@@ -1109,7 +1117,7 @@ i915_gem_madvise_ioctl(struct drm_device *dev, void *data,
 		i915_gem_object_truncate(obj);
 
 	args->retained = obj->mm.madv != __I915_MADV_PURGED;
-	mutex_unlock(&obj->mm.lock);
+	i915_gem_object_unlock(obj);
 
 out:
 	i915_gem_object_put(obj);
diff --git a/drivers/gpu/drm/i915/selftests/intel_memory_region.c b/drivers/gpu/drm/i915/selftests/intel_memory_region.c
index 6e80d99048e4..8d9fdf591514 100644
--- a/drivers/gpu/drm/i915/selftests/intel_memory_region.c
+++ b/drivers/gpu/drm/i915/selftests/intel_memory_region.c
@@ -24,6 +24,15 @@
 #include "selftests/igt_flush_test.h"
 #include "selftests/i915_random.h"
 
+static void close_object(struct drm_i915_gem_object *obj)
+{
+	i915_gem_object_lock(obj);
+	__i915_gem_object_put_pages(obj);
+	i915_gem_object_unlock(obj);
+
+	i915_gem_object_put(obj);
+}
+
 static void close_objects(struct intel_memory_region *mem,
 			  struct list_head *objects)
 {
@@ -33,10 +42,9 @@ static void close_objects(struct intel_memory_region *mem,
 	list_for_each_entry_safe(obj, on, objects, st_link) {
 		if (i915_gem_object_has_pinned_pages(obj))
 			i915_gem_object_unpin_pages(obj);
-		/* No polluting the memory region between tests */
-		__i915_gem_object_put_pages(obj);
 		list_del(&obj->st_link);
-		i915_gem_object_put(obj);
+		/* No polluting the memory region between tests */
+		close_object(obj);
 	}
 
 	cond_resched();
@@ -124,9 +132,8 @@ igt_object_create(struct intel_memory_region *mem,
 static void igt_object_release(struct drm_i915_gem_object *obj)
 {
 	i915_gem_object_unpin_pages(obj);
-	__i915_gem_object_put_pages(obj);
 	list_del(&obj->st_link);
-	i915_gem_object_put(obj);
+	close_object(obj);
 }
 
 static int igt_mock_contiguous(void *arg)
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 48+ messages in thread

* [Intel-gfx] [PATCH 2/7] drm/i915: Reuse the reservation_ww_class for acquiring vma backing storage
  2020-06-22  9:59 [Intel-gfx] [PATCH 1/7] drm/i915/gem: Replace i915_gem_object.mm.mutex with reservation_ww_class Chris Wilson
@ 2020-06-22  9:59 ` Chris Wilson
  2020-06-22  9:59 ` [Intel-gfx] [PATCH 3/7] drm/i915/gem: Track the fences for object allocations Chris Wilson
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 48+ messages in thread
From: Chris Wilson @ 2020-06-22  9:59 UTC (permalink / raw)
  To: intel-gfx; +Cc: Chris Wilson

i915_vma_pin() remains as a convenience function to grab a single range
of address space on the GPU, and must not only acquire the backing
storage of the associated buffer/pages, but must also acquire backing
storage for teh page directory. As such, it will want to receive the
allocations, but in the meantime, convert the custom vma->pages_mutex to
reuse the reservation_ww_class from the associated backing store (object
for now).

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 .../gpu/drm/i915/gem/i915_gem_execbuffer.c    |  21 +-
 drivers/gpu/drm/i915/gt/gen6_ppgtt.c          |   2 -
 drivers/gpu/drm/i915/gt/gen8_ppgtt.c          |   1 -
 drivers/gpu/drm/i915/gt/intel_ggtt.c          |  23 +--
 drivers/gpu/drm/i915/gt/intel_gtt.h           |   2 -
 drivers/gpu/drm/i915/gt/intel_ppgtt.c         |   3 +-
 drivers/gpu/drm/i915/i915_vma.c               | 189 ++++++------------
 drivers/gpu/drm/i915/i915_vma.h               |   3 -
 drivers/gpu/drm/i915/i915_vma_types.h         |   7 -
 drivers/gpu/drm/i915/selftests/i915_gem_gtt.c |   4 +-
 drivers/gpu/drm/i915/selftests/mock_gtt.c     |   1 -
 11 files changed, 92 insertions(+), 164 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
index f7f34954a920..678e7f82f6c9 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -332,7 +332,7 @@ static inline void eb_unreserve_vma(struct eb_vma *ev)
 		__i915_vma_unpin(vma);
 
 	if (ev->flags & __EXEC_OBJECT_HAS_PAGES)
-		i915_vma_put_pages(vma);
+		i915_gem_object_unpin_pages(vma->obj);
 
 	ev->flags &= ~(__EXEC_OBJECT_HAS_PIN |
 		       __EXEC_OBJECT_HAS_FENCE |
@@ -1095,7 +1095,6 @@ static int eb_reserve_vma(struct eb_vm_work *work, struct eb_vma *ev)
 		if (unlikely(err))
 			return err;
 
-		atomic_add(I915_VMA_PAGES_ACTIVE, &vma->pages_count);
 		atomic_or(bind_flags, &vma->flags);
 
 		if (i915_vma_is_ggtt(vma))
@@ -1160,6 +1159,13 @@ static int wait_for_timeline(struct intel_timeline *tl)
 	} while (1);
 }
 
+static void eb_pin_vma_pages(struct i915_vma *vma, unsigned int count)
+{
+	count = hweight32(count);
+	while (count--)
+		__i915_gem_object_pin_pages(vma->obj);
+}
+
 static int __eb_bind_vma(struct eb_vm_work *work, int err)
 {
 	struct i915_address_space *vm = work->vm;
@@ -1201,12 +1207,15 @@ static int __eb_bind_vma(struct eb_vm_work *work, int err)
 		GEM_BUG_ON(vma->vm != vm);
 		GEM_BUG_ON(!i915_vma_is_active(vma));
 
+		if (!vma->pages)
+			err = vma->ops->set_pages(vma);
 		if (err == 0)
 			err = vma->ops->bind_vma(vma,
 						 vma->obj->cache_level,
-						 ev->bind_flags |
-						 I915_VMA_ALLOC);
-		if (err)
+						 ev->bind_flags);
+		if (err == 0)
+			eb_pin_vma_pages(vma, ev->bind_flags);
+		else
 			atomic_and(~ev->bind_flags, &vma->flags);
 
 		if (drm_mm_node_allocated(&ev->hole)) {
@@ -1318,7 +1327,7 @@ static int eb_prepare_vma(struct eb_vma *ev)
 	ev->bind_flags = 0;
 
 	if (!(ev->flags &  __EXEC_OBJECT_HAS_PAGES)) {
-		err = i915_vma_get_pages(vma);
+		err = i915_gem_object_pin_pages(vma->obj);
 		if (err)
 			return err;
 
diff --git a/drivers/gpu/drm/i915/gt/gen6_ppgtt.c b/drivers/gpu/drm/i915/gt/gen6_ppgtt.c
index 2c5ac598ade2..34f66a9ccf2d 100644
--- a/drivers/gpu/drm/i915/gt/gen6_ppgtt.c
+++ b/drivers/gpu/drm/i915/gt/gen6_ppgtt.c
@@ -360,7 +360,6 @@ static struct i915_vma *pd_vma_create(struct gen6_ppgtt *ppgtt, int size)
 	i915_active_init(&vma->active, NULL, NULL);
 
 	kref_init(&vma->ref);
-	mutex_init(&vma->pages_mutex);
 	vma->vm = i915_vm_get(&ggtt->vm);
 	vma->ops = &pd_vma_ops;
 	vma->private = ppgtt;
@@ -447,7 +446,6 @@ struct i915_ppgtt *gen6_ppgtt_create(struct intel_gt *gt)
 	ppgtt_init(&ppgtt->base, gt);
 	ppgtt->base.vm.top = 1;
 
-	ppgtt->base.vm.bind_async_flags = I915_VMA_LOCAL_BIND;
 	ppgtt->base.vm.allocate_va_range = gen6_alloc_va_range;
 	ppgtt->base.vm.clear_range = gen6_ppgtt_clear_range;
 	ppgtt->base.vm.insert_entries = gen6_ppgtt_insert_entries;
diff --git a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c
index 699125928272..1aea30238aa4 100644
--- a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c
+++ b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c
@@ -737,7 +737,6 @@ struct i915_ppgtt *gen8_ppgtt_create(struct intel_gt *gt)
 			goto err_free_pd;
 	}
 
-	ppgtt->vm.bind_async_flags = I915_VMA_LOCAL_BIND;
 	ppgtt->vm.insert_entries = gen8_ppgtt_insert;
 	ppgtt->vm.allocate_va_range = gen8_ppgtt_alloc;
 	ppgtt->vm.clear_range = gen8_ppgtt_clear;
diff --git a/drivers/gpu/drm/i915/gt/intel_ggtt.c b/drivers/gpu/drm/i915/gt/intel_ggtt.c
index eaacf369d304..a7160e142c1c 100644
--- a/drivers/gpu/drm/i915/gt/intel_ggtt.c
+++ b/drivers/gpu/drm/i915/gt/intel_ggtt.c
@@ -582,8 +582,7 @@ static int aliasing_gtt_bind_vma(struct i915_vma *vma,
 	if (flags & I915_VMA_LOCAL_BIND) {
 		struct i915_ppgtt *alias = i915_vm_to_ggtt(vma->vm)->alias;
 
-		if (flags & I915_VMA_ALLOC &&
-		    !test_bit(I915_VMA_ALLOC_BIT, __i915_vma_flags(vma))) {
+		if (!test_bit(I915_VMA_ALLOC_BIT, __i915_vma_flags(vma))) {
 			ret = alias->vm.allocate_va_range(&alias->vm,
 							  vma->node.start,
 							  vma->size);
@@ -646,7 +645,6 @@ static int init_aliasing_ppgtt(struct i915_ggtt *ggtt)
 		goto err_ppgtt;
 
 	ggtt->alias = ppgtt;
-	ggtt->vm.bind_async_flags |= ppgtt->vm.bind_async_flags;
 
 	GEM_BUG_ON(ggtt->vm.vma_ops.bind_vma != ggtt_bind_vma);
 	ggtt->vm.vma_ops.bind_vma = aliasing_gtt_bind_vma;
@@ -882,8 +880,6 @@ static int gen8_gmch_probe(struct i915_ggtt *ggtt)
 	    IS_CHERRYVIEW(i915) /* fails with concurrent use/update */) {
 		ggtt->vm.insert_entries = bxt_vtd_ggtt_insert_entries__BKL;
 		ggtt->vm.insert_page    = bxt_vtd_ggtt_insert_page__BKL;
-		ggtt->vm.bind_async_flags =
-			I915_VMA_GLOBAL_BIND | I915_VMA_LOCAL_BIND;
 	}
 
 	ggtt->invalidate = gen8_ggtt_invalidate;
@@ -1181,11 +1177,6 @@ void i915_ggtt_disable_guc(struct i915_ggtt *ggtt)
 	ggtt->invalidate(ggtt);
 }
 
-static unsigned int clear_bind(struct i915_vma *vma)
-{
-	return atomic_fetch_and(~I915_VMA_BIND_MASK, &vma->flags);
-}
-
 void i915_ggtt_resume(struct i915_ggtt *ggtt)
 {
 	struct i915_vma *vma;
@@ -1203,11 +1194,13 @@ void i915_ggtt_resume(struct i915_ggtt *ggtt)
 	/* clflush objects bound into the GGTT and rebind them. */
 	list_for_each_entry(vma, &ggtt->vm.bound_list, vm_link) {
 		struct drm_i915_gem_object *obj = vma->obj;
-		unsigned int was_bound = clear_bind(vma);
+		unsigned int was_bound =
+			atomic_read(&vma->flags) & I915_VMA_BIND_MASK;
 
-		WARN_ON(i915_vma_bind(vma,
-				      obj ? obj->cache_level : 0,
-				      was_bound, NULL));
+		GEM_BUG_ON(!was_bound);
+		GEM_WARN_ON(vma->ops->bind_vma(vma,
+					       obj ? obj->cache_level : 0,
+					       was_bound));
 		if (obj) { /* only used during resume => exclusive access */
 			flush |= fetch_and_zero(&obj->write_domain);
 			obj->read_domains |= I915_GEM_DOMAIN_GTT;
@@ -1448,7 +1441,7 @@ i915_get_ggtt_vma_pages(struct i915_vma *vma)
 	 * must be the vma->pages. A simple rule is that vma->pages must only
 	 * be accessed when the obj->mm.pages are pinned.
 	 */
-	GEM_BUG_ON(!i915_gem_object_has_pinned_pages(vma->obj));
+	GEM_BUG_ON(!i915_gem_object_has_pages(vma->obj));
 
 	switch (vma->ggtt_view.type) {
 	default:
diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.h b/drivers/gpu/drm/i915/gt/intel_gtt.h
index 773fc76dfa1b..5cbaf55e4941 100644
--- a/drivers/gpu/drm/i915/gt/intel_gtt.h
+++ b/drivers/gpu/drm/i915/gt/intel_gtt.h
@@ -239,8 +239,6 @@ struct i915_address_space {
 	u64 total;		/* size addr space maps (ex. 2GB for ggtt) */
 	u64 reserved;		/* size addr space reserved */
 
-	unsigned int bind_async_flags;
-
 	/*
 	 * Each active user context has its own address space (in full-ppgtt).
 	 * Since the vm may be shared between multiple contexts, we count how
diff --git a/drivers/gpu/drm/i915/gt/intel_ppgtt.c b/drivers/gpu/drm/i915/gt/intel_ppgtt.c
index ecdd58f4b993..fd1abdee2b27 100644
--- a/drivers/gpu/drm/i915/gt/intel_ppgtt.c
+++ b/drivers/gpu/drm/i915/gt/intel_ppgtt.c
@@ -162,8 +162,7 @@ static int ppgtt_bind_vma(struct i915_vma *vma,
 	u32 pte_flags;
 	int err;
 
-	if (flags & I915_VMA_ALLOC &&
-	    !test_bit(I915_VMA_ALLOC_BIT, __i915_vma_flags(vma))) {
+	if (!test_bit(I915_VMA_ALLOC_BIT, __i915_vma_flags(vma))) {
 		err = vma->vm->allocate_va_range(vma->vm,
 						 vma->node.start, vma->size);
 		if (err)
diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
index efb9eacf59b9..dc656c7d3191 100644
--- a/drivers/gpu/drm/i915/i915_vma.c
+++ b/drivers/gpu/drm/i915/i915_vma.c
@@ -115,7 +115,6 @@ vma_create(struct drm_i915_gem_object *obj,
 		return ERR_PTR(-ENOMEM);
 
 	kref_init(&vma->ref);
-	mutex_init(&vma->pages_mutex);
 	vma->vm = i915_vm_get(vm);
 	vma->ops = &vm->vma_ops;
 	vma->obj = obj;
@@ -302,13 +301,27 @@ struct i915_vma_work {
 	unsigned int flags;
 };
 
+static void pin_pages(struct i915_vma *vma, unsigned int bind)
+{
+	bind = hweight32(bind & I915_VMA_BIND_MASK);
+	while (bind--)
+		__i915_gem_object_pin_pages(vma->obj);
+}
+
 static int __vma_bind(struct dma_fence_work *work)
 {
 	struct i915_vma_work *vw = container_of(work, typeof(*vw), base);
 	struct i915_vma *vma = vw->vma;
 	int err;
 
-	err = vma->ops->bind_vma(vma, vw->cache_level, vw->flags);
+	if (vma->obj) /* fixup the pin-pages for bind-flags */
+		pin_pages(vma, vw->flags);
+
+	err = 0;
+	if (!vma->pages)
+		err = vma->ops->set_pages(vma);
+	if (err == 0)
+		err = vma->ops->bind_vma(vma, vw->cache_level, vw->flags);
 	if (err)
 		atomic_or(I915_VMA_ERROR, &vma->flags);
 
@@ -390,9 +403,9 @@ int i915_vma_bind(struct i915_vma *vma,
 		  u32 flags,
 		  struct i915_vma_work *work)
 {
+	struct dma_fence *prev;
 	u32 bind_flags;
 	u32 vma_flags;
-	int ret;
 
 	GEM_BUG_ON(!drm_mm_node_allocated(&vma->node));
 	GEM_BUG_ON(vma->size > vma->node.size);
@@ -415,43 +428,34 @@ int i915_vma_bind(struct i915_vma *vma,
 	if (bind_flags == 0)
 		return 0;
 
-	GEM_BUG_ON(!vma->pages);
-
 	trace_i915_vma_bind(vma, bind_flags);
-	if (work && bind_flags & vma->vm->bind_async_flags) {
-		struct dma_fence *prev;
 
-		work->vma = vma;
-		work->cache_level = cache_level;
-		work->flags = bind_flags | I915_VMA_ALLOC;
+	work->vma = vma;
+	work->cache_level = cache_level;
+	work->flags = bind_flags;
 
-		/*
-		 * Note we only want to chain up to the migration fence on
-		 * the pages (not the object itself). As we don't track that,
-		 * yet, we have to use the exclusive fence instead.
-		 *
-		 * Also note that we do not want to track the async vma as
-		 * part of the obj->resv->excl_fence as it only affects
-		 * execution and not content or object's backing store lifetime.
-		 */
-		prev = i915_active_set_exclusive(&vma->active, &work->base.dma);
-		if (prev) {
-			__i915_sw_fence_await_dma_fence(&work->base.chain,
-							prev,
-							&work->cb);
-			dma_fence_put(prev);
-		}
+	/*
+	 * Note we only want to chain up to the migration fence on
+	 * the pages (not the object itself). As we don't track that,
+	 * yet, we have to use the exclusive fence instead.
+	 *
+	 * Also note that we do not want to track the async vma as
+	 * part of the obj->resv->excl_fence as it only affects
+	 * execution and not content or object's backing store lifetime.
+	 */
+	prev = i915_active_set_exclusive(&vma->active, &work->base.dma);
+	if (prev) {
+		__i915_sw_fence_await_dma_fence(&work->base.chain,
+						prev,
+						&work->cb);
+		dma_fence_put(prev);
+	}
 
-		work->base.dma.error = 0; /* enable the queue_work() */
+	work->base.dma.error = 0; /* enable the queue_work() */
 
-		if (vma->obj) {
-			__i915_gem_object_pin_pages(vma->obj);
-			work->pinned = vma->obj;
-		}
-	} else {
-		ret = vma->ops->bind_vma(vma, cache_level, bind_flags);
-		if (ret)
-			return ret;
+	if (vma->obj) {
+		__i915_gem_object_pin_pages(vma->obj);
+		work->pinned = vma->obj;
 	}
 
 	atomic_or(bind_flags, &vma->flags);
@@ -694,6 +698,9 @@ i915_vma_insert(struct i915_vma *vma, u64 size, u64 alignment, u64 flags)
 		if (ret)
 			return ret;
 	} else {
+		const unsigned long page_sizes =
+			INTEL_INFO(vma->vm->i915)->page_sizes;
+
 		/*
 		 * We only support huge gtt pages through the 48b PPGTT,
 		 * however we also don't want to force any alignment for
@@ -703,7 +710,7 @@ i915_vma_insert(struct i915_vma *vma, u64 size, u64 alignment, u64 flags)
 		 * forseeable future. See also i915_ggtt_offset().
 		 */
 		if (upper_32_bits(end - 1) &&
-		    vma->page_sizes.sg > I915_GTT_PAGE_SIZE) {
+		    page_sizes > I915_GTT_PAGE_SIZE) {
 			/*
 			 * We can't mix 64K and 4K PTEs in the same page-table
 			 * (2M block), and so to avoid the ugliness and
@@ -711,7 +718,7 @@ i915_vma_insert(struct i915_vma *vma, u64 size, u64 alignment, u64 flags)
 			 * objects to 2M.
 			 */
 			u64 page_alignment =
-				rounddown_pow_of_two(vma->page_sizes.sg |
+				rounddown_pow_of_two(page_sizes |
 						     I915_GTT_PAGE_SIZE_2M);
 
 			/*
@@ -723,7 +730,7 @@ i915_vma_insert(struct i915_vma *vma, u64 size, u64 alignment, u64 flags)
 
 			alignment = max(alignment, page_alignment);
 
-			if (vma->page_sizes.sg & I915_GTT_PAGE_SIZE_64K)
+			if (page_sizes & I915_GTT_PAGE_SIZE_64K)
 				size = round_up(size, I915_GTT_PAGE_SIZE_2M);
 		}
 
@@ -798,74 +805,6 @@ static bool try_qad_pin(struct i915_vma *vma, unsigned int flags)
 	return pinned;
 }
 
-int i915_vma_get_pages(struct i915_vma *vma)
-{
-	int err = 0;
-
-	if (atomic_add_unless(&vma->pages_count, 1, 0))
-		return 0;
-
-	/* Allocations ahoy! */
-	if (mutex_lock_interruptible(&vma->pages_mutex))
-		return -EINTR;
-
-	if (!atomic_read(&vma->pages_count)) {
-		if (vma->obj) {
-			err = i915_gem_object_pin_pages(vma->obj);
-			if (err)
-				goto unlock;
-		}
-
-		err = vma->ops->set_pages(vma);
-		if (err) {
-			if (vma->obj)
-				i915_gem_object_unpin_pages(vma->obj);
-			goto unlock;
-		}
-	}
-	atomic_inc(&vma->pages_count);
-
-unlock:
-	mutex_unlock(&vma->pages_mutex);
-
-	return err;
-}
-
-static void __vma_put_pages(struct i915_vma *vma, unsigned int count)
-{
-	/* We allocate under vma_get_pages, so beware the shrinker */
-	mutex_lock_nested(&vma->pages_mutex, SINGLE_DEPTH_NESTING);
-	GEM_BUG_ON(atomic_read(&vma->pages_count) < count);
-	if (atomic_sub_return(count, &vma->pages_count) == 0) {
-		vma->ops->clear_pages(vma);
-		GEM_BUG_ON(vma->pages);
-		if (vma->obj)
-			i915_gem_object_unpin_pages(vma->obj);
-	}
-	mutex_unlock(&vma->pages_mutex);
-}
-
-void i915_vma_put_pages(struct i915_vma *vma)
-{
-	if (atomic_add_unless(&vma->pages_count, -1, 1))
-		return;
-
-	__vma_put_pages(vma, 1);
-}
-
-static void vma_unbind_pages(struct i915_vma *vma)
-{
-	unsigned int count;
-
-	lockdep_assert_held(&vma->vm->mutex);
-
-	/* The upper portion of pages_count is the number of bindings */
-	count = atomic_read(&vma->pages_count);
-	count >>= I915_VMA_PAGES_BIAS;
-	if (count)
-		__vma_put_pages(vma, count | count << I915_VMA_PAGES_BIAS);
-}
-
 static int __wait_for_unbind(struct i915_vma *vma, unsigned int flags)
 {
 	return __i915_vma_wait_excl(vma, false, flags);
@@ -887,20 +826,20 @@ int i915_vma_pin(struct i915_vma *vma, u64 size, u64 alignment, u64 flags)
 	if (try_qad_pin(vma, flags & I915_VMA_BIND_MASK))
 		return 0;
 
-	err = i915_vma_get_pages(vma);
-	if (err)
-		return err;
+	if (vma->obj) {
+		err = i915_gem_object_pin_pages(vma->obj);
+		if (err)
+			return err;
+	}
 
 	err = __wait_for_unbind(vma, flags);
 	if (err)
 		goto err_pages;
 
-	if (flags & vma->vm->bind_async_flags) {
-		work = i915_vma_work();
-		if (!work) {
-			err = -ENOMEM;
-			goto err_pages;
-		}
+	work = i915_vma_work();
+	if (!work) {
+		err = -ENOMEM;
+		goto err_pages;
 	}
 
 	if (flags & PIN_GLOBAL)
@@ -968,16 +907,12 @@ int i915_vma_pin(struct i915_vma *vma, u64 size, u64 alignment, u64 flags)
 			__i915_vma_set_map_and_fenceable(vma);
 	}
 
-	GEM_BUG_ON(!vma->pages);
 	err = i915_vma_bind(vma,
 			    vma->obj ? vma->obj->cache_level : 0,
 			    flags, work);
 	if (err)
 		goto err_remove;
 
-	/* There should only be at most 2 active bindings (user, global) */
-	GEM_BUG_ON(bound + I915_VMA_PAGES_ACTIVE < bound);
-	atomic_add(I915_VMA_PAGES_ACTIVE, &vma->pages_count);
 	list_move_tail(&vma->vm_link, &vma->vm->bound_list);
 	GEM_BUG_ON(!i915_vma_is_active(vma));
 
@@ -996,12 +931,12 @@ int i915_vma_pin(struct i915_vma *vma, u64 size, u64 alignment, u64 flags)
 err_unlock:
 	mutex_unlock(&vma->vm->mutex);
 err_fence:
-	if (work)
-		dma_fence_work_commit_imm(&work->base);
+	dma_fence_work_commit_imm(&work->base);
 	if (wakeref)
 		intel_runtime_pm_put(&vma->vm->i915->runtime_pm, wakeref);
 err_pages:
-	i915_vma_put_pages(vma);
+	if (vma->obj)
+		i915_gem_object_unpin_pages(vma->obj);
 	return err;
 }
 
@@ -1257,6 +1192,8 @@ int i915_vma_move_to_active(struct i915_vma *vma,
 
 void __i915_vma_evict(struct i915_vma *vma)
 {
+	int count;
+
 	GEM_BUG_ON(i915_vma_is_pinned(vma));
 
 	if (i915_vma_is_map_and_fenceable(vma)) {
@@ -1291,11 +1228,17 @@ void __i915_vma_evict(struct i915_vma *vma)
 		trace_i915_vma_unbind(vma);
 		vma->ops->unbind_vma(vma);
 	}
+	count = hweight32(atomic_read(&vma->flags) & I915_VMA_BIND_MASK);
 	atomic_and(~(I915_VMA_BIND_MASK | I915_VMA_ERROR | I915_VMA_GGTT_WRITE),
 		   &vma->flags);
 
 	i915_vma_detach(vma);
-	vma_unbind_pages(vma);
+
+	vma->ops->clear_pages(vma);
+	if (vma->obj) {
+		while (count--)
+			__i915_gem_object_unpin_pages(vma->obj);
+	}
 }
 
 int __i915_vma_unbind(struct i915_vma *vma)
diff --git a/drivers/gpu/drm/i915/i915_vma.h b/drivers/gpu/drm/i915/i915_vma.h
index 478e8679f331..8ec204817986 100644
--- a/drivers/gpu/drm/i915/i915_vma.h
+++ b/drivers/gpu/drm/i915/i915_vma.h
@@ -240,9 +240,6 @@ int __must_check
 i915_vma_pin(struct i915_vma *vma, u64 size, u64 alignment, u64 flags);
 int i915_ggtt_pin(struct i915_vma *vma, u32 align, unsigned int flags);
 
-int i915_vma_get_pages(struct i915_vma *vma);
-void i915_vma_put_pages(struct i915_vma *vma);
-
 static inline int i915_vma_pin_count(const struct i915_vma *vma)
 {
 	return atomic_read(&vma->flags) & I915_VMA_PIN_MASK;
diff --git a/drivers/gpu/drm/i915/i915_vma_types.h b/drivers/gpu/drm/i915/i915_vma_types.h
index 63831cdb7402..02c1640bb034 100644
--- a/drivers/gpu/drm/i915/i915_vma_types.h
+++ b/drivers/gpu/drm/i915/i915_vma_types.h
@@ -235,7 +235,6 @@ struct i915_vma {
 #define I915_VMA_BIND_MASK (I915_VMA_GLOBAL_BIND | I915_VMA_LOCAL_BIND)
 
 #define I915_VMA_ALLOC_BIT	12
-#define I915_VMA_ALLOC		((int)BIT(I915_VMA_ALLOC_BIT))
 
 #define I915_VMA_ERROR_BIT	13
 #define I915_VMA_ERROR		((int)BIT(I915_VMA_ERROR_BIT))
@@ -252,11 +251,6 @@ struct i915_vma {
 
 	struct i915_active active;
 
-#define I915_VMA_PAGES_BIAS 24
-#define I915_VMA_PAGES_ACTIVE (BIT(24) | 1)
-	atomic_t pages_count; /* number of active binds to the pages */
-	struct mutex pages_mutex; /* protect acquire/release of backing pages */
-
 	/**
 	 * Support different GGTT views into the same object.
 	 * This means there can be multiple VMA mappings per object and per VM.
@@ -280,4 +274,3 @@ struct i915_vma {
 };
 
 #endif
-
diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
index 0016ffc7d914..e840093e205f 100644
--- a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
@@ -1221,9 +1221,9 @@ static void track_vma_bind(struct i915_vma *vma)
 	__i915_gem_object_pin_pages(obj);
 
 	GEM_BUG_ON(vma->pages);
-	atomic_set(&vma->pages_count, I915_VMA_PAGES_ACTIVE);
-	__i915_gem_object_pin_pages(obj);
 	vma->pages = obj->mm.pages;
+	__i915_gem_object_pin_pages(obj);
+	atomic_or(I915_VMA_GLOBAL_BIND, &vma->flags);
 
 	mutex_lock(&vma->vm->mutex);
 	list_add_tail(&vma->vm_link, &vma->vm->bound_list);
diff --git a/drivers/gpu/drm/i915/selftests/mock_gtt.c b/drivers/gpu/drm/i915/selftests/mock_gtt.c
index edc5e3dda8ca..54825a322852 100644
--- a/drivers/gpu/drm/i915/selftests/mock_gtt.c
+++ b/drivers/gpu/drm/i915/selftests/mock_gtt.c
@@ -92,7 +92,6 @@ static int mock_bind_ggtt(struct i915_vma *vma,
 			  enum i915_cache_level cache_level,
 			  u32 flags)
 {
-	atomic_or(I915_VMA_GLOBAL_BIND | I915_VMA_LOCAL_BIND, &vma->flags);
 	return 0;
 }
 
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 48+ messages in thread

* [Intel-gfx] [PATCH 3/7] drm/i915/gem: Track the fences for object allocations
  2020-06-22  9:59 [Intel-gfx] [PATCH 1/7] drm/i915/gem: Replace i915_gem_object.mm.mutex with reservation_ww_class Chris Wilson
  2020-06-22  9:59 ` [Intel-gfx] [PATCH 2/7] drm/i915: Reuse the reservation_ww_class for acquiring vma backing storage Chris Wilson
@ 2020-06-22  9:59 ` Chris Wilson
  2020-06-22  9:59 ` [Intel-gfx] [PATCH 4/7] drm/i915: Update vma to use async page allocations Chris Wilson
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 48+ messages in thread
From: Chris Wilson @ 2020-06-22  9:59 UTC (permalink / raw)
  To: intel-gfx; +Cc: Chris Wilson

While often the allocation fence is a part of the implicit fences used
for accessing the object, we also want to identify the pages allocation
fence individually as different stages of the pipeline will want to only
be queued for the pages.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/gem/i915_gem_object.c    | 23 +++++++++
 .../gpu/drm/i915/gem/i915_gem_object_types.h  |  2 +
 drivers/gpu/drm/i915/gem/i915_gem_pages.c     | 50 ++++++++++++++++---
 drivers/gpu/drm/i915/i915_active.c            | 18 +++++++
 drivers/gpu/drm/i915/i915_active.h            |  7 +++
 5 files changed, 93 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.c b/drivers/gpu/drm/i915/gem/i915_gem_object.c
index 37b3fb0eb943..b7fc0da239f5 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_object.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_object.c
@@ -26,6 +26,7 @@
 
 #include "display/intel_frontbuffer.h"
 #include "gt/intel_gt.h"
+#include "i915_active.h"
 #include "i915_drv.h"
 #include "i915_gem_clflush.h"
 #include "i915_gem_context.h"
@@ -49,6 +50,24 @@ void i915_gem_object_free(struct drm_i915_gem_object *obj)
 	return kmem_cache_free(global.slab_objects, obj);
 }
 
+static int i915_mm_active(struct i915_active *ref)
+{
+	struct drm_i915_gem_object *obj =
+		container_of(ref, typeof(*obj), mm.active);
+
+	i915_gem_object_get(obj);
+	return 0;
+}
+
+__i915_active_call
+static void i915_mm_retire(struct i915_active *ref)
+{
+	struct drm_i915_gem_object *obj =
+		container_of(ref, typeof(*obj), mm.active);
+
+	i915_gem_object_put(obj);
+}
+
 void i915_gem_object_init(struct drm_i915_gem_object *obj,
 			  const struct drm_i915_gem_object_ops *ops,
 			  struct lock_class_key *key)
@@ -70,6 +89,8 @@ void i915_gem_object_init(struct drm_i915_gem_object *obj,
 	obj->mm.madv = I915_MADV_WILLNEED;
 	INIT_RADIX_TREE(&obj->mm.get_page.radix, GFP_KERNEL | __GFP_NOWARN);
 	mutex_init(&obj->mm.get_page.lock);
+
+	i915_active_init(&obj->mm.active, i915_mm_active, i915_mm_retire);
 }
 
 /**
@@ -149,6 +170,8 @@ static void __i915_gem_free_object_rcu(struct rcu_head *head)
 		container_of(head, typeof(*obj), rcu);
 	struct drm_i915_private *i915 = to_i915(obj->base.dev);
 
+	i915_active_fini(&obj->mm.active);
+
 	dma_resv_fini(&obj->base._resv);
 	i915_gem_object_free(obj);
 
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
index dbb33aac7828..668b249fd109 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
+++ b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
@@ -189,6 +189,8 @@ struct drm_i915_gem_object {
 		atomic_t pages_pin_count;
 		atomic_t shrink_pin;
 
+		struct i915_active active;
+
 		/**
 		 * Memory region for this object.
 		 */
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_pages.c b/drivers/gpu/drm/i915/gem/i915_gem_pages.c
index 2ff1036ef91f..d0cdf1c93a67 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_pages.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_pages.c
@@ -81,25 +81,59 @@ void __i915_gem_object_set_pages(struct drm_i915_gem_object *obj,
 	}
 }
 
-int ____i915_gem_object_get_pages(struct drm_i915_gem_object *obj)
+static int __i915_gem_object_wait_for_pages(struct drm_i915_gem_object *obj)
+{
+	int err;
+
+	err = i915_active_wait_for_exclusive(&obj->mm.active);
+	if (err)
+		return err;
+
+	if (IS_ERR(obj->mm.pages))
+		return PTR_ERR(obj->mm.pages);
+
+	return 0;
+}
+
+static int ____i915_gem_object_get_pages_async(struct drm_i915_gem_object *obj)
 {
-	struct drm_i915_private *i915 = to_i915(obj->base.dev);
 	int err;
 
 	assert_object_held(obj);
 
+	if (i915_active_has_exclusive(&obj->mm.active))
+		return 0;
+
+	if (i915_gem_object_has_pages(obj))
+		return PTR_ERR_OR_ZERO(obj->mm.pages);
+
 	if (unlikely(obj->mm.madv != I915_MADV_WILLNEED)) {
-		drm_dbg(&i915->drm,
+		drm_dbg(obj->base.dev,
 			"Attempting to obtain a purgeable object\n");
 		return -EFAULT;
 	}
 
+	err = i915_active_acquire(&obj->mm.active);
+	if (err)
+		return err;
+
 	err = obj->ops->get_pages(obj);
-	GEM_BUG_ON(!err && !i915_gem_object_has_pages(obj));
+	i915_active_release(&obj->mm.active);
 
 	return err;
 }
 
+int ____i915_gem_object_get_pages(struct drm_i915_gem_object *obj)
+{
+	int err;
+
+	err = ____i915_gem_object_get_pages_async(obj);
+	if (err)
+		return err;
+
+	return __i915_gem_object_wait_for_pages(obj);
+}
+
 /* Ensure that the associated pages are gathered from the backing storage
  * and pinned into our object. i915_gem_object_pin_pages() may be called
  * multiple times before they are released by a single call to
@@ -203,14 +237,16 @@ __i915_gem_object_unset_pages(struct drm_i915_gem_object *obj)
 int __i915_gem_object_put_pages(struct drm_i915_gem_object *obj)
 {
 	struct sg_table *pages;
+	int err;
 
 	/* May be called by shrinker from within get_pages() (on another bo) */
 	assert_object_held(obj);
 
-	if (i915_gem_object_has_pinned_pages(obj))
-		return -EBUSY;
+	err = i915_active_wait(&obj->mm.active);
+	if (err)
+		return err;
 
-	if (unlikely(atomic_read(&obj->mm.pages_pin_count)))
+	if (i915_gem_object_has_pinned_pages(obj))
 		return -EBUSY;
 
 	i915_gem_object_release_mmap_offset(obj);
diff --git a/drivers/gpu/drm/i915/i915_active.c b/drivers/gpu/drm/i915/i915_active.c
index 3f595446fd44..886685d6e1e2 100644
--- a/drivers/gpu/drm/i915/i915_active.c
+++ b/drivers/gpu/drm/i915/i915_active.c
@@ -1073,6 +1073,24 @@ int i915_active_fence_set(struct i915_active_fence *active,
 	return err;
 }
 
+int i915_active_fence_wait(struct i915_active_fence *active)
+{
+	struct dma_fence *fence;
+	int err;
+
+	if (GEM_WARN_ON(is_barrier(active)))
+		return -EBUSY;
+
+	fence = i915_active_fence_get(active);
+	if (!fence)
+		return 0;
+
+	err = dma_fence_wait(fence, true);
+	dma_fence_put(fence);
+
+	return err;
+}
+
 void i915_active_noop(struct dma_fence *fence, struct dma_fence_cb *cb)
 {
 	active_fence_cb(fence, cb);
diff --git a/drivers/gpu/drm/i915/i915_active.h b/drivers/gpu/drm/i915/i915_active.h
index 2e0bcb3289ec..eefcc2344509 100644
--- a/drivers/gpu/drm/i915/i915_active.h
+++ b/drivers/gpu/drm/i915/i915_active.h
@@ -123,6 +123,8 @@ i915_active_fence_isset(const struct i915_active_fence *active)
 	return rcu_access_pointer(active->fence);
 }
 
+int i915_active_fence_wait(struct i915_active_fence *active);
+
 /*
  * GPU activity tracking
  *
@@ -191,6 +193,11 @@ static inline int i915_active_wait(struct i915_active *ref)
 	return __i915_active_wait(ref, TASK_INTERRUPTIBLE);
 }
 
+static inline int i915_active_wait_for_exclusive(struct i915_active *ref)
+{
+	return i915_active_fence_wait(&ref->excl);
+}
+
 int i915_sw_fence_await_active(struct i915_sw_fence *fence,
 			       struct i915_active *ref,
 			       unsigned int flags);
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 48+ messages in thread

* [Intel-gfx] [PATCH 4/7] drm/i915: Update vma to use async page allocations
  2020-06-22  9:59 [Intel-gfx] [PATCH 1/7] drm/i915/gem: Replace i915_gem_object.mm.mutex with reservation_ww_class Chris Wilson
  2020-06-22  9:59 ` [Intel-gfx] [PATCH 2/7] drm/i915: Reuse the reservation_ww_class for acquiring vma backing storage Chris Wilson
  2020-06-22  9:59 ` [Intel-gfx] [PATCH 3/7] drm/i915/gem: Track the fences for object allocations Chris Wilson
@ 2020-06-22  9:59 ` Chris Wilson
  2020-06-22  9:59 ` [Intel-gfx] [PATCH 5/7] drm/i915/gem: Convert the userptr-worker to use a fence Chris Wilson
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 48+ messages in thread
From: Chris Wilson @ 2020-06-22  9:59 UTC (permalink / raw)
  To: intel-gfx; +Cc: Chris Wilson

Since we have asynchronous vma bindings, we are ready to utilise
asynchronous page allocations. All we have to do is ask for the
get_pages not to wait on our behalf, as our workqueue will.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 .../gpu/drm/i915/gem/i915_gem_execbuffer.c    |  2 +
 drivers/gpu/drm/i915/gem/i915_gem_object.h    |  1 +
 drivers/gpu/drm/i915/gem/i915_gem_pages.c     |  2 +-
 drivers/gpu/drm/i915/i915_vma.c               | 42 +++++++++----------
 drivers/gpu/drm/i915/i915_vma_types.h         |  1 +
 5 files changed, 25 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
index 678e7f82f6c9..59750edd617f 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -699,6 +699,8 @@ static int set_bind_fence(struct i915_vma *vma, struct eb_vm_work *work)
 
 	lockdep_assert_held(&vma->vm->mutex);
 	prev = i915_active_set_exclusive(&vma->active, &work->base.dma);
+	if (!prev)
+		prev = i915_active_fence_get(&vma->obj->mm.active.excl);
 	if (unlikely(prev)) {
 		err = i915_sw_fence_await_dma_fence(&work->base.chain, prev, 0,
 						    GFP_NOWAIT | __GFP_NOWARN);
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.h b/drivers/gpu/drm/i915/gem/i915_gem_object.h
index 03a1b859aeef..3bb0939dce99 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_object.h
+++ b/drivers/gpu/drm/i915/gem/i915_gem_object.h
@@ -275,6 +275,7 @@ void __i915_gem_object_set_pages(struct drm_i915_gem_object *obj,
 				 struct sg_table *pages,
 				 unsigned int sg_page_sizes);
 
+int ____i915_gem_object_get_pages_async(struct drm_i915_gem_object *obj);
 int ____i915_gem_object_get_pages(struct drm_i915_gem_object *obj);
 int __i915_gem_object_get_pages(struct drm_i915_gem_object *obj);
 int __i915_gem_object_get_pages_locked(struct drm_i915_gem_object *obj);
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_pages.c b/drivers/gpu/drm/i915/gem/i915_gem_pages.c
index d0cdf1c93a67..4efd1aeedc2d 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_pages.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_pages.c
@@ -95,7 +95,7 @@ static int __i915_gem_object_wait_for_pages(struct drm_i915_gem_object *obj)
 	return 0;
 }
 
-static int ____i915_gem_object_get_pages_async(struct drm_i915_gem_object *obj)
+int ____i915_gem_object_get_pages_async(struct drm_i915_gem_object *obj)
 {
 	int err;
 
diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
index dc656c7d3191..dc8fdb656e8b 100644
--- a/drivers/gpu/drm/i915/i915_vma.c
+++ b/drivers/gpu/drm/i915/i915_vma.c
@@ -123,6 +123,7 @@ vma_create(struct drm_i915_gem_object *obj,
 	vma->display_alignment = I915_GTT_MIN_ALIGNMENT;
 
 	i915_active_init(&vma->active, __i915_vma_active, __i915_vma_retire);
+	vma->fence_context = dma_fence_context_alloc(1);
 
 	/* Declare ourselves safe for use inside shrinkers */
 	if (IS_ENABLED(CONFIG_LOCKDEP)) {
@@ -295,7 +296,6 @@ i915_vma_instance(struct drm_i915_gem_object *obj,
 struct i915_vma_work {
 	struct dma_fence_work base;
 	struct i915_vma *vma;
-	struct drm_i915_gem_object *pinned;
 	struct i915_sw_dma_fence_cb cb;
 	enum i915_cache_level cache_level;
 	unsigned int flags;
@@ -331,9 +331,6 @@ static int __vma_bind(struct dma_fence_work *work)
 static void __vma_release(struct dma_fence_work *work)
 {
 	struct i915_vma_work *vw = container_of(work, typeof(*vw), base);
-
-	if (vw->pinned)
-		__i915_gem_object_unpin_pages(vw->pinned);
 }
 
 static const struct dma_fence_work_ops bind_ops = {
@@ -444,6 +441,8 @@ int i915_vma_bind(struct i915_vma *vma,
 	 * execution and not content or object's backing store lifetime.
 	 */
 	prev = i915_active_set_exclusive(&vma->active, &work->base.dma);
+	if (!prev && vma->obj)
+		prev = i915_active_fence_get(&vma->obj->mm.active.excl);
 	if (prev) {
 		__i915_sw_fence_await_dma_fence(&work->base.chain,
 						prev,
@@ -453,11 +452,6 @@ int i915_vma_bind(struct i915_vma *vma,
 
 	work->base.dma.error = 0; /* enable the queue_work() */
 
-	if (vma->obj) {
-		__i915_gem_object_pin_pages(vma->obj);
-		work->pinned = vma->obj;
-	}
-
 	atomic_or(bind_flags, &vma->flags);
 	return 0;
 }
@@ -826,20 +820,27 @@ int i915_vma_pin(struct i915_vma *vma, u64 size, u64 alignment, u64 flags)
 	if (try_qad_pin(vma, flags & I915_VMA_BIND_MASK))
 		return 0;
 
-	if (vma->obj) {
-		err = i915_gem_object_pin_pages(vma->obj);
-		if (err)
-			return err;
-	}
-
 	err = __wait_for_unbind(vma, flags);
 	if (err)
-		goto err_pages;
+		return err;
 
 	work = i915_vma_work();
-	if (!work) {
-		err = -ENOMEM;
-		goto err_pages;
+	if (!work)
+		return -ENOMEM;
+
+	if (vma->obj) {
+		if (dma_resv_lock_interruptible(vma->resv, NULL))
+			return -EINTR;
+
+		err = ____i915_gem_object_get_pages_async(vma->obj);
+		if (err == 0) {
+			err = i915_active_ref(&vma->obj->mm.active,
+					      vma->fence_context,
+					      &work->base.dma);
+		}
+		dma_resv_unlock(vma->resv);
+		if (err)
+			return err;
 	}
 
 	if (flags & PIN_GLOBAL)
@@ -934,9 +935,6 @@ int i915_vma_pin(struct i915_vma *vma, u64 size, u64 alignment, u64 flags)
 	dma_fence_work_commit_imm(&work->base);
 	if (wakeref)
 		intel_runtime_pm_put(&vma->vm->i915->runtime_pm, wakeref);
-err_pages:
-	if (vma->obj)
-		i915_gem_object_unpin_pages(vma->obj);
 	return err;
 }
 
diff --git a/drivers/gpu/drm/i915/i915_vma_types.h b/drivers/gpu/drm/i915/i915_vma_types.h
index 02c1640bb034..10757319c2a4 100644
--- a/drivers/gpu/drm/i915/i915_vma_types.h
+++ b/drivers/gpu/drm/i915/i915_vma_types.h
@@ -250,6 +250,7 @@ struct i915_vma {
 #define I915_VMA_GGTT_WRITE	((int)BIT(I915_VMA_GGTT_WRITE_BIT))
 
 	struct i915_active active;
+	u64 fence_context;
 
 	/**
 	 * Support different GGTT views into the same object.
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 48+ messages in thread

* [Intel-gfx] [PATCH 5/7] drm/i915/gem: Convert the userptr-worker to use a fence
  2020-06-22  9:59 [Intel-gfx] [PATCH 1/7] drm/i915/gem: Replace i915_gem_object.mm.mutex with reservation_ww_class Chris Wilson
                   ` (2 preceding siblings ...)
  2020-06-22  9:59 ` [Intel-gfx] [PATCH 4/7] drm/i915: Update vma to use async page allocations Chris Wilson
@ 2020-06-22  9:59 ` Chris Wilson
  2020-06-22  9:59 ` [Intel-gfx] [PATCH 6/7] drm/i915/gem: Break apart the early i915_vma_pin from execbuf object lookup Chris Wilson
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 48+ messages in thread
From: Chris Wilson @ 2020-06-22  9:59 UTC (permalink / raw)
  To: intel-gfx; +Cc: Chris Wilson

Now that we have fence tracking in place for object allocations, we can
remove the haphazard polling over a workqueue used for asynchronous
userptr allocations. All consumers will now wait for the fence
notification instead of leaking EAGAIN back to userspace.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 .../gpu/drm/i915/gem/i915_gem_execbuffer.c    |   5 -
 .../gpu/drm/i915/gem/i915_gem_object_types.h  |   1 -
 drivers/gpu/drm/i915/gem/i915_gem_pages.c     |   6 +-
 drivers/gpu/drm/i915/gem/i915_gem_userptr.c   | 136 ++++++++----------
 drivers/gpu/drm/i915/i915_drv.h               |   9 +-
 drivers/gpu/drm/i915/i915_gem.c               |   4 +-
 6 files changed, 66 insertions(+), 95 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
index 59750edd617f..60926209b1fc 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -1456,11 +1456,6 @@ static int eb_reserve(struct i915_execbuffer *eb)
 		if (signal_pending(current))
 			return -EINTR;
 
-		if (err == -EAGAIN) {
-			flush_workqueue(eb->i915->mm.userptr_wq);
-			continue;
-		}
-
 		/* Now safe to wait with no reservations held */
 		list_for_each_entry(ev, &eb->unbound, bind_link) {
 			struct i915_vma *vma = ev->vma;
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
index 668b249fd109..17a47186ba81 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
+++ b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
@@ -278,7 +278,6 @@ struct drm_i915_gem_object {
 
 			struct i915_mm_struct *mm;
 			struct i915_mmu_object *mmu_object;
-			struct work_struct *work;
 		} userptr;
 
 		unsigned long scratch;
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_pages.c b/drivers/gpu/drm/i915/gem/i915_gem_pages.c
index 4efd1aeedc2d..99f50c9d0ed6 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_pages.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_pages.c
@@ -18,8 +18,6 @@ void __i915_gem_object_set_pages(struct drm_i915_gem_object *obj,
 	unsigned long supported = INTEL_INFO(i915)->page_sizes;
 	int i;
 
-	assert_object_held(obj);
-
 	if (i915_gem_object_is_volatile(obj))
 		obj->mm.madv = I915_MADV_DONTNEED;
 
@@ -34,8 +32,6 @@ void __i915_gem_object_set_pages(struct drm_i915_gem_object *obj,
 	obj->mm.get_page.sg_pos = pages->sgl;
 	obj->mm.get_page.sg_idx = 0;
 
-	obj->mm.pages = pages;
-
 	if (i915_gem_object_is_tiled(obj) &&
 	    i915->quirks & QUIRK_PIN_SWIZZLED_PAGES) {
 		GEM_BUG_ON(obj->mm.quirked);
@@ -61,6 +57,8 @@ void __i915_gem_object_set_pages(struct drm_i915_gem_object *obj,
 	}
 	GEM_BUG_ON(!HAS_PAGE_SIZES(i915, obj->mm.page_sizes.sg));
 
+	WRITE_ONCE(obj->mm.pages, pages);
+
 	if (i915_gem_object_is_shrinkable(obj)) {
 		struct list_head *list;
 		unsigned long flags;
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c
index 4cd79f425eac..a236df02bc44 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c
@@ -14,6 +14,7 @@
 #include "i915_gem_ioctls.h"
 #include "i915_gem_object.h"
 #include "i915_scatterlist.h"
+#include "i915_sw_fence_work.h"
 
 struct i915_mm_struct {
 	struct mm_struct *mm;
@@ -398,7 +399,7 @@ i915_gem_userptr_release__mm_struct(struct drm_i915_gem_object *obj)
 }
 
 struct get_pages_work {
-	struct work_struct work;
+	struct dma_fence_work base;
 	struct drm_i915_gem_object *obj;
 	struct task_struct *task;
 };
@@ -446,12 +447,12 @@ __i915_gem_userptr_alloc_pages(struct drm_i915_gem_object *obj,
 	return st;
 }
 
-static void
-__i915_gem_userptr_get_pages_worker(struct work_struct *_work)
+static int gup_work(struct dma_fence_work *base)
 {
-	struct get_pages_work *work = container_of(_work, typeof(*work), work);
-	struct drm_i915_gem_object *obj = work->obj;
+	struct get_pages_work *gup = container_of(base, typeof(*gup), base);
+	struct drm_i915_gem_object *obj = gup->obj;
 	const unsigned long npages = obj->base.size >> PAGE_SHIFT;
+	struct sg_table *pages;
 	unsigned long pinned;
 	struct page **pvec;
 	int ret;
@@ -476,7 +477,7 @@ __i915_gem_userptr_get_pages_worker(struct work_struct *_work)
 					locked = 1;
 				}
 				ret = pin_user_pages_remote
-					(work->task, mm,
+					(gup->task, mm,
 					 obj->userptr.ptr + pinned * PAGE_SIZE,
 					 npages - pinned,
 					 flags,
@@ -492,37 +493,41 @@ __i915_gem_userptr_get_pages_worker(struct work_struct *_work)
 		}
 	}
 
-	i915_gem_object_lock(obj);
-	if (obj->userptr.work == &work->work) {
-		struct sg_table *pages = ERR_PTR(ret);
-
-		if (pinned == npages) {
-			pages = __i915_gem_userptr_alloc_pages(obj, pvec,
-							       npages);
-			if (!IS_ERR(pages)) {
-				pinned = 0;
-				pages = NULL;
-			}
+	if (pinned == npages) {
+		pages = __i915_gem_userptr_alloc_pages(obj, pvec, npages);
+		if (!IS_ERR(pages)) {
+			pinned = 0;
+			pages = NULL;
 		}
-
-		obj->userptr.work = ERR_CAST(pages);
-		if (IS_ERR(pages))
-			__i915_gem_userptr_set_active(obj, false);
+	} else {
+		pages = ERR_PTR(ret);
 	}
-	i915_gem_object_unlock(obj);
+	if (IS_ERR(pages))
+		__i915_gem_userptr_set_active(obj, false);
 
 	unpin_user_pages(pvec, pinned);
 	kvfree(pvec);
 
-	i915_gem_object_put(obj);
-	put_task_struct(work->task);
-	kfree(work);
+	return PTR_ERR_OR_ZERO(pages);
 }
 
-static struct sg_table *
+static void gup_release(struct dma_fence_work *base)
+{
+	struct get_pages_work *gup = container_of(base, typeof(*gup), base);
+
+	put_task_struct(gup->task);
+}
+
+static const struct dma_fence_work_ops gup_ops = {
+	.name = "userptr-getpages",
+	.work = gup_work,
+	.release = gup_release,
+};
+
+static bool
 __i915_gem_userptr_get_pages_schedule(struct drm_i915_gem_object *obj)
 {
-	struct get_pages_work *work;
+	struct get_pages_work *gup;
 
 	/* Spawn a worker so that we can acquire the
 	 * user pages without holding our mutex. Access
@@ -543,21 +548,21 @@ __i915_gem_userptr_get_pages_schedule(struct drm_i915_gem_object *obj)
 	 * that error back to this function through
 	 * obj->userptr.work = ERR_PTR.
 	 */
-	work = kmalloc(sizeof(*work), GFP_KERNEL);
-	if (work == NULL)
-		return ERR_PTR(-ENOMEM);
-
-	obj->userptr.work = &work->work;
+	gup = kmalloc(sizeof(*gup), GFP_KERNEL);
+	if (!gup)
+		return false;
 
-	work->obj = i915_gem_object_get(obj);
+	dma_fence_work_init(&gup->base, &gup_ops);
 
-	work->task = current;
-	get_task_struct(work->task);
+	gup->obj = obj;
+	i915_active_set_exclusive(&obj->mm.active, &gup->base.dma);
 
-	INIT_WORK(&work->work, __i915_gem_userptr_get_pages_worker);
-	queue_work(to_i915(obj->base.dev)->mm.userptr_wq, &work->work);
+	gup->task = current;
+	get_task_struct(gup->task);
 
-	return ERR_PTR(-EAGAIN);
+	dma_resv_add_excl_fence(obj->base.resv, &gup->base.dma);
+	dma_fence_work_commit(&gup->base);
+	return true;
 }
 
 static int i915_gem_userptr_get_pages(struct drm_i915_gem_object *obj)
@@ -566,7 +571,6 @@ static int i915_gem_userptr_get_pages(struct drm_i915_gem_object *obj)
 	struct mm_struct *mm = obj->userptr.mm->mm;
 	struct page **pvec;
 	struct sg_table *pages;
-	bool active;
 	int pinned;
 	unsigned int gup_flags = 0;
 
@@ -587,14 +591,6 @@ static int i915_gem_userptr_get_pages(struct drm_i915_gem_object *obj)
 	 * egregious cases from causing harm.
 	 */
 
-	if (obj->userptr.work) {
-		/* active flag should still be held for the pending work */
-		if (IS_ERR(obj->userptr.work))
-			return PTR_ERR(obj->userptr.work);
-		else
-			return -EAGAIN;
-	}
-
 	pvec = NULL;
 	pinned = 0;
 
@@ -618,28 +614,31 @@ static int i915_gem_userptr_get_pages(struct drm_i915_gem_object *obj)
 			pinned = pin_user_pages_fast_only(obj->userptr.ptr,
 							  num_pages, gup_flags,
 							  pvec);
+			if (pinned < 0)
+				goto out;
 		}
 	}
 
-	active = false;
-	if (pinned < 0) {
-		pages = ERR_PTR(pinned);
-		pinned = 0;
-	} else if (pinned < num_pages) {
-		pages = __i915_gem_userptr_get_pages_schedule(obj);
-		active = pages == ERR_PTR(-EAGAIN);
-	} else {
+	if (pinned == num_pages) {
 		pages = __i915_gem_userptr_alloc_pages(obj, pvec, num_pages);
-		active = !IS_ERR(pages);
+		if (IS_ERR(pages)) {
+			unpin_user_pages(pvec, pinned);
+			pinned = PTR_ERR(pages);
+			goto out;
+		}
+	} else {
+		unpin_user_pages(pvec, pinned);
+		if (!__i915_gem_userptr_get_pages_schedule(obj)) {
+			pinned = -ENOMEM;
+			goto out;
+		}
 	}
-	if (active)
-		__i915_gem_userptr_set_active(obj, true);
+	__i915_gem_userptr_set_active(obj, true);
+	pinned = 0;
 
-	if (IS_ERR(pages))
-		unpin_user_pages(pvec, pinned);
+out:
 	kvfree(pvec);
-
-	return PTR_ERR_OR_ZERO(pages);
+	return pinned;
 }
 
 static void
@@ -650,7 +649,6 @@ i915_gem_userptr_put_pages(struct drm_i915_gem_object *obj,
 	struct page *page;
 
 	/* Cancel any inflight work and force them to restart their gup */
-	obj->userptr.work = NULL;
 	__i915_gem_userptr_set_active(obj, false);
 	if (!pages)
 		return;
@@ -853,22 +851,12 @@ i915_gem_userptr_ioctl(struct drm_device *dev,
 	return 0;
 }
 
-int i915_gem_init_userptr(struct drm_i915_private *dev_priv)
+void i915_gem_init_userptr(struct drm_i915_private *dev_priv)
 {
 	spin_lock_init(&dev_priv->mm_lock);
 	hash_init(dev_priv->mm_structs);
-
-	dev_priv->mm.userptr_wq =
-		alloc_workqueue("i915-userptr-acquire",
-				WQ_HIGHPRI | WQ_UNBOUND,
-				0);
-	if (!dev_priv->mm.userptr_wq)
-		return -ENOMEM;
-
-	return 0;
 }
 
 void i915_gem_cleanup_userptr(struct drm_i915_private *dev_priv)
 {
-	destroy_workqueue(dev_priv->mm.userptr_wq);
 }
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 072b1e19d86a..85c11912288a 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -604,13 +604,6 @@ struct i915_gem_mm {
 	struct notifier_block vmap_notifier;
 	struct shrinker shrinker;
 
-	/**
-	 * Workqueue to fault in userptr pages, flushed by the execbuf
-	 * when required but otherwise left to userspace to try again
-	 * on EAGAIN.
-	 */
-	struct workqueue_struct *userptr_wq;
-
 	/* shrinker accounting, also useful for userland debugging */
 	u64 shrink_memory;
 	u32 shrink_count;
@@ -1724,7 +1717,7 @@ int i915_getparam_ioctl(struct drm_device *dev, void *data,
 			struct drm_file *file_priv);
 
 /* i915_gem.c */
-int i915_gem_init_userptr(struct drm_i915_private *dev_priv);
+void i915_gem_init_userptr(struct drm_i915_private *dev_priv);
 void i915_gem_cleanup_userptr(struct drm_i915_private *dev_priv);
 void i915_gem_init_early(struct drm_i915_private *dev_priv);
 void i915_gem_cleanup_early(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 0fbe438c4523..f886a98dc2ae 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1133,9 +1133,7 @@ int i915_gem_init(struct drm_i915_private *dev_priv)
 		mkwrite_device_info(dev_priv)->page_sizes =
 			I915_GTT_PAGE_SIZE_4K;
 
-	ret = i915_gem_init_userptr(dev_priv);
-	if (ret)
-		return ret;
+	i915_gem_init_userptr(dev_priv);
 
 	intel_uc_fetch_firmwares(&dev_priv->gt.uc);
 	intel_wopcm_init(&dev_priv->wopcm);
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 48+ messages in thread

* [Intel-gfx] [PATCH 6/7] drm/i915/gem: Break apart the early i915_vma_pin from execbuf object lookup
  2020-06-22  9:59 [Intel-gfx] [PATCH 1/7] drm/i915/gem: Replace i915_gem_object.mm.mutex with reservation_ww_class Chris Wilson
                   ` (3 preceding siblings ...)
  2020-06-22  9:59 ` [Intel-gfx] [PATCH 5/7] drm/i915/gem: Convert the userptr-worker to use a fence Chris Wilson
@ 2020-06-22  9:59 ` Chris Wilson
  2020-06-22  9:59 ` [Intel-gfx] [PATCH 7/7] drm/i915/gem: Acquire all vma/objects under reservation_ww_class Chris Wilson
  2020-06-22 10:45 ` [Intel-gfx] ✗ Fi.CI.BUILD: failure for series starting with [1/7] drm/i915/gem: Replace i915_gem_object.mm.mutex with reservation_ww_class Patchwork
  6 siblings, 0 replies; 48+ messages in thread
From: Chris Wilson @ 2020-06-22  9:59 UTC (permalink / raw)
  To: intel-gfx; +Cc: Chris Wilson

As a prelude to the next step where we want to perform all the object
allocations together under the same lock, we first must delay the
i915_vma_pin() as that implicitly does the allocations for us, one by
one. As it only does the allocations one by one, it is not allowed to
wait/evict, whereas pulling all the allocations together the entire set
can be scheduled as one.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 .../gpu/drm/i915/gem/i915_gem_execbuffer.c    | 43 +++++++++++--------
 1 file changed, 25 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
index 60926209b1fc..46fcbdf8161c 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -664,16 +664,6 @@ eb_add_vma(struct i915_execbuffer *eb,
 						    eb->lut_size)]);
 	}
 
-	if (eb_pin_vma(eb, entry, ev)) {
-		if (entry->offset != vma->node.start) {
-			entry->offset = vma->node.start | UPDATE;
-			eb->args->flags |= __EXEC_HAS_RELOC;
-		}
-	} else {
-		eb_unreserve_vma(ev);
-		list_add_tail(&ev->bind_link, &eb->unbound);
-	}
-
 	list_add_tail(&ev->lock_link, &eb->lock);
 }
 
@@ -1339,18 +1329,37 @@ static int eb_prepare_vma(struct eb_vma *ev)
 	return 0;
 }
 
-static int eb_reserve(struct i915_execbuffer *eb)
+static int eb_reserve_vm(struct i915_execbuffer *eb)
 {
 	const unsigned int count = eb->buffer_count;
 	struct i915_address_space *vm = eb->context->vm;
-	struct list_head last;
 	unsigned int i, pass;
-	struct eb_vma *ev;
 	int err = 0;
 
+	for (i = 0; i < count; i++) {
+		struct drm_i915_gem_exec_object2 *entry = &eb->exec[i];
+		struct eb_vma *ev = &eb->vma[i];
+
+		if (eb_pin_vma(eb, entry, ev)) {
+			struct i915_vma *vma = ev->vma;
+
+			if (entry->offset != vma->node.start) {
+				entry->offset = vma->node.start | UPDATE;
+				eb->args->flags |= __EXEC_HAS_RELOC;
+			}
+		} else {
+			eb_unreserve_vma(ev);
+			list_add_tail(&ev->bind_link, &eb->unbound);
+		}
+	}
+	if (list_empty(&eb->unbound))
+		return 0;
+
 	pass = 0;
 	do {
 		struct eb_vm_work *work;
+		struct list_head last;
+		struct eb_vma *ev;
 
 		list_for_each_entry(ev, &eb->unbound, bind_link) {
 			err = eb_prepare_vma(ev);
@@ -2404,11 +2413,9 @@ static int eb_relocate(struct i915_execbuffer *eb)
 	if (err)
 		return err;
 
-	if (!list_empty(&eb->unbound)) {
-		err = eb_reserve(eb);
-		if (err)
-			return err;
-	}
+	err = eb_reserve_vm(eb);
+	if (err)
+		return err;
 
 	/* The objects are in their final locations, apply the relocations. */
 	if (eb->args->flags & __EXEC_HAS_RELOC) {
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 48+ messages in thread

* [Intel-gfx] [PATCH 7/7] drm/i915/gem: Acquire all vma/objects under reservation_ww_class
  2020-06-22  9:59 [Intel-gfx] [PATCH 1/7] drm/i915/gem: Replace i915_gem_object.mm.mutex with reservation_ww_class Chris Wilson
                   ` (4 preceding siblings ...)
  2020-06-22  9:59 ` [Intel-gfx] [PATCH 6/7] drm/i915/gem: Break apart the early i915_vma_pin from execbuf object lookup Chris Wilson
@ 2020-06-22  9:59 ` Chris Wilson
  2020-06-23  9:33   ` Thomas Hellström (Intel)
  2020-06-23 11:22   ` Thomas Hellström (Intel)
  2020-06-22 10:45 ` [Intel-gfx] ✗ Fi.CI.BUILD: failure for series starting with [1/7] drm/i915/gem: Replace i915_gem_object.mm.mutex with reservation_ww_class Patchwork
  6 siblings, 2 replies; 48+ messages in thread
From: Chris Wilson @ 2020-06-22  9:59 UTC (permalink / raw)
  To: intel-gfx; +Cc: Chris Wilson

In order to actually handle eviction and what not, we need to process
all the objects together under a common lock, reservation_ww_class. As
such, do a memory reservation pass after looking up the object/vma,
which then feeds into the rest of execbuf [relocation, cmdparsing,
flushing and ofc execution].

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 .../gpu/drm/i915/gem/i915_gem_execbuffer.c    | 91 ++++++++++++++-----
 1 file changed, 70 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
index 46fcbdf8161c..8db2e013465f 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -53,10 +53,9 @@ struct eb_vma_array {
 
 #define __EXEC_OBJECT_HAS_PIN		BIT(31)
 #define __EXEC_OBJECT_HAS_FENCE		BIT(30)
-#define __EXEC_OBJECT_HAS_PAGES		BIT(29)
-#define __EXEC_OBJECT_NEEDS_MAP		BIT(28)
-#define __EXEC_OBJECT_NEEDS_BIAS	BIT(27)
-#define __EXEC_OBJECT_INTERNAL_FLAGS	(~0u << 27) /* all of the above */
+#define __EXEC_OBJECT_NEEDS_MAP		BIT(29)
+#define __EXEC_OBJECT_NEEDS_BIAS	BIT(28)
+#define __EXEC_OBJECT_INTERNAL_FLAGS	(~0u << 28) /* all of the above */
 
 #define __EXEC_HAS_RELOC	BIT(31)
 #define __EXEC_INTERNAL_FLAGS	(~0u << 31)
@@ -241,6 +240,8 @@ struct i915_execbuffer {
 	struct intel_context *context; /* logical state for the request */
 	struct i915_gem_context *gem_context; /** caller's context */
 
+	struct dma_fence *mm_fence;
+
 	struct i915_request *request; /** our request to build */
 	struct eb_vma *batch; /** identity of the batch obj/vma */
 	struct i915_vma *trampoline; /** trampoline used for chaining */
@@ -331,12 +332,7 @@ static inline void eb_unreserve_vma(struct eb_vma *ev)
 	if (ev->flags & __EXEC_OBJECT_HAS_PIN)
 		__i915_vma_unpin(vma);
 
-	if (ev->flags & __EXEC_OBJECT_HAS_PAGES)
-		i915_gem_object_unpin_pages(vma->obj);
-
-	ev->flags &= ~(__EXEC_OBJECT_HAS_PIN |
-		       __EXEC_OBJECT_HAS_FENCE |
-		       __EXEC_OBJECT_HAS_PAGES);
+	ev->flags &= ~(__EXEC_OBJECT_HAS_PIN | __EXEC_OBJECT_HAS_FENCE);
 }
 
 static void eb_vma_array_destroy(struct kref *kref)
@@ -667,6 +663,55 @@ eb_add_vma(struct i915_execbuffer *eb,
 	list_add_tail(&ev->lock_link, &eb->lock);
 }
 
+static int eb_vma_get_pages(struct i915_execbuffer *eb,
+			    struct eb_vma *ev,
+			    u64 idx)
+{
+	struct i915_vma *vma = ev->vma;
+	int err;
+
+	/* XXX also preallocate PD for vma */
+
+	err = ____i915_gem_object_get_pages_async(vma->obj);
+	if (err)
+		return err;
+
+	return i915_active_ref(&vma->obj->mm.active, idx, eb->mm_fence);
+}
+
+static int eb_reserve_mm(struct i915_execbuffer *eb)
+{
+	const u64 idx = eb->context->timeline->fence_context;
+	struct ww_acquire_ctx acquire;
+	struct eb_vma *ev;
+	int err;
+
+	eb->mm_fence = __dma_fence_create_proxy(0, 0);
+	if (!eb->mm_fence)
+		return -ENOMEM;
+
+	ww_acquire_init(&acquire, &reservation_ww_class);
+
+	err = eb_lock_vma(eb, &acquire);
+	if (err)
+		goto out;
+
+	ww_acquire_done(&acquire);
+
+	list_for_each_entry(ev, &eb->lock, lock_link) {
+		struct i915_vma *vma = ev->vma;
+
+		if (err == 0)
+			err = eb_vma_get_pages(eb, ev, idx);
+
+		i915_vma_unlock(vma);
+	}
+
+out:
+	ww_acquire_fini(&acquire);
+	return err;
+}
+
 struct eb_vm_work {
 	struct dma_fence_work base;
 	struct list_head unbound;
@@ -1312,20 +1357,9 @@ static int eb_vm_throttle(struct eb_vm_work *work)
 
 static int eb_prepare_vma(struct eb_vma *ev)
 {
-	struct i915_vma *vma = ev->vma;
-	int err;
-
 	ev->hole.flags = 0;
 	ev->bind_flags = 0;
 
-	if (!(ev->flags &  __EXEC_OBJECT_HAS_PAGES)) {
-		err = i915_gem_object_pin_pages(vma->obj);
-		if (err)
-			return err;
-
-		ev->flags |=  __EXEC_OBJECT_HAS_PAGES;
-	}
-
 	return 0;
 }
 
@@ -2413,6 +2447,10 @@ static int eb_relocate(struct i915_execbuffer *eb)
 	if (err)
 		return err;
 
+	err = eb_reserve_mm(eb);
+	if (err)
+		return err;
+
 	err = eb_reserve_vm(eb);
 	if (err)
 		return err;
@@ -2844,6 +2882,12 @@ static int eb_submit(struct i915_execbuffer *eb, struct i915_vma *batch)
 {
 	int err;
 
+	if (eb->mm_fence) {
+		dma_fence_proxy_set_real(eb->mm_fence, &eb->request->fence);
+		dma_fence_put(eb->mm_fence);
+		eb->mm_fence = NULL;
+	}
+
 	if (eb->reloc_cache.fence) {
 		err = i915_request_await_dma_fence(eb->request,
 						   &eb->reloc_cache.rq->fence);
@@ -3339,6 +3383,7 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 	eb.batch_start_offset = args->batch_start_offset;
 	eb.batch_len = args->batch_len;
 	eb.trampoline = NULL;
+	eb.mm_fence = NULL;
 
 	eb.batch_flags = 0;
 	if (args->flags & I915_EXEC_SECURE) {
@@ -3533,6 +3578,10 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 		eb_reloc_signal(&eb, eb.reloc_cache.rq);
 	if (eb.trampoline)
 		i915_vma_unpin(eb.trampoline);
+	if (eb.mm_fence) {
+		dma_fence_signal(eb.mm_fence);
+		dma_fence_put(eb.mm_fence);
+	}
 	eb_unpin_engine(&eb);
 err_context:
 	i915_gem_context_put(eb.gem_context);
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 48+ messages in thread

* [Intel-gfx] ✗ Fi.CI.BUILD: failure for series starting with [1/7] drm/i915/gem: Replace i915_gem_object.mm.mutex with reservation_ww_class
  2020-06-22  9:59 [Intel-gfx] [PATCH 1/7] drm/i915/gem: Replace i915_gem_object.mm.mutex with reservation_ww_class Chris Wilson
                   ` (5 preceding siblings ...)
  2020-06-22  9:59 ` [Intel-gfx] [PATCH 7/7] drm/i915/gem: Acquire all vma/objects under reservation_ww_class Chris Wilson
@ 2020-06-22 10:45 ` Patchwork
  6 siblings, 0 replies; 48+ messages in thread
From: Patchwork @ 2020-06-22 10:45 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [1/7] drm/i915/gem: Replace i915_gem_object.mm.mutex with reservation_ww_class
URL   : https://patchwork.freedesktop.org/series/78688/
State : failure

== Summary ==

Applying: drm/i915/gem: Replace i915_gem_object.mm.mutex with reservation_ww_class
Applying: drm/i915: Reuse the reservation_ww_class for acquiring vma backing storage
error: sha1 information is lacking or useless (drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c).
error: could not build fake ancestor
hint: Use 'git am --show-current-patch=diff' to see the failed patch
Patch failed at 0002 drm/i915: Reuse the reservation_ww_class for acquiring vma backing storage
When you have resolved this problem, run "git am --continue".
If you prefer to skip this patch, run "git am --skip" instead.
To restore the original branch and stop patching, run "git am --abort".

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Intel-gfx] [PATCH 7/7] drm/i915/gem: Acquire all vma/objects under reservation_ww_class
  2020-06-22  9:59 ` [Intel-gfx] [PATCH 7/7] drm/i915/gem: Acquire all vma/objects under reservation_ww_class Chris Wilson
@ 2020-06-23  9:33   ` Thomas Hellström (Intel)
  2020-06-23 10:03     ` Chris Wilson
  2020-06-23 11:22   ` Thomas Hellström (Intel)
  1 sibling, 1 reply; 48+ messages in thread
From: Thomas Hellström (Intel) @ 2020-06-23  9:33 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Hi, Chris!

On 6/22/20 11:59 AM, Chris Wilson wrote:
> In order to actually handle eviction and what not, we need to process
> all the objects together under a common lock, reservation_ww_class. As
> such, do a memory reservation pass after looking up the object/vma,
> which then feeds into the rest of execbuf [relocation, cmdparsing,
> flushing and ofc execution].
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   .../gpu/drm/i915/gem/i915_gem_execbuffer.c    | 91 ++++++++++++++-----
>   1 file changed, 70 insertions(+), 21 deletions(-)
>
Which tree is this against? The series doesn't apply cleanly against 
drm-tip?

...

> +static int eb_reserve_mm(struct i915_execbuffer *eb)
> +{
> +	const u64 idx = eb->context->timeline->fence_context;
> +	struct ww_acquire_ctx acquire;
> +	struct eb_vma *ev;
> +	int err;
> +
> +	eb->mm_fence = __dma_fence_create_proxy(0, 0);
> +	if (!eb->mm_fence)
> +		return -ENOMEM;

Where are the proxy fence functions defined?

Thanks,

Thomas


_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Intel-gfx] [PATCH 7/7] drm/i915/gem: Acquire all vma/objects under reservation_ww_class
  2020-06-23  9:33   ` Thomas Hellström (Intel)
@ 2020-06-23 10:03     ` Chris Wilson
  2020-06-23 15:37       ` Thomas Hellström (Intel)
  2020-06-23 21:01       ` Dave Airlie
  0 siblings, 2 replies; 48+ messages in thread
From: Chris Wilson @ 2020-06-23 10:03 UTC (permalink / raw)
  To: Thomas Hellström, intel-gfx

Quoting Thomas Hellström (Intel) (2020-06-23 10:33:20)
> Hi, Chris!
> 
> On 6/22/20 11:59 AM, Chris Wilson wrote:
> > In order to actually handle eviction and what not, we need to process
> > all the objects together under a common lock, reservation_ww_class. As
> > such, do a memory reservation pass after looking up the object/vma,
> > which then feeds into the rest of execbuf [relocation, cmdparsing,
> > flushing and ofc execution].
> >
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > ---
> >   .../gpu/drm/i915/gem/i915_gem_execbuffer.c    | 91 ++++++++++++++-----
> >   1 file changed, 70 insertions(+), 21 deletions(-)
> >
> Which tree is this against? The series doesn't apply cleanly against 
> drm-tip?

It's continuing on from the scheduler patches, the bug fixes and the
iris-deferred-fence work. I thought throwing all of those old patches
into the pile would have been distracting.

> ...
> 
> > +static int eb_reserve_mm(struct i915_execbuffer *eb)
> > +{
> > +     const u64 idx = eb->context->timeline->fence_context;
> > +     struct ww_acquire_ctx acquire;
> > +     struct eb_vma *ev;
> > +     int err;
> > +
> > +     eb->mm_fence = __dma_fence_create_proxy(0, 0);
> > +     if (!eb->mm_fence)
> > +             return -ENOMEM;
> 
> Where are the proxy fence functions defined?

In dma-fence-proxy.c ;)
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Intel-gfx] [PATCH 7/7] drm/i915/gem: Acquire all vma/objects under reservation_ww_class
  2020-06-22  9:59 ` [Intel-gfx] [PATCH 7/7] drm/i915/gem: Acquire all vma/objects under reservation_ww_class Chris Wilson
  2020-06-23  9:33   ` Thomas Hellström (Intel)
@ 2020-06-23 11:22   ` Thomas Hellström (Intel)
  2020-06-23 12:57     ` Thomas Hellström (Intel)
  2020-06-23 16:36     ` Chris Wilson
  1 sibling, 2 replies; 48+ messages in thread
From: Thomas Hellström (Intel) @ 2020-06-23 11:22 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Hi, Chris,

On 6/22/20 11:59 AM, Chris Wilson wrote:
> In order to actually handle eviction and what not, we need to process
> all the objects together under a common lock, reservation_ww_class. As
> such, do a memory reservation pass after looking up the object/vma,
> which then feeds into the rest of execbuf [relocation, cmdparsing,
> flushing and ofc execution].
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   .../gpu/drm/i915/gem/i915_gem_execbuffer.c    | 91 ++++++++++++++-----
>   1 file changed, 70 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> index 46fcbdf8161c..8db2e013465f 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> @@ -53,10 +53,9 @@ struct eb_vma_array {
>   
>   #define __EXEC_OBJECT_HAS_PIN		BIT(31)
>   #define __EXEC_OBJECT_HAS_FENCE		BIT(30)
> -#define __EXEC_OBJECT_HAS_PAGES		BIT(29)
> -#define __EXEC_OBJECT_NEEDS_MAP		BIT(28)
> -#define __EXEC_OBJECT_NEEDS_BIAS	BIT(27)
> -#define __EXEC_OBJECT_INTERNAL_FLAGS	(~0u << 27) /* all of the above */
> +#define __EXEC_OBJECT_NEEDS_MAP		BIT(29)
> +#define __EXEC_OBJECT_NEEDS_BIAS	BIT(28)
> +#define __EXEC_OBJECT_INTERNAL_FLAGS	(~0u << 28) /* all of the above */
>   
>   #define __EXEC_HAS_RELOC	BIT(31)
>   #define __EXEC_INTERNAL_FLAGS	(~0u << 31)
> @@ -241,6 +240,8 @@ struct i915_execbuffer {
>   	struct intel_context *context; /* logical state for the request */
>   	struct i915_gem_context *gem_context; /** caller's context */
>   
> +	struct dma_fence *mm_fence;
> +
>   	struct i915_request *request; /** our request to build */
>   	struct eb_vma *batch; /** identity of the batch obj/vma */
>   	struct i915_vma *trampoline; /** trampoline used for chaining */
> @@ -331,12 +332,7 @@ static inline void eb_unreserve_vma(struct eb_vma *ev)
>   	if (ev->flags & __EXEC_OBJECT_HAS_PIN)
>   		__i915_vma_unpin(vma);
>   
> -	if (ev->flags & __EXEC_OBJECT_HAS_PAGES)
> -		i915_gem_object_unpin_pages(vma->obj);
> -
> -	ev->flags &= ~(__EXEC_OBJECT_HAS_PIN |
> -		       __EXEC_OBJECT_HAS_FENCE |
> -		       __EXEC_OBJECT_HAS_PAGES);
> +	ev->flags &= ~(__EXEC_OBJECT_HAS_PIN | __EXEC_OBJECT_HAS_FENCE);
>   }
>   
>   static void eb_vma_array_destroy(struct kref *kref)
> @@ -667,6 +663,55 @@ eb_add_vma(struct i915_execbuffer *eb,
>   	list_add_tail(&ev->lock_link, &eb->lock);
>   }
>   
> +static int eb_vma_get_pages(struct i915_execbuffer *eb,
> +			    struct eb_vma *ev,
> +			    u64 idx)
> +{
> +	struct i915_vma *vma = ev->vma;
> +	int err;
> +
> +	/* XXX also preallocate PD for vma */
> +
> +	err = ____i915_gem_object_get_pages_async(vma->obj);
> +	if (err)
> +		return err;
> +
> +	return i915_active_ref(&vma->obj->mm.active, idx, eb->mm_fence);
> +}
> +
> +static int eb_reserve_mm(struct i915_execbuffer *eb)
> +{
> +	const u64 idx = eb->context->timeline->fence_context;
> +	struct ww_acquire_ctx acquire;
> +	struct eb_vma *ev;
> +	int err;
> +
> +	eb->mm_fence = __dma_fence_create_proxy(0, 0);
> +	if (!eb->mm_fence)
> +		return -ENOMEM;

Question: eb is local to this thread, right, so eb->mm_fence is not 
considered "published" yet?

> +
> +	ww_acquire_init(&acquire, &reservation_ww_class);
> +
> +	err = eb_lock_vma(eb, &acquire);
> +	if (err)
> +		goto out;
> +
> +	ww_acquire_done(&acquire);
> +
> +	list_for_each_entry(ev, &eb->lock, lock_link) {
> +		struct i915_vma *vma = ev->vma;
> +
> +		if (err == 0)
> +			err = eb_vma_get_pages(eb, ev, idx);

I figure this is where you publish the proxy fence? If so, the fence 
signaling critical path starts with this loop, and that means any code 
we call between here and submission complete (including spawned work we 
need to wait for before submission) may not lock the 
reservation_ww_class nor (still being discussed) allocate memory. It 
looks like i915_pin_vma takes a reservation_ww_class. And all memory 
pinning seems to be in the fence critical path as well?

/Thomas


_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Intel-gfx] [PATCH 7/7] drm/i915/gem: Acquire all vma/objects under reservation_ww_class
  2020-06-23 11:22   ` Thomas Hellström (Intel)
@ 2020-06-23 12:57     ` Thomas Hellström (Intel)
  2020-06-23 14:01       ` Chris Wilson
  2020-06-23 16:36     ` Chris Wilson
  1 sibling, 1 reply; 48+ messages in thread
From: Thomas Hellström (Intel) @ 2020-06-23 12:57 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 6/23/20 1:22 PM, Thomas Hellström (Intel) wrote:
> Hi, Chris,
>
> On 6/22/20 11:59 AM, Chris Wilson wrote:
>> In order to actually handle eviction and what not, we need to process
>> all the objects together under a common lock, reservation_ww_class. As
>> such, do a memory reservation pass after looking up the object/vma,
>> which then feeds into the rest of execbuf [relocation, cmdparsing,
>> flushing and ofc execution].
>>
>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>> ---
>>   .../gpu/drm/i915/gem/i915_gem_execbuffer.c    | 91 ++++++++++++++-----
>>   1 file changed, 70 insertions(+), 21 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c 
>> b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
>> index 46fcbdf8161c..8db2e013465f 100644
>> --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
>> @@ -53,10 +53,9 @@ struct eb_vma_array {
>>     #define __EXEC_OBJECT_HAS_PIN        BIT(31)
>>   #define __EXEC_OBJECT_HAS_FENCE        BIT(30)
>> -#define __EXEC_OBJECT_HAS_PAGES        BIT(29)
>> -#define __EXEC_OBJECT_NEEDS_MAP        BIT(28)
>> -#define __EXEC_OBJECT_NEEDS_BIAS    BIT(27)
>> -#define __EXEC_OBJECT_INTERNAL_FLAGS    (~0u << 27) /* all of the 
>> above */
>> +#define __EXEC_OBJECT_NEEDS_MAP        BIT(29)
>> +#define __EXEC_OBJECT_NEEDS_BIAS    BIT(28)
>> +#define __EXEC_OBJECT_INTERNAL_FLAGS    (~0u << 28) /* all of the 
>> above */
>>     #define __EXEC_HAS_RELOC    BIT(31)
>>   #define __EXEC_INTERNAL_FLAGS    (~0u << 31)
>> @@ -241,6 +240,8 @@ struct i915_execbuffer {
>>       struct intel_context *context; /* logical state for the request */
>>       struct i915_gem_context *gem_context; /** caller's context */
>>   +    struct dma_fence *mm_fence;
>> +
>>       struct i915_request *request; /** our request to build */
>>       struct eb_vma *batch; /** identity of the batch obj/vma */
>>       struct i915_vma *trampoline; /** trampoline used for chaining */
>> @@ -331,12 +332,7 @@ static inline void eb_unreserve_vma(struct 
>> eb_vma *ev)
>>       if (ev->flags & __EXEC_OBJECT_HAS_PIN)
>>           __i915_vma_unpin(vma);
>>   -    if (ev->flags & __EXEC_OBJECT_HAS_PAGES)
>> -        i915_gem_object_unpin_pages(vma->obj);
>> -
>> -    ev->flags &= ~(__EXEC_OBJECT_HAS_PIN |
>> -               __EXEC_OBJECT_HAS_FENCE |
>> -               __EXEC_OBJECT_HAS_PAGES);
>> +    ev->flags &= ~(__EXEC_OBJECT_HAS_PIN | __EXEC_OBJECT_HAS_FENCE);
>>   }
>>     static void eb_vma_array_destroy(struct kref *kref)
>> @@ -667,6 +663,55 @@ eb_add_vma(struct i915_execbuffer *eb,
>>       list_add_tail(&ev->lock_link, &eb->lock);
>>   }
>>   +static int eb_vma_get_pages(struct i915_execbuffer *eb,
>> +                struct eb_vma *ev,
>> +                u64 idx)
>> +{
>> +    struct i915_vma *vma = ev->vma;
>> +    int err;
>> +
>> +    /* XXX also preallocate PD for vma */
>> +
>> +    err = ____i915_gem_object_get_pages_async(vma->obj);
>> +    if (err)
>> +        return err;
>> +
>> +    return i915_active_ref(&vma->obj->mm.active, idx, eb->mm_fence);
>> +}
>> +
>> +static int eb_reserve_mm(struct i915_execbuffer *eb)
>> +{
>> +    const u64 idx = eb->context->timeline->fence_context;
>> +    struct ww_acquire_ctx acquire;
>> +    struct eb_vma *ev;
>> +    int err;
>> +
>> +    eb->mm_fence = __dma_fence_create_proxy(0, 0);
>> +    if (!eb->mm_fence)
>> +        return -ENOMEM;
>
> Question: eb is local to this thread, right, so eb->mm_fence is not 
> considered "published" yet?
>
>> +
>> +    ww_acquire_init(&acquire, &reservation_ww_class);
>> +
>> +    err = eb_lock_vma(eb, &acquire);
>> +    if (err)
>> +        goto out;
>> +
>> +    ww_acquire_done(&acquire);
>> +
>> +    list_for_each_entry(ev, &eb->lock, lock_link) {
>> +        struct i915_vma *vma = ev->vma;
>> +
>> +        if (err == 0)
>> +            err = eb_vma_get_pages(eb, ev, idx);
>
> I figure this is where you publish the proxy fence? If so, the fence 
> signaling critical path starts with this loop, and that means any code 
> we call between here and submission complete (including spawned work 
> we need to wait for before submission) may not lock the 
> reservation_ww_class nor (still being discussed) allocate memory. It 
> looks like i915_pin_vma takes a reservation_ww_class. And all memory 
> pinning seems to be in the fence critical path as well?

And I think even if we at some point end up with the allocation 
annotation the other way around, allowing memory allocations in fence 
signalling critical paths, both relocations and userpointer would cause 
lockdep problems because of

mmap_sem->reservation_object->fence_wait (fault handlers, lockdep priming)
vs
fence_critical->gup/copy_from_user->mmap_sem

/Thomas


>
> /Thomas
>
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Intel-gfx] [PATCH 7/7] drm/i915/gem: Acquire all vma/objects under reservation_ww_class
  2020-06-23 12:57     ` Thomas Hellström (Intel)
@ 2020-06-23 14:01       ` Chris Wilson
  2020-06-23 15:09         ` Thomas Hellström (Intel)
  0 siblings, 1 reply; 48+ messages in thread
From: Chris Wilson @ 2020-06-23 14:01 UTC (permalink / raw)
  To: Thomas Hellström, intel-gfx

Quoting Thomas Hellström (Intel) (2020-06-23 13:57:06)
> 
> On 6/23/20 1:22 PM, Thomas Hellström (Intel) wrote:
> > Hi, Chris,
> >
> > On 6/22/20 11:59 AM, Chris Wilson wrote:
> >> In order to actually handle eviction and what not, we need to process
> >> all the objects together under a common lock, reservation_ww_class. As
> >> such, do a memory reservation pass after looking up the object/vma,
> >> which then feeds into the rest of execbuf [relocation, cmdparsing,
> >> flushing and ofc execution].
> >>
> >> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >> ---
> >>   .../gpu/drm/i915/gem/i915_gem_execbuffer.c    | 91 ++++++++++++++-----
> >>   1 file changed, 70 insertions(+), 21 deletions(-)
> >>
> >> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c 
> >> b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> >> index 46fcbdf8161c..8db2e013465f 100644
> >> --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> >> +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> >> @@ -53,10 +53,9 @@ struct eb_vma_array {
> >>     #define __EXEC_OBJECT_HAS_PIN        BIT(31)
> >>   #define __EXEC_OBJECT_HAS_FENCE        BIT(30)
> >> -#define __EXEC_OBJECT_HAS_PAGES        BIT(29)
> >> -#define __EXEC_OBJECT_NEEDS_MAP        BIT(28)
> >> -#define __EXEC_OBJECT_NEEDS_BIAS    BIT(27)
> >> -#define __EXEC_OBJECT_INTERNAL_FLAGS    (~0u << 27) /* all of the 
> >> above */
> >> +#define __EXEC_OBJECT_NEEDS_MAP        BIT(29)
> >> +#define __EXEC_OBJECT_NEEDS_BIAS    BIT(28)
> >> +#define __EXEC_OBJECT_INTERNAL_FLAGS    (~0u << 28) /* all of the 
> >> above */
> >>     #define __EXEC_HAS_RELOC    BIT(31)
> >>   #define __EXEC_INTERNAL_FLAGS    (~0u << 31)
> >> @@ -241,6 +240,8 @@ struct i915_execbuffer {
> >>       struct intel_context *context; /* logical state for the request */
> >>       struct i915_gem_context *gem_context; /** caller's context */
> >>   +    struct dma_fence *mm_fence;
> >> +
> >>       struct i915_request *request; /** our request to build */
> >>       struct eb_vma *batch; /** identity of the batch obj/vma */
> >>       struct i915_vma *trampoline; /** trampoline used for chaining */
> >> @@ -331,12 +332,7 @@ static inline void eb_unreserve_vma(struct 
> >> eb_vma *ev)
> >>       if (ev->flags & __EXEC_OBJECT_HAS_PIN)
> >>           __i915_vma_unpin(vma);
> >>   -    if (ev->flags & __EXEC_OBJECT_HAS_PAGES)
> >> -        i915_gem_object_unpin_pages(vma->obj);
> >> -
> >> -    ev->flags &= ~(__EXEC_OBJECT_HAS_PIN |
> >> -               __EXEC_OBJECT_HAS_FENCE |
> >> -               __EXEC_OBJECT_HAS_PAGES);
> >> +    ev->flags &= ~(__EXEC_OBJECT_HAS_PIN | __EXEC_OBJECT_HAS_FENCE);
> >>   }
> >>     static void eb_vma_array_destroy(struct kref *kref)
> >> @@ -667,6 +663,55 @@ eb_add_vma(struct i915_execbuffer *eb,
> >>       list_add_tail(&ev->lock_link, &eb->lock);
> >>   }
> >>   +static int eb_vma_get_pages(struct i915_execbuffer *eb,
> >> +                struct eb_vma *ev,
> >> +                u64 idx)
> >> +{
> >> +    struct i915_vma *vma = ev->vma;
> >> +    int err;
> >> +
> >> +    /* XXX also preallocate PD for vma */
> >> +
> >> +    err = ____i915_gem_object_get_pages_async(vma->obj);
> >> +    if (err)
> >> +        return err;
> >> +
> >> +    return i915_active_ref(&vma->obj->mm.active, idx, eb->mm_fence);
> >> +}
> >> +
> >> +static int eb_reserve_mm(struct i915_execbuffer *eb)
> >> +{
> >> +    const u64 idx = eb->context->timeline->fence_context;
> >> +    struct ww_acquire_ctx acquire;
> >> +    struct eb_vma *ev;
> >> +    int err;
> >> +
> >> +    eb->mm_fence = __dma_fence_create_proxy(0, 0);
> >> +    if (!eb->mm_fence)
> >> +        return -ENOMEM;
> >
> > Question: eb is local to this thread, right, so eb->mm_fence is not 
> > considered "published" yet?
> >
> >> +
> >> +    ww_acquire_init(&acquire, &reservation_ww_class);
> >> +
> >> +    err = eb_lock_vma(eb, &acquire);
> >> +    if (err)
> >> +        goto out;
> >> +
> >> +    ww_acquire_done(&acquire);
> >> +
> >> +    list_for_each_entry(ev, &eb->lock, lock_link) {
> >> +        struct i915_vma *vma = ev->vma;
> >> +
> >> +        if (err == 0)
> >> +            err = eb_vma_get_pages(eb, ev, idx);
> >
> > I figure this is where you publish the proxy fence? If so, the fence 
> > signaling critical path starts with this loop, and that means any code 
> > we call between here and submission complete (including spawned work 
> > we need to wait for before submission) may not lock the 
> > reservation_ww_class nor (still being discussed) allocate memory.

Yes, at this point we have reserved the memory for the execbuf.

> > It 
> > looks like i915_pin_vma takes a reservation_ww_class. And all memory 
> > pinning seems to be in the fence critical path as well?

Correct, it's not meant to be waiting inside i915_vma_pin(); the
intention was to pass in memory, and then we would not need to
do the acquire ourselves. As we have just reserved the memory in the
above loop, this should not be an issue. I was trying to keep the
change minimal and allow incremental conversions. It does however need
to add a reference to the object for the work it spawns -- equally
though there is an async eviction pass later in execbuf. The challenge
here is that the greedy grab of bound vma is faster than doing the
unbound eviction handling (even when eviction is not required).

> And I think even if we at some point end up with the allocation 
> annotation the other way around, allowing memory allocations in fence 
> signalling critical paths, both relocations and userpointer would cause 
> lockdep problems because of
> 
> mmap_sem->reservation_object->fence_wait (fault handlers, lockdep priming)

We don't wait inside mmap_sem. One cannot, you do not know the locking
context, so you can only try to reclaim idle space. So you end up with
the issue of a multitude of threads each trying to claim the last slice
of the aperture/backing storage, not being able to directly reclaim and
so have to hit the equivalent of kswapd.

> vs
> fence_critical->gup/copy_from_user->mmap_sem

Which exists today, even the busy wait loop is implicit linkage; you only
need userspace to be holding a resource on the gpu to create the deadlock.
I've been using the userfault handler to develop test cases where we can
arbitrarily block the userptr.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Intel-gfx] [PATCH 7/7] drm/i915/gem: Acquire all vma/objects under reservation_ww_class
  2020-06-23 14:01       ` Chris Wilson
@ 2020-06-23 15:09         ` Thomas Hellström (Intel)
  2020-06-23 16:00           ` Chris Wilson
  2020-06-23 16:17           ` Chris Wilson
  0 siblings, 2 replies; 48+ messages in thread
From: Thomas Hellström (Intel) @ 2020-06-23 15:09 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 6/23/20 4:01 PM, Chris Wilson wrote:
> Quoting Thomas Hellström (Intel) (2020-06-23 13:57:06)
>> On 6/23/20 1:22 PM, Thomas Hellström (Intel) wrote:
>>> Hi, Chris,
>>>
>>> On 6/22/20 11:59 AM, Chris Wilson wrote:
>>>> In order to actually handle eviction and what not, we need to process
>>>> all the objects together under a common lock, reservation_ww_class. As
>>>> such, do a memory reservation pass after looking up the object/vma,
>>>> which then feeds into the rest of execbuf [relocation, cmdparsing,
>>>> flushing and ofc execution].
>>>>
>>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>>> ---
>>>>    .../gpu/drm/i915/gem/i915_gem_execbuffer.c    | 91 ++++++++++++++-----
>>>>    1 file changed, 70 insertions(+), 21 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
>>>> b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
>>>> index 46fcbdf8161c..8db2e013465f 100644
>>>> --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
>>>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
>>>> @@ -53,10 +53,9 @@ struct eb_vma_array {
>>>>      #define __EXEC_OBJECT_HAS_PIN        BIT(31)
>>>>    #define __EXEC_OBJECT_HAS_FENCE        BIT(30)
>>>> -#define __EXEC_OBJECT_HAS_PAGES        BIT(29)
>>>> -#define __EXEC_OBJECT_NEEDS_MAP        BIT(28)
>>>> -#define __EXEC_OBJECT_NEEDS_BIAS    BIT(27)
>>>> -#define __EXEC_OBJECT_INTERNAL_FLAGS    (~0u << 27) /* all of the
>>>> above */
>>>> +#define __EXEC_OBJECT_NEEDS_MAP        BIT(29)
>>>> +#define __EXEC_OBJECT_NEEDS_BIAS    BIT(28)
>>>> +#define __EXEC_OBJECT_INTERNAL_FLAGS    (~0u << 28) /* all of the
>>>> above */
>>>>      #define __EXEC_HAS_RELOC    BIT(31)
>>>>    #define __EXEC_INTERNAL_FLAGS    (~0u << 31)
>>>> @@ -241,6 +240,8 @@ struct i915_execbuffer {
>>>>        struct intel_context *context; /* logical state for the request */
>>>>        struct i915_gem_context *gem_context; /** caller's context */
>>>>    +    struct dma_fence *mm_fence;
>>>> +
>>>>        struct i915_request *request; /** our request to build */
>>>>        struct eb_vma *batch; /** identity of the batch obj/vma */
>>>>        struct i915_vma *trampoline; /** trampoline used for chaining */
>>>> @@ -331,12 +332,7 @@ static inline void eb_unreserve_vma(struct
>>>> eb_vma *ev)
>>>>        if (ev->flags & __EXEC_OBJECT_HAS_PIN)
>>>>            __i915_vma_unpin(vma);
>>>>    -    if (ev->flags & __EXEC_OBJECT_HAS_PAGES)
>>>> -        i915_gem_object_unpin_pages(vma->obj);
>>>> -
>>>> -    ev->flags &= ~(__EXEC_OBJECT_HAS_PIN |
>>>> -               __EXEC_OBJECT_HAS_FENCE |
>>>> -               __EXEC_OBJECT_HAS_PAGES);
>>>> +    ev->flags &= ~(__EXEC_OBJECT_HAS_PIN | __EXEC_OBJECT_HAS_FENCE);
>>>>    }
>>>>      static void eb_vma_array_destroy(struct kref *kref)
>>>> @@ -667,6 +663,55 @@ eb_add_vma(struct i915_execbuffer *eb,
>>>>        list_add_tail(&ev->lock_link, &eb->lock);
>>>>    }
>>>>    +static int eb_vma_get_pages(struct i915_execbuffer *eb,
>>>> +                struct eb_vma *ev,
>>>> +                u64 idx)
>>>> +{
>>>> +    struct i915_vma *vma = ev->vma;
>>>> +    int err;
>>>> +
>>>> +    /* XXX also preallocate PD for vma */
>>>> +
>>>> +    err = ____i915_gem_object_get_pages_async(vma->obj);
>>>> +    if (err)
>>>> +        return err;
>>>> +
>>>> +    return i915_active_ref(&vma->obj->mm.active, idx, eb->mm_fence);
>>>> +}
>>>> +
>>>> +static int eb_reserve_mm(struct i915_execbuffer *eb)
>>>> +{
>>>> +    const u64 idx = eb->context->timeline->fence_context;
>>>> +    struct ww_acquire_ctx acquire;
>>>> +    struct eb_vma *ev;
>>>> +    int err;
>>>> +
>>>> +    eb->mm_fence = __dma_fence_create_proxy(0, 0);
>>>> +    if (!eb->mm_fence)
>>>> +        return -ENOMEM;
>>> Question: eb is local to this thread, right, so eb->mm_fence is not
>>> considered "published" yet?
>>>
>>>> +
>>>> +    ww_acquire_init(&acquire, &reservation_ww_class);
>>>> +
>>>> +    err = eb_lock_vma(eb, &acquire);
>>>> +    if (err)
>>>> +        goto out;
>>>> +
>>>> +    ww_acquire_done(&acquire);
>>>> +
>>>> +    list_for_each_entry(ev, &eb->lock, lock_link) {
>>>> +        struct i915_vma *vma = ev->vma;
>>>> +
>>>> +        if (err == 0)
>>>> +            err = eb_vma_get_pages(eb, ev, idx);
>>> I figure this is where you publish the proxy fence? If so, the fence
>>> signaling critical path starts with this loop, and that means any code
>>> we call between here and submission complete (including spawned work
>>> we need to wait for before submission) may not lock the
>>> reservation_ww_class nor (still being discussed) allocate memory.
> Yes, at this point we have reserved the memory for the execbuf.
>
>>> It
>>> looks like i915_pin_vma takes a reservation_ww_class. And all memory
>>> pinning seems to be in the fence critical path as well?
> Correct, it's not meant to be waiting inside i915_vma_pin(); the
> intention was to pass in memory, and then we would not need to
> do the acquire ourselves. As we have just reserved the memory in the
> above loop, this should not be an issue. I was trying to keep the
> change minimal and allow incremental conversions. It does however need
> to add a reference to the object for the work it spawns -- equally
> though there is an async eviction pass later in execbuf. The challenge
> here is that the greedy grab of bound vma is faster than doing the
> unbound eviction handling (even when eviction is not required).

So for the i915_vma_pin, it looks like

fence_critical_start(eb_reserve_mm) -> 
dma_resv_lock_interruptible(i915_vma_pin) -> lockdep issue.

You can't take the dma_resv_lock inside a fence critical section.

And for the memory allocation, it looks like the fence is published in 
the first loop iteration, starting the critical section, meaning that 
any memory allocation that follows will cause a lockdep issue. That 
includes worker threads. (with the proposed dma_fence annotations).

>
>> And I think even if we at some point end up with the allocation
>> annotation the other way around, allowing memory allocations in fence
>> signalling critical paths, both relocations and userpointer would cause
>> lockdep problems because of
>>
>> mmap_sem->reservation_object->fence_wait (fault handlers, lockdep priming)
> We don't wait inside mmap_sem. One cannot, you do not know the locking
> context, so you can only try to reclaim idle space. So you end up with
> the issue of a multitude of threads each trying to claim the last slice
> of the aperture/backing storage, not being able to directly reclaim and
> so have to hit the equivalent of kswapd.

I don't think I follow you here. There are a number of drivers that wait 
for dma_fences inside the fault handlers with mmap_sem held for data to 
be migrated before the pte is set up.

>
>> vs
>> fence_critical->gup/copy_from_user->mmap_sem
> Which exists today, even the busy wait loop is implicit linkage; you only
> need userspace to be holding a resource on the gpu to create the deadlock.
> I've been using the userfault handler to develop test cases where we can
> arbitrarily block the userptr.

Yes but in a case where we don't publish the fence early, the above 
would be reduced to the well known reservation_ww_class vs mmap_sem 
lockdep issue, which other drivers seem to have solved and we could copy 
what they've done.

/Thomas




> -Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Intel-gfx] [PATCH 7/7] drm/i915/gem: Acquire all vma/objects under reservation_ww_class
  2020-06-23 10:03     ` Chris Wilson
@ 2020-06-23 15:37       ` Thomas Hellström (Intel)
  2020-06-23 16:37         ` Chris Wilson
  2020-06-23 21:01       ` Dave Airlie
  1 sibling, 1 reply; 48+ messages in thread
From: Thomas Hellström (Intel) @ 2020-06-23 15:37 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 6/23/20 12:03 PM, Chris Wilson wrote:
> Quoting Thomas Hellström (Intel) (2020-06-23 10:33:20)
>> Hi, Chris!
>>
>> On 6/22/20 11:59 AM, Chris Wilson wrote:
>>> In order to actually handle eviction and what not, we need to process
>>> all the objects together under a common lock, reservation_ww_class. As
>>> such, do a memory reservation pass after looking up the object/vma,
>>> which then feeds into the rest of execbuf [relocation, cmdparsing,
>>> flushing and ofc execution].
>>>
>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>> ---
>>>    .../gpu/drm/i915/gem/i915_gem_execbuffer.c    | 91 ++++++++++++++-----
>>>    1 file changed, 70 insertions(+), 21 deletions(-)
>>>
>> Which tree is this against? The series doesn't apply cleanly against
>> drm-tip?
> It's continuing on from the scheduler patches, the bug fixes and the
> iris-deferred-fence work. I thought throwing all of those old patches
> into the pile would have been distracting.

Is there somewhere you could push a branch for reviewer consumption?

Thanks,

/Thomas

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Intel-gfx] [PATCH 7/7] drm/i915/gem: Acquire all vma/objects under reservation_ww_class
  2020-06-23 15:09         ` Thomas Hellström (Intel)
@ 2020-06-23 16:00           ` Chris Wilson
  2020-06-23 16:17           ` Chris Wilson
  1 sibling, 0 replies; 48+ messages in thread
From: Chris Wilson @ 2020-06-23 16:00 UTC (permalink / raw)
  To: Thomas Hellström, intel-gfx

Quoting Thomas Hellström (Intel) (2020-06-23 16:09:08)
> 
> On 6/23/20 4:01 PM, Chris Wilson wrote:
> > Quoting Thomas Hellström (Intel) (2020-06-23 13:57:06)
> >> On 6/23/20 1:22 PM, Thomas Hellström (Intel) wrote:
> >>> Hi, Chris,
> >>>
> >>> On 6/22/20 11:59 AM, Chris Wilson wrote:
> >>>> In order to actually handle eviction and what not, we need to process
> >>>> all the objects together under a common lock, reservation_ww_class. As
> >>>> such, do a memory reservation pass after looking up the object/vma,
> >>>> which then feeds into the rest of execbuf [relocation, cmdparsing,
> >>>> flushing and ofc execution].
> >>>>
> >>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >>>> ---
> >>>>    .../gpu/drm/i915/gem/i915_gem_execbuffer.c    | 91 ++++++++++++++-----
> >>>>    1 file changed, 70 insertions(+), 21 deletions(-)
> >>>>
> >>>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> >>>> b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> >>>> index 46fcbdf8161c..8db2e013465f 100644
> >>>> --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> >>>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> >>>> @@ -53,10 +53,9 @@ struct eb_vma_array {
> >>>>      #define __EXEC_OBJECT_HAS_PIN        BIT(31)
> >>>>    #define __EXEC_OBJECT_HAS_FENCE        BIT(30)
> >>>> -#define __EXEC_OBJECT_HAS_PAGES        BIT(29)
> >>>> -#define __EXEC_OBJECT_NEEDS_MAP        BIT(28)
> >>>> -#define __EXEC_OBJECT_NEEDS_BIAS    BIT(27)
> >>>> -#define __EXEC_OBJECT_INTERNAL_FLAGS    (~0u << 27) /* all of the
> >>>> above */
> >>>> +#define __EXEC_OBJECT_NEEDS_MAP        BIT(29)
> >>>> +#define __EXEC_OBJECT_NEEDS_BIAS    BIT(28)
> >>>> +#define __EXEC_OBJECT_INTERNAL_FLAGS    (~0u << 28) /* all of the
> >>>> above */
> >>>>      #define __EXEC_HAS_RELOC    BIT(31)
> >>>>    #define __EXEC_INTERNAL_FLAGS    (~0u << 31)
> >>>> @@ -241,6 +240,8 @@ struct i915_execbuffer {
> >>>>        struct intel_context *context; /* logical state for the request */
> >>>>        struct i915_gem_context *gem_context; /** caller's context */
> >>>>    +    struct dma_fence *mm_fence;
> >>>> +
> >>>>        struct i915_request *request; /** our request to build */
> >>>>        struct eb_vma *batch; /** identity of the batch obj/vma */
> >>>>        struct i915_vma *trampoline; /** trampoline used for chaining */
> >>>> @@ -331,12 +332,7 @@ static inline void eb_unreserve_vma(struct
> >>>> eb_vma *ev)
> >>>>        if (ev->flags & __EXEC_OBJECT_HAS_PIN)
> >>>>            __i915_vma_unpin(vma);
> >>>>    -    if (ev->flags & __EXEC_OBJECT_HAS_PAGES)
> >>>> -        i915_gem_object_unpin_pages(vma->obj);
> >>>> -
> >>>> -    ev->flags &= ~(__EXEC_OBJECT_HAS_PIN |
> >>>> -               __EXEC_OBJECT_HAS_FENCE |
> >>>> -               __EXEC_OBJECT_HAS_PAGES);
> >>>> +    ev->flags &= ~(__EXEC_OBJECT_HAS_PIN | __EXEC_OBJECT_HAS_FENCE);
> >>>>    }
> >>>>      static void eb_vma_array_destroy(struct kref *kref)
> >>>> @@ -667,6 +663,55 @@ eb_add_vma(struct i915_execbuffer *eb,
> >>>>        list_add_tail(&ev->lock_link, &eb->lock);
> >>>>    }
> >>>>    +static int eb_vma_get_pages(struct i915_execbuffer *eb,
> >>>> +                struct eb_vma *ev,
> >>>> +                u64 idx)
> >>>> +{
> >>>> +    struct i915_vma *vma = ev->vma;
> >>>> +    int err;
> >>>> +
> >>>> +    /* XXX also preallocate PD for vma */
> >>>> +
> >>>> +    err = ____i915_gem_object_get_pages_async(vma->obj);
> >>>> +    if (err)
> >>>> +        return err;
> >>>> +
> >>>> +    return i915_active_ref(&vma->obj->mm.active, idx, eb->mm_fence);
> >>>> +}
> >>>> +
> >>>> +static int eb_reserve_mm(struct i915_execbuffer *eb)
> >>>> +{
> >>>> +    const u64 idx = eb->context->timeline->fence_context;
> >>>> +    struct ww_acquire_ctx acquire;
> >>>> +    struct eb_vma *ev;
> >>>> +    int err;
> >>>> +
> >>>> +    eb->mm_fence = __dma_fence_create_proxy(0, 0);
> >>>> +    if (!eb->mm_fence)
> >>>> +        return -ENOMEM;
> >>> Question: eb is local to this thread, right, so eb->mm_fence is not
> >>> considered "published" yet?
> >>>
> >>>> +
> >>>> +    ww_acquire_init(&acquire, &reservation_ww_class);
> >>>> +
> >>>> +    err = eb_lock_vma(eb, &acquire);
> >>>> +    if (err)
> >>>> +        goto out;
> >>>> +
> >>>> +    ww_acquire_done(&acquire);
> >>>> +
> >>>> +    list_for_each_entry(ev, &eb->lock, lock_link) {
> >>>> +        struct i915_vma *vma = ev->vma;
> >>>> +
> >>>> +        if (err == 0)
> >>>> +            err = eb_vma_get_pages(eb, ev, idx);
> >>> I figure this is where you publish the proxy fence? If so, the fence
> >>> signaling critical path starts with this loop, and that means any code
> >>> we call between here and submission complete (including spawned work
> >>> we need to wait for before submission) may not lock the
> >>> reservation_ww_class nor (still being discussed) allocate memory.
> > Yes, at this point we have reserved the memory for the execbuf.
> >
> >>> It
> >>> looks like i915_pin_vma takes a reservation_ww_class. And all memory
> >>> pinning seems to be in the fence critical path as well?
> > Correct, it's not meant to be waiting inside i915_vma_pin(); the
> > intention was to pass in memory, and then we would not need to
> > do the acquire ourselves. As we have just reserved the memory in the
> > above loop, this should not be an issue. I was trying to keep the
> > change minimal and allow incremental conversions. It does however need
> > to add a reference to the object for the work it spawns -- equally
> > though there is an async eviction pass later in execbuf. The challenge
> > here is that the greedy grab of bound vma is faster than doing the
> > unbound eviction handling (even when eviction is not required).
> 
> So for the i915_vma_pin, it looks like
> 
> fence_critical_start(eb_reserve_mm) -> 
> dma_resv_lock_interruptible(i915_vma_pin) -> lockdep issue.
> 
> You can't take the dma_resv_lock inside a fence critical section.

Aye, and that is trivially liftable since the allocation is provided by
the caller. But we still want one off access, hence the preference for
keeping the convenience of i915_vma_pin until all callers have
transitioned.

> And for the memory allocation, it looks like the fence is published in 
> the first loop iteration, starting the critical section, meaning that 
> any memory allocation that follows will cause a lockdep issue. That 
> includes worker threads. (with the proposed dma_fence annotations).

I fail to be convinced that proposal is a good solution.

> >> And I think even if we at some point end up with the allocation
> >> annotation the other way around, allowing memory allocations in fence
> >> signalling critical paths, both relocations and userpointer would cause
> >> lockdep problems because of
> >>
> >> mmap_sem->reservation_object->fence_wait (fault handlers, lockdep priming)
> > We don't wait inside mmap_sem. One cannot, you do not know the locking
> > context, so you can only try to reclaim idle space. So you end up with
> > the issue of a multitude of threads each trying to claim the last slice
> > of the aperture/backing storage, not being able to directly reclaim and
> > so have to hit the equivalent of kswapd.
> 
> I don't think I follow you here. There are a number of drivers that wait 
> for dma_fences inside the fault handlers with mmap_sem held for data to 
> be migrated before the pte is set up.

I mean that userspace is at liberty to prevent the migration
arbitrarily, forming a resource lock. Waiting for userspace while holding
any mutex is an easy deadlock. Hence why any such wait for eviction of another
must be interruptible (either by signal, timeout), ensure force completion or
never actually wait. A wait for itself should at least be killable.

> >> vs
> >> fence_critical->gup/copy_from_user->mmap_sem
> > Which exists today, even the busy wait loop is implicit linkage; you only
> > need userspace to be holding a resource on the gpu to create the deadlock.
> > I've been using the userfault handler to develop test cases where we can
> > arbitrarily block the userptr.
> 
> Yes but in a case where we don't publish the fence early, the above 
> would be reduced to the well known reservation_ww_class vs mmap_sem 
> lockdep issue, which other drivers seem to have solved and we could copy 
> what they've done.

I don't see the difference. We have gup, malloc, copy_user inside a fence
as it stands. What I would want to address, with say, ttm_bo_vm_reserve is
that faulting should only care about the migration chain, and we should
be careful when selecting eviction candidates (if only because we need
to respect memory prioritisation).
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Intel-gfx] [PATCH 7/7] drm/i915/gem: Acquire all vma/objects under reservation_ww_class
  2020-06-23 15:09         ` Thomas Hellström (Intel)
  2020-06-23 16:00           ` Chris Wilson
@ 2020-06-23 16:17           ` Chris Wilson
  2020-06-23 16:29             ` Thomas Hellström (Intel)
  1 sibling, 1 reply; 48+ messages in thread
From: Chris Wilson @ 2020-06-23 16:17 UTC (permalink / raw)
  To: Thomas Hellström, intel-gfx

Quoting Thomas Hellström (Intel) (2020-06-23 16:09:08)
> You can't take the dma_resv_lock inside a fence critical section.

I much prefer the alternative interpretation, you can't wait inside a
dma_resv_lock.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Intel-gfx] [PATCH 7/7] drm/i915/gem: Acquire all vma/objects under reservation_ww_class
  2020-06-23 16:17           ` Chris Wilson
@ 2020-06-23 16:29             ` Thomas Hellström (Intel)
  2020-06-23 16:46               ` Chris Wilson
  0 siblings, 1 reply; 48+ messages in thread
From: Thomas Hellström (Intel) @ 2020-06-23 16:29 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 6/23/20 6:17 PM, Chris Wilson wrote:
> Quoting Thomas Hellström (Intel) (2020-06-23 16:09:08)
>> You can't take the dma_resv_lock inside a fence critical section.
> I much prefer the alternative interpretation, you can't wait inside a
> dma_resv_lock.
> -Chris

I respect your point of view, athough I need to think we need to focus 
on what we have to do in the i915 driver.

/Thomas


_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Intel-gfx] [PATCH 7/7] drm/i915/gem: Acquire all vma/objects under reservation_ww_class
  2020-06-23 11:22   ` Thomas Hellström (Intel)
  2020-06-23 12:57     ` Thomas Hellström (Intel)
@ 2020-06-23 16:36     ` Chris Wilson
  2020-06-23 18:21       ` Thomas Hellström (Intel)
  1 sibling, 1 reply; 48+ messages in thread
From: Chris Wilson @ 2020-06-23 16:36 UTC (permalink / raw)
  To: Thomas Hellström, intel-gfx

Quoting Thomas Hellström (Intel) (2020-06-23 12:22:11)
> Hi, Chris,
> 
> On 6/22/20 11:59 AM, Chris Wilson wrote:
> > In order to actually handle eviction and what not, we need to process
> > all the objects together under a common lock, reservation_ww_class. As
> > such, do a memory reservation pass after looking up the object/vma,
> > which then feeds into the rest of execbuf [relocation, cmdparsing,
> > flushing and ofc execution].
> >
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > ---
> >   .../gpu/drm/i915/gem/i915_gem_execbuffer.c    | 91 ++++++++++++++-----
> >   1 file changed, 70 insertions(+), 21 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> > index 46fcbdf8161c..8db2e013465f 100644
> > --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> > +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> > @@ -53,10 +53,9 @@ struct eb_vma_array {
> >   
> >   #define __EXEC_OBJECT_HAS_PIN               BIT(31)
> >   #define __EXEC_OBJECT_HAS_FENCE             BIT(30)
> > -#define __EXEC_OBJECT_HAS_PAGES              BIT(29)
> > -#define __EXEC_OBJECT_NEEDS_MAP              BIT(28)
> > -#define __EXEC_OBJECT_NEEDS_BIAS     BIT(27)
> > -#define __EXEC_OBJECT_INTERNAL_FLAGS (~0u << 27) /* all of the above */
> > +#define __EXEC_OBJECT_NEEDS_MAP              BIT(29)
> > +#define __EXEC_OBJECT_NEEDS_BIAS     BIT(28)
> > +#define __EXEC_OBJECT_INTERNAL_FLAGS (~0u << 28) /* all of the above */
> >   
> >   #define __EXEC_HAS_RELOC    BIT(31)
> >   #define __EXEC_INTERNAL_FLAGS       (~0u << 31)
> > @@ -241,6 +240,8 @@ struct i915_execbuffer {
> >       struct intel_context *context; /* logical state for the request */
> >       struct i915_gem_context *gem_context; /** caller's context */
> >   
> > +     struct dma_fence *mm_fence;
> > +
> >       struct i915_request *request; /** our request to build */
> >       struct eb_vma *batch; /** identity of the batch obj/vma */
> >       struct i915_vma *trampoline; /** trampoline used for chaining */
> > @@ -331,12 +332,7 @@ static inline void eb_unreserve_vma(struct eb_vma *ev)
> >       if (ev->flags & __EXEC_OBJECT_HAS_PIN)
> >               __i915_vma_unpin(vma);
> >   
> > -     if (ev->flags & __EXEC_OBJECT_HAS_PAGES)
> > -             i915_gem_object_unpin_pages(vma->obj);
> > -
> > -     ev->flags &= ~(__EXEC_OBJECT_HAS_PIN |
> > -                    __EXEC_OBJECT_HAS_FENCE |
> > -                    __EXEC_OBJECT_HAS_PAGES);
> > +     ev->flags &= ~(__EXEC_OBJECT_HAS_PIN | __EXEC_OBJECT_HAS_FENCE);
> >   }
> >   
> >   static void eb_vma_array_destroy(struct kref *kref)
> > @@ -667,6 +663,55 @@ eb_add_vma(struct i915_execbuffer *eb,
> >       list_add_tail(&ev->lock_link, &eb->lock);
> >   }
> >   
> > +static int eb_vma_get_pages(struct i915_execbuffer *eb,
> > +                         struct eb_vma *ev,
> > +                         u64 idx)
> > +{
> > +     struct i915_vma *vma = ev->vma;
> > +     int err;
> > +
> > +     /* XXX also preallocate PD for vma */
> > +
> > +     err = ____i915_gem_object_get_pages_async(vma->obj);
> > +     if (err)
> > +             return err;
> > +
> > +     return i915_active_ref(&vma->obj->mm.active, idx, eb->mm_fence);
> > +}
> > +
> > +static int eb_reserve_mm(struct i915_execbuffer *eb)
> > +{
> > +     const u64 idx = eb->context->timeline->fence_context;
> > +     struct ww_acquire_ctx acquire;
> > +     struct eb_vma *ev;
> > +     int err;
> > +
> > +     eb->mm_fence = __dma_fence_create_proxy(0, 0);
> > +     if (!eb->mm_fence)
> > +             return -ENOMEM;
> 
> Question: eb is local to this thread, right, so eb->mm_fence is not 
> considered "published" yet?
> 
> > +
> > +     ww_acquire_init(&acquire, &reservation_ww_class);
> > +
> > +     err = eb_lock_vma(eb, &acquire);
> > +     if (err)
> > +             goto out;
> > +
> > +     ww_acquire_done(&acquire);
> > +
> > +     list_for_each_entry(ev, &eb->lock, lock_link) {
> > +             struct i915_vma *vma = ev->vma;
> > +
> > +             if (err == 0)
> > +                     err = eb_vma_get_pages(eb, ev, idx);
> 
> I figure this is where you publish the proxy fence? If so, the fence 
> signaling critical path starts with this loop,

Hmm, actually at this moment, the fence is still very much internal
being only used as a reference token, and the async fence for the pages
is still only in the internal migration slot [along side the reference
tokens].

Those fences will not be attached to the dma_resv until the chains are
completed in move-to-gpu.

That might be enough of a difference to consider.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Intel-gfx] [PATCH 7/7] drm/i915/gem: Acquire all vma/objects under reservation_ww_class
  2020-06-23 15:37       ` Thomas Hellström (Intel)
@ 2020-06-23 16:37         ` Chris Wilson
  0 siblings, 0 replies; 48+ messages in thread
From: Chris Wilson @ 2020-06-23 16:37 UTC (permalink / raw)
  To: Thomas Hellström, intel-gfx

Quoting Thomas Hellström (Intel) (2020-06-23 16:37:30)
> 
> On 6/23/20 12:03 PM, Chris Wilson wrote:
> > Quoting Thomas Hellström (Intel) (2020-06-23 10:33:20)
> >> Hi, Chris!
> >>
> >> On 6/22/20 11:59 AM, Chris Wilson wrote:
> >>> In order to actually handle eviction and what not, we need to process
> >>> all the objects together under a common lock, reservation_ww_class. As
> >>> such, do a memory reservation pass after looking up the object/vma,
> >>> which then feeds into the rest of execbuf [relocation, cmdparsing,
> >>> flushing and ofc execution].
> >>>
> >>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >>> ---
> >>>    .../gpu/drm/i915/gem/i915_gem_execbuffer.c    | 91 ++++++++++++++-----
> >>>    1 file changed, 70 insertions(+), 21 deletions(-)
> >>>
> >> Which tree is this against? The series doesn't apply cleanly against
> >> drm-tip?
> > It's continuing on from the scheduler patches, the bug fixes and the
> > iris-deferred-fence work. I thought throwing all of those old patches
> > into the pile would have been distracting.
> 
> Is there somewhere you could push a branch for reviewer consumption?

I added some patches to remove some locked waits and having been
regretting it all today. Coming soon, schedulers for all.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Intel-gfx] [PATCH 7/7] drm/i915/gem: Acquire all vma/objects under reservation_ww_class
  2020-06-23 16:29             ` Thomas Hellström (Intel)
@ 2020-06-23 16:46               ` Chris Wilson
  0 siblings, 0 replies; 48+ messages in thread
From: Chris Wilson @ 2020-06-23 16:46 UTC (permalink / raw)
  To: Thomas Hellström, intel-gfx

Quoting Thomas Hellström (Intel) (2020-06-23 17:29:46)
> 
> On 6/23/20 6:17 PM, Chris Wilson wrote:
> > Quoting Thomas Hellström (Intel) (2020-06-23 16:09:08)
> >> You can't take the dma_resv_lock inside a fence critical section.
> > I much prefer the alternative interpretation, you can't wait inside a
> > dma_resv_lock.
> > -Chris
> 
> I respect your point of view, athough I need to think we need to focus 
> on what we have to do in the i915 driver.

While aiming for small steps, each improving upon the last.

At the end of the day, whether it's an ww_mutex exclusive lock or a
fence shared lock, it's all steps in the pipeline.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Intel-gfx] [PATCH 7/7] drm/i915/gem: Acquire all vma/objects under reservation_ww_class
  2020-06-23 16:36     ` Chris Wilson
@ 2020-06-23 18:21       ` Thomas Hellström (Intel)
  2020-06-23 18:41         ` Chris Wilson
  0 siblings, 1 reply; 48+ messages in thread
From: Thomas Hellström (Intel) @ 2020-06-23 18:21 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 6/23/20 6:36 PM, Chris Wilson wrote:
> Quoting Thomas Hellström (Intel) (2020-06-23 12:22:11)
>> Hi, Chris,
>>
>> On 6/22/20 11:59 AM, Chris Wilson wrote:
>>> In order to actually handle eviction and what not, we need to process
>>> all the objects together under a common lock, reservation_ww_class. As
>>> such, do a memory reservation pass after looking up the object/vma,
>>> which then feeds into the rest of execbuf [relocation, cmdparsing,
>>> flushing and ofc execution].
>>>
>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>> ---
>>>    .../gpu/drm/i915/gem/i915_gem_execbuffer.c    | 91 ++++++++++++++-----
>>>    1 file changed, 70 insertions(+), 21 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
>>> index 46fcbdf8161c..8db2e013465f 100644
>>> --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
>>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
>>> @@ -53,10 +53,9 @@ struct eb_vma_array {
>>>    
>>>    #define __EXEC_OBJECT_HAS_PIN               BIT(31)
>>>    #define __EXEC_OBJECT_HAS_FENCE             BIT(30)
>>> -#define __EXEC_OBJECT_HAS_PAGES              BIT(29)
>>> -#define __EXEC_OBJECT_NEEDS_MAP              BIT(28)
>>> -#define __EXEC_OBJECT_NEEDS_BIAS     BIT(27)
>>> -#define __EXEC_OBJECT_INTERNAL_FLAGS (~0u << 27) /* all of the above */
>>> +#define __EXEC_OBJECT_NEEDS_MAP              BIT(29)
>>> +#define __EXEC_OBJECT_NEEDS_BIAS     BIT(28)
>>> +#define __EXEC_OBJECT_INTERNAL_FLAGS (~0u << 28) /* all of the above */
>>>    
>>>    #define __EXEC_HAS_RELOC    BIT(31)
>>>    #define __EXEC_INTERNAL_FLAGS       (~0u << 31)
>>> @@ -241,6 +240,8 @@ struct i915_execbuffer {
>>>        struct intel_context *context; /* logical state for the request */
>>>        struct i915_gem_context *gem_context; /** caller's context */
>>>    
>>> +     struct dma_fence *mm_fence;
>>> +
>>>        struct i915_request *request; /** our request to build */
>>>        struct eb_vma *batch; /** identity of the batch obj/vma */
>>>        struct i915_vma *trampoline; /** trampoline used for chaining */
>>> @@ -331,12 +332,7 @@ static inline void eb_unreserve_vma(struct eb_vma *ev)
>>>        if (ev->flags & __EXEC_OBJECT_HAS_PIN)
>>>                __i915_vma_unpin(vma);
>>>    
>>> -     if (ev->flags & __EXEC_OBJECT_HAS_PAGES)
>>> -             i915_gem_object_unpin_pages(vma->obj);
>>> -
>>> -     ev->flags &= ~(__EXEC_OBJECT_HAS_PIN |
>>> -                    __EXEC_OBJECT_HAS_FENCE |
>>> -                    __EXEC_OBJECT_HAS_PAGES);
>>> +     ev->flags &= ~(__EXEC_OBJECT_HAS_PIN | __EXEC_OBJECT_HAS_FENCE);
>>>    }
>>>    
>>>    static void eb_vma_array_destroy(struct kref *kref)
>>> @@ -667,6 +663,55 @@ eb_add_vma(struct i915_execbuffer *eb,
>>>        list_add_tail(&ev->lock_link, &eb->lock);
>>>    }
>>>    
>>> +static int eb_vma_get_pages(struct i915_execbuffer *eb,
>>> +                         struct eb_vma *ev,
>>> +                         u64 idx)
>>> +{
>>> +     struct i915_vma *vma = ev->vma;
>>> +     int err;
>>> +
>>> +     /* XXX also preallocate PD for vma */
>>> +
>>> +     err = ____i915_gem_object_get_pages_async(vma->obj);
>>> +     if (err)
>>> +             return err;
>>> +
>>> +     return i915_active_ref(&vma->obj->mm.active, idx, eb->mm_fence);
>>> +}
>>> +
>>> +static int eb_reserve_mm(struct i915_execbuffer *eb)
>>> +{
>>> +     const u64 idx = eb->context->timeline->fence_context;
>>> +     struct ww_acquire_ctx acquire;
>>> +     struct eb_vma *ev;
>>> +     int err;
>>> +
>>> +     eb->mm_fence = __dma_fence_create_proxy(0, 0);
>>> +     if (!eb->mm_fence)
>>> +             return -ENOMEM;
>> Question: eb is local to this thread, right, so eb->mm_fence is not
>> considered "published" yet?
>>
>>> +
>>> +     ww_acquire_init(&acquire, &reservation_ww_class);
>>> +
>>> +     err = eb_lock_vma(eb, &acquire);
>>> +     if (err)
>>> +             goto out;
>>> +
>>> +     ww_acquire_done(&acquire);
>>> +
>>> +     list_for_each_entry(ev, &eb->lock, lock_link) {
>>> +             struct i915_vma *vma = ev->vma;
>>> +
>>> +             if (err == 0)
>>> +                     err = eb_vma_get_pages(eb, ev, idx);
>> I figure this is where you publish the proxy fence? If so, the fence
>> signaling critical path starts with this loop,
> Hmm, actually at this moment, the fence is still very much internal
> being only used as a reference token,
I think as long as another thread, running in this driver or another gpu 
driver can theoretically reference the fence pointer from the 
reservation object and wait for the fence it's considered published.

Also the ww_mutexes in this context are really all about grabbing a 
random set of resources and associate them with a point in a timeline, 
as the ww_mutexes are released, the fence pointer(s) need to point to 
published fence(s).

/Thomas


_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Intel-gfx] [PATCH 7/7] drm/i915/gem: Acquire all vma/objects under reservation_ww_class
  2020-06-23 18:21       ` Thomas Hellström (Intel)
@ 2020-06-23 18:41         ` Chris Wilson
  2020-06-23 20:31           ` Thomas Hellström (Intel)
  0 siblings, 1 reply; 48+ messages in thread
From: Chris Wilson @ 2020-06-23 18:41 UTC (permalink / raw)
  To: Thomas Hellström, intel-gfx

Quoting Thomas Hellström (Intel) (2020-06-23 19:21:28)
> 
> On 6/23/20 6:36 PM, Chris Wilson wrote:
> > Quoting Thomas Hellström (Intel) (2020-06-23 12:22:11)
> >> Hi, Chris,
> >>
> >> On 6/22/20 11:59 AM, Chris Wilson wrote:
> >>> In order to actually handle eviction and what not, we need to process
> >>> all the objects together under a common lock, reservation_ww_class. As
> >>> such, do a memory reservation pass after looking up the object/vma,
> >>> which then feeds into the rest of execbuf [relocation, cmdparsing,
> >>> flushing and ofc execution].
> >>>
> >>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >>> ---
> >>>    .../gpu/drm/i915/gem/i915_gem_execbuffer.c    | 91 ++++++++++++++-----
> >>>    1 file changed, 70 insertions(+), 21 deletions(-)
> >>>
> >>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> >>> index 46fcbdf8161c..8db2e013465f 100644
> >>> --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> >>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> >>> @@ -53,10 +53,9 @@ struct eb_vma_array {
> >>>    
> >>>    #define __EXEC_OBJECT_HAS_PIN               BIT(31)
> >>>    #define __EXEC_OBJECT_HAS_FENCE             BIT(30)
> >>> -#define __EXEC_OBJECT_HAS_PAGES              BIT(29)
> >>> -#define __EXEC_OBJECT_NEEDS_MAP              BIT(28)
> >>> -#define __EXEC_OBJECT_NEEDS_BIAS     BIT(27)
> >>> -#define __EXEC_OBJECT_INTERNAL_FLAGS (~0u << 27) /* all of the above */
> >>> +#define __EXEC_OBJECT_NEEDS_MAP              BIT(29)
> >>> +#define __EXEC_OBJECT_NEEDS_BIAS     BIT(28)
> >>> +#define __EXEC_OBJECT_INTERNAL_FLAGS (~0u << 28) /* all of the above */
> >>>    
> >>>    #define __EXEC_HAS_RELOC    BIT(31)
> >>>    #define __EXEC_INTERNAL_FLAGS       (~0u << 31)
> >>> @@ -241,6 +240,8 @@ struct i915_execbuffer {
> >>>        struct intel_context *context; /* logical state for the request */
> >>>        struct i915_gem_context *gem_context; /** caller's context */
> >>>    
> >>> +     struct dma_fence *mm_fence;
> >>> +
> >>>        struct i915_request *request; /** our request to build */
> >>>        struct eb_vma *batch; /** identity of the batch obj/vma */
> >>>        struct i915_vma *trampoline; /** trampoline used for chaining */
> >>> @@ -331,12 +332,7 @@ static inline void eb_unreserve_vma(struct eb_vma *ev)
> >>>        if (ev->flags & __EXEC_OBJECT_HAS_PIN)
> >>>                __i915_vma_unpin(vma);
> >>>    
> >>> -     if (ev->flags & __EXEC_OBJECT_HAS_PAGES)
> >>> -             i915_gem_object_unpin_pages(vma->obj);
> >>> -
> >>> -     ev->flags &= ~(__EXEC_OBJECT_HAS_PIN |
> >>> -                    __EXEC_OBJECT_HAS_FENCE |
> >>> -                    __EXEC_OBJECT_HAS_PAGES);
> >>> +     ev->flags &= ~(__EXEC_OBJECT_HAS_PIN | __EXEC_OBJECT_HAS_FENCE);
> >>>    }
> >>>    
> >>>    static void eb_vma_array_destroy(struct kref *kref)
> >>> @@ -667,6 +663,55 @@ eb_add_vma(struct i915_execbuffer *eb,
> >>>        list_add_tail(&ev->lock_link, &eb->lock);
> >>>    }
> >>>    
> >>> +static int eb_vma_get_pages(struct i915_execbuffer *eb,
> >>> +                         struct eb_vma *ev,
> >>> +                         u64 idx)
> >>> +{
> >>> +     struct i915_vma *vma = ev->vma;
> >>> +     int err;
> >>> +
> >>> +     /* XXX also preallocate PD for vma */
> >>> +
> >>> +     err = ____i915_gem_object_get_pages_async(vma->obj);
> >>> +     if (err)
> >>> +             return err;
> >>> +
> >>> +     return i915_active_ref(&vma->obj->mm.active, idx, eb->mm_fence);
> >>> +}
> >>> +
> >>> +static int eb_reserve_mm(struct i915_execbuffer *eb)
> >>> +{
> >>> +     const u64 idx = eb->context->timeline->fence_context;
> >>> +     struct ww_acquire_ctx acquire;
> >>> +     struct eb_vma *ev;
> >>> +     int err;
> >>> +
> >>> +     eb->mm_fence = __dma_fence_create_proxy(0, 0);
> >>> +     if (!eb->mm_fence)
> >>> +             return -ENOMEM;
> >> Question: eb is local to this thread, right, so eb->mm_fence is not
> >> considered "published" yet?
> >>
> >>> +
> >>> +     ww_acquire_init(&acquire, &reservation_ww_class);
> >>> +
> >>> +     err = eb_lock_vma(eb, &acquire);
> >>> +     if (err)
> >>> +             goto out;
> >>> +
> >>> +     ww_acquire_done(&acquire);
> >>> +
> >>> +     list_for_each_entry(ev, &eb->lock, lock_link) {
> >>> +             struct i915_vma *vma = ev->vma;
> >>> +
> >>> +             if (err == 0)
> >>> +                     err = eb_vma_get_pages(eb, ev, idx);
> >> I figure this is where you publish the proxy fence? If so, the fence
> >> signaling critical path starts with this loop,
> > Hmm, actually at this moment, the fence is still very much internal
> > being only used as a reference token,
> I think as long as another thread, running in this driver or another gpu 
> driver can theoretically reference the fence pointer from the 
> reservation object and wait for the fence it's considered published.

It's not in the reservation object.
 
> Also the ww_mutexes in this context are really all about grabbing a 
> random set of resources and associate them with a point in a timeline, 
> as the ww_mutexes are released, the fence pointer(s) need to point to 
> published fence(s).

That's not the purpose of these fences, though. They exist to provide
reference counting on the backing store, along side the migration fence.
It's extra detail tacked on the equivalent of bo->moving.

That is not to say that one could build up an async migration chain which
form a graph back to these, that chain could only be formed once the
operation itself has been published in the dma_resv though.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Intel-gfx] [PATCH 7/7] drm/i915/gem: Acquire all vma/objects under reservation_ww_class
  2020-06-23 18:41         ` Chris Wilson
@ 2020-06-23 20:31           ` Thomas Hellström (Intel)
  2020-06-23 21:15             ` Chris Wilson
  0 siblings, 1 reply; 48+ messages in thread
From: Thomas Hellström (Intel) @ 2020-06-23 20:31 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 6/23/20 8:41 PM, Chris Wilson wrote:
> Quoting Thomas Hellström (Intel) (2020-06-23 19:21:28)
>> On 6/23/20 6:36 PM, Chris Wilson wrote:
>>> Quoting Thomas Hellström (Intel) (2020-06-23 12:22:11)
>>>> Hi, Chris,
>>>>
>>>> On 6/22/20 11:59 AM, Chris Wilson wrote:
>>>>> In order to actually handle eviction and what not, we need to process
>>>>> all the objects together under a common lock, reservation_ww_class. As
>>>>> such, do a memory reservation pass after looking up the object/vma,
>>>>> which then feeds into the rest of execbuf [relocation, cmdparsing,
>>>>> flushing and ofc execution].
>>>>>
>>>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>>>> ---
>>>>>     .../gpu/drm/i915/gem/i915_gem_execbuffer.c    | 91 ++++++++++++++-----
>>>>>     1 file changed, 70 insertions(+), 21 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
>>>>> index 46fcbdf8161c..8db2e013465f 100644
>>>>> --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
>>>>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
>>>>> @@ -53,10 +53,9 @@ struct eb_vma_array {
>>>>>     
>>>>>     #define __EXEC_OBJECT_HAS_PIN               BIT(31)
>>>>>     #define __EXEC_OBJECT_HAS_FENCE             BIT(30)
>>>>> -#define __EXEC_OBJECT_HAS_PAGES              BIT(29)
>>>>> -#define __EXEC_OBJECT_NEEDS_MAP              BIT(28)
>>>>> -#define __EXEC_OBJECT_NEEDS_BIAS     BIT(27)
>>>>> -#define __EXEC_OBJECT_INTERNAL_FLAGS (~0u << 27) /* all of the above */
>>>>> +#define __EXEC_OBJECT_NEEDS_MAP              BIT(29)
>>>>> +#define __EXEC_OBJECT_NEEDS_BIAS     BIT(28)
>>>>> +#define __EXEC_OBJECT_INTERNAL_FLAGS (~0u << 28) /* all of the above */
>>>>>     
>>>>>     #define __EXEC_HAS_RELOC    BIT(31)
>>>>>     #define __EXEC_INTERNAL_FLAGS       (~0u << 31)
>>>>> @@ -241,6 +240,8 @@ struct i915_execbuffer {
>>>>>         struct intel_context *context; /* logical state for the request */
>>>>>         struct i915_gem_context *gem_context; /** caller's context */
>>>>>     
>>>>> +     struct dma_fence *mm_fence;
>>>>> +
>>>>>         struct i915_request *request; /** our request to build */
>>>>>         struct eb_vma *batch; /** identity of the batch obj/vma */
>>>>>         struct i915_vma *trampoline; /** trampoline used for chaining */
>>>>> @@ -331,12 +332,7 @@ static inline void eb_unreserve_vma(struct eb_vma *ev)
>>>>>         if (ev->flags & __EXEC_OBJECT_HAS_PIN)
>>>>>                 __i915_vma_unpin(vma);
>>>>>     
>>>>> -     if (ev->flags & __EXEC_OBJECT_HAS_PAGES)
>>>>> -             i915_gem_object_unpin_pages(vma->obj);
>>>>> -
>>>>> -     ev->flags &= ~(__EXEC_OBJECT_HAS_PIN |
>>>>> -                    __EXEC_OBJECT_HAS_FENCE |
>>>>> -                    __EXEC_OBJECT_HAS_PAGES);
>>>>> +     ev->flags &= ~(__EXEC_OBJECT_HAS_PIN | __EXEC_OBJECT_HAS_FENCE);
>>>>>     }
>>>>>     
>>>>>     static void eb_vma_array_destroy(struct kref *kref)
>>>>> @@ -667,6 +663,55 @@ eb_add_vma(struct i915_execbuffer *eb,
>>>>>         list_add_tail(&ev->lock_link, &eb->lock);
>>>>>     }
>>>>>     
>>>>> +static int eb_vma_get_pages(struct i915_execbuffer *eb,
>>>>> +                         struct eb_vma *ev,
>>>>> +                         u64 idx)
>>>>> +{
>>>>> +     struct i915_vma *vma = ev->vma;
>>>>> +     int err;
>>>>> +
>>>>> +     /* XXX also preallocate PD for vma */
>>>>> +
>>>>> +     err = ____i915_gem_object_get_pages_async(vma->obj);
>>>>> +     if (err)
>>>>> +             return err;
>>>>> +
>>>>> +     return i915_active_ref(&vma->obj->mm.active, idx, eb->mm_fence);
>>>>> +}
>>>>> +
>>>>> +static int eb_reserve_mm(struct i915_execbuffer *eb)
>>>>> +{
>>>>> +     const u64 idx = eb->context->timeline->fence_context;
>>>>> +     struct ww_acquire_ctx acquire;
>>>>> +     struct eb_vma *ev;
>>>>> +     int err;
>>>>> +
>>>>> +     eb->mm_fence = __dma_fence_create_proxy(0, 0);
>>>>> +     if (!eb->mm_fence)
>>>>> +             return -ENOMEM;
>>>> Question: eb is local to this thread, right, so eb->mm_fence is not
>>>> considered "published" yet?
>>>>
>>>>> +
>>>>> +     ww_acquire_init(&acquire, &reservation_ww_class);
>>>>> +
>>>>> +     err = eb_lock_vma(eb, &acquire);
>>>>> +     if (err)
>>>>> +             goto out;
>>>>> +
>>>>> +     ww_acquire_done(&acquire);
>>>>> +
>>>>> +     list_for_each_entry(ev, &eb->lock, lock_link) {
>>>>> +             struct i915_vma *vma = ev->vma;
>>>>> +
>>>>> +             if (err == 0)
>>>>> +                     err = eb_vma_get_pages(eb, ev, idx);
>>>> I figure this is where you publish the proxy fence? If so, the fence
>>>> signaling critical path starts with this loop,
>>> Hmm, actually at this moment, the fence is still very much internal
>>> being only used as a reference token,
>> I think as long as another thread, running in this driver or another gpu
>> driver can theoretically reference the fence pointer from the
>> reservation object and wait for the fence it's considered published.
> It's not in the reservation object.
>   
>> Also the ww_mutexes in this context are really all about grabbing a
>> random set of resources and associate them with a point in a timeline,
>> as the ww_mutexes are released, the fence pointer(s) need to point to
>> published fence(s).
> That's not the purpose of these fences, though. They exist to provide
> reference counting on the backing store, along side the migration fence.
> It's extra detail tacked on the equivalent of bo->moving.
>
> That is not to say that one could build up an async migration chain which
> form a graph back to these, that chain could only be formed once the
> operation itself has been published in the dma_resv though.

Hmm. So let's say another thread grabs one of the just released 
ww_mutexes and wants to schedule a blit from one of the buffers in the 
current operation with high priority. How would that thread know how to 
order that blit operation w r t the current operation?

/Thomas


> -Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Intel-gfx] [PATCH 7/7] drm/i915/gem: Acquire all vma/objects under reservation_ww_class
  2020-06-23 10:03     ` Chris Wilson
  2020-06-23 15:37       ` Thomas Hellström (Intel)
@ 2020-06-23 21:01       ` Dave Airlie
  2020-06-23 21:19         ` Chris Wilson
  1 sibling, 1 reply; 48+ messages in thread
From: Dave Airlie @ 2020-06-23 21:01 UTC (permalink / raw)
  To: Chris Wilson, Christian König; +Cc: Intel Graphics Development

On Tue, 23 Jun 2020 at 20:03, Chris Wilson <chris@chris-wilson.co.uk> wrote:
>
> Quoting Thomas Hellström (Intel) (2020-06-23 10:33:20)
> > Hi, Chris!
> >
> > On 6/22/20 11:59 AM, Chris Wilson wrote:
> > > In order to actually handle eviction and what not, we need to process
> > > all the objects together under a common lock, reservation_ww_class. As
> > > such, do a memory reservation pass after looking up the object/vma,
> > > which then feeds into the rest of execbuf [relocation, cmdparsing,
> > > flushing and ofc execution].
> > >
> > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > > ---
> > >   .../gpu/drm/i915/gem/i915_gem_execbuffer.c    | 91 ++++++++++++++-----
> > >   1 file changed, 70 insertions(+), 21 deletions(-)
> > >
> > Which tree is this against? The series doesn't apply cleanly against
> > drm-tip?
>
> It's continuing on from the scheduler patches, the bug fixes and the
> iris-deferred-fence work. I thought throwing all of those old patches
> into the pile would have been distracting.
>
> > ...
> >
> > > +static int eb_reserve_mm(struct i915_execbuffer *eb)
> > > +{
> > > +     const u64 idx = eb->context->timeline->fence_context;
> > > +     struct ww_acquire_ctx acquire;
> > > +     struct eb_vma *ev;
> > > +     int err;
> > > +
> > > +     eb->mm_fence = __dma_fence_create_proxy(0, 0);
> > > +     if (!eb->mm_fence)
> > > +             return -ENOMEM;
> >
> > Where are the proxy fence functions defined?
>
> In dma-fence-proxy.c ;)

The dma-fence-proxy that Christian NAKed before?

Dave.
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Intel-gfx] [PATCH 7/7] drm/i915/gem: Acquire all vma/objects under reservation_ww_class
  2020-06-23 20:31           ` Thomas Hellström (Intel)
@ 2020-06-23 21:15             ` Chris Wilson
  2020-06-24  5:42               ` Thomas Hellström (Intel)
  0 siblings, 1 reply; 48+ messages in thread
From: Chris Wilson @ 2020-06-23 21:15 UTC (permalink / raw)
  To: Thomas Hellström, intel-gfx

Quoting Thomas Hellström (Intel) (2020-06-23 21:31:38)
> 
> On 6/23/20 8:41 PM, Chris Wilson wrote:
> > Quoting Thomas Hellström (Intel) (2020-06-23 19:21:28)
> >> On 6/23/20 6:36 PM, Chris Wilson wrote:
> >>> Quoting Thomas Hellström (Intel) (2020-06-23 12:22:11)
> >>>> Hi, Chris,
> >>>>
> >>>> On 6/22/20 11:59 AM, Chris Wilson wrote:
> >>>>> In order to actually handle eviction and what not, we need to process
> >>>>> all the objects together under a common lock, reservation_ww_class. As
> >>>>> such, do a memory reservation pass after looking up the object/vma,
> >>>>> which then feeds into the rest of execbuf [relocation, cmdparsing,
> >>>>> flushing and ofc execution].
> >>>>>
> >>>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >>>>> ---
> >>>>>     .../gpu/drm/i915/gem/i915_gem_execbuffer.c    | 91 ++++++++++++++-----
> >>>>>     1 file changed, 70 insertions(+), 21 deletions(-)
> >>>>>
> >>>>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> >>>>> index 46fcbdf8161c..8db2e013465f 100644
> >>>>> --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> >>>>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> >>>>> @@ -53,10 +53,9 @@ struct eb_vma_array {
> >>>>>     
> >>>>>     #define __EXEC_OBJECT_HAS_PIN               BIT(31)
> >>>>>     #define __EXEC_OBJECT_HAS_FENCE             BIT(30)
> >>>>> -#define __EXEC_OBJECT_HAS_PAGES              BIT(29)
> >>>>> -#define __EXEC_OBJECT_NEEDS_MAP              BIT(28)
> >>>>> -#define __EXEC_OBJECT_NEEDS_BIAS     BIT(27)
> >>>>> -#define __EXEC_OBJECT_INTERNAL_FLAGS (~0u << 27) /* all of the above */
> >>>>> +#define __EXEC_OBJECT_NEEDS_MAP              BIT(29)
> >>>>> +#define __EXEC_OBJECT_NEEDS_BIAS     BIT(28)
> >>>>> +#define __EXEC_OBJECT_INTERNAL_FLAGS (~0u << 28) /* all of the above */
> >>>>>     
> >>>>>     #define __EXEC_HAS_RELOC    BIT(31)
> >>>>>     #define __EXEC_INTERNAL_FLAGS       (~0u << 31)
> >>>>> @@ -241,6 +240,8 @@ struct i915_execbuffer {
> >>>>>         struct intel_context *context; /* logical state for the request */
> >>>>>         struct i915_gem_context *gem_context; /** caller's context */
> >>>>>     
> >>>>> +     struct dma_fence *mm_fence;
> >>>>> +
> >>>>>         struct i915_request *request; /** our request to build */
> >>>>>         struct eb_vma *batch; /** identity of the batch obj/vma */
> >>>>>         struct i915_vma *trampoline; /** trampoline used for chaining */
> >>>>> @@ -331,12 +332,7 @@ static inline void eb_unreserve_vma(struct eb_vma *ev)
> >>>>>         if (ev->flags & __EXEC_OBJECT_HAS_PIN)
> >>>>>                 __i915_vma_unpin(vma);
> >>>>>     
> >>>>> -     if (ev->flags & __EXEC_OBJECT_HAS_PAGES)
> >>>>> -             i915_gem_object_unpin_pages(vma->obj);
> >>>>> -
> >>>>> -     ev->flags &= ~(__EXEC_OBJECT_HAS_PIN |
> >>>>> -                    __EXEC_OBJECT_HAS_FENCE |
> >>>>> -                    __EXEC_OBJECT_HAS_PAGES);
> >>>>> +     ev->flags &= ~(__EXEC_OBJECT_HAS_PIN | __EXEC_OBJECT_HAS_FENCE);
> >>>>>     }
> >>>>>     
> >>>>>     static void eb_vma_array_destroy(struct kref *kref)
> >>>>> @@ -667,6 +663,55 @@ eb_add_vma(struct i915_execbuffer *eb,
> >>>>>         list_add_tail(&ev->lock_link, &eb->lock);
> >>>>>     }
> >>>>>     
> >>>>> +static int eb_vma_get_pages(struct i915_execbuffer *eb,
> >>>>> +                         struct eb_vma *ev,
> >>>>> +                         u64 idx)
> >>>>> +{
> >>>>> +     struct i915_vma *vma = ev->vma;
> >>>>> +     int err;
> >>>>> +
> >>>>> +     /* XXX also preallocate PD for vma */
> >>>>> +
> >>>>> +     err = ____i915_gem_object_get_pages_async(vma->obj);
> >>>>> +     if (err)
> >>>>> +             return err;
> >>>>> +
> >>>>> +     return i915_active_ref(&vma->obj->mm.active, idx, eb->mm_fence);
> >>>>> +}
> >>>>> +
> >>>>> +static int eb_reserve_mm(struct i915_execbuffer *eb)
> >>>>> +{
> >>>>> +     const u64 idx = eb->context->timeline->fence_context;
> >>>>> +     struct ww_acquire_ctx acquire;
> >>>>> +     struct eb_vma *ev;
> >>>>> +     int err;
> >>>>> +
> >>>>> +     eb->mm_fence = __dma_fence_create_proxy(0, 0);
> >>>>> +     if (!eb->mm_fence)
> >>>>> +             return -ENOMEM;
> >>>> Question: eb is local to this thread, right, so eb->mm_fence is not
> >>>> considered "published" yet?
> >>>>
> >>>>> +
> >>>>> +     ww_acquire_init(&acquire, &reservation_ww_class);
> >>>>> +
> >>>>> +     err = eb_lock_vma(eb, &acquire);
> >>>>> +     if (err)
> >>>>> +             goto out;
> >>>>> +
> >>>>> +     ww_acquire_done(&acquire);
> >>>>> +
> >>>>> +     list_for_each_entry(ev, &eb->lock, lock_link) {
> >>>>> +             struct i915_vma *vma = ev->vma;
> >>>>> +
> >>>>> +             if (err == 0)
> >>>>> +                     err = eb_vma_get_pages(eb, ev, idx);
> >>>> I figure this is where you publish the proxy fence? If so, the fence
> >>>> signaling critical path starts with this loop,
> >>> Hmm, actually at this moment, the fence is still very much internal
> >>> being only used as a reference token,
> >> I think as long as another thread, running in this driver or another gpu
> >> driver can theoretically reference the fence pointer from the
> >> reservation object and wait for the fence it's considered published.
> > It's not in the reservation object.
> >   
> >> Also the ww_mutexes in this context are really all about grabbing a
> >> random set of resources and associate them with a point in a timeline,
> >> as the ww_mutexes are released, the fence pointer(s) need to point to
> >> published fence(s).
> > That's not the purpose of these fences, though. They exist to provide
> > reference counting on the backing store, along side the migration fence.
> > It's extra detail tacked on the equivalent of bo->moving.
> >
> > That is not to say that one could build up an async migration chain which
> > form a graph back to these, that chain could only be formed once the
> > operation itself has been published in the dma_resv though.
> 
> Hmm. So let's say another thread grabs one of the just released 
> ww_mutexes and wants to schedule a blit from one of the buffers in the 
> current operation with high priority. How would that thread know how to 
> order that blit operation w r t the current operation?

Why would it order?

At this moment in time all that has been reserved is the backing store.
Both threads will issue an await on the same fence. As the high priority
thread, it will be scheduled first (providing no other ordering is
imposed). Neither thread will block the other inside execbuf, and their
active reference fence is never published directly (indirect coupling
would be after eviction, and if both threads are holding a reference to
the current backing store they are before that eviction) so I do not see
how even someone else can do a locked wait on their unpublished fences.

After that, it is a race as to which thread hits the implicit fencing
ww_mutex lock. (That is if they are using implicit write fencing on a
shared buffer, a pair of reads to a common buffer are not ordered.) In
which case GEM rules apply, first to install their write fence is ahead
in the queue. So if the high priority arrives after the low priority, the
low priority thread receives the priority boost to high.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Intel-gfx] [PATCH 7/7] drm/i915/gem: Acquire all vma/objects under reservation_ww_class
  2020-06-23 21:01       ` Dave Airlie
@ 2020-06-23 21:19         ` Chris Wilson
  2020-06-24 19:04           ` Dave Airlie
  0 siblings, 1 reply; 48+ messages in thread
From: Chris Wilson @ 2020-06-23 21:19 UTC (permalink / raw)
  To: Christian König, Dave Airlie; +Cc: Intel Graphics Development

Quoting Dave Airlie (2020-06-23 22:01:24)
> On Tue, 23 Jun 2020 at 20:03, Chris Wilson <chris@chris-wilson.co.uk> wrote:
> >
> > Quoting Thomas Hellström (Intel) (2020-06-23 10:33:20)
> > > Hi, Chris!
> > >
> > > On 6/22/20 11:59 AM, Chris Wilson wrote:
> > > > In order to actually handle eviction and what not, we need to process
> > > > all the objects together under a common lock, reservation_ww_class. As
> > > > such, do a memory reservation pass after looking up the object/vma,
> > > > which then feeds into the rest of execbuf [relocation, cmdparsing,
> > > > flushing and ofc execution].
> > > >
> > > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > > > ---
> > > >   .../gpu/drm/i915/gem/i915_gem_execbuffer.c    | 91 ++++++++++++++-----
> > > >   1 file changed, 70 insertions(+), 21 deletions(-)
> > > >
> > > Which tree is this against? The series doesn't apply cleanly against
> > > drm-tip?
> >
> > It's continuing on from the scheduler patches, the bug fixes and the
> > iris-deferred-fence work. I thought throwing all of those old patches
> > into the pile would have been distracting.
> >
> > > ...
> > >
> > > > +static int eb_reserve_mm(struct i915_execbuffer *eb)
> > > > +{
> > > > +     const u64 idx = eb->context->timeline->fence_context;
> > > > +     struct ww_acquire_ctx acquire;
> > > > +     struct eb_vma *ev;
> > > > +     int err;
> > > > +
> > > > +     eb->mm_fence = __dma_fence_create_proxy(0, 0);
> > > > +     if (!eb->mm_fence)
> > > > +             return -ENOMEM;
> > >
> > > Where are the proxy fence functions defined?
> >
> > In dma-fence-proxy.c ;)
> 
> The dma-fence-proxy that Christian NAKed before?

I do not have an email from Christian about dma-fence-proxy in the last
3 years it has been on the list.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Intel-gfx] [PATCH 7/7] drm/i915/gem: Acquire all vma/objects under reservation_ww_class
  2020-06-23 21:15             ` Chris Wilson
@ 2020-06-24  5:42               ` Thomas Hellström (Intel)
  2020-06-24  8:08                 ` Chris Wilson
  0 siblings, 1 reply; 48+ messages in thread
From: Thomas Hellström (Intel) @ 2020-06-24  5:42 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 6/23/20 11:15 PM, Chris Wilson wrote:
> Quoting Thomas Hellström (Intel) (2020-06-23 21:31:38)
>> On 6/23/20 8:41 PM, Chris Wilson wrote:
>>> Quoting Thomas Hellström (Intel) (2020-06-23 19:21:28)
>>>> On 6/23/20 6:36 PM, Chris Wilson wrote:
>>>>> Quoting Thomas Hellström (Intel) (2020-06-23 12:22:11)
>>>>>> Hi, Chris,
>>>>>>
>>>>>> On 6/22/20 11:59 AM, Chris Wilson wrote:
>>>>>>> In order to actually handle eviction and what not, we need to process
>>>>>>> all the objects together under a common lock, reservation_ww_class. As
>>>>>>> such, do a memory reservation pass after looking up the object/vma,
>>>>>>> which then feeds into the rest of execbuf [relocation, cmdparsing,
>>>>>>> flushing and ofc execution].
>>>>>>>
>>>>>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>>>>>> ---
>>>>>>>      .../gpu/drm/i915/gem/i915_gem_execbuffer.c    | 91 ++++++++++++++-----
>>>>>>>      1 file changed, 70 insertions(+), 21 deletions(-)
>>>>>>>
>>>>>>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
>>>>>>> index 46fcbdf8161c..8db2e013465f 100644
>>>>>>> --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
>>>>>>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
>>>>>>> @@ -53,10 +53,9 @@ struct eb_vma_array {
>>>>>>>      
>>>>>>>      #define __EXEC_OBJECT_HAS_PIN               BIT(31)
>>>>>>>      #define __EXEC_OBJECT_HAS_FENCE             BIT(30)
>>>>>>> -#define __EXEC_OBJECT_HAS_PAGES              BIT(29)
>>>>>>> -#define __EXEC_OBJECT_NEEDS_MAP              BIT(28)
>>>>>>> -#define __EXEC_OBJECT_NEEDS_BIAS     BIT(27)
>>>>>>> -#define __EXEC_OBJECT_INTERNAL_FLAGS (~0u << 27) /* all of the above */
>>>>>>> +#define __EXEC_OBJECT_NEEDS_MAP              BIT(29)
>>>>>>> +#define __EXEC_OBJECT_NEEDS_BIAS     BIT(28)
>>>>>>> +#define __EXEC_OBJECT_INTERNAL_FLAGS (~0u << 28) /* all of the above */
>>>>>>>      
>>>>>>>      #define __EXEC_HAS_RELOC    BIT(31)
>>>>>>>      #define __EXEC_INTERNAL_FLAGS       (~0u << 31)
>>>>>>> @@ -241,6 +240,8 @@ struct i915_execbuffer {
>>>>>>>          struct intel_context *context; /* logical state for the request */
>>>>>>>          struct i915_gem_context *gem_context; /** caller's context */
>>>>>>>      
>>>>>>> +     struct dma_fence *mm_fence;
>>>>>>> +
>>>>>>>          struct i915_request *request; /** our request to build */
>>>>>>>          struct eb_vma *batch; /** identity of the batch obj/vma */
>>>>>>>          struct i915_vma *trampoline; /** trampoline used for chaining */
>>>>>>> @@ -331,12 +332,7 @@ static inline void eb_unreserve_vma(struct eb_vma *ev)
>>>>>>>          if (ev->flags & __EXEC_OBJECT_HAS_PIN)
>>>>>>>                  __i915_vma_unpin(vma);
>>>>>>>      
>>>>>>> -     if (ev->flags & __EXEC_OBJECT_HAS_PAGES)
>>>>>>> -             i915_gem_object_unpin_pages(vma->obj);
>>>>>>> -
>>>>>>> -     ev->flags &= ~(__EXEC_OBJECT_HAS_PIN |
>>>>>>> -                    __EXEC_OBJECT_HAS_FENCE |
>>>>>>> -                    __EXEC_OBJECT_HAS_PAGES);
>>>>>>> +     ev->flags &= ~(__EXEC_OBJECT_HAS_PIN | __EXEC_OBJECT_HAS_FENCE);
>>>>>>>      }
>>>>>>>      
>>>>>>>      static void eb_vma_array_destroy(struct kref *kref)
>>>>>>> @@ -667,6 +663,55 @@ eb_add_vma(struct i915_execbuffer *eb,
>>>>>>>          list_add_tail(&ev->lock_link, &eb->lock);
>>>>>>>      }
>>>>>>>      
>>>>>>> +static int eb_vma_get_pages(struct i915_execbuffer *eb,
>>>>>>> +                         struct eb_vma *ev,
>>>>>>> +                         u64 idx)
>>>>>>> +{
>>>>>>> +     struct i915_vma *vma = ev->vma;
>>>>>>> +     int err;
>>>>>>> +
>>>>>>> +     /* XXX also preallocate PD for vma */
>>>>>>> +
>>>>>>> +     err = ____i915_gem_object_get_pages_async(vma->obj);
>>>>>>> +     if (err)
>>>>>>> +             return err;
>>>>>>> +
>>>>>>> +     return i915_active_ref(&vma->obj->mm.active, idx, eb->mm_fence);
>>>>>>> +}
>>>>>>> +
>>>>>>> +static int eb_reserve_mm(struct i915_execbuffer *eb)
>>>>>>> +{
>>>>>>> +     const u64 idx = eb->context->timeline->fence_context;
>>>>>>> +     struct ww_acquire_ctx acquire;
>>>>>>> +     struct eb_vma *ev;
>>>>>>> +     int err;
>>>>>>> +
>>>>>>> +     eb->mm_fence = __dma_fence_create_proxy(0, 0);
>>>>>>> +     if (!eb->mm_fence)
>>>>>>> +             return -ENOMEM;
>>>>>> Question: eb is local to this thread, right, so eb->mm_fence is not
>>>>>> considered "published" yet?
>>>>>>
>>>>>>> +
>>>>>>> +     ww_acquire_init(&acquire, &reservation_ww_class);
>>>>>>> +
>>>>>>> +     err = eb_lock_vma(eb, &acquire);
>>>>>>> +     if (err)
>>>>>>> +             goto out;
>>>>>>> +
>>>>>>> +     ww_acquire_done(&acquire);
>>>>>>> +
>>>>>>> +     list_for_each_entry(ev, &eb->lock, lock_link) {
>>>>>>> +             struct i915_vma *vma = ev->vma;
>>>>>>> +
>>>>>>> +             if (err == 0)
>>>>>>> +                     err = eb_vma_get_pages(eb, ev, idx);
>>>>>> I figure this is where you publish the proxy fence? If so, the fence
>>>>>> signaling critical path starts with this loop,
>>>>> Hmm, actually at this moment, the fence is still very much internal
>>>>> being only used as a reference token,
>>>> I think as long as another thread, running in this driver or another gpu
>>>> driver can theoretically reference the fence pointer from the
>>>> reservation object and wait for the fence it's considered published.
>>> It's not in the reservation object.
>>>    
>>>> Also the ww_mutexes in this context are really all about grabbing a
>>>> random set of resources and associate them with a point in a timeline,
>>>> as the ww_mutexes are released, the fence pointer(s) need to point to
>>>> published fence(s).
>>> That's not the purpose of these fences, though. They exist to provide
>>> reference counting on the backing store, along side the migration fence.
>>> It's extra detail tacked on the equivalent of bo->moving.
>>>
>>> That is not to say that one could build up an async migration chain which
>>> form a graph back to these, that chain could only be formed once the
>>> operation itself has been published in the dma_resv though.
>> Hmm. So let's say another thread grabs one of the just released
>> ww_mutexes and wants to schedule a blit from one of the buffers in the
>> current operation with high priority. How would that thread know how to
>> order that blit operation w r t the current operation?
> Why would it order?
So let's say it's an eviction blit, needing to incorporate the data from 
the current operation. Or, for that matter a ttm-style cpu copy eviction:

ww_mutex_lock
wait_for_idle
copy
ww_mutex_unlock

/Thomas


_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Intel-gfx] [PATCH 7/7] drm/i915/gem: Acquire all vma/objects under reservation_ww_class
  2020-06-24  5:42               ` Thomas Hellström (Intel)
@ 2020-06-24  8:08                 ` Chris Wilson
  2020-06-24  9:50                   ` Thomas Hellström (Intel)
  0 siblings, 1 reply; 48+ messages in thread
From: Chris Wilson @ 2020-06-24  8:08 UTC (permalink / raw)
  To: Thomas Hellström, intel-gfx

Quoting Thomas Hellström (Intel) (2020-06-24 06:42:33)
> 
> On 6/23/20 11:15 PM, Chris Wilson wrote:
> > Quoting Thomas Hellström (Intel) (2020-06-23 21:31:38)
> >> On 6/23/20 8:41 PM, Chris Wilson wrote:
> >>> Quoting Thomas Hellström (Intel) (2020-06-23 19:21:28)
> >>>> On 6/23/20 6:36 PM, Chris Wilson wrote:
> >>>>> Quoting Thomas Hellström (Intel) (2020-06-23 12:22:11)
> >>>>>> Hi, Chris,
> >>>>>>
> >>>>>> On 6/22/20 11:59 AM, Chris Wilson wrote:
> >>>>>>> In order to actually handle eviction and what not, we need to process
> >>>>>>> all the objects together under a common lock, reservation_ww_class. As
> >>>>>>> such, do a memory reservation pass after looking up the object/vma,
> >>>>>>> which then feeds into the rest of execbuf [relocation, cmdparsing,
> >>>>>>> flushing and ofc execution].
> >>>>>>>
> >>>>>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >>>>>>> ---
> >>>>>>>      .../gpu/drm/i915/gem/i915_gem_execbuffer.c    | 91 ++++++++++++++-----
> >>>>>>>      1 file changed, 70 insertions(+), 21 deletions(-)
> >>>>>>>
> >>>>>>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> >>>>>>> index 46fcbdf8161c..8db2e013465f 100644
> >>>>>>> --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> >>>>>>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> >>>>>>> @@ -53,10 +53,9 @@ struct eb_vma_array {
> >>>>>>>      
> >>>>>>>      #define __EXEC_OBJECT_HAS_PIN               BIT(31)
> >>>>>>>      #define __EXEC_OBJECT_HAS_FENCE             BIT(30)
> >>>>>>> -#define __EXEC_OBJECT_HAS_PAGES              BIT(29)
> >>>>>>> -#define __EXEC_OBJECT_NEEDS_MAP              BIT(28)
> >>>>>>> -#define __EXEC_OBJECT_NEEDS_BIAS     BIT(27)
> >>>>>>> -#define __EXEC_OBJECT_INTERNAL_FLAGS (~0u << 27) /* all of the above */
> >>>>>>> +#define __EXEC_OBJECT_NEEDS_MAP              BIT(29)
> >>>>>>> +#define __EXEC_OBJECT_NEEDS_BIAS     BIT(28)
> >>>>>>> +#define __EXEC_OBJECT_INTERNAL_FLAGS (~0u << 28) /* all of the above */
> >>>>>>>      
> >>>>>>>      #define __EXEC_HAS_RELOC    BIT(31)
> >>>>>>>      #define __EXEC_INTERNAL_FLAGS       (~0u << 31)
> >>>>>>> @@ -241,6 +240,8 @@ struct i915_execbuffer {
> >>>>>>>          struct intel_context *context; /* logical state for the request */
> >>>>>>>          struct i915_gem_context *gem_context; /** caller's context */
> >>>>>>>      
> >>>>>>> +     struct dma_fence *mm_fence;
> >>>>>>> +
> >>>>>>>          struct i915_request *request; /** our request to build */
> >>>>>>>          struct eb_vma *batch; /** identity of the batch obj/vma */
> >>>>>>>          struct i915_vma *trampoline; /** trampoline used for chaining */
> >>>>>>> @@ -331,12 +332,7 @@ static inline void eb_unreserve_vma(struct eb_vma *ev)
> >>>>>>>          if (ev->flags & __EXEC_OBJECT_HAS_PIN)
> >>>>>>>                  __i915_vma_unpin(vma);
> >>>>>>>      
> >>>>>>> -     if (ev->flags & __EXEC_OBJECT_HAS_PAGES)
> >>>>>>> -             i915_gem_object_unpin_pages(vma->obj);
> >>>>>>> -
> >>>>>>> -     ev->flags &= ~(__EXEC_OBJECT_HAS_PIN |
> >>>>>>> -                    __EXEC_OBJECT_HAS_FENCE |
> >>>>>>> -                    __EXEC_OBJECT_HAS_PAGES);
> >>>>>>> +     ev->flags &= ~(__EXEC_OBJECT_HAS_PIN | __EXEC_OBJECT_HAS_FENCE);
> >>>>>>>      }
> >>>>>>>      
> >>>>>>>      static void eb_vma_array_destroy(struct kref *kref)
> >>>>>>> @@ -667,6 +663,55 @@ eb_add_vma(struct i915_execbuffer *eb,
> >>>>>>>          list_add_tail(&ev->lock_link, &eb->lock);
> >>>>>>>      }
> >>>>>>>      
> >>>>>>> +static int eb_vma_get_pages(struct i915_execbuffer *eb,
> >>>>>>> +                         struct eb_vma *ev,
> >>>>>>> +                         u64 idx)
> >>>>>>> +{
> >>>>>>> +     struct i915_vma *vma = ev->vma;
> >>>>>>> +     int err;
> >>>>>>> +
> >>>>>>> +     /* XXX also preallocate PD for vma */
> >>>>>>> +
> >>>>>>> +     err = ____i915_gem_object_get_pages_async(vma->obj);
> >>>>>>> +     if (err)
> >>>>>>> +             return err;
> >>>>>>> +
> >>>>>>> +     return i915_active_ref(&vma->obj->mm.active, idx, eb->mm_fence);
> >>>>>>> +}
> >>>>>>> +
> >>>>>>> +static int eb_reserve_mm(struct i915_execbuffer *eb)
> >>>>>>> +{
> >>>>>>> +     const u64 idx = eb->context->timeline->fence_context;
> >>>>>>> +     struct ww_acquire_ctx acquire;
> >>>>>>> +     struct eb_vma *ev;
> >>>>>>> +     int err;
> >>>>>>> +
> >>>>>>> +     eb->mm_fence = __dma_fence_create_proxy(0, 0);
> >>>>>>> +     if (!eb->mm_fence)
> >>>>>>> +             return -ENOMEM;
> >>>>>> Question: eb is local to this thread, right, so eb->mm_fence is not
> >>>>>> considered "published" yet?
> >>>>>>
> >>>>>>> +
> >>>>>>> +     ww_acquire_init(&acquire, &reservation_ww_class);
> >>>>>>> +
> >>>>>>> +     err = eb_lock_vma(eb, &acquire);
> >>>>>>> +     if (err)
> >>>>>>> +             goto out;
> >>>>>>> +
> >>>>>>> +     ww_acquire_done(&acquire);
> >>>>>>> +
> >>>>>>> +     list_for_each_entry(ev, &eb->lock, lock_link) {
> >>>>>>> +             struct i915_vma *vma = ev->vma;
> >>>>>>> +
> >>>>>>> +             if (err == 0)
> >>>>>>> +                     err = eb_vma_get_pages(eb, ev, idx);
> >>>>>> I figure this is where you publish the proxy fence? If so, the fence
> >>>>>> signaling critical path starts with this loop,
> >>>>> Hmm, actually at this moment, the fence is still very much internal
> >>>>> being only used as a reference token,
> >>>> I think as long as another thread, running in this driver or another gpu
> >>>> driver can theoretically reference the fence pointer from the
> >>>> reservation object and wait for the fence it's considered published.
> >>> It's not in the reservation object.
> >>>    
> >>>> Also the ww_mutexes in this context are really all about grabbing a
> >>>> random set of resources and associate them with a point in a timeline,
> >>>> as the ww_mutexes are released, the fence pointer(s) need to point to
> >>>> published fence(s).
> >>> That's not the purpose of these fences, though. They exist to provide
> >>> reference counting on the backing store, along side the migration fence.
> >>> It's extra detail tacked on the equivalent of bo->moving.
> >>>
> >>> That is not to say that one could build up an async migration chain which
> >>> form a graph back to these, that chain could only be formed once the
> >>> operation itself has been published in the dma_resv though.
> >> Hmm. So let's say another thread grabs one of the just released
> >> ww_mutexes and wants to schedule a blit from one of the buffers in the
> >> current operation with high priority. How would that thread know how to
> >> order that blit operation w r t the current operation?
> > Why would it order?
> So let's say it's an eviction blit, needing to incorporate the data from 
> the current operation. Or, for that matter a ttm-style cpu copy eviction:
> 
> ww_mutex_lock
> wait_for_idle
> copy
> ww_mutex_unlock

We have a scheduler. Eviction does not block. Submission never blocks.

lock
swap allocation blocks
unlock
copy.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Intel-gfx] [PATCH 7/7] drm/i915/gem: Acquire all vma/objects under reservation_ww_class
  2020-06-24  8:08                 ` Chris Wilson
@ 2020-06-24  9:50                   ` Thomas Hellström (Intel)
  2020-06-24 10:48                     ` Chris Wilson
  0 siblings, 1 reply; 48+ messages in thread
From: Thomas Hellström (Intel) @ 2020-06-24  9:50 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 6/24/20 10:08 AM, Chris Wilson wrote:
> Quoting Thomas Hellström (Intel) (2020-06-24 06:42:33)
>> On 6/23/20 11:15 PM, Chris Wilson wrote:
>>> Quoting Thomas Hellström (Intel) (2020-06-23 21:31:38)
>>>> On 6/23/20 8:41 PM, Chris Wilson wrote:
>>>>> Quoting Thomas Hellström (Intel) (2020-06-23 19:21:28)
>>>>>> On 6/23/20 6:36 PM, Chris Wilson wrote:
>>>>>>> Quoting Thomas Hellström (Intel) (2020-06-23 12:22:11)
>>>>>>>> Hi, Chris,
>>>>>>>>
>>>>>>>> On 6/22/20 11:59 AM, Chris Wilson wrote:
>>>>>>>>> In order to actually handle eviction and what not, we need to process
>>>>>>>>> all the objects together under a common lock, reservation_ww_class. As
>>>>>>>>> such, do a memory reservation pass after looking up the object/vma,
>>>>>>>>> which then feeds into the rest of execbuf [relocation, cmdparsing,
>>>>>>>>> flushing and ofc execution].
>>>>>>>>>
>>>>>>>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>>>>>>>> ---
>>>>>>>>>       .../gpu/drm/i915/gem/i915_gem_execbuffer.c    | 91 ++++++++++++++-----
>>>>>>>>>       1 file changed, 70 insertions(+), 21 deletions(-)
>>>>>>>>>
>>>>>>>>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
>>>>>>>>> index 46fcbdf8161c..8db2e013465f 100644
>>>>>>>>> --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
>>>>>>>>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
>>>>>>>>> @@ -53,10 +53,9 @@ struct eb_vma_array {
>>>>>>>>>       
>>>>>>>>>       #define __EXEC_OBJECT_HAS_PIN               BIT(31)
>>>>>>>>>       #define __EXEC_OBJECT_HAS_FENCE             BIT(30)
>>>>>>>>> -#define __EXEC_OBJECT_HAS_PAGES              BIT(29)
>>>>>>>>> -#define __EXEC_OBJECT_NEEDS_MAP              BIT(28)
>>>>>>>>> -#define __EXEC_OBJECT_NEEDS_BIAS     BIT(27)
>>>>>>>>> -#define __EXEC_OBJECT_INTERNAL_FLAGS (~0u << 27) /* all of the above */
>>>>>>>>> +#define __EXEC_OBJECT_NEEDS_MAP              BIT(29)
>>>>>>>>> +#define __EXEC_OBJECT_NEEDS_BIAS     BIT(28)
>>>>>>>>> +#define __EXEC_OBJECT_INTERNAL_FLAGS (~0u << 28) /* all of the above */
>>>>>>>>>       
>>>>>>>>>       #define __EXEC_HAS_RELOC    BIT(31)
>>>>>>>>>       #define __EXEC_INTERNAL_FLAGS       (~0u << 31)
>>>>>>>>> @@ -241,6 +240,8 @@ struct i915_execbuffer {
>>>>>>>>>           struct intel_context *context; /* logical state for the request */
>>>>>>>>>           struct i915_gem_context *gem_context; /** caller's context */
>>>>>>>>>       
>>>>>>>>> +     struct dma_fence *mm_fence;
>>>>>>>>> +
>>>>>>>>>           struct i915_request *request; /** our request to build */
>>>>>>>>>           struct eb_vma *batch; /** identity of the batch obj/vma */
>>>>>>>>>           struct i915_vma *trampoline; /** trampoline used for chaining */
>>>>>>>>> @@ -331,12 +332,7 @@ static inline void eb_unreserve_vma(struct eb_vma *ev)
>>>>>>>>>           if (ev->flags & __EXEC_OBJECT_HAS_PIN)
>>>>>>>>>                   __i915_vma_unpin(vma);
>>>>>>>>>       
>>>>>>>>> -     if (ev->flags & __EXEC_OBJECT_HAS_PAGES)
>>>>>>>>> -             i915_gem_object_unpin_pages(vma->obj);
>>>>>>>>> -
>>>>>>>>> -     ev->flags &= ~(__EXEC_OBJECT_HAS_PIN |
>>>>>>>>> -                    __EXEC_OBJECT_HAS_FENCE |
>>>>>>>>> -                    __EXEC_OBJECT_HAS_PAGES);
>>>>>>>>> +     ev->flags &= ~(__EXEC_OBJECT_HAS_PIN | __EXEC_OBJECT_HAS_FENCE);
>>>>>>>>>       }
>>>>>>>>>       
>>>>>>>>>       static void eb_vma_array_destroy(struct kref *kref)
>>>>>>>>> @@ -667,6 +663,55 @@ eb_add_vma(struct i915_execbuffer *eb,
>>>>>>>>>           list_add_tail(&ev->lock_link, &eb->lock);
>>>>>>>>>       }
>>>>>>>>>       
>>>>>>>>> +static int eb_vma_get_pages(struct i915_execbuffer *eb,
>>>>>>>>> +                         struct eb_vma *ev,
>>>>>>>>> +                         u64 idx)
>>>>>>>>> +{
>>>>>>>>> +     struct i915_vma *vma = ev->vma;
>>>>>>>>> +     int err;
>>>>>>>>> +
>>>>>>>>> +     /* XXX also preallocate PD for vma */
>>>>>>>>> +
>>>>>>>>> +     err = ____i915_gem_object_get_pages_async(vma->obj);
>>>>>>>>> +     if (err)
>>>>>>>>> +             return err;
>>>>>>>>> +
>>>>>>>>> +     return i915_active_ref(&vma->obj->mm.active, idx, eb->mm_fence);
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>> +static int eb_reserve_mm(struct i915_execbuffer *eb)
>>>>>>>>> +{
>>>>>>>>> +     const u64 idx = eb->context->timeline->fence_context;
>>>>>>>>> +     struct ww_acquire_ctx acquire;
>>>>>>>>> +     struct eb_vma *ev;
>>>>>>>>> +     int err;
>>>>>>>>> +
>>>>>>>>> +     eb->mm_fence = __dma_fence_create_proxy(0, 0);
>>>>>>>>> +     if (!eb->mm_fence)
>>>>>>>>> +             return -ENOMEM;
>>>>>>>> Question: eb is local to this thread, right, so eb->mm_fence is not
>>>>>>>> considered "published" yet?
>>>>>>>>
>>>>>>>>> +
>>>>>>>>> +     ww_acquire_init(&acquire, &reservation_ww_class);
>>>>>>>>> +
>>>>>>>>> +     err = eb_lock_vma(eb, &acquire);
>>>>>>>>> +     if (err)
>>>>>>>>> +             goto out;
>>>>>>>>> +
>>>>>>>>> +     ww_acquire_done(&acquire);
>>>>>>>>> +
>>>>>>>>> +     list_for_each_entry(ev, &eb->lock, lock_link) {
>>>>>>>>> +             struct i915_vma *vma = ev->vma;
>>>>>>>>> +
>>>>>>>>> +             if (err == 0)
>>>>>>>>> +                     err = eb_vma_get_pages(eb, ev, idx);
>>>>>>>> I figure this is where you publish the proxy fence? If so, the fence
>>>>>>>> signaling critical path starts with this loop,
>>>>>>> Hmm, actually at this moment, the fence is still very much internal
>>>>>>> being only used as a reference token,
>>>>>> I think as long as another thread, running in this driver or another gpu
>>>>>> driver can theoretically reference the fence pointer from the
>>>>>> reservation object and wait for the fence it's considered published.
>>>>> It's not in the reservation object.
>>>>>     
>>>>>> Also the ww_mutexes in this context are really all about grabbing a
>>>>>> random set of resources and associate them with a point in a timeline,
>>>>>> as the ww_mutexes are released, the fence pointer(s) need to point to
>>>>>> published fence(s).
>>>>> That's not the purpose of these fences, though. They exist to provide
>>>>> reference counting on the backing store, along side the migration fence.
>>>>> It's extra detail tacked on the equivalent of bo->moving.
>>>>>
>>>>> That is not to say that one could build up an async migration chain which
>>>>> form a graph back to these, that chain could only be formed once the
>>>>> operation itself has been published in the dma_resv though.
>>>> Hmm. So let's say another thread grabs one of the just released
>>>> ww_mutexes and wants to schedule a blit from one of the buffers in the
>>>> current operation with high priority. How would that thread know how to
>>>> order that blit operation w r t the current operation?
>>> Why would it order?
>> So let's say it's an eviction blit, needing to incorporate the data from
>> the current operation. Or, for that matter a ttm-style cpu copy eviction:
>>
>> ww_mutex_lock
>> wait_for_idle
>> copy
>> ww_mutex_unlock
> We have a scheduler. Eviction does not block. Submission never blocks.
So regardless if we block or not, how does the scheduler know how to 
order the eviction blit after the current operation? Wouldn't it need to 
look at the proxy fence to determine that? Basically I'm trying to get 
an understanding where the fence signaling critical section starts.

/Thomas


_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Intel-gfx] [PATCH 7/7] drm/i915/gem: Acquire all vma/objects under reservation_ww_class
  2020-06-24  9:50                   ` Thomas Hellström (Intel)
@ 2020-06-24 10:48                     ` Chris Wilson
  0 siblings, 0 replies; 48+ messages in thread
From: Chris Wilson @ 2020-06-24 10:48 UTC (permalink / raw)
  To: Thomas Hellström, intel-gfx

Quoting Thomas Hellström (Intel) (2020-06-24 10:50:08)
> 
> On 6/24/20 10:08 AM, Chris Wilson wrote:
> > Quoting Thomas Hellström (Intel) (2020-06-24 06:42:33)
> >> On 6/23/20 11:15 PM, Chris Wilson wrote:
> >>> Quoting Thomas Hellström (Intel) (2020-06-23 21:31:38)
> >>>> On 6/23/20 8:41 PM, Chris Wilson wrote:
> >>>>> Quoting Thomas Hellström (Intel) (2020-06-23 19:21:28)
> >>>>>> On 6/23/20 6:36 PM, Chris Wilson wrote:
> >>>>>>> Quoting Thomas Hellström (Intel) (2020-06-23 12:22:11)
> >>>>>>>> Hi, Chris,
> >>>>>>>>
> >>>>>>>> On 6/22/20 11:59 AM, Chris Wilson wrote:
> >>>>>>>>> In order to actually handle eviction and what not, we need to process
> >>>>>>>>> all the objects together under a common lock, reservation_ww_class. As
> >>>>>>>>> such, do a memory reservation pass after looking up the object/vma,
> >>>>>>>>> which then feeds into the rest of execbuf [relocation, cmdparsing,
> >>>>>>>>> flushing and ofc execution].
> >>>>>>>>>
> >>>>>>>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >>>>>>>>> ---
> >>>>>>>>>       .../gpu/drm/i915/gem/i915_gem_execbuffer.c    | 91 ++++++++++++++-----
> >>>>>>>>>       1 file changed, 70 insertions(+), 21 deletions(-)
> >>>>>>>>>
> >>>>>>>>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> >>>>>>>>> index 46fcbdf8161c..8db2e013465f 100644
> >>>>>>>>> --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> >>>>>>>>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> >>>>>>>>> @@ -53,10 +53,9 @@ struct eb_vma_array {
> >>>>>>>>>       
> >>>>>>>>>       #define __EXEC_OBJECT_HAS_PIN               BIT(31)
> >>>>>>>>>       #define __EXEC_OBJECT_HAS_FENCE             BIT(30)
> >>>>>>>>> -#define __EXEC_OBJECT_HAS_PAGES              BIT(29)
> >>>>>>>>> -#define __EXEC_OBJECT_NEEDS_MAP              BIT(28)
> >>>>>>>>> -#define __EXEC_OBJECT_NEEDS_BIAS     BIT(27)
> >>>>>>>>> -#define __EXEC_OBJECT_INTERNAL_FLAGS (~0u << 27) /* all of the above */
> >>>>>>>>> +#define __EXEC_OBJECT_NEEDS_MAP              BIT(29)
> >>>>>>>>> +#define __EXEC_OBJECT_NEEDS_BIAS     BIT(28)
> >>>>>>>>> +#define __EXEC_OBJECT_INTERNAL_FLAGS (~0u << 28) /* all of the above */
> >>>>>>>>>       
> >>>>>>>>>       #define __EXEC_HAS_RELOC    BIT(31)
> >>>>>>>>>       #define __EXEC_INTERNAL_FLAGS       (~0u << 31)
> >>>>>>>>> @@ -241,6 +240,8 @@ struct i915_execbuffer {
> >>>>>>>>>           struct intel_context *context; /* logical state for the request */
> >>>>>>>>>           struct i915_gem_context *gem_context; /** caller's context */
> >>>>>>>>>       
> >>>>>>>>> +     struct dma_fence *mm_fence;
> >>>>>>>>> +
> >>>>>>>>>           struct i915_request *request; /** our request to build */
> >>>>>>>>>           struct eb_vma *batch; /** identity of the batch obj/vma */
> >>>>>>>>>           struct i915_vma *trampoline; /** trampoline used for chaining */
> >>>>>>>>> @@ -331,12 +332,7 @@ static inline void eb_unreserve_vma(struct eb_vma *ev)
> >>>>>>>>>           if (ev->flags & __EXEC_OBJECT_HAS_PIN)
> >>>>>>>>>                   __i915_vma_unpin(vma);
> >>>>>>>>>       
> >>>>>>>>> -     if (ev->flags & __EXEC_OBJECT_HAS_PAGES)
> >>>>>>>>> -             i915_gem_object_unpin_pages(vma->obj);
> >>>>>>>>> -
> >>>>>>>>> -     ev->flags &= ~(__EXEC_OBJECT_HAS_PIN |
> >>>>>>>>> -                    __EXEC_OBJECT_HAS_FENCE |
> >>>>>>>>> -                    __EXEC_OBJECT_HAS_PAGES);
> >>>>>>>>> +     ev->flags &= ~(__EXEC_OBJECT_HAS_PIN | __EXEC_OBJECT_HAS_FENCE);
> >>>>>>>>>       }
> >>>>>>>>>       
> >>>>>>>>>       static void eb_vma_array_destroy(struct kref *kref)
> >>>>>>>>> @@ -667,6 +663,55 @@ eb_add_vma(struct i915_execbuffer *eb,
> >>>>>>>>>           list_add_tail(&ev->lock_link, &eb->lock);
> >>>>>>>>>       }
> >>>>>>>>>       
> >>>>>>>>> +static int eb_vma_get_pages(struct i915_execbuffer *eb,
> >>>>>>>>> +                         struct eb_vma *ev,
> >>>>>>>>> +                         u64 idx)
> >>>>>>>>> +{
> >>>>>>>>> +     struct i915_vma *vma = ev->vma;
> >>>>>>>>> +     int err;
> >>>>>>>>> +
> >>>>>>>>> +     /* XXX also preallocate PD for vma */
> >>>>>>>>> +
> >>>>>>>>> +     err = ____i915_gem_object_get_pages_async(vma->obj);
> >>>>>>>>> +     if (err)
> >>>>>>>>> +             return err;
> >>>>>>>>> +
> >>>>>>>>> +     return i915_active_ref(&vma->obj->mm.active, idx, eb->mm_fence);
> >>>>>>>>> +}
> >>>>>>>>> +
> >>>>>>>>> +static int eb_reserve_mm(struct i915_execbuffer *eb)
> >>>>>>>>> +{
> >>>>>>>>> +     const u64 idx = eb->context->timeline->fence_context;
> >>>>>>>>> +     struct ww_acquire_ctx acquire;
> >>>>>>>>> +     struct eb_vma *ev;
> >>>>>>>>> +     int err;
> >>>>>>>>> +
> >>>>>>>>> +     eb->mm_fence = __dma_fence_create_proxy(0, 0);
> >>>>>>>>> +     if (!eb->mm_fence)
> >>>>>>>>> +             return -ENOMEM;
> >>>>>>>> Question: eb is local to this thread, right, so eb->mm_fence is not
> >>>>>>>> considered "published" yet?
> >>>>>>>>
> >>>>>>>>> +
> >>>>>>>>> +     ww_acquire_init(&acquire, &reservation_ww_class);
> >>>>>>>>> +
> >>>>>>>>> +     err = eb_lock_vma(eb, &acquire);
> >>>>>>>>> +     if (err)
> >>>>>>>>> +             goto out;
> >>>>>>>>> +
> >>>>>>>>> +     ww_acquire_done(&acquire);
> >>>>>>>>> +
> >>>>>>>>> +     list_for_each_entry(ev, &eb->lock, lock_link) {
> >>>>>>>>> +             struct i915_vma *vma = ev->vma;
> >>>>>>>>> +
> >>>>>>>>> +             if (err == 0)
> >>>>>>>>> +                     err = eb_vma_get_pages(eb, ev, idx);
> >>>>>>>> I figure this is where you publish the proxy fence? If so, the fence
> >>>>>>>> signaling critical path starts with this loop,
> >>>>>>> Hmm, actually at this moment, the fence is still very much internal
> >>>>>>> being only used as a reference token,
> >>>>>> I think as long as another thread, running in this driver or another gpu
> >>>>>> driver can theoretically reference the fence pointer from the
> >>>>>> reservation object and wait for the fence it's considered published.
> >>>>> It's not in the reservation object.
> >>>>>     
> >>>>>> Also the ww_mutexes in this context are really all about grabbing a
> >>>>>> random set of resources and associate them with a point in a timeline,
> >>>>>> as the ww_mutexes are released, the fence pointer(s) need to point to
> >>>>>> published fence(s).
> >>>>> That's not the purpose of these fences, though. They exist to provide
> >>>>> reference counting on the backing store, along side the migration fence.
> >>>>> It's extra detail tacked on the equivalent of bo->moving.
> >>>>>
> >>>>> That is not to say that one could build up an async migration chain which
> >>>>> form a graph back to these, that chain could only be formed once the
> >>>>> operation itself has been published in the dma_resv though.
> >>>> Hmm. So let's say another thread grabs one of the just released
> >>>> ww_mutexes and wants to schedule a blit from one of the buffers in the
> >>>> current operation with high priority. How would that thread know how to
> >>>> order that blit operation w r t the current operation?
> >>> Why would it order?
> >> So let's say it's an eviction blit, needing to incorporate the data from
> >> the current operation. Or, for that matter a ttm-style cpu copy eviction:
> >>
> >> ww_mutex_lock
> >> wait_for_idle
> >> copy
> >> ww_mutex_unlock
> > We have a scheduler. Eviction does not block. Submission never blocks.
> So regardless if we block or not, how does the scheduler know how to 
> order the eviction blit after the current operation? Wouldn't it need to 
> look at the proxy fence to determine that? Basically I'm trying to get 
> an understanding where the fence signaling critical section starts.

Yes, via the eviction logic but that is only applicable to evictions
within the critical section, and the easiest way to circumvent that is
not to allow evictions within that region; that is evictions can only be
scheduled en masse and not piecemeal. [All the same rules as pinning
applies, since it is the same...]

There is no disagreement in that ideally all reservations must be
performed upfront. The issue is quite simply that we do not know all the
reservations we will need up front -- there are many sequences which we
can offload to the GPU but require arbitrary allocations to do so. For
that, what I was expecting was to try to create requests without
eviction (akin to GFP_ATOMIC), on running out of space commit what has
been completed so far, and rolling back to reacquiring all objects plus
a reserved mempool and continuing on, as many times as required to
complete handing the payload to the GPU. (Experience might say that we start
off with a reservation for a mempool in addition to the user and PD payload.)

The other issue is that some objects are not trivially evictable, those
that are in active use by the HW and have an implicit write after the
fence. [And when that write occurs is unknown as we are expected to
treat that as part of a black box, so we only know for certain after the
fact as another context's request is signaled.] They also have a
placeholder for a fence that is inserted by what is essentially a GC
sweep. [That fence can be inserted on demand, hence the special
perma-pinned kernel context to ensure that we can always force a context
switch, also used for power management of the engines.] It's certainly a
lot simpler if we can avoid including those as part of the eviction pass,
only removing them when idle and avoid exposing them ever to a wide
lock. It certainly needs to be treated with care to avoid regressing the
driver.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Intel-gfx] [PATCH 7/7] drm/i915/gem: Acquire all vma/objects under reservation_ww_class
  2020-06-23 21:19         ` Chris Wilson
@ 2020-06-24 19:04           ` Dave Airlie
  2020-06-24 20:18             ` Chris Wilson
  0 siblings, 1 reply; 48+ messages in thread
From: Dave Airlie @ 2020-06-24 19:04 UTC (permalink / raw)
  To: Chris Wilson; +Cc: Christian König, Intel Graphics Development

On Wed, 24 Jun 2020 at 07:19, Chris Wilson <chris@chris-wilson.co.uk> wrote:
>
> Quoting Dave Airlie (2020-06-23 22:01:24)
> > On Tue, 23 Jun 2020 at 20:03, Chris Wilson <chris@chris-wilson.co.uk> wrote:
> > >
> > > Quoting Thomas Hellström (Intel) (2020-06-23 10:33:20)
> > > > Hi, Chris!
> > > >
> > > > On 6/22/20 11:59 AM, Chris Wilson wrote:
> > > > > In order to actually handle eviction and what not, we need to process
> > > > > all the objects together under a common lock, reservation_ww_class. As
> > > > > such, do a memory reservation pass after looking up the object/vma,
> > > > > which then feeds into the rest of execbuf [relocation, cmdparsing,
> > > > > flushing and ofc execution].
> > > > >
> > > > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > > > > ---
> > > > >   .../gpu/drm/i915/gem/i915_gem_execbuffer.c    | 91 ++++++++++++++-----
> > > > >   1 file changed, 70 insertions(+), 21 deletions(-)
> > > > >
> > > > Which tree is this against? The series doesn't apply cleanly against
> > > > drm-tip?
> > >
> > > It's continuing on from the scheduler patches, the bug fixes and the
> > > iris-deferred-fence work. I thought throwing all of those old patches
> > > into the pile would have been distracting.
> > >
> > > > ...
> > > >
> > > > > +static int eb_reserve_mm(struct i915_execbuffer *eb)
> > > > > +{
> > > > > +     const u64 idx = eb->context->timeline->fence_context;
> > > > > +     struct ww_acquire_ctx acquire;
> > > > > +     struct eb_vma *ev;
> > > > > +     int err;
> > > > > +
> > > > > +     eb->mm_fence = __dma_fence_create_proxy(0, 0);
> > > > > +     if (!eb->mm_fence)
> > > > > +             return -ENOMEM;
> > > >
> > > > Where are the proxy fence functions defined?
> > >
> > > In dma-fence-proxy.c ;)
> >
> > The dma-fence-proxy that Christian NAKed before?
>
> I do not have an email from Christian about dma-fence-proxy in the last
> 3 years it has been on the list.

https://lore.kernel.org/dri-devel/aeb0373d-0583-d922-3b73-93668c27d177@amd.com/

I'm assuming this was about patch 8 there which to me looks like proxy
fences but maybe by threading is off reading that.

https://lore.kernel.org/dri-devel/1502491174-10913-9-git-send-email-jason.ekstrand@intel.com/

Dave.
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Intel-gfx] [PATCH 7/7] drm/i915/gem: Acquire all vma/objects under reservation_ww_class
  2020-06-24 19:04           ` Dave Airlie
@ 2020-06-24 20:18             ` Chris Wilson
  2020-06-25  8:11               ` Christian König
  0 siblings, 1 reply; 48+ messages in thread
From: Chris Wilson @ 2020-06-24 20:18 UTC (permalink / raw)
  To: Dave Airlie; +Cc: Christian König, Intel Graphics Development

Quoting Dave Airlie (2020-06-24 20:04:02)
> On Wed, 24 Jun 2020 at 07:19, Chris Wilson <chris@chris-wilson.co.uk> wrote:
> >
> > Quoting Dave Airlie (2020-06-23 22:01:24)
> > > On Tue, 23 Jun 2020 at 20:03, Chris Wilson <chris@chris-wilson.co.uk> wrote:
> > > >
> > > > Quoting Thomas Hellström (Intel) (2020-06-23 10:33:20)
> > > > > Hi, Chris!
> > > > >
> > > > > On 6/22/20 11:59 AM, Chris Wilson wrote:
> > > > > > In order to actually handle eviction and what not, we need to process
> > > > > > all the objects together under a common lock, reservation_ww_class. As
> > > > > > such, do a memory reservation pass after looking up the object/vma,
> > > > > > which then feeds into the rest of execbuf [relocation, cmdparsing,
> > > > > > flushing and ofc execution].
> > > > > >
> > > > > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > > > > > ---
> > > > > >   .../gpu/drm/i915/gem/i915_gem_execbuffer.c    | 91 ++++++++++++++-----
> > > > > >   1 file changed, 70 insertions(+), 21 deletions(-)
> > > > > >
> > > > > Which tree is this against? The series doesn't apply cleanly against
> > > > > drm-tip?
> > > >
> > > > It's continuing on from the scheduler patches, the bug fixes and the
> > > > iris-deferred-fence work. I thought throwing all of those old patches
> > > > into the pile would have been distracting.
> > > >
> > > > > ...
> > > > >
> > > > > > +static int eb_reserve_mm(struct i915_execbuffer *eb)
> > > > > > +{
> > > > > > +     const u64 idx = eb->context->timeline->fence_context;
> > > > > > +     struct ww_acquire_ctx acquire;
> > > > > > +     struct eb_vma *ev;
> > > > > > +     int err;
> > > > > > +
> > > > > > +     eb->mm_fence = __dma_fence_create_proxy(0, 0);
> > > > > > +     if (!eb->mm_fence)
> > > > > > +             return -ENOMEM;
> > > > >
> > > > > Where are the proxy fence functions defined?
> > > >
> > > > In dma-fence-proxy.c ;)
> > >
> > > The dma-fence-proxy that Christian NAKed before?
> >
> > I do not have an email from Christian about dma-fence-proxy in the last
> > 3 years it has been on the list.
> 
> https://lore.kernel.org/dri-devel/aeb0373d-0583-d922-3b73-93668c27d177@amd.com/

Darn, I skimmed the thread title and thought it was just about the
timelines.

> I'm assuming this was about patch 8 there which to me looks like proxy
> fences but maybe by threading is off reading that.

The deadlocks are easy to resolve. The fence is either signaled normally
by userspace, they create a deadlock that is rejected by checking the dag
and the fence signaled with an error (and work cancelled, error
propagated back to userspace if they kept the output fence around), or
userspace forgets entirely about the fence they were waiting on in which
case it is signaled by closing the syncobjs [sadly not in error though,
I hoping to report EPIPE] on process termination.

https://patchwork.freedesktop.org/patch/372759/?series=78762&rev=1
We can always attach the dag resolver such that we resolve the deadlock
for any importer and so only ever present a normal monotonic fence.
That would make it illegal to wait on an external fence imported into 
that syncobj (as that would be outside of our dag). An option would
be whether or not to force timeout slow userspace. But the simplicity of
reusing the existing functionality to move intrabatch scheduling into
iris is compelling. [In contrast, no one has yet finished the timeline
patches to the point where they stopped throwing errors in igt, and we
still then have to write patches for nonblocking wait-for-submit :[

The use here is trivial, chiefly used as a convenience to flesh out this
argument to see if we can reduce the lock duration within submission
[from the entirety of submission to ideally just reservation] by holding
a fence for the submission process itself. And that boils down to at what
point can someone else start to wait on that fence, and whether or not we
can avoid any direct/indirect waits ourselves after point and before
completing submission. [Usual rules about not being allowed to wait on a
resource while holding contendable resources, but with the nuance of
what/when exactly that resource becomes contendable.] The lock contention
is quite real, as at the moment it is devolving into a global lock. With
the amusing side effect that it then turns out to be quicker to wrap the
entire thing in struct_mutex.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Intel-gfx] [PATCH 7/7] drm/i915/gem: Acquire all vma/objects under reservation_ww_class
  2020-06-24 20:18             ` Chris Wilson
@ 2020-06-25  8:11               ` Christian König
  2020-06-25 12:48                 ` Chris Wilson
  0 siblings, 1 reply; 48+ messages in thread
From: Christian König @ 2020-06-25  8:11 UTC (permalink / raw)
  To: Chris Wilson, Dave Airlie; +Cc: Intel Graphics Development

Am 24.06.20 um 22:18 schrieb Chris Wilson:
> Quoting Dave Airlie (2020-06-24 20:04:02)
>> On Wed, 24 Jun 2020 at 07:19, Chris Wilson <chris@chris-wilson.co.uk> wrote:
>>> Quoting Dave Airlie (2020-06-23 22:01:24)
>>>> On Tue, 23 Jun 2020 at 20:03, Chris Wilson <chris@chris-wilson.co.uk> wrote:
>>>>> Quoting Thomas Hellström (Intel) (2020-06-23 10:33:20)
>>>>>> Hi, Chris!
>>>>>>
>>>>>> On 6/22/20 11:59 AM, Chris Wilson wrote:
>>>>>>> In order to actually handle eviction and what not, we need to process
>>>>>>> all the objects together under a common lock, reservation_ww_class. As
>>>>>>> such, do a memory reservation pass after looking up the object/vma,
>>>>>>> which then feeds into the rest of execbuf [relocation, cmdparsing,
>>>>>>> flushing and ofc execution].
>>>>>>>
>>>>>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>>>>>> ---
>>>>>>>    .../gpu/drm/i915/gem/i915_gem_execbuffer.c    | 91 ++++++++++++++-----
>>>>>>>    1 file changed, 70 insertions(+), 21 deletions(-)
>>>>>>>
>>>>>> Which tree is this against? The series doesn't apply cleanly against
>>>>>> drm-tip?
>>>>> It's continuing on from the scheduler patches, the bug fixes and the
>>>>> iris-deferred-fence work. I thought throwing all of those old patches
>>>>> into the pile would have been distracting.
>>>>>
>>>>>> ...
>>>>>>
>>>>>>> +static int eb_reserve_mm(struct i915_execbuffer *eb)
>>>>>>> +{
>>>>>>> +     const u64 idx = eb->context->timeline->fence_context;
>>>>>>> +     struct ww_acquire_ctx acquire;
>>>>>>> +     struct eb_vma *ev;
>>>>>>> +     int err;
>>>>>>> +
>>>>>>> +     eb->mm_fence = __dma_fence_create_proxy(0, 0);
>>>>>>> +     if (!eb->mm_fence)
>>>>>>> +             return -ENOMEM;
>>>>>> Where are the proxy fence functions defined?
>>>>> In dma-fence-proxy.c ;)
>>>> The dma-fence-proxy that Christian NAKed before?
>>> I do not have an email from Christian about dma-fence-proxy in the last
>>> 3 years it has been on the list.
>> https://lore.kernel.org/dri-devel/aeb0373d-0583-d922-3b73-93668c27d177@amd.com/
> Darn, I skimmed the thread title and thought it was just about the
> timelines.
>
>> I'm assuming this was about patch 8 there which to me looks like proxy
>> fences but maybe by threading is off reading that.
> The deadlocks are easy to resolve. The fence is either signaled normally
> by userspace, they create a deadlock that is rejected by checking the dag
> and the fence signaled with an error (and work cancelled, error
> propagated back to userspace if they kept the output fence around), or
> userspace forgets entirely about the fence they were waiting on in which
> case it is signaled by closing the syncobjs [sadly not in error though,
> I hoping to report EPIPE] on process termination.

And exactly that concept is still a big NAK.

The kernel memory management depends on dma_fences to be signaling as 
soon as they are existing.

Just imagine what Daniel's dependency patches would splat out when you 
do something like this and correctly annotate the signaling code path.

Proxy fences, especially when they depend on userspace for signaling are 
an absolutely NO-GO.

Regards,
Christian.

>
> https://patchwork.freedesktop.org/patch/372759/?series=78762&rev=1
> We can always attach the dag resolver such that we resolve the deadlock
> for any importer and so only ever present a normal monotonic fence.
> That would make it illegal to wait on an external fence imported into
> that syncobj (as that would be outside of our dag). An option would
> be whether or not to force timeout slow userspace. But the simplicity of
> reusing the existing functionality to move intrabatch scheduling into
> iris is compelling. [In contrast, no one has yet finished the timeline
> patches to the point where they stopped throwing errors in igt, and we
> still then have to write patches for nonblocking wait-for-submit :[
>
> The use here is trivial, chiefly used as a convenience to flesh out this
> argument to see if we can reduce the lock duration within submission
> [from the entirety of submission to ideally just reservation] by holding
> a fence for the submission process itself. And that boils down to at what
> point can someone else start to wait on that fence, and whether or not we
> can avoid any direct/indirect waits ourselves after point and before
> completing submission. [Usual rules about not being allowed to wait on a
> resource while holding contendable resources, but with the nuance of
> what/when exactly that resource becomes contendable.] The lock contention
> is quite real, as at the moment it is devolving into a global lock. With
> the amusing side effect that it then turns out to be quicker to wrap the
> entire thing in struct_mutex.
> -Chris

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Intel-gfx] [PATCH 7/7] drm/i915/gem: Acquire all vma/objects under reservation_ww_class
  2020-06-25  8:11               ` Christian König
@ 2020-06-25 12:48                 ` Chris Wilson
  2020-06-25 12:59                   ` Christian König
  0 siblings, 1 reply; 48+ messages in thread
From: Chris Wilson @ 2020-06-25 12:48 UTC (permalink / raw)
  To: Christian König, Dave Airlie, christian.koenig
  Cc: Intel Graphics Development

Quoting Christian König (2020-06-25 09:11:35)
> Am 24.06.20 um 22:18 schrieb Chris Wilson:
> > Quoting Dave Airlie (2020-06-24 20:04:02)
> >> On Wed, 24 Jun 2020 at 07:19, Chris Wilson <chris@chris-wilson.co.uk> wrote:
> >>> Quoting Dave Airlie (2020-06-23 22:01:24)
> >>>> On Tue, 23 Jun 2020 at 20:03, Chris Wilson <chris@chris-wilson.co.uk> wrote:
> >>>>> Quoting Thomas Hellström (Intel) (2020-06-23 10:33:20)
> >>>>>> Hi, Chris!
> >>>>>>
> >>>>>> On 6/22/20 11:59 AM, Chris Wilson wrote:
> >>>>>>> In order to actually handle eviction and what not, we need to process
> >>>>>>> all the objects together under a common lock, reservation_ww_class. As
> >>>>>>> such, do a memory reservation pass after looking up the object/vma,
> >>>>>>> which then feeds into the rest of execbuf [relocation, cmdparsing,
> >>>>>>> flushing and ofc execution].
> >>>>>>>
> >>>>>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >>>>>>> ---
> >>>>>>>    .../gpu/drm/i915/gem/i915_gem_execbuffer.c    | 91 ++++++++++++++-----
> >>>>>>>    1 file changed, 70 insertions(+), 21 deletions(-)
> >>>>>>>
> >>>>>> Which tree is this against? The series doesn't apply cleanly against
> >>>>>> drm-tip?
> >>>>> It's continuing on from the scheduler patches, the bug fixes and the
> >>>>> iris-deferred-fence work. I thought throwing all of those old patches
> >>>>> into the pile would have been distracting.
> >>>>>
> >>>>>> ...
> >>>>>>
> >>>>>>> +static int eb_reserve_mm(struct i915_execbuffer *eb)
> >>>>>>> +{
> >>>>>>> +     const u64 idx = eb->context->timeline->fence_context;
> >>>>>>> +     struct ww_acquire_ctx acquire;
> >>>>>>> +     struct eb_vma *ev;
> >>>>>>> +     int err;
> >>>>>>> +
> >>>>>>> +     eb->mm_fence = __dma_fence_create_proxy(0, 0);
> >>>>>>> +     if (!eb->mm_fence)
> >>>>>>> +             return -ENOMEM;
> >>>>>> Where are the proxy fence functions defined?
> >>>>> In dma-fence-proxy.c ;)
> >>>> The dma-fence-proxy that Christian NAKed before?
> >>> I do not have an email from Christian about dma-fence-proxy in the last
> >>> 3 years it has been on the list.
> >> https://lore.kernel.org/dri-devel/aeb0373d-0583-d922-3b73-93668c27d177@amd.com/
> > Darn, I skimmed the thread title and thought it was just about the
> > timelines.
> >
> >> I'm assuming this was about patch 8 there which to me looks like proxy
> >> fences but maybe by threading is off reading that.
> > The deadlocks are easy to resolve. The fence is either signaled normally
> > by userspace, they create a deadlock that is rejected by checking the dag
> > and the fence signaled with an error (and work cancelled, error
> > propagated back to userspace if they kept the output fence around), or
> > userspace forgets entirely about the fence they were waiting on in which
> > case it is signaled by closing the syncobjs [sadly not in error though,
> > I hoping to report EPIPE] on process termination.
> 
> And exactly that concept is still a big NAK.
> 
> The kernel memory management depends on dma_fences to be signaling as 
> soon as they are existing.
> 
> Just imagine what Daniel's dependency patches would splat out when you 
> do something like this and correctly annotate the signaling code path.

Nothing at all. Forward progress of the waiter does not solely depend on
the signaler, just as in bc9c80fe01a2570a2fd78abbc492b377b5fda068.
 
> Proxy fences, especially when they depend on userspace for signaling are 
> an absolutely NO-GO.

We are in full control of the signaling and are able to cancel the pending
userspace operation, move it off to one side and shutdown the HW,
whatever. We can and do do dependency analysis of the fence contexts to
avoid deadlocks, just as easily as detecting recursion.

To claim that userspace is not already able to control signaling, is a
false dichotomy. Userspace is fully able to lock the HW resources
indefinitely (even if you cap every job, one can always build a chain of
jobs to circumvent any imposed timeout, a couple of seconds timeout
becomes several months of jobs before the GPU runs out of memory and is
unable to accept any more jobs). Any ioctl that blocks while holding a HW
resource renders itself liable to a user controllable livelock, you know
this, because it is blocking the signaling of those earlier jobs.
Worrying about things that are entirely within our control and hence
avoidable, misses the point.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Intel-gfx] [PATCH 7/7] drm/i915/gem: Acquire all vma/objects under reservation_ww_class
  2020-06-25 12:48                 ` Chris Wilson
@ 2020-06-25 12:59                   ` Christian König
  2020-06-25 13:23                     ` Chris Wilson
  0 siblings, 1 reply; 48+ messages in thread
From: Christian König @ 2020-06-25 12:59 UTC (permalink / raw)
  To: Chris Wilson, Christian König, Dave Airlie
  Cc: Intel Graphics Development

Am 25.06.20 um 14:48 schrieb Chris Wilson:
> Quoting Christian König (2020-06-25 09:11:35)
>> Am 24.06.20 um 22:18 schrieb Chris Wilson:
>>> Quoting Dave Airlie (2020-06-24 20:04:02)
>>>> On Wed, 24 Jun 2020 at 07:19, Chris Wilson <chris@chris-wilson.co.uk> wrote:
>>>>> Quoting Dave Airlie (2020-06-23 22:01:24)
>>>>>> On Tue, 23 Jun 2020 at 20:03, Chris Wilson <chris@chris-wilson.co.uk> wrote:
>>>>>>> Quoting Thomas Hellström (Intel) (2020-06-23 10:33:20)
>>>>>>>> Hi, Chris!
>>>>>>>>
>>>>>>>> On 6/22/20 11:59 AM, Chris Wilson wrote:
>>>>>>>>> In order to actually handle eviction and what not, we need to process
>>>>>>>>> all the objects together under a common lock, reservation_ww_class. As
>>>>>>>>> such, do a memory reservation pass after looking up the object/vma,
>>>>>>>>> which then feeds into the rest of execbuf [relocation, cmdparsing,
>>>>>>>>> flushing and ofc execution].
>>>>>>>>>
>>>>>>>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>>>>>>>> ---
>>>>>>>>>     .../gpu/drm/i915/gem/i915_gem_execbuffer.c    | 91 ++++++++++++++-----
>>>>>>>>>     1 file changed, 70 insertions(+), 21 deletions(-)
>>>>>>>>>
>>>>>>>> Which tree is this against? The series doesn't apply cleanly against
>>>>>>>> drm-tip?
>>>>>>> It's continuing on from the scheduler patches, the bug fixes and the
>>>>>>> iris-deferred-fence work. I thought throwing all of those old patches
>>>>>>> into the pile would have been distracting.
>>>>>>>
>>>>>>>> ...
>>>>>>>>
>>>>>>>>> +static int eb_reserve_mm(struct i915_execbuffer *eb)
>>>>>>>>> +{
>>>>>>>>> +     const u64 idx = eb->context->timeline->fence_context;
>>>>>>>>> +     struct ww_acquire_ctx acquire;
>>>>>>>>> +     struct eb_vma *ev;
>>>>>>>>> +     int err;
>>>>>>>>> +
>>>>>>>>> +     eb->mm_fence = __dma_fence_create_proxy(0, 0);
>>>>>>>>> +     if (!eb->mm_fence)
>>>>>>>>> +             return -ENOMEM;
>>>>>>>> Where are the proxy fence functions defined?
>>>>>>> In dma-fence-proxy.c ;)
>>>>>> The dma-fence-proxy that Christian NAKed before?
>>>>> I do not have an email from Christian about dma-fence-proxy in the last
>>>>> 3 years it has been on the list.
>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore.kernel.org%2Fdri-devel%2Faeb0373d-0583-d922-3b73-93668c27d177%40amd.com%2F&amp;data=02%7C01%7Cchristian.koenig%40amd.com%7Ccb060e358d844784815708d819061868%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637286861292346372&amp;sdata=DlHistmqPi%2BtwdcT%2FycrtRpoLGZ6xcBD%2FkPvVZcQ2YQ%3D&amp;reserved=0
>>> Darn, I skimmed the thread title and thought it was just about the
>>> timelines.
>>>
>>>> I'm assuming this was about patch 8 there which to me looks like proxy
>>>> fences but maybe by threading is off reading that.
>>> The deadlocks are easy to resolve. The fence is either signaled normally
>>> by userspace, they create a deadlock that is rejected by checking the dag
>>> and the fence signaled with an error (and work cancelled, error
>>> propagated back to userspace if they kept the output fence around), or
>>> userspace forgets entirely about the fence they were waiting on in which
>>> case it is signaled by closing the syncobjs [sadly not in error though,
>>> I hoping to report EPIPE] on process termination.
>> And exactly that concept is still a big NAK.
>>
>> The kernel memory management depends on dma_fences to be signaling as
>> soon as they are existing.
>>
>> Just imagine what Daniel's dependency patches would splat out when you
>> do something like this and correctly annotate the signaling code path.
> Nothing at all. Forward progress of the waiter does not solely depend on
> the signaler, just as in bc9c80fe01a2570a2fd78abbc492b377b5fda068.
>   
>> Proxy fences, especially when they depend on userspace for signaling are
>> an absolutely NO-GO.
> We are in full control of the signaling and are able to cancel the pending
> userspace operation, move it off to one side and shutdown the HW,
> whatever. We can and do do dependency analysis of the fence contexts to
> avoid deadlocks, just as easily as detecting recursion.
>
> To claim that userspace is not already able to control signaling, is a
> false dichotomy. Userspace is fully able to lock the HW resources
> indefinitely (even if you cap every job, one can always build a chain of
> jobs to circumvent any imposed timeout, a couple of seconds timeout
> becomes several months of jobs before the GPU runs out of memory and is
> unable to accept any more jobs). Any ioctl that blocks while holding a HW
> resource renders itself liable to a user controllable livelock, you know
> this, because it is blocking the signaling of those earlier jobs.
> Worrying about things that are entirely within our control and hence
> avoidable, misses the point.

You are completely missing the problem here.

As you correctly pointed out that an userspace thread blocks on 
something is perfectly acceptable. And that's how 
bc9c80fe01a2570a2fd78abbc492b377b5fda068 works as well.

And bc9c80fe01a2570a2fd78abbc492b377b5fda068 only implements waiting so 
that during CS or WAIT IOCTL we can block for the fence to appear.


What happens in your approach is that the kernel starts to wait for 
userspace in its memory reclaim path. That is exactly the kind of 
problem Daniels patches now point out immediately.

So while the hardware can obviously get stuck in an endless loop and 
needs to be recovered, this here has the potential of a system wide 
kernel deadlock which is not recoverable.

The whole approach is an absolutely clear NAK!

Regards,
Christian.

> -Chris

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Intel-gfx] [PATCH 7/7] drm/i915/gem: Acquire all vma/objects under reservation_ww_class
  2020-06-25 12:59                   ` Christian König
@ 2020-06-25 13:23                     ` Chris Wilson
  2020-06-25 14:02                       ` Christian König
  0 siblings, 1 reply; 48+ messages in thread
From: Chris Wilson @ 2020-06-25 13:23 UTC (permalink / raw)
  To: Christian König, Christian König, Dave Airlie
  Cc: Intel Graphics Development

Quoting Christian König (2020-06-25 13:59:16)
> Am 25.06.20 um 14:48 schrieb Chris Wilson:
> > Quoting Christian König (2020-06-25 09:11:35)
> >> Am 24.06.20 um 22:18 schrieb Chris Wilson:
> >>> Quoting Dave Airlie (2020-06-24 20:04:02)
> >>>> On Wed, 24 Jun 2020 at 07:19, Chris Wilson <chris@chris-wilson.co.uk> wrote:
> >>>>> Quoting Dave Airlie (2020-06-23 22:01:24)
> >>>>>> On Tue, 23 Jun 2020 at 20:03, Chris Wilson <chris@chris-wilson.co.uk> wrote:
> >>>>>>> Quoting Thomas Hellström (Intel) (2020-06-23 10:33:20)
> >>>>>>>> Hi, Chris!
> >>>>>>>>
> >>>>>>>> On 6/22/20 11:59 AM, Chris Wilson wrote:
> >>>>>>>>> In order to actually handle eviction and what not, we need to process
> >>>>>>>>> all the objects together under a common lock, reservation_ww_class. As
> >>>>>>>>> such, do a memory reservation pass after looking up the object/vma,
> >>>>>>>>> which then feeds into the rest of execbuf [relocation, cmdparsing,
> >>>>>>>>> flushing and ofc execution].
> >>>>>>>>>
> >>>>>>>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >>>>>>>>> ---
> >>>>>>>>>     .../gpu/drm/i915/gem/i915_gem_execbuffer.c    | 91 ++++++++++++++-----
> >>>>>>>>>     1 file changed, 70 insertions(+), 21 deletions(-)
> >>>>>>>>>
> >>>>>>>> Which tree is this against? The series doesn't apply cleanly against
> >>>>>>>> drm-tip?
> >>>>>>> It's continuing on from the scheduler patches, the bug fixes and the
> >>>>>>> iris-deferred-fence work. I thought throwing all of those old patches
> >>>>>>> into the pile would have been distracting.
> >>>>>>>
> >>>>>>>> ...
> >>>>>>>>
> >>>>>>>>> +static int eb_reserve_mm(struct i915_execbuffer *eb)
> >>>>>>>>> +{
> >>>>>>>>> +     const u64 idx = eb->context->timeline->fence_context;
> >>>>>>>>> +     struct ww_acquire_ctx acquire;
> >>>>>>>>> +     struct eb_vma *ev;
> >>>>>>>>> +     int err;
> >>>>>>>>> +
> >>>>>>>>> +     eb->mm_fence = __dma_fence_create_proxy(0, 0);
> >>>>>>>>> +     if (!eb->mm_fence)
> >>>>>>>>> +             return -ENOMEM;
> >>>>>>>> Where are the proxy fence functions defined?
> >>>>>>> In dma-fence-proxy.c ;)
> >>>>>> The dma-fence-proxy that Christian NAKed before?
> >>>>> I do not have an email from Christian about dma-fence-proxy in the last
> >>>>> 3 years it has been on the list.
> >>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore.kernel.org%2Fdri-devel%2Faeb0373d-0583-d922-3b73-93668c27d177%40amd.com%2F&amp;data=02%7C01%7Cchristian.koenig%40amd.com%7Ccb060e358d844784815708d819061868%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637286861292346372&amp;sdata=DlHistmqPi%2BtwdcT%2FycrtRpoLGZ6xcBD%2FkPvVZcQ2YQ%3D&amp;reserved=0
> >>> Darn, I skimmed the thread title and thought it was just about the
> >>> timelines.
> >>>
> >>>> I'm assuming this was about patch 8 there which to me looks like proxy
> >>>> fences but maybe by threading is off reading that.
> >>> The deadlocks are easy to resolve. The fence is either signaled normally
> >>> by userspace, they create a deadlock that is rejected by checking the dag
> >>> and the fence signaled with an error (and work cancelled, error
> >>> propagated back to userspace if they kept the output fence around), or
> >>> userspace forgets entirely about the fence they were waiting on in which
> >>> case it is signaled by closing the syncobjs [sadly not in error though,
> >>> I hoping to report EPIPE] on process termination.
> >> And exactly that concept is still a big NAK.
> >>
> >> The kernel memory management depends on dma_fences to be signaling as
> >> soon as they are existing.
> >>
> >> Just imagine what Daniel's dependency patches would splat out when you
> >> do something like this and correctly annotate the signaling code path.
> > Nothing at all. Forward progress of the waiter does not solely depend on
> > the signaler, just as in bc9c80fe01a2570a2fd78abbc492b377b5fda068.
> >   
> >> Proxy fences, especially when they depend on userspace for signaling are
> >> an absolutely NO-GO.
> > We are in full control of the signaling and are able to cancel the pending
> > userspace operation, move it off to one side and shutdown the HW,
> > whatever. We can and do do dependency analysis of the fence contexts to
> > avoid deadlocks, just as easily as detecting recursion.
> >
> > To claim that userspace is not already able to control signaling, is a
> > false dichotomy. Userspace is fully able to lock the HW resources
> > indefinitely (even if you cap every job, one can always build a chain of
> > jobs to circumvent any imposed timeout, a couple of seconds timeout
> > becomes several months of jobs before the GPU runs out of memory and is
> > unable to accept any more jobs). Any ioctl that blocks while holding a HW
> > resource renders itself liable to a user controllable livelock, you know
> > this, because it is blocking the signaling of those earlier jobs.
> > Worrying about things that are entirely within our control and hence
> > avoidable, misses the point.
> 
> You are completely missing the problem here.
> 
> As you correctly pointed out that an userspace thread blocks on 
> something is perfectly acceptable. And that's how 
> bc9c80fe01a2570a2fd78abbc492b377b5fda068 works as well.
> 
> And bc9c80fe01a2570a2fd78abbc492b377b5fda068 only implements waiting so 
> that during CS or WAIT IOCTL we can block for the fence to appear.
> 
> 
> What happens in your approach is that the kernel starts to wait for 
> userspace in its memory reclaim path. That is exactly the kind of 
> problem Daniels patches now point out immediately.

No we don't.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Intel-gfx] [PATCH 7/7] drm/i915/gem: Acquire all vma/objects under reservation_ww_class
  2020-06-25 13:23                     ` Chris Wilson
@ 2020-06-25 14:02                       ` Christian König
  2020-06-25 15:10                         ` Chris Wilson
  0 siblings, 1 reply; 48+ messages in thread
From: Christian König @ 2020-06-25 14:02 UTC (permalink / raw)
  To: Chris Wilson, Christian König, Dave Airlie
  Cc: Intel Graphics Development

Am 25.06.20 um 15:23 schrieb Chris Wilson:
> Quoting Christian König (2020-06-25 13:59:16)
>> Am 25.06.20 um 14:48 schrieb Chris Wilson:
>>> Quoting Christian König (2020-06-25 09:11:35)
>>>> Am 24.06.20 um 22:18 schrieb Chris Wilson:
>>>>> Quoting Dave Airlie (2020-06-24 20:04:02)
>>>>>> On Wed, 24 Jun 2020 at 07:19, Chris Wilson <chris@chris-wilson.co.uk> wrote:
>>>>>>> Quoting Dave Airlie (2020-06-23 22:01:24)
>>>>>>>> On Tue, 23 Jun 2020 at 20:03, Chris Wilson <chris@chris-wilson.co.uk> wrote:
>>>>>>>>> Quoting Thomas Hellström (Intel) (2020-06-23 10:33:20)
>>>>>>>>>> Hi, Chris!
>>>>>>>>>>
>>>>>>>>>> On 6/22/20 11:59 AM, Chris Wilson wrote:
>>>>>>>>>>> In order to actually handle eviction and what not, we need to process
>>>>>>>>>>> all the objects together under a common lock, reservation_ww_class. As
>>>>>>>>>>> such, do a memory reservation pass after looking up the object/vma,
>>>>>>>>>>> which then feeds into the rest of execbuf [relocation, cmdparsing,
>>>>>>>>>>> flushing and ofc execution].
>>>>>>>>>>>
>>>>>>>>>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>>>>>>>>>> ---
>>>>>>>>>>>      .../gpu/drm/i915/gem/i915_gem_execbuffer.c    | 91 ++++++++++++++-----
>>>>>>>>>>>      1 file changed, 70 insertions(+), 21 deletions(-)
>>>>>>>>>>>
>>>>>>>>>> Which tree is this against? The series doesn't apply cleanly against
>>>>>>>>>> drm-tip?
>>>>>>>>> It's continuing on from the scheduler patches, the bug fixes and the
>>>>>>>>> iris-deferred-fence work. I thought throwing all of those old patches
>>>>>>>>> into the pile would have been distracting.
>>>>>>>>>
>>>>>>>>>> ...
>>>>>>>>>>
>>>>>>>>>>> +static int eb_reserve_mm(struct i915_execbuffer *eb)
>>>>>>>>>>> +{
>>>>>>>>>>> +     const u64 idx = eb->context->timeline->fence_context;
>>>>>>>>>>> +     struct ww_acquire_ctx acquire;
>>>>>>>>>>> +     struct eb_vma *ev;
>>>>>>>>>>> +     int err;
>>>>>>>>>>> +
>>>>>>>>>>> +     eb->mm_fence = __dma_fence_create_proxy(0, 0);
>>>>>>>>>>> +     if (!eb->mm_fence)
>>>>>>>>>>> +             return -ENOMEM;
>>>>>>>>>> Where are the proxy fence functions defined?
>>>>>>>>> In dma-fence-proxy.c ;)
>>>>>>>> The dma-fence-proxy that Christian NAKed before?
>>>>>>> I do not have an email from Christian about dma-fence-proxy in the last
>>>>>>> 3 years it has been on the list.
>>>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore.kernel.org%2Fdri-devel%2Faeb0373d-0583-d922-3b73-93668c27d177%40amd.com%2F&amp;data=02%7C01%7Cchristian.koenig%40amd.com%7Ccb060e358d844784815708d819061868%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637286861292346372&amp;sdata=DlHistmqPi%2BtwdcT%2FycrtRpoLGZ6xcBD%2FkPvVZcQ2YQ%3D&amp;reserved=0
>>>>> Darn, I skimmed the thread title and thought it was just about the
>>>>> timelines.
>>>>>
>>>>>> I'm assuming this was about patch 8 there which to me looks like proxy
>>>>>> fences but maybe by threading is off reading that.
>>>>> The deadlocks are easy to resolve. The fence is either signaled normally
>>>>> by userspace, they create a deadlock that is rejected by checking the dag
>>>>> and the fence signaled with an error (and work cancelled, error
>>>>> propagated back to userspace if they kept the output fence around), or
>>>>> userspace forgets entirely about the fence they were waiting on in which
>>>>> case it is signaled by closing the syncobjs [sadly not in error though,
>>>>> I hoping to report EPIPE] on process termination.
>>>> And exactly that concept is still a big NAK.
>>>>
>>>> The kernel memory management depends on dma_fences to be signaling as
>>>> soon as they are existing.
>>>>
>>>> Just imagine what Daniel's dependency patches would splat out when you
>>>> do something like this and correctly annotate the signaling code path.
>>> Nothing at all. Forward progress of the waiter does not solely depend on
>>> the signaler, just as in bc9c80fe01a2570a2fd78abbc492b377b5fda068.
>>>    
>>>> Proxy fences, especially when they depend on userspace for signaling are
>>>> an absolutely NO-GO.
>>> We are in full control of the signaling and are able to cancel the pending
>>> userspace operation, move it off to one side and shutdown the HW,
>>> whatever. We can and do do dependency analysis of the fence contexts to
>>> avoid deadlocks, just as easily as detecting recursion.
>>>
>>> To claim that userspace is not already able to control signaling, is a
>>> false dichotomy. Userspace is fully able to lock the HW resources
>>> indefinitely (even if you cap every job, one can always build a chain of
>>> jobs to circumvent any imposed timeout, a couple of seconds timeout
>>> becomes several months of jobs before the GPU runs out of memory and is
>>> unable to accept any more jobs). Any ioctl that blocks while holding a HW
>>> resource renders itself liable to a user controllable livelock, you know
>>> this, because it is blocking the signaling of those earlier jobs.
>>> Worrying about things that are entirely within our control and hence
>>> avoidable, misses the point.
>> You are completely missing the problem here.
>>
>> As you correctly pointed out that an userspace thread blocks on
>> something is perfectly acceptable. And that's how
>> bc9c80fe01a2570a2fd78abbc492b377b5fda068 works as well.
>>
>> And bc9c80fe01a2570a2fd78abbc492b377b5fda068 only implements waiting so
>> that during CS or WAIT IOCTL we can block for the fence to appear.
>>
>>
>> What happens in your approach is that the kernel starts to wait for
>> userspace in its memory reclaim path. That is exactly the kind of
>> problem Daniels patches now point out immediately.
> No we don't.

Well then Daniels patches are still missing that case :)

See when signaling a fence depends userspace doing something, we 
obviously insert circle dependencies between whatever userspace might do 
in a kernel system call and the kernel reclaim path.

That this can't work correctly is actually completely obvious if you see 
it from this side.

Regards,
Christian.

> -Chris

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Intel-gfx] [PATCH 7/7] drm/i915/gem: Acquire all vma/objects under reservation_ww_class
  2020-06-25 14:02                       ` Christian König
@ 2020-06-25 15:10                         ` Chris Wilson
  2020-06-25 15:47                           ` Christian König
  0 siblings, 1 reply; 48+ messages in thread
From: Chris Wilson @ 2020-06-25 15:10 UTC (permalink / raw)
  To: Christian König, Christian König, Dave Airlie
  Cc: Intel Graphics Development

Quoting Christian König (2020-06-25 15:02:41)
> Am 25.06.20 um 15:23 schrieb Chris Wilson:
> > Quoting Christian König (2020-06-25 13:59:16)
> >> Am 25.06.20 um 14:48 schrieb Chris Wilson:
> >>> Quoting Christian König (2020-06-25 09:11:35)
> >>>> Am 24.06.20 um 22:18 schrieb Chris Wilson:
> >>>>> Quoting Dave Airlie (2020-06-24 20:04:02)
> >>>>>> On Wed, 24 Jun 2020 at 07:19, Chris Wilson <chris@chris-wilson.co.uk> wrote:
> >>>>>>> Quoting Dave Airlie (2020-06-23 22:01:24)
> >>>>>>>> On Tue, 23 Jun 2020 at 20:03, Chris Wilson <chris@chris-wilson.co.uk> wrote:
> >>>>>>>>> Quoting Thomas Hellström (Intel) (2020-06-23 10:33:20)
> >>>>>>>>>> Hi, Chris!
> >>>>>>>>>>
> >>>>>>>>>> On 6/22/20 11:59 AM, Chris Wilson wrote:
> >>>>>>>>>>> In order to actually handle eviction and what not, we need to process
> >>>>>>>>>>> all the objects together under a common lock, reservation_ww_class. As
> >>>>>>>>>>> such, do a memory reservation pass after looking up the object/vma,
> >>>>>>>>>>> which then feeds into the rest of execbuf [relocation, cmdparsing,
> >>>>>>>>>>> flushing and ofc execution].
> >>>>>>>>>>>
> >>>>>>>>>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >>>>>>>>>>> ---
> >>>>>>>>>>>      .../gpu/drm/i915/gem/i915_gem_execbuffer.c    | 91 ++++++++++++++-----
> >>>>>>>>>>>      1 file changed, 70 insertions(+), 21 deletions(-)
> >>>>>>>>>>>
> >>>>>>>>>> Which tree is this against? The series doesn't apply cleanly against
> >>>>>>>>>> drm-tip?
> >>>>>>>>> It's continuing on from the scheduler patches, the bug fixes and the
> >>>>>>>>> iris-deferred-fence work. I thought throwing all of those old patches
> >>>>>>>>> into the pile would have been distracting.
> >>>>>>>>>
> >>>>>>>>>> ...
> >>>>>>>>>>
> >>>>>>>>>>> +static int eb_reserve_mm(struct i915_execbuffer *eb)
> >>>>>>>>>>> +{
> >>>>>>>>>>> +     const u64 idx = eb->context->timeline->fence_context;
> >>>>>>>>>>> +     struct ww_acquire_ctx acquire;
> >>>>>>>>>>> +     struct eb_vma *ev;
> >>>>>>>>>>> +     int err;
> >>>>>>>>>>> +
> >>>>>>>>>>> +     eb->mm_fence = __dma_fence_create_proxy(0, 0);
> >>>>>>>>>>> +     if (!eb->mm_fence)
> >>>>>>>>>>> +             return -ENOMEM;
> >>>>>>>>>> Where are the proxy fence functions defined?
> >>>>>>>>> In dma-fence-proxy.c ;)
> >>>>>>>> The dma-fence-proxy that Christian NAKed before?
> >>>>>>> I do not have an email from Christian about dma-fence-proxy in the last
> >>>>>>> 3 years it has been on the list.
> >>>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore.kernel.org%2Fdri-devel%2Faeb0373d-0583-d922-3b73-93668c27d177%40amd.com%2F&amp;data=02%7C01%7Cchristian.koenig%40amd.com%7Ccb060e358d844784815708d819061868%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637286861292346372&amp;sdata=DlHistmqPi%2BtwdcT%2FycrtRpoLGZ6xcBD%2FkPvVZcQ2YQ%3D&amp;reserved=0
> >>>>> Darn, I skimmed the thread title and thought it was just about the
> >>>>> timelines.
> >>>>>
> >>>>>> I'm assuming this was about patch 8 there which to me looks like proxy
> >>>>>> fences but maybe by threading is off reading that.
> >>>>> The deadlocks are easy to resolve. The fence is either signaled normally
> >>>>> by userspace, they create a deadlock that is rejected by checking the dag
> >>>>> and the fence signaled with an error (and work cancelled, error
> >>>>> propagated back to userspace if they kept the output fence around), or
> >>>>> userspace forgets entirely about the fence they were waiting on in which
> >>>>> case it is signaled by closing the syncobjs [sadly not in error though,
> >>>>> I hoping to report EPIPE] on process termination.
> >>>> And exactly that concept is still a big NAK.
> >>>>
> >>>> The kernel memory management depends on dma_fences to be signaling as
> >>>> soon as they are existing.
> >>>>
> >>>> Just imagine what Daniel's dependency patches would splat out when you
> >>>> do something like this and correctly annotate the signaling code path.
> >>> Nothing at all. Forward progress of the waiter does not solely depend on
> >>> the signaler, just as in bc9c80fe01a2570a2fd78abbc492b377b5fda068.
> >>>    
> >>>> Proxy fences, especially when they depend on userspace for signaling are
> >>>> an absolutely NO-GO.
> >>> We are in full control of the signaling and are able to cancel the pending
> >>> userspace operation, move it off to one side and shutdown the HW,
> >>> whatever. We can and do do dependency analysis of the fence contexts to
> >>> avoid deadlocks, just as easily as detecting recursion.
> >>>
> >>> To claim that userspace is not already able to control signaling, is a
> >>> false dichotomy. Userspace is fully able to lock the HW resources
> >>> indefinitely (even if you cap every job, one can always build a chain of
> >>> jobs to circumvent any imposed timeout, a couple of seconds timeout
> >>> becomes several months of jobs before the GPU runs out of memory and is
> >>> unable to accept any more jobs). Any ioctl that blocks while holding a HW
> >>> resource renders itself liable to a user controllable livelock, you know
> >>> this, because it is blocking the signaling of those earlier jobs.
> >>> Worrying about things that are entirely within our control and hence
> >>> avoidable, misses the point.
> >> You are completely missing the problem here.
> >>
> >> As you correctly pointed out that an userspace thread blocks on
> >> something is perfectly acceptable. And that's how
> >> bc9c80fe01a2570a2fd78abbc492b377b5fda068 works as well.
> >>
> >> And bc9c80fe01a2570a2fd78abbc492b377b5fda068 only implements waiting so
> >> that during CS or WAIT IOCTL we can block for the fence to appear.
> >>
> >>
> >> What happens in your approach is that the kernel starts to wait for
> >> userspace in its memory reclaim path. That is exactly the kind of
> >> problem Daniels patches now point out immediately.
> > No we don't.

To be clear, adding a wait to direct reclaim incurs latency across the
whole system, and attracts the ire of users and core developers alike.

Having fielded the bug reports for that, we try to avoid any case where
we would wait inside direct reclaim. We still do cause kswapd to wait if
there's nothing else left to clean up. We also try to apply backpressure
to client memory allocators directly; I would like to improve that path
to have memory prioritisation.

So I still consider direct reclaim latency to be a serious enough issue
that a blanket recommendation should be: don't wait.

> Well then Daniels patches are still missing that case :)

We have the DAG of fences, we can use that information to avoid adding
an implicit coupling between execution contexts. Borrowing lockdep for
its heavily aliased chains, when we have the finegrained information
available for scheduling to solve what is essentially a scheduling issue
seems shallow.

> See when signaling a fence depends userspace doing something, we 
> obviously insert circle dependencies between whatever userspace might do 
> in a kernel system call and the kernel reclaim path.
> 
> That this can't work correctly is actually completely obvious if you see 
> it from this side.

The waits are unbounded, indefinite or even just a matter of milliseconds;
ergo you are not allowed to wait inside direct reclaim. Userspace
dictates forward progress of signaling chains, worse if one client can
indirectly manipulate another's progress; the entire kernel is inside
that execution context. Totally agree on that. [But inside the kernel, we
do have the information to track even implicit execution coupling inside 
the drivers, and where we don't have that information we have to assume
it is outside of our control.]
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Intel-gfx] [PATCH 7/7] drm/i915/gem: Acquire all vma/objects under reservation_ww_class
  2020-06-25 15:10                         ` Chris Wilson
@ 2020-06-25 15:47                           ` Christian König
  2020-06-25 17:42                             ` Chris Wilson
  0 siblings, 1 reply; 48+ messages in thread
From: Christian König @ 2020-06-25 15:47 UTC (permalink / raw)
  To: Chris Wilson, Christian König, Dave Airlie
  Cc: Intel Graphics Development

Am 25.06.20 um 17:10 schrieb Chris Wilson:
> We have the DAG of fences, we can use that information to avoid adding
> an implicit coupling between execution contexts.

No, we can't. And it sounds like you still have not understood the 
underlying problem.

See this has nothing to do with the fences itself or their DAG.

When you depend on userspace to do another submission so your fence can 
start processing you end up depending on whatever userspace does.

This in turn means when userspace calls a system call (or does page 
fault) it is possible that this ends up in the reclaim code path.


And while we want to avoid it both Daniel and I already discussed this 
multiple times and we agree it is still a must have to be able to do 
fence waits in the reclaim code path.

So what happens is that you have a dependency between fence submission 
-> userspace -> reclaim path -> fence submission. And that is a circle 
dependency, no matter what your DAG looks like.


In other words this whole approach does not work, is a clear NAK and I 
can only advise Dave to *not* merge it.

Regards,
Christian.
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Intel-gfx] [PATCH 7/7] drm/i915/gem: Acquire all vma/objects under reservation_ww_class
  2020-06-25 15:47                           ` Christian König
@ 2020-06-25 17:42                             ` Chris Wilson
  2020-06-26  8:10                               ` Chris Wilson
  0 siblings, 1 reply; 48+ messages in thread
From: Chris Wilson @ 2020-06-25 17:42 UTC (permalink / raw)
  To: Christian König, Christian König, Dave Airlie
  Cc: Intel Graphics Development

Quoting Christian König (2020-06-25 16:47:09)
> Am 25.06.20 um 17:10 schrieb Chris Wilson:
> > We have the DAG of fences, we can use that information to avoid adding
> > an implicit coupling between execution contexts.
> 
> No, we can't. And it sounds like you still have not understood the 
> underlying problem.
> 
> See this has nothing to do with the fences itself or their DAG.
> 
> When you depend on userspace to do another submission so your fence can 
> start processing you end up depending on whatever userspace does.

HW dependency on userspace is explicit in the ABI and client APIs, and
the direct control userspace has over the HW.

> This in turn means when userspace calls a system call (or does page 
> fault) it is possible that this ends up in the reclaim code path.

We have both said the very same thing.
 
> And while we want to avoid it both Daniel and I already discussed this 
> multiple times and we agree it is still a must have to be able to do 
> fence waits in the reclaim code path.

But came to the opposite conclusion. For doing that wait harms the
unrelated caller and the reclaim is opportunistic. There is no need for
that caller to reclaim that page, when it can have any other. Why did you
even choose that page to reclaim? Inducing latency in the caller is a bug,
has been reported previously as a bug, and still considered a bug. [But at
the end of the day, if the system is out of memory, then you have to pick
a victim.]

> So what happens is that you have a dependency between fence submission 
> -> userspace -> reclaim path -> fence submission. And that is a circle 
> dependency, no matter what your DAG looks like.

Sigh. We have both said the very same thing.

> In other words this whole approach does not work, is a clear NAK and I 
> can only advise Dave to *not* merge it.

If you are talking about the proxy? Then it looks like this [if you
insist on having that wait in the reclaim]
1. userspace submits request, waiting for the future
2. other thread that is due to signal, enters kernel, hits direct reclaim,
waits for the future fence [because you insist on this when it is not
necessary and is a unbounded latency issue for general cases],
1. times out

vs

1. userspace submits wait-for-submit; blocks
2. other thread enters kernel and waits for reclaim on another arbitrary
fence, or anything, could even be waiting for a signal from 1.
1. times out


Userspace directly controls fence signaling. Any wait whatsoever could
be a deadlock on a resource that is outside of our [immediate] control.
Further if that wait is underneath a mutex or other semaphore that it
can cause another client to contend with, it is now able to inject its
deadlock into an witting partner.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Intel-gfx] [PATCH 7/7] drm/i915/gem: Acquire all vma/objects under reservation_ww_class
  2020-06-25 17:42                             ` Chris Wilson
@ 2020-06-26  8:10                               ` Chris Wilson
  2020-06-26  8:54                                 ` Christian König
  0 siblings, 1 reply; 48+ messages in thread
From: Chris Wilson @ 2020-06-26  8:10 UTC (permalink / raw)
  To: Christian König, Christian König, Dave Airlie
  Cc: Intel Graphics Development

Quoting Chris Wilson (2020-06-25 18:42:41)
> Quoting Christian König (2020-06-25 16:47:09)
> > Am 25.06.20 um 17:10 schrieb Chris Wilson:
> > > We have the DAG of fences, we can use that information to avoid adding
> > > an implicit coupling between execution contexts.
> > 
> > No, we can't. And it sounds like you still have not understood the 
> > underlying problem.
> > 
> > See this has nothing to do with the fences itself or their DAG.
> > 
> > When you depend on userspace to do another submission so your fence can 
> > start processing you end up depending on whatever userspace does.
> 
> HW dependency on userspace is explicit in the ABI and client APIs, and
> the direct control userspace has over the HW.
> 
> > This in turn means when userspace calls a system call (or does page 
> > fault) it is possible that this ends up in the reclaim code path.
> 
> We have both said the very same thing.
>  
> > And while we want to avoid it both Daniel and I already discussed this 
> > multiple times and we agree it is still a must have to be able to do 
> > fence waits in the reclaim code path.
> 
> But came to the opposite conclusion. For doing that wait harms the
> unrelated caller and the reclaim is opportunistic. There is no need for
> that caller to reclaim that page, when it can have any other. Why did you
> even choose that page to reclaim? Inducing latency in the caller is a bug,
> has been reported previously as a bug, and still considered a bug. [But at
> the end of the day, if the system is out of memory, then you have to pick
> a victim.]

An example

Thread A				Thread B

	submit(VkCmdWaitEvents)
	recvfrom(ThreadB)	...	sendto(ThreadB)
					\- alloc_page
					 \- direct reclaim
					  \- dma_fence_wait(A)
	VkSetEvent()

Regardless of that actual deadlock, waiting on an arbitrary fence incurs
an unbounded latency which is unacceptable for direct reclaim.

Online debugging can indefinitely suspend fence signaling, and the only
guarantee we make of forward progress, in some cases, is process
termination.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Intel-gfx] [PATCH 7/7] drm/i915/gem: Acquire all vma/objects under reservation_ww_class
  2020-06-26  8:10                               ` Chris Wilson
@ 2020-06-26  8:54                                 ` Christian König
  2020-06-26  8:56                                   ` Christian König
  2020-06-26 11:10                                   ` Chris Wilson
  0 siblings, 2 replies; 48+ messages in thread
From: Christian König @ 2020-06-26  8:54 UTC (permalink / raw)
  To: Chris Wilson, Christian König, Dave Airlie
  Cc: Intel Graphics Development

Am 26.06.20 um 10:10 schrieb Chris Wilson:
> Quoting Chris Wilson (2020-06-25 18:42:41)
>> Quoting Christian König (2020-06-25 16:47:09)
>>> Am 25.06.20 um 17:10 schrieb Chris Wilson:
>>>> We have the DAG of fences, we can use that information to avoid adding
>>>> an implicit coupling between execution contexts.
>>> No, we can't. And it sounds like you still have not understood the
>>> underlying problem.
>>>
>>> See this has nothing to do with the fences itself or their DAG.
>>>
>>> When you depend on userspace to do another submission so your fence can
>>> start processing you end up depending on whatever userspace does.
>> HW dependency on userspace is explicit in the ABI and client APIs, and
>> the direct control userspace has over the HW.
>>
>>> This in turn means when userspace calls a system call (or does page
>>> fault) it is possible that this ends up in the reclaim code path.
>> We have both said the very same thing.

Then I'm really wondering why you don't come to the same conclusion :)

>>   
>>> And while we want to avoid it both Daniel and I already discussed this
>>> multiple times and we agree it is still a must have to be able to do
>>> fence waits in the reclaim code path.
>> But came to the opposite conclusion. For doing that wait harms the
>> unrelated caller and the reclaim is opportunistic. There is no need for
>> that caller to reclaim that page, when it can have any other. Why did you
>> even choose that page to reclaim? Inducing latency in the caller is a bug,
>> has been reported previously as a bug, and still considered a bug. [But at
>> the end of the day, if the system is out of memory, then you have to pick
>> a victim.]

Correct. But this is also not limited to the reclaim path as any kernel 
system call and page fault can cause a problem as well.

In other words "fence -> userspace -> page fault -> fence" or "fence -> 
userspace -> system call -> fence" can easily cause the same problem and 
that is not avoidable.

> An example
>
> Thread A				Thread B
>
> 	submit(VkCmdWaitEvents)
> 	recvfrom(ThreadB)	...	sendto(ThreadB)
> 					\- alloc_page
> 					 \- direct reclaim
> 					  \- dma_fence_wait(A)
> 	VkSetEvent()
>
> Regardless of that actual deadlock, waiting on an arbitrary fence incurs
> an unbounded latency which is unacceptable for direct reclaim.
>
> Online debugging can indefinitely suspend fence signaling, and the only
> guarantee we make of forward progress, in some cases, is process
> termination.

And exactly that is what doesn't work. You don't have any forward 
progress any more because you ran into a software deadlock.

In other words the signaling of a fence depends on the welfare of 
userspace. You can try to kill userspace, but this can wait for the 
fence you try to signal in the first place.

See the difference to a deadlock on the GPU is that you can can always 
kill a running job or process even if it is stuck with something else. 
But if the kernel is deadlocked with itself you can't kill the process 
any more, the only option left to get cleanly out of this is to reboot 
the kernel.

The only way to avoid this would be to never ever wait for the fence in 
the kernel and then your whole construct is not useful any more.

I'm running out of ideas how to explain what the problem is here....

Regards,
Christian.

> -Chris

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Intel-gfx] [PATCH 7/7] drm/i915/gem: Acquire all vma/objects under reservation_ww_class
  2020-06-26  8:54                                 ` Christian König
@ 2020-06-26  8:56                                   ` Christian König
  2020-06-26 11:10                                   ` Chris Wilson
  1 sibling, 0 replies; 48+ messages in thread
From: Christian König @ 2020-06-26  8:56 UTC (permalink / raw)
  To: Chris Wilson, Dave Airlie; +Cc: Daniel Vetter, Intel Graphics Development

Hi Daniel,

could you help my explaining to Christoph why this doesn't work?

We have exercised this multiple times in the past month and I'm really 
surprised that anybody is still trying this approach.

Thanks,
Christian.

Am 26.06.20 um 10:54 schrieb Christian König:
> Am 26.06.20 um 10:10 schrieb Chris Wilson:
>> Quoting Chris Wilson (2020-06-25 18:42:41)
>>> Quoting Christian König (2020-06-25 16:47:09)
>>>> Am 25.06.20 um 17:10 schrieb Chris Wilson:
>>>>> We have the DAG of fences, we can use that information to avoid 
>>>>> adding
>>>>> an implicit coupling between execution contexts.
>>>> No, we can't. And it sounds like you still have not understood the
>>>> underlying problem.
>>>>
>>>> See this has nothing to do with the fences itself or their DAG.
>>>>
>>>> When you depend on userspace to do another submission so your fence 
>>>> can
>>>> start processing you end up depending on whatever userspace does.
>>> HW dependency on userspace is explicit in the ABI and client APIs, and
>>> the direct control userspace has over the HW.
>>>
>>>> This in turn means when userspace calls a system call (or does page
>>>> fault) it is possible that this ends up in the reclaim code path.
>>> We have both said the very same thing.
>
> Then I'm really wondering why you don't come to the same conclusion :)
>
>>>> And while we want to avoid it both Daniel and I already discussed this
>>>> multiple times and we agree it is still a must have to be able to do
>>>> fence waits in the reclaim code path.
>>> But came to the opposite conclusion. For doing that wait harms the
>>> unrelated caller and the reclaim is opportunistic. There is no need for
>>> that caller to reclaim that page, when it can have any other. Why 
>>> did you
>>> even choose that page to reclaim? Inducing latency in the caller is 
>>> a bug,
>>> has been reported previously as a bug, and still considered a bug. 
>>> [But at
>>> the end of the day, if the system is out of memory, then you have to 
>>> pick
>>> a victim.]
>
> Correct. But this is also not limited to the reclaim path as any 
> kernel system call and page fault can cause a problem as well.
>
> In other words "fence -> userspace -> page fault -> fence" or "fence 
> -> userspace -> system call -> fence" can easily cause the same 
> problem and that is not avoidable.
>
>> An example
>>
>> Thread A                Thread B
>>
>>     submit(VkCmdWaitEvents)
>>     recvfrom(ThreadB)    ...    sendto(ThreadB)
>>                     \- alloc_page
>>                      \- direct reclaim
>>                       \- dma_fence_wait(A)
>>     VkSetEvent()
>>
>> Regardless of that actual deadlock, waiting on an arbitrary fence incurs
>> an unbounded latency which is unacceptable for direct reclaim.
>>
>> Online debugging can indefinitely suspend fence signaling, and the only
>> guarantee we make of forward progress, in some cases, is process
>> termination.
>
> And exactly that is what doesn't work. You don't have any forward 
> progress any more because you ran into a software deadlock.
>
> In other words the signaling of a fence depends on the welfare of 
> userspace. You can try to kill userspace, but this can wait for the 
> fence you try to signal in the first place.
>
> See the difference to a deadlock on the GPU is that you can can always 
> kill a running job or process even if it is stuck with something else. 
> But if the kernel is deadlocked with itself you can't kill the process 
> any more, the only option left to get cleanly out of this is to reboot 
> the kernel.
>
> The only way to avoid this would be to never ever wait for the fence 
> in the kernel and then your whole construct is not useful any more.
>
> I'm running out of ideas how to explain what the problem is here....
>
> Regards,
> Christian.
>
>> -Chris
>

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Intel-gfx] [PATCH 7/7] drm/i915/gem: Acquire all vma/objects under reservation_ww_class
  2020-06-26  8:54                                 ` Christian König
  2020-06-26  8:56                                   ` Christian König
@ 2020-06-26 11:10                                   ` Chris Wilson
  2020-06-26 11:35                                     ` Christian König
  1 sibling, 1 reply; 48+ messages in thread
From: Chris Wilson @ 2020-06-26 11:10 UTC (permalink / raw)
  To: Christian König, Christian König, Dave Airlie
  Cc: Intel Graphics Development

Quoting Christian König (2020-06-26 09:54:19)
> Am 26.06.20 um 10:10 schrieb Chris Wilson:
> > Quoting Chris Wilson (2020-06-25 18:42:41)
> >> Quoting Christian König (2020-06-25 16:47:09)
> >>> Am 25.06.20 um 17:10 schrieb Chris Wilson:
> >>>> We have the DAG of fences, we can use that information to avoid adding
> >>>> an implicit coupling between execution contexts.
> >>> No, we can't. And it sounds like you still have not understood the
> >>> underlying problem.
> >>>
> >>> See this has nothing to do with the fences itself or their DAG.
> >>>
> >>> When you depend on userspace to do another submission so your fence can
> >>> start processing you end up depending on whatever userspace does.
> >> HW dependency on userspace is explicit in the ABI and client APIs, and
> >> the direct control userspace has over the HW.
> >>
> >>> This in turn means when userspace calls a system call (or does page
> >>> fault) it is possible that this ends up in the reclaim code path.
> >> We have both said the very same thing.
> 
> Then I'm really wondering why you don't come to the same conclusion :)
> 
> >>   
> >>> And while we want to avoid it both Daniel and I already discussed this
> >>> multiple times and we agree it is still a must have to be able to do
> >>> fence waits in the reclaim code path.
> >> But came to the opposite conclusion. For doing that wait harms the
> >> unrelated caller and the reclaim is opportunistic. There is no need for
> >> that caller to reclaim that page, when it can have any other. Why did you
> >> even choose that page to reclaim? Inducing latency in the caller is a bug,
> >> has been reported previously as a bug, and still considered a bug. [But at
> >> the end of the day, if the system is out of memory, then you have to pick
> >> a victim.]
> 
> Correct. But this is also not limited to the reclaim path as any kernel 
> system call and page fault can cause a problem as well.

Yes. Hence the effort to avoid blocking and implicit waits in those paths,
and why flagging those waits is better than accepting them. The necessary
evil should be annotated, everything that is unnecessary should be
avoided.

And that it is the user->kernel entry points that are important as they
are uncontrolled; but directly nesting execution contexts is controlled.

And yes direct reclaim is the easiest and most obvious case to avoid
unbounded waits inside unknown contexts.

> In other words "fence -> userspace -> page fault -> fence" or "fence -> 
> userspace -> system call -> fence" can easily cause the same problem and 
> that is not avoidable.
> 
> > An example
> >
> > Thread A                              Thread B
> >
> >       submit(VkCmdWaitEvents)
> >       recvfrom(ThreadB)       ...     sendto(ThreadB)
> >                                       \- alloc_page
> >                                        \- direct reclaim
> >                                         \- dma_fence_wait(A)
> >       VkSetEvent()
> >
> > Regardless of that actual deadlock, waiting on an arbitrary fence incurs
> > an unbounded latency which is unacceptable for direct reclaim.
> >
> > Online debugging can indefinitely suspend fence signaling, and the only
> > guarantee we make of forward progress, in some cases, is process
> > termination.
> 
> And exactly that is what doesn't work. You don't have any forward 
> progress any more because you ran into a software deadlock.

Only one side is halted. Everything on that side comes to a grinding
halt.

What about checkpoint/restore, suspend/resume? Where we need to suspend
all execution, move all the resources to one side, then put everything
back, without cancelling the fences. Same halting problem, no?

We also do similar for resets. Suspend the hanging context, move it and
all dependent execution off to one side; record what we can, clean up
what we have to, then move what remains of the execution back to finish
signaling.

> In other words the signaling of a fence depends on the welfare of 
> userspace. You can try to kill userspace, but this can wait for the 
> fence you try to signal in the first place.

The only scenario that fits what you are describing here [userspace
ignoring a signal] is if you used an uninterruptible wait. Under what
circumstances during normal execution would you do that? If it's
someone else's wait, a bug outside of our control.

But if you have chosen to cancel the fences, there is nothing to stop
the signaling.

> See the difference to a deadlock on the GPU is that you can can always 
> kill a running job or process even if it is stuck with something else. 
> But if the kernel is deadlocked with itself you can't kill the process 
> any more, the only option left to get cleanly out of this is to reboot 
> the kernel.

However, I say that is under our control. We know what fences are in an
execution context, just as easily as we know that we are inside an
execution context. And yes, the easiest, the most restrictive way to
control it is to say don't bother.

> The only way to avoid this would be to never ever wait for the fence in 
> the kernel and then your whole construct is not useful any more.

I advocate for moving as much as is feasible, for some waits are required
by userspace as a necessary evil, into the parallelised pipeline.

> I'm running out of ideas how to explain what the problem is here....

Oh we agree on the problem, we appear to disagree that the implicit waits
themselves are a serious existent problem. That they are worth effort to
avoid or, at least, mitigate.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Intel-gfx] [PATCH 7/7] drm/i915/gem: Acquire all vma/objects under reservation_ww_class
  2020-06-26 11:10                                   ` Chris Wilson
@ 2020-06-26 11:35                                     ` Christian König
  2020-06-26 13:08                                       ` Chris Wilson
  0 siblings, 1 reply; 48+ messages in thread
From: Christian König @ 2020-06-26 11:35 UTC (permalink / raw)
  To: Chris Wilson, Christian König, Dave Airlie
  Cc: Intel Graphics Development

Am 26.06.20 um 13:10 schrieb Chris Wilson:
> Quoting Christian König (2020-06-26 09:54:19)
> [SNIP]
>> In other words "fence -> userspace -> page fault -> fence" or "fence ->
>> userspace -> system call -> fence" can easily cause the same problem and
>> that is not avoidable.
>>
>>> An example
>>>
>>> Thread A                              Thread B
>>>
>>>        submit(VkCmdWaitEvents)
>>>        recvfrom(ThreadB)       ...     sendto(ThreadB)
>>>                                        \- alloc_page
>>>                                         \- direct reclaim
>>>                                          \- dma_fence_wait(A)
>>>        VkSetEvent()
>>>
>>> Regardless of that actual deadlock, waiting on an arbitrary fence incurs
>>> an unbounded latency which is unacceptable for direct reclaim.
>>>
>>> Online debugging can indefinitely suspend fence signaling, and the only
>>> guarantee we make of forward progress, in some cases, is process
>>> termination.
>> And exactly that is what doesn't work. You don't have any forward
>> progress any more because you ran into a software deadlock.
> Only one side is halted. Everything on that side comes to a grinding
> halt.
>
> What about checkpoint/restore, suspend/resume? Where we need to suspend
> all execution, move all the resources to one side, then put everything
> back, without cancelling the fences. Same halting problem, no?

What are you talking about? Of course we either wait for all fences to 
complete or cancel them on suspend.

> We also do similar for resets. Suspend the hanging context, move it and
> all dependent execution off to one side; record what we can, clean up
> what we have to, then move what remains of the execution back to finish
> signaling.

Yes, but this is not possible in this situation. In the bad case you 
have a kernel deadlock and that can't be cleaned up in any way.

The only solution left in that situation is to reset the system or at 
least reload the kernel and that is not acceptable.

>> In other words the signaling of a fence depends on the welfare of
>> userspace. You can try to kill userspace, but this can wait for the
>> fence you try to signal in the first place.
> The only scenario that fits what you are describing here [userspace
> ignoring a signal] is if you used an uninterruptible wait. Under what
> circumstances during normal execution would you do that? If it's
> someone else's wait, a bug outside of our control.

Uninterruptible waits are a necessity.

Just take a look at the dma_fence_wait() interface. Why to you think we 
have ability to wait uninterruptible there?

We need this when there is no other way of recovering. For example when 
operations are already partially flushed to the hardware and can't be 
aborted any more.

> But if you have chosen to cancel the fences, there is nothing to stop
> the signaling.

And just to repeat myself: You can't cancel the fence!

For example assume that canceling the proxy fence would mean that you 
send a SIGKILL to the process which issued it. But then you need to wait 
for the SIGKILL to be processed.

Now what can happen is that the process is uninterruptible waiting for 
something which then needs the SIGKILL to be delivered -> kernel deadlock.

>> See the difference to a deadlock on the GPU is that you can can always
>> kill a running job or process even if it is stuck with something else.
>> But if the kernel is deadlocked with itself you can't kill the process
>> any more, the only option left to get cleanly out of this is to reboot
>> the kernel.
> However, I say that is under our control. We know what fences are in an
> execution context, just as easily as we know that we are inside an
> execution context. And yes, the easiest, the most restrictive way to
> control it is to say don't bother.

No, that is absolutely not under our control.

dma_fences need to be waited on under a lot of different context, 
including the reclaim path as well as the MMU notifiers, memory pressure 
callbacks, OOM killer....

Just see Daniels patches on the lockdep fence signaling annotation and 
what this work bubbled up on problems.

>> The only way to avoid this would be to never ever wait for the fence in
>> the kernel and then your whole construct is not useful any more.
> I advocate for moving as much as is feasible, for some waits are required
> by userspace as a necessary evil, into the parallelised pipeline.
>
>> I'm running out of ideas how to explain what the problem is here....
> Oh we agree on the problem, we appear to disagree that the implicit waits
> themselves are a serious existent problem. That they are worth effort to
> avoid or, at least, mitigate.

No, as far as I can see you don't seem to either understand the problem 
or the implications of it.

The only way to solve this would be to audit the whole Linux kernel and 
remove all uninterruptible waits and that is not feasible.

As long as you don't provide me with a working solution to the problem 
I've outlined here the whole approach is a clear NAK since it will allow 
to create really bad kernel deadlocks.

Sorry to say that, but this whole thing doesn't look like it is thought 
through to the end. You should probably take a step back and talk to 
Daniel here.

Regards,
Christian.

> -Chris

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Intel-gfx] [PATCH 7/7] drm/i915/gem: Acquire all vma/objects under reservation_ww_class
  2020-06-26 11:35                                     ` Christian König
@ 2020-06-26 13:08                                       ` Chris Wilson
  2020-06-26 17:44                                         ` Christian König
  0 siblings, 1 reply; 48+ messages in thread
From: Chris Wilson @ 2020-06-26 13:08 UTC (permalink / raw)
  To: Christian König, Christian König, Dave Airlie
  Cc: Intel Graphics Development

Quoting Christian König (2020-06-26 12:35:30)
> Am 26.06.20 um 13:10 schrieb Chris Wilson:
> > Quoting Christian König (2020-06-26 09:54:19)
> > [SNIP]
> >> In other words "fence -> userspace -> page fault -> fence" or "fence ->
> >> userspace -> system call -> fence" can easily cause the same problem and
> >> that is not avoidable.
> >>
> >>> An example
> >>>
> >>> Thread A                              Thread B
> >>>
> >>>        submit(VkCmdWaitEvents)
> >>>        recvfrom(ThreadB)       ...     sendto(ThreadB)
> >>>                                        \- alloc_page
> >>>                                         \- direct reclaim
> >>>                                          \- dma_fence_wait(A)
> >>>        VkSetEvent()
> >>>
> >>> Regardless of that actual deadlock, waiting on an arbitrary fence incurs
> >>> an unbounded latency which is unacceptable for direct reclaim.
> >>>
> >>> Online debugging can indefinitely suspend fence signaling, and the only
> >>> guarantee we make of forward progress, in some cases, is process
> >>> termination.
> >> And exactly that is what doesn't work. You don't have any forward
> >> progress any more because you ran into a software deadlock.
> > Only one side is halted. Everything on that side comes to a grinding
> > halt.
> >
> > What about checkpoint/restore, suspend/resume? Where we need to suspend
> > all execution, move all the resources to one side, then put everything
> > back, without cancelling the fences. Same halting problem, no?
> 
> What are you talking about? Of course we either wait for all fences to 
> complete or cancel them on suspend.

I do not want to have to cancel incomplete fences as we do today.
I want to restore the suspended execution back to waiting on its
VkEvent.

> > We also do similar for resets. Suspend the hanging context, move it and
> > all dependent execution off to one side; record what we can, clean up
> > what we have to, then move what remains of the execution back to finish
> > signaling.
> 
> Yes, but this is not possible in this situation. In the bad case you 
> have a kernel deadlock and that can't be cleaned up in any way.

Fences are not disturbed in this process.
> 
> The only solution left in that situation is to reset the system or at 
> least reload the kernel and that is not acceptable.
> 
> >> In other words the signaling of a fence depends on the welfare of
> >> userspace. You can try to kill userspace, but this can wait for the
> >> fence you try to signal in the first place.
> > The only scenario that fits what you are describing here [userspace
> > ignoring a signal] is if you used an uninterruptible wait. Under what
> > circumstances during normal execution would you do that? If it's
> > someone else's wait, a bug outside of our control.
> 
> Uninterruptible waits are a necessity.
> 
> Just take a look at the dma_fence_wait() interface. Why to you think we 
> have ability to wait uninterruptible there?
>
> We need this when there is no other way of recovering. For example when 
> operations are already partially flushed to the hardware and can't be 
> aborted any more.

So why wait in the middle of submission, rather than defer the submission
to the fence callback if the HW wasn't ready? You then have your
uninterruptible continuation.

> > But if you have chosen to cancel the fences, there is nothing to stop
> > the signaling.
> 
> And just to repeat myself: You can't cancel the fence!
> 
> For example assume that canceling the proxy fence would mean that you 
> send a SIGKILL to the process which issued it. But then you need to wait 
> for the SIGKILL to be processed.

What? Where does SIGKILL come from for fence handling?

The proxy fence is force signaled in an error state (e.g. -ETIMEDOUT),
every waiter then inherits the error state and all of their waiters down
the chain. Those waiters are now presumably ready to finish their own
signaling.

The proxy fence is constructed to always complete if it does not get
resolved; after resolution, the onus is on the real fence to complete.

The same as handling any other error or context cancellation during
fence submission.
 
> Now what can happen is that the process is uninterruptible waiting for 
> something which then needs the SIGKILL to be delivered -> kernel deadlock.
> 
> >> See the difference to a deadlock on the GPU is that you can can always
> >> kill a running job or process even if it is stuck with something else.
> >> But if the kernel is deadlocked with itself you can't kill the process
> >> any more, the only option left to get cleanly out of this is to reboot
> >> the kernel.
> > However, I say that is under our control. We know what fences are in an
> > execution context, just as easily as we know that we are inside an
> > execution context. And yes, the easiest, the most restrictive way to
> > control it is to say don't bother.
> 
> No, that is absolutely not under our control.
> 
> dma_fences need to be waited on under a lot of different context, 
> including the reclaim path as well as the MMU notifiers, memory pressure 
> callbacks, OOM killer....

Oh yes, they are under our control. That list boils down to reclaim,
since mmu notifiers outside of reclaim are outside of a nested context.

That in particular is the same old question as whether GFP_IO should be
a gfp_t or in the task_struct. If we are inside an execution context, we
can track that and the fences on the task_struct if we wanted to,
avoiding reclaim of fences being used by the outer context and their
descendants...

But as we have stated multiple times now, and that I thought you had
agreed with for the VkEvents example, one cannot wait inside direct
reclaim. Least of all because the latency in doing so impacts other
users, sometimes severely.

Which pushes the burden of work onto kswapd to make objects reclaimable,
and the driver in general to not hold onto objects beyond their use.

> Just see Daniels patches on the lockdep fence signaling annotation and 
> what this work bubbled up on problems.
> 
> >> The only way to avoid this would be to never ever wait for the fence in
> >> the kernel and then your whole construct is not useful any more.
> > I advocate for moving as much as is feasible, for some waits are required
> > by userspace as a necessary evil, into the parallelised pipeline.
> >
> >> I'm running out of ideas how to explain what the problem is here....
> > Oh we agree on the problem, we appear to disagree that the implicit waits
> > themselves are a serious existent problem. That they are worth effort to
> > avoid or, at least, mitigate.
> 
> No, as far as I can see you don't seem to either understand the problem 
> or the implications of it.
> 
> The only way to solve this would be to audit the whole Linux kernel and 
> remove all uninterruptible waits and that is not feasible.
> 
> As long as you don't provide me with a working solution to the problem 
> I've outlined here the whole approach is a clear NAK since it will allow 
> to create really bad kernel deadlocks.

You are confusing multiple things here. The VkEvents example is real.
How do you avoid that deadlock? We avoid it by not waiting in direct
reclaim.

It has also shown up any waits in our submit ioctl [prior to fence
publication, I might add] for their potential deadlock with userspace.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Intel-gfx] [PATCH 7/7] drm/i915/gem: Acquire all vma/objects under reservation_ww_class
  2020-06-26 13:08                                       ` Chris Wilson
@ 2020-06-26 17:44                                         ` Christian König
  0 siblings, 0 replies; 48+ messages in thread
From: Christian König @ 2020-06-26 17:44 UTC (permalink / raw)
  To: Chris Wilson, Christian König, Dave Airlie
  Cc: Intel Graphics Development

Am 26.06.20 um 15:08 schrieb Chris Wilson:
> Quoting Christian König (2020-06-26 12:35:30)
>> Am 26.06.20 um 13:10 schrieb Chris Wilson:
>>> Quoting Christian König (2020-06-26 09:54:19)
>>> [SNIP]

>>> What about checkpoint/restore, suspend/resume? Where we need to suspend
>>> all execution, move all the resources to one side, then put everything
>>> back, without cancelling the fences. Same halting problem, no?
>> What are you talking about? Of course we either wait for all fences to
>> complete or cancel them on suspend.
> I do not want to have to cancel incomplete fences as we do today.

But this is a necessity. Putting away halve executed fences and starting 
them later on is not possible and most likely never will be.

>> So why wait in the middle of submission, rather than defer the submission
>> to the fence callback if the HW wasn't ready? You then have your
>> uninterruptible continuation.

Because you don't wait in the middle of the submission, but rather 
before the submission is made and resources or locks are acquired.

That's also the reason why it is illegal to wait for a fence to appear 
with a reservation lock held and that is also what lockdep should be 
able to point out as well.

See amdgpu_cs_ioctl() for an example of why this is necessary:

         r = amdgpu_cs_dependencies(adev, &parser);
...
         r = amdgpu_cs_parser_bos(&parser, data);

amdgpu_cs_dependencies() is waiting for the wait before signal fences to 
appear and amdgpu_cs_parser_bos() is grabbing the reservation locks.

Do it the other way around and lockdep at least should splat that this 
has deadlock potential.

And you are running into exactly the same case here as well, just in a 
bit more complicated because userspace is involved.

>>> But if you have chosen to cancel the fences, there is nothing to stop
>>> the signaling.
>> And just to repeat myself: You can't cancel the fence!
>>
>> For example assume that canceling the proxy fence would mean that you
>> send a SIGKILL to the process which issued it. But then you need to wait
>> for the SIGKILL to be processed.
> What? Where does SIGKILL come from for fence handling?

Sorry, that was just an example how to handle it. A lock or an event is 
also possible.

> The proxy fence is force signaled in an error state (e.g. -ETIMEDOUT),
> every waiter then inherits the error state and all of their waiters down
> the chain. Those waiters are now presumably ready to finish their own
> signaling.

That alone is illegal. See currently fences are only allowed to signal 
if all their previous dependencies are signaled, even in an error case.

This is because we replace all the fences in a dma_resv object when we 
add a new exclusive one.

> The proxy fence is constructed to always complete if it does not get
> resolved; after resolution, the onus is on the real fence to complete.

But then it is not useful at all. See in this case you can't wait on the 
proxy fence at all.

In other words when you try to wait and the underlying real submission 
has not yet appeared you must return with an error immediately.

>>> However, I say that is under our control. We know what fences are in an
>>> execution context, just as easily as we know that we are inside an
>>> execution context. And yes, the easiest, the most restrictive way to
>>> control it is to say don't bother.
>> No, that is absolutely not under our control.
>>
>> dma_fences need to be waited on under a lot of different context,
>> including the reclaim path as well as the MMU notifiers, memory pressure
>> callbacks, OOM killer....
> Oh yes, they are under our control. That list boils down to reclaim,
> since mmu notifiers outside of reclaim are outside of a nested context.

Nested context is irrelevant here. Let's see the following example:

We use dma_fence_proxy because userspace wants to do a delayed submission.

This dma_fence_proxy is attached to a dma_resv object because we need 
the implicit dependency for DRI2/DRI3 handling.

Now the process calls fork() and an MMU notifier is triggered. This MMU 
notifier then waits for the dma_resv object fences to complete.

But to complete the fences the fork() call needs to complete first -> 
deadlock.

> That in particular is the same old question as whether GFP_IO should be
> a gfp_t or in the task_struct. If we are inside an execution context, we
> can track that and the fences on the task_struct if we wanted to,
> avoiding reclaim of fences being used by the outer context and their
> descendants...

Oh, yes that is correct and an absolutely brilliant example of why this 
doesn't work :D

See the difference is that in this case userspace is involved.

In other words in your example you would set the GFP_IO flag in the 
task_struct and then return from your IOCTL and waiting for the next 
IOCTL to clear it again.

And that in turn is not something we can do.

>> No, as far as I can see you don't seem to either understand the problem
>> or the implications of it.
>>
>> The only way to solve this would be to audit the whole Linux kernel and
>> remove all uninterruptible waits and that is not feasible.
>>
>> As long as you don't provide me with a working solution to the problem
>> I've outlined here the whole approach is a clear NAK since it will allow
>> to create really bad kernel deadlocks.
> You are confusing multiple things here. The VkEvents example is real.
> How do you avoid that deadlock? We avoid it by not waiting in direct
> reclaim.

I'm perfectly aware of what are you trying to do here cause the AMD 
engineers have suggested and tried the exact same thing. And yes we have 
already rejected that as well.

> It has also shown up any waits in our submit ioctl [prior to fence
> publication, I might add] for their potential deadlock with userspace.

No that approach is provable deadlock free.

See as I explained with the amdgpu_cs example above it is as simple as 
waiting for the fences to appear without any memory management relevant 
locks held.

As soon as you leak waiting for fences to appear into other parts of the 
kernel you are making those parts depend on the welfare of the userspace 
process and that's what doesn't work.

Sorry that I'm so insisting on this, but we have already tried this 
approach and discussed it more than once and it really does not work 
correctly.

Regards,
Christian.

> -Chris

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 48+ messages in thread

end of thread, other threads:[~2020-06-29 13:15 UTC | newest]

Thread overview: 48+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-06-22  9:59 [Intel-gfx] [PATCH 1/7] drm/i915/gem: Replace i915_gem_object.mm.mutex with reservation_ww_class Chris Wilson
2020-06-22  9:59 ` [Intel-gfx] [PATCH 2/7] drm/i915: Reuse the reservation_ww_class for acquiring vma backing storage Chris Wilson
2020-06-22  9:59 ` [Intel-gfx] [PATCH 3/7] drm/i915/gem: Track the fences for object allocations Chris Wilson
2020-06-22  9:59 ` [Intel-gfx] [PATCH 4/7] drm/i915: Update vma to use async page allocations Chris Wilson
2020-06-22  9:59 ` [Intel-gfx] [PATCH 5/7] drm/i915/gem: Convert the userptr-worker to use a fence Chris Wilson
2020-06-22  9:59 ` [Intel-gfx] [PATCH 6/7] drm/i915/gem: Break apart the early i915_vma_pin from execbuf object lookup Chris Wilson
2020-06-22  9:59 ` [Intel-gfx] [PATCH 7/7] drm/i915/gem: Acquire all vma/objects under reservation_ww_class Chris Wilson
2020-06-23  9:33   ` Thomas Hellström (Intel)
2020-06-23 10:03     ` Chris Wilson
2020-06-23 15:37       ` Thomas Hellström (Intel)
2020-06-23 16:37         ` Chris Wilson
2020-06-23 21:01       ` Dave Airlie
2020-06-23 21:19         ` Chris Wilson
2020-06-24 19:04           ` Dave Airlie
2020-06-24 20:18             ` Chris Wilson
2020-06-25  8:11               ` Christian König
2020-06-25 12:48                 ` Chris Wilson
2020-06-25 12:59                   ` Christian König
2020-06-25 13:23                     ` Chris Wilson
2020-06-25 14:02                       ` Christian König
2020-06-25 15:10                         ` Chris Wilson
2020-06-25 15:47                           ` Christian König
2020-06-25 17:42                             ` Chris Wilson
2020-06-26  8:10                               ` Chris Wilson
2020-06-26  8:54                                 ` Christian König
2020-06-26  8:56                                   ` Christian König
2020-06-26 11:10                                   ` Chris Wilson
2020-06-26 11:35                                     ` Christian König
2020-06-26 13:08                                       ` Chris Wilson
2020-06-26 17:44                                         ` Christian König
2020-06-23 11:22   ` Thomas Hellström (Intel)
2020-06-23 12:57     ` Thomas Hellström (Intel)
2020-06-23 14:01       ` Chris Wilson
2020-06-23 15:09         ` Thomas Hellström (Intel)
2020-06-23 16:00           ` Chris Wilson
2020-06-23 16:17           ` Chris Wilson
2020-06-23 16:29             ` Thomas Hellström (Intel)
2020-06-23 16:46               ` Chris Wilson
2020-06-23 16:36     ` Chris Wilson
2020-06-23 18:21       ` Thomas Hellström (Intel)
2020-06-23 18:41         ` Chris Wilson
2020-06-23 20:31           ` Thomas Hellström (Intel)
2020-06-23 21:15             ` Chris Wilson
2020-06-24  5:42               ` Thomas Hellström (Intel)
2020-06-24  8:08                 ` Chris Wilson
2020-06-24  9:50                   ` Thomas Hellström (Intel)
2020-06-24 10:48                     ` Chris Wilson
2020-06-22 10:45 ` [Intel-gfx] ✗ Fi.CI.BUILD: failure for series starting with [1/7] drm/i915/gem: Replace i915_gem_object.mm.mutex with reservation_ww_class Patchwork

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.