All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC] Add DMA_RESV_USAGE flags
@ 2021-05-17 14:11 Christian König
  2021-05-17 14:11 ` [PATCH 01/11] dma-buf: fix invalid debug print Christian König
                   ` (11 more replies)
  0 siblings, 12 replies; 50+ messages in thread
From: Christian König @ 2021-05-17 14:11 UTC (permalink / raw)
  To: dri-devel, linaro-mm-sig

We had a long outstanding problem in amdgpu that buffers exported to user drivers by DMA-buf serialize all command submissions using them.

In other words we can't compose the buffer with different engines and then send it to another driver for display further processing.

This was added to work around the fact that i915 didn't wanted to wait for shared fences in the dma_resv objects before displaying a buffer.

Since this problem is now causing issues with Vulkan we need to find a better solution for that.

The patch set here tries to do this by adding an usage flag to the shared fences noting when and how they should participate in implicit synchronization.

Please review and/or comment,
Christian.



^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH 01/11] dma-buf: fix invalid debug print
  2021-05-17 14:11 [RFC] Add DMA_RESV_USAGE flags Christian König
@ 2021-05-17 14:11 ` Christian König
  2021-05-17 14:11 ` [PATCH 02/11] dma-buf: add SPDX header and fix style in dma-resv.c Christian König
                   ` (10 subsequent siblings)
  11 siblings, 0 replies; 50+ messages in thread
From: Christian König @ 2021-05-17 14:11 UTC (permalink / raw)
  To: dri-devel, linaro-mm-sig

The code tries to acquire the rcu protected fence list, but then ignores
individual fences which has been modified while holding the rcu.

Stop that madness and just note cleanly that
the list was concurrently modified.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/dma-buf/dma-buf.c | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
index f264b70c383e..468c282b8a06 100644
--- a/drivers/dma-buf/dma-buf.c
+++ b/drivers/dma-buf/dma-buf.c
@@ -1383,22 +1383,17 @@ static int dma_buf_debug_show(struct seq_file *s, void *unused)
 				buf_obj->name ?: "");
 
 		robj = buf_obj->resv;
-		while (true) {
-			seq = read_seqcount_begin(&robj->seq);
-			rcu_read_lock();
-			fobj = rcu_dereference(robj->fence);
-			shared_count = fobj ? fobj->shared_count : 0;
-			fence = rcu_dereference(robj->fence_excl);
-			if (!read_seqcount_retry(&robj->seq, seq))
-				break;
-			rcu_read_unlock();
-		}
-
+		seq = read_seqcount_begin(&robj->seq);
+		rcu_read_lock();
+		fence = rcu_dereference(robj->fence_excl);
 		if (fence)
 			seq_printf(s, "\tExclusive fence: %s %s %ssignalled\n",
 				   fence->ops->get_driver_name(fence),
 				   fence->ops->get_timeline_name(fence),
 				   dma_fence_is_signaled(fence) ? "" : "un");
+
+		fobj = rcu_dereference(robj->fence);
+		shared_count = fobj ? fobj->shared_count : 0;
 		for (i = 0; i < shared_count; i++) {
 			fence = rcu_dereference(fobj->shared[i]);
 			if (!dma_fence_get_rcu(fence))
@@ -1410,6 +1405,8 @@ static int dma_buf_debug_show(struct seq_file *s, void *unused)
 			dma_fence_put(fence);
 		}
 		rcu_read_unlock();
+		if (read_seqcount_retry(&robj->seq, seq))
+			seq_printf(s, "\tFences concurrently modified\n");
 
 		seq_puts(s, "\tAttached Devices:\n");
 		attach_count = 0;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 50+ messages in thread

* [PATCH 02/11] dma-buf: add SPDX header and fix style in dma-resv.c
  2021-05-17 14:11 [RFC] Add DMA_RESV_USAGE flags Christian König
  2021-05-17 14:11 ` [PATCH 01/11] dma-buf: fix invalid debug print Christian König
@ 2021-05-17 14:11 ` Christian König
  2021-05-17 14:11 ` [PATCH 03/11] dma-buf: cleanup dma-resv shared fence debugging a bit Christian König
                   ` (9 subsequent siblings)
  11 siblings, 0 replies; 50+ messages in thread
From: Christian König @ 2021-05-17 14:11 UTC (permalink / raw)
  To: dri-devel, linaro-mm-sig

No functional change.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/dma-buf/dma-resv.c | 128 +++++++++++++++++++------------------
 1 file changed, 65 insertions(+), 63 deletions(-)

diff --git a/drivers/dma-buf/dma-resv.c b/drivers/dma-buf/dma-resv.c
index 6ddbeb5dfbf6..87f5d82d992a 100644
--- a/drivers/dma-buf/dma-resv.c
+++ b/drivers/dma-buf/dma-resv.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: MIT
 /*
  * Copyright (C) 2012-2014 Canonical Ltd (Maarten Lankhorst)
  *
@@ -92,49 +93,6 @@ static void dma_resv_list_free(struct dma_resv_list *list)
 	kfree_rcu(list, rcu);
 }
 
-#if IS_ENABLED(CONFIG_LOCKDEP)
-static int __init dma_resv_lockdep(void)
-{
-	struct mm_struct *mm = mm_alloc();
-	struct ww_acquire_ctx ctx;
-	struct dma_resv obj;
-	struct address_space mapping;
-	int ret;
-
-	if (!mm)
-		return -ENOMEM;
-
-	dma_resv_init(&obj);
-	address_space_init_once(&mapping);
-
-	mmap_read_lock(mm);
-	ww_acquire_init(&ctx, &reservation_ww_class);
-	ret = dma_resv_lock(&obj, &ctx);
-	if (ret == -EDEADLK)
-		dma_resv_lock_slow(&obj, &ctx);
-	fs_reclaim_acquire(GFP_KERNEL);
-	/* for unmap_mapping_range on trylocked buffer objects in shrinkers */
-	i_mmap_lock_write(&mapping);
-	i_mmap_unlock_write(&mapping);
-#ifdef CONFIG_MMU_NOTIFIER
-	lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
-	__dma_fence_might_wait();
-	lock_map_release(&__mmu_notifier_invalidate_range_start_map);
-#else
-	__dma_fence_might_wait();
-#endif
-	fs_reclaim_release(GFP_KERNEL);
-	ww_mutex_unlock(&obj.lock);
-	ww_acquire_fini(&ctx);
-	mmap_read_unlock(mm);
-	
-	mmput(mm);
-
-	return 0;
-}
-subsys_initcall(dma_resv_lockdep);
-#endif
-
 /**
  * dma_resv_init - initialize a reservation object
  * @obj: the reservation object
@@ -196,9 +154,7 @@ int dma_resv_reserve_shared(struct dma_resv *obj, unsigned int num_fences)
 	if (old && old->shared_max) {
 		if ((old->shared_count + num_fences) <= old->shared_max)
 			return 0;
-		else
-			max = max(old->shared_count + num_fences,
-				  old->shared_max * 2);
+		max = max(old->shared_count + num_fences, old->shared_max * 2);
 	} else {
 		max = max(4ul, roundup_pow_of_two(num_fences));
 	}
@@ -337,17 +293,17 @@ void dma_resv_add_excl_fence(struct dma_resv *obj, struct dma_fence *fence)
 EXPORT_SYMBOL(dma_resv_add_excl_fence);
 
 /**
-* dma_resv_copy_fences - Copy all fences from src to dst.
-* @dst: the destination reservation object
-* @src: the source reservation object
-*
-* Copy all fences from src to dst. dst-lock must be held.
-*/
+ * dma_resv_copy_fences - Copy all fences from src to dst.
+ * @dst: the destination reservation object
+ * @src: the source reservation object
+ *
+ * Copy all fences from src to dst. dst-lock must be held.
+ */
 int dma_resv_copy_fences(struct dma_resv *dst, struct dma_resv *src)
 {
 	struct dma_resv_list *src_list, *dst_list;
 	struct dma_fence *old, *new;
-	unsigned i;
+	unsigned int i;
 
 	dma_resv_assert_held(dst);
 
@@ -356,7 +312,7 @@ int dma_resv_copy_fences(struct dma_resv *dst, struct dma_resv *src)
 
 retry:
 	if (src_list) {
-		unsigned shared_count = src_list->shared_count;
+		unsigned int shared_count = src_list->shared_count;
 
 		rcu_read_unlock();
 
@@ -373,6 +329,7 @@ int dma_resv_copy_fences(struct dma_resv *dst, struct dma_resv *src)
 
 		dst_list->shared_count = 0;
 		for (i = 0; i < src_list->shared_count; ++i) {
+			struct dma_fence __rcu **dst;
 			struct dma_fence *fence;
 
 			fence = rcu_dereference(src_list->shared[i]);
@@ -391,7 +348,8 @@ int dma_resv_copy_fences(struct dma_resv *dst, struct dma_resv *src)
 				continue;
 			}
 
-			rcu_assign_pointer(dst_list->shared[dst_list->shared_count++], fence);
+			dst = &dst_list->shared[dst_list->shared_count++];
+			rcu_assign_pointer(*dst, fence);
 		}
 	} else {
 		dst_list = NULL;
@@ -431,7 +389,7 @@ EXPORT_SYMBOL(dma_resv_copy_fences);
  */
 int dma_resv_get_fences_rcu(struct dma_resv *obj,
 			    struct dma_fence **pfence_excl,
-			    unsigned *pshared_count,
+			    unsigned int *pshared_count,
 			    struct dma_fence ***pshared)
 {
 	struct dma_fence **shared = NULL;
@@ -533,9 +491,9 @@ long dma_resv_wait_timeout_rcu(struct dma_resv *obj,
 			       bool wait_all, bool intr,
 			       unsigned long timeout)
 {
-	struct dma_fence *fence;
-	unsigned seq, shared_count;
 	long ret = timeout ? timeout : 1;
+	unsigned int seq, shared_count;
+	struct dma_fence *fence;
 	int i;
 
 retry:
@@ -565,8 +523,9 @@ long dma_resv_wait_timeout_rcu(struct dma_resv *obj,
 			shared_count = fobj->shared_count;
 
 		for (i = 0; !fence && i < shared_count; ++i) {
-			struct dma_fence *lfence = rcu_dereference(fobj->shared[i]);
+			struct dma_fence *lfence;
 
+			lfence = rcu_dereference(fobj->shared[i]);
 			if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
 				     &lfence->flags))
 				continue;
@@ -633,7 +592,7 @@ static inline int dma_resv_test_signaled_single(struct dma_fence *passed_fence)
  */
 bool dma_resv_test_signaled_rcu(struct dma_resv *obj, bool test_all)
 {
-	unsigned seq, shared_count;
+	unsigned int seq, shared_count;
 	int ret;
 
 	rcu_read_lock();
@@ -643,16 +602,16 @@ bool dma_resv_test_signaled_rcu(struct dma_resv *obj, bool test_all)
 	seq = read_seqcount_begin(&obj->seq);
 
 	if (test_all) {
-		unsigned i;
-
 		struct dma_resv_list *fobj = rcu_dereference(obj->fence);
+		unsigned int i;
 
 		if (fobj)
 			shared_count = fobj->shared_count;
 
 		for (i = 0; i < shared_count; ++i) {
-			struct dma_fence *fence = rcu_dereference(fobj->shared[i]);
+			struct dma_fence *fence;
 
+			fence = rcu_dereference(fobj->shared[i]);
 			ret = dma_resv_test_signaled_single(fence);
 			if (ret < 0)
 				goto retry;
@@ -681,3 +640,46 @@ bool dma_resv_test_signaled_rcu(struct dma_resv *obj, bool test_all)
 	return ret;
 }
 EXPORT_SYMBOL_GPL(dma_resv_test_signaled_rcu);
+
+#if IS_ENABLED(CONFIG_LOCKDEP)
+static int __init dma_resv_lockdep(void)
+{
+	struct mm_struct *mm = mm_alloc();
+	struct ww_acquire_ctx ctx;
+	struct dma_resv obj;
+	struct address_space mapping;
+	int ret;
+
+	if (!mm)
+		return -ENOMEM;
+
+	dma_resv_init(&obj);
+	address_space_init_once(&mapping);
+
+	mmap_read_lock(mm);
+	ww_acquire_init(&ctx, &reservation_ww_class);
+	ret = dma_resv_lock(&obj, &ctx);
+	if (ret == -EDEADLK)
+		dma_resv_lock_slow(&obj, &ctx);
+	fs_reclaim_acquire(GFP_KERNEL);
+	/* for unmap_mapping_range on trylocked buffer objects in shrinkers */
+	i_mmap_lock_write(&mapping);
+	i_mmap_unlock_write(&mapping);
+#ifdef CONFIG_MMU_NOTIFIER
+	lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
+	__dma_fence_might_wait();
+	lock_map_release(&__mmu_notifier_invalidate_range_start_map);
+#else
+	__dma_fence_might_wait();
+#endif
+	fs_reclaim_release(GFP_KERNEL);
+	ww_mutex_unlock(&obj.lock);
+	ww_acquire_fini(&ctx);
+	mmap_read_unlock(mm);
+
+	mmput(mm);
+
+	return 0;
+}
+subsys_initcall(dma_resv_lockdep);
+#endif
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 50+ messages in thread

* [PATCH 03/11] dma-buf: cleanup dma-resv shared fence debugging a bit
  2021-05-17 14:11 [RFC] Add DMA_RESV_USAGE flags Christian König
  2021-05-17 14:11 ` [PATCH 01/11] dma-buf: fix invalid debug print Christian König
  2021-05-17 14:11 ` [PATCH 02/11] dma-buf: add SPDX header and fix style in dma-resv.c Christian König
@ 2021-05-17 14:11 ` Christian König
  2021-05-17 14:11 ` [PATCH 04/11] dma-buf: rename and cleanup dma_resv_get_excl Christian König
                   ` (8 subsequent siblings)
  11 siblings, 0 replies; 50+ messages in thread
From: Christian König @ 2021-05-17 14:11 UTC (permalink / raw)
  To: dri-devel, linaro-mm-sig

Make that a function instead of inline.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/dma-buf/dma-resv.c | 18 ++++++++++++++++++
 include/linux/dma-resv.h   | 15 +++++++--------
 2 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/drivers/dma-buf/dma-resv.c b/drivers/dma-buf/dma-resv.c
index 87f5d82d992a..6c6195315e9f 100644
--- a/drivers/dma-buf/dma-resv.c
+++ b/drivers/dma-buf/dma-resv.c
@@ -208,6 +208,24 @@ int dma_resv_reserve_shared(struct dma_resv *obj, unsigned int num_fences)
 }
 EXPORT_SYMBOL(dma_resv_reserve_shared);
 
+#ifdef CONFIG_DEBUG_MUTEXES
+/**
+ * dma_resv_reset_shared_max - reset shared fences for debugging
+ * @obj: the dma_resv object to reset
+ *
+ * Reset the shared_max to test if drivers do correct slot allocation.
+ */
+void dma_resv_reset_shared_max(struct dma_resv *obj)
+{
+	/* Test shared fence slot reservation */
+	if (rcu_access_pointer(obj->fence)) {
+		struct dma_resv_list *fence = dma_resv_get_list(obj);
+
+		fence->shared_max = fence->shared_count;
+	}
+}
+#endif
+
 /**
  * dma_resv_add_shared_fence - Add a fence to a shared slot
  * @obj: the reservation object
diff --git a/include/linux/dma-resv.h b/include/linux/dma-resv.h
index d44a77e8a7e3..f32a3d176513 100644
--- a/include/linux/dma-resv.h
+++ b/include/linux/dma-resv.h
@@ -92,6 +92,12 @@ static inline struct dma_resv_list *dma_resv_get_list(struct dma_resv *obj)
 					 dma_resv_held(obj));
 }
 
+#ifdef CONFIG_DEBUG_MUTEXES
+void dma_resv_reset_shared_max(struct dma_resv *obj);
+#else
+static inline void dma_resv_reset_shared_max(struct dma_resv *obj) {}
+#endif
+
 /**
  * dma_resv_lock - lock the reservation object
  * @obj: the reservation object
@@ -215,14 +221,7 @@ static inline struct ww_acquire_ctx *dma_resv_locking_ctx(struct dma_resv *obj)
  */
 static inline void dma_resv_unlock(struct dma_resv *obj)
 {
-#ifdef CONFIG_DEBUG_MUTEXES
-	/* Test shared fence slot reservation */
-	if (rcu_access_pointer(obj->fence)) {
-		struct dma_resv_list *fence = dma_resv_get_list(obj);
-
-		fence->shared_max = fence->shared_count;
-	}
-#endif
+	dma_resv_reset_shared_max(obj);
 	ww_mutex_unlock(&obj->lock);
 }
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 50+ messages in thread

* [PATCH 04/11] dma-buf: rename and cleanup dma_resv_get_excl
  2021-05-17 14:11 [RFC] Add DMA_RESV_USAGE flags Christian König
                   ` (2 preceding siblings ...)
  2021-05-17 14:11 ` [PATCH 03/11] dma-buf: cleanup dma-resv shared fence debugging a bit Christian König
@ 2021-05-17 14:11 ` Christian König
  2021-05-17 14:11 ` [PATCH 05/11] dma-buf: rename and cleanup dma_resv_get_list Christian König
                   ` (7 subsequent siblings)
  11 siblings, 0 replies; 50+ messages in thread
From: Christian König @ 2021-05-17 14:11 UTC (permalink / raw)
  To: dri-devel, linaro-mm-sig

When the comment needs to state explicitly that this
doesn't get a reference to the object then the function
is named rather badly.

Rename the function and use rcu_dereference_check(), this
way it can be used from both rcu as well as lock protected
critical sections.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/dma-buf/dma-buf.c                |  4 ++--
 drivers/dma-buf/dma-resv.c               | 10 +++++-----
 drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c  |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c |  2 +-
 drivers/gpu/drm/etnaviv/etnaviv_gem.c    |  2 +-
 drivers/gpu/drm/i915/gem/i915_gem_busy.c |  3 +--
 drivers/gpu/drm/msm/msm_gem.c            |  4 ++--
 drivers/gpu/drm/nouveau/nouveau_bo.c     |  2 +-
 drivers/gpu/drm/nouveau/nouveau_fence.c  |  2 +-
 drivers/gpu/drm/radeon/radeon_display.c  |  2 +-
 drivers/gpu/drm/radeon/radeon_sync.c     |  2 +-
 drivers/gpu/drm/radeon/radeon_uvd.c      |  2 +-
 drivers/gpu/drm/ttm/ttm_bo.c             |  2 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_resource.c |  2 +-
 include/linux/dma-resv.h                 | 13 +++++--------
 15 files changed, 25 insertions(+), 29 deletions(-)

diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
index 468c282b8a06..5abf6b8c89ac 100644
--- a/drivers/dma-buf/dma-buf.c
+++ b/drivers/dma-buf/dma-buf.c
@@ -234,7 +234,7 @@ static __poll_t dma_buf_poll(struct file *file, poll_table *poll)
 		shared_count = fobj->shared_count;
 	else
 		shared_count = 0;
-	fence_excl = rcu_dereference(resv->fence_excl);
+	fence_excl = dma_resv_exclusive(resv);
 	if (read_seqcount_retry(&resv->seq, seq)) {
 		rcu_read_unlock();
 		goto retry;
@@ -1385,7 +1385,7 @@ static int dma_buf_debug_show(struct seq_file *s, void *unused)
 		robj = buf_obj->resv;
 		seq = read_seqcount_begin(&robj->seq);
 		rcu_read_lock();
-		fence = rcu_dereference(robj->fence_excl);
+		fence = dma_resv_exclusive(robj);
 		if (fence)
 			seq_printf(s, "\tExclusive fence: %s %s %ssignalled\n",
 				   fence->ops->get_driver_name(fence),
diff --git a/drivers/dma-buf/dma-resv.c b/drivers/dma-buf/dma-resv.c
index 6c6195315e9f..81b032b43457 100644
--- a/drivers/dma-buf/dma-resv.c
+++ b/drivers/dma-buf/dma-resv.c
@@ -281,7 +281,7 @@ EXPORT_SYMBOL(dma_resv_add_shared_fence);
  */
 void dma_resv_add_excl_fence(struct dma_resv *obj, struct dma_fence *fence)
 {
-	struct dma_fence *old_fence = dma_resv_get_excl(obj);
+	struct dma_fence *old_fence = dma_resv_exclusive(obj);
 	struct dma_resv_list *old;
 	u32 i = 0;
 
@@ -377,7 +377,7 @@ int dma_resv_copy_fences(struct dma_resv *dst, struct dma_resv *src)
 	rcu_read_unlock();
 
 	src_list = dma_resv_get_list(dst);
-	old = dma_resv_get_excl(dst);
+	old = dma_resv_exclusive(dst);
 
 	write_seqcount_begin(&dst->seq);
 	/* write_seqcount_begin provides the necessary memory barrier */
@@ -425,7 +425,7 @@ int dma_resv_get_fences_rcu(struct dma_resv *obj,
 		rcu_read_lock();
 		seq = read_seqcount_begin(&obj->seq);
 
-		fence_excl = rcu_dereference(obj->fence_excl);
+		fence_excl = dma_resv_exclusive(obj);
 		if (fence_excl && !dma_fence_get_rcu(fence_excl))
 			goto unlock;
 
@@ -520,7 +520,7 @@ long dma_resv_wait_timeout_rcu(struct dma_resv *obj,
 	rcu_read_lock();
 	i = -1;
 
-	fence = rcu_dereference(obj->fence_excl);
+	fence = dma_resv_exclusive(obj);
 	if (fence && !test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags)) {
 		if (!dma_fence_get_rcu(fence))
 			goto unlock_retry;
@@ -642,7 +642,7 @@ bool dma_resv_test_signaled_rcu(struct dma_resv *obj, bool test_all)
 	}
 
 	if (!shared_count) {
-		struct dma_fence *fence_excl = rcu_dereference(obj->fence_excl);
+		struct dma_fence *fence_excl = dma_resv_exclusive(obj);
 
 		if (fence_excl) {
 			ret = dma_resv_test_signaled_single(fence_excl);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index 18974bd081f0..94da44d97e7f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -185,7 +185,7 @@ static void amdgpu_gem_object_close(struct drm_gem_object *obj,
 	if (!amdgpu_vm_ready(vm))
 		goto out_unlock;
 
-	fence = dma_resv_get_excl(bo->tbo.base.resv);
+	fence = dma_resv_exclusive(bo->tbo.base.resv);
 	if (fence) {
 		amdgpu_bo_fence(bo, fence, true);
 		fence = NULL;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
index 4e558632a5d2..c84d5b843985 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
@@ -210,7 +210,7 @@ int amdgpu_sync_resv(struct amdgpu_device *adev, struct amdgpu_sync *sync,
 		return -EINVAL;
 
 	/* always sync to the exclusive fence */
-	f = dma_resv_get_excl(resv);
+	f = dma_resv_exclusive(resv);
 	r = amdgpu_sync_fence(sync, f);
 
 	flist = dma_resv_get_list(resv);
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.c b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
index db69f19ab5bc..d4f54dea8ac1 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
@@ -471,7 +471,7 @@ static void etnaviv_gem_describe(struct drm_gem_object *obj, struct seq_file *m)
 		}
 	}
 
-	fence = rcu_dereference(robj->fence_excl);
+	fence = dma_resv_exclusive(robj);
 	if (fence)
 		etnaviv_gem_describe_fence(fence, "Exclusive", m);
 	rcu_read_unlock();
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_busy.c b/drivers/gpu/drm/i915/gem/i915_gem_busy.c
index 25235ef630c1..02312a0c3a36 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_busy.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_busy.c
@@ -113,8 +113,7 @@ i915_gem_busy_ioctl(struct drm_device *dev, void *data,
 	seq = raw_read_seqcount(&obj->base.resv->seq);
 
 	/* Translate the exclusive fence to the READ *and* WRITE engine */
-	args->busy =
-		busy_check_writer(rcu_dereference(obj->base.resv->fence_excl));
+	args->busy = busy_check_writer(dma_resv_exclusive(obj->base.resv));
 
 	/* Translate shared fences to READ set of engines */
 	list = rcu_dereference(obj->base.resv->fence);
diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c
index b199942266a2..99b8cefb0072 100644
--- a/drivers/gpu/drm/msm/msm_gem.c
+++ b/drivers/gpu/drm/msm/msm_gem.c
@@ -805,7 +805,7 @@ int msm_gem_sync_object(struct drm_gem_object *obj,
 
 	fobj = dma_resv_get_list(obj->resv);
 	if (!fobj || (fobj->shared_count == 0)) {
-		fence = dma_resv_get_excl(obj->resv);
+		fence = dma_resv_exclusive(obj->resv);
 		/* don't need to wait on our own fences, since ring is fifo */
 		if (fence && (fence->context != fctx->context)) {
 			ret = dma_fence_wait(fence, true);
@@ -1021,7 +1021,7 @@ void msm_gem_describe(struct drm_gem_object *obj, struct seq_file *m,
 		}
 	}
 
-	fence = rcu_dereference(robj->fence_excl);
+	fence = dma_resv_exclusive(robj);
 	if (fence)
 		describe_fence(fence, "Exclusive", m);
 	rcu_read_unlock();
diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c
index 7a2624c0ba4c..ad30a6a100b9 100644
--- a/drivers/gpu/drm/nouveau/nouveau_bo.c
+++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
@@ -955,7 +955,7 @@ nouveau_bo_vm_cleanup(struct ttm_buffer_object *bo,
 {
 	struct nouveau_drm *drm = nouveau_bdev(bo->bdev);
 	struct drm_device *dev = drm->dev;
-	struct dma_fence *fence = dma_resv_get_excl(bo->base.resv);
+	struct dma_fence *fence = dma_resv_exclusive(bo->base.resv);
 
 	nv10_bo_put_tile_region(dev, *old_tile, fence);
 	*old_tile = new_tile;
diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c b/drivers/gpu/drm/nouveau/nouveau_fence.c
index e5dcbf67de7e..a6cb35181aee 100644
--- a/drivers/gpu/drm/nouveau/nouveau_fence.c
+++ b/drivers/gpu/drm/nouveau/nouveau_fence.c
@@ -356,7 +356,7 @@ nouveau_fence_sync(struct nouveau_bo *nvbo, struct nouveau_channel *chan, bool e
 	}
 
 	fobj = dma_resv_get_list(resv);
-	fence = dma_resv_get_excl(resv);
+	fence = dma_resv_exclusive(resv);
 
 	if (fence && (!exclusive || !fobj || !fobj->shared_count)) {
 		struct nouveau_channel *prev = NULL;
diff --git a/drivers/gpu/drm/radeon/radeon_display.c b/drivers/gpu/drm/radeon/radeon_display.c
index 652af7a134bd..57c910e5ae77 100644
--- a/drivers/gpu/drm/radeon/radeon_display.c
+++ b/drivers/gpu/drm/radeon/radeon_display.c
@@ -533,7 +533,7 @@ static int radeon_crtc_page_flip_target(struct drm_crtc *crtc,
 		DRM_ERROR("failed to pin new rbo buffer before flip\n");
 		goto cleanup;
 	}
-	work->fence = dma_fence_get(dma_resv_get_excl(new_rbo->tbo.base.resv));
+	work->fence = dma_fence_get(dma_resv_exclusive(new_rbo->tbo.base.resv));
 	radeon_bo_get_tiling_flags(new_rbo, &tiling_flags, NULL);
 	radeon_bo_unreserve(new_rbo);
 
diff --git a/drivers/gpu/drm/radeon/radeon_sync.c b/drivers/gpu/drm/radeon/radeon_sync.c
index 5d3302945076..e476f90ef1c1 100644
--- a/drivers/gpu/drm/radeon/radeon_sync.c
+++ b/drivers/gpu/drm/radeon/radeon_sync.c
@@ -98,7 +98,7 @@ int radeon_sync_resv(struct radeon_device *rdev,
 	int r = 0;
 
 	/* always sync to the exclusive fence */
-	f = dma_resv_get_excl(resv);
+	f = dma_resv_exclusive(resv);
 	fence = f ? to_radeon_fence(f) : NULL;
 	if (fence && fence->rdev == rdev)
 		radeon_sync_fence(sync, fence);
diff --git a/drivers/gpu/drm/radeon/radeon_uvd.c b/drivers/gpu/drm/radeon/radeon_uvd.c
index dfa9fdbe98da..02d4bbdc9111 100644
--- a/drivers/gpu/drm/radeon/radeon_uvd.c
+++ b/drivers/gpu/drm/radeon/radeon_uvd.c
@@ -477,7 +477,7 @@ static int radeon_uvd_cs_msg(struct radeon_cs_parser *p, struct radeon_bo *bo,
 		return -EINVAL;
 	}
 
-	f = dma_resv_get_excl(bo->tbo.base.resv);
+	f = dma_resv_exclusive(bo->tbo.base.resv);
 	if (f) {
 		r = radeon_fence_wait((struct radeon_fence *)f, false);
 		if (r) {
diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index ca1b098b6a56..95fa73ef90fb 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -260,7 +260,7 @@ static void ttm_bo_flush_all_fences(struct ttm_buffer_object *bo)
 
 	rcu_read_lock();
 	fobj = rcu_dereference(resv->fence);
-	fence = rcu_dereference(resv->fence_excl);
+	fence = dma_resv_exclusive(resv);
 	if (fence && !fence->ops->signaled)
 		dma_fence_enable_sw_signaling(fence);
 
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c b/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
index 62ea920addc3..c78f38ee1c20 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
@@ -1166,7 +1166,7 @@ int vmw_resources_clean(struct vmw_buffer_object *vbo, pgoff_t start,
 		if (bo->moving)
 			dma_fence_put(bo->moving);
 		bo->moving = dma_fence_get
-			(dma_resv_get_excl(bo->base.resv));
+			(dma_resv_exclusive(bo->base.resv));
 	}
 
 	return 0;
diff --git a/include/linux/dma-resv.h b/include/linux/dma-resv.h
index f32a3d176513..7549ec5eb35c 100644
--- a/include/linux/dma-resv.h
+++ b/include/linux/dma-resv.h
@@ -226,22 +226,19 @@ static inline void dma_resv_unlock(struct dma_resv *obj)
 }
 
 /**
- * dma_resv_get_excl - get the reservation object's
- * exclusive fence, with update-side lock held
+ * dma_resv_exclusive - return the object's exclusive fence
  * @obj: the reservation object
  *
- * Returns the exclusive fence (if any).  Does NOT take a
- * reference. Writers must hold obj->lock, readers may only
- * hold a RCU read side lock.
+ * Returns the exclusive fence (if any). Caller must either hold the objects
+ * lock or the rcu read side lock.
  *
  * RETURNS
  * The exclusive fence or NULL
  */
 static inline struct dma_fence *
-dma_resv_get_excl(struct dma_resv *obj)
+dma_resv_exclusive(struct dma_resv *obj)
 {
-	return rcu_dereference_protected(obj->fence_excl,
-					 dma_resv_held(obj));
+	return rcu_dereference_check(obj->fence_excl, dma_resv_held(obj));
 }
 
 /**
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 50+ messages in thread

* [PATCH 05/11] dma-buf: rename and cleanup dma_resv_get_list
  2021-05-17 14:11 [RFC] Add DMA_RESV_USAGE flags Christian König
                   ` (3 preceding siblings ...)
  2021-05-17 14:11 ` [PATCH 04/11] dma-buf: rename and cleanup dma_resv_get_excl Christian König
@ 2021-05-17 14:11 ` Christian König
  2021-05-17 14:11 ` [PATCH 06/11] dma-buf: add dma_resv_list_fence helper Christian König
                   ` (6 subsequent siblings)
  11 siblings, 0 replies; 50+ messages in thread
From: Christian König @ 2021-05-17 14:11 UTC (permalink / raw)
  To: dri-devel, linaro-mm-sig

When the comment needs to state explicitly that this is doesn't get a reference
to the object then the function is named rather badly.

Rename the function and use it in even more places.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/dma-buf/dma-resv.c                    | 32 +++++++++----------
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c   |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c      |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c       |  2 +-
 drivers/gpu/drm/etnaviv/etnaviv_gem.c         |  2 +-
 drivers/gpu/drm/i915/gem/i915_gem_busy.c      |  2 +-
 drivers/gpu/drm/msm/msm_gem.c                 |  4 +--
 drivers/gpu/drm/nouveau/nouveau_fence.c       |  2 +-
 drivers/gpu/drm/qxl/qxl_debugfs.c             |  2 +-
 drivers/gpu/drm/radeon/radeon_sync.c          |  2 +-
 drivers/gpu/drm/ttm/ttm_bo.c                  |  2 +-
 include/linux/dma-resv.h                      | 25 +++++++--------
 13 files changed, 39 insertions(+), 42 deletions(-)

diff --git a/drivers/dma-buf/dma-resv.c b/drivers/dma-buf/dma-resv.c
index 81b032b43457..b1a1a31dc009 100644
--- a/drivers/dma-buf/dma-resv.c
+++ b/drivers/dma-buf/dma-resv.c
@@ -149,8 +149,7 @@ int dma_resv_reserve_shared(struct dma_resv *obj, unsigned int num_fences)
 
 	dma_resv_assert_held(obj);
 
-	old = dma_resv_get_list(obj);
-
+	old = dma_resv_shared(obj);
 	if (old && old->shared_max) {
 		if ((old->shared_count + num_fences) <= old->shared_max)
 			return 0;
@@ -217,12 +216,13 @@ EXPORT_SYMBOL(dma_resv_reserve_shared);
  */
 void dma_resv_reset_shared_max(struct dma_resv *obj)
 {
-	/* Test shared fence slot reservation */
-	if (rcu_access_pointer(obj->fence)) {
-		struct dma_resv_list *fence = dma_resv_get_list(obj);
+	struct dma_resv_list *fences = dma_resv_shared(obj);
 
-		fence->shared_max = fence->shared_count;
-	}
+	dma_resv_assert_held(obj);
+
+	/* Test shared fence slot reservation */
+	if (fences)
+		fences->shared_max = fences->shared_count;
 }
 #endif
 
@@ -244,7 +244,7 @@ void dma_resv_add_shared_fence(struct dma_resv *obj, struct dma_fence *fence)
 
 	dma_resv_assert_held(obj);
 
-	fobj = dma_resv_get_list(obj);
+	fobj = dma_resv_shared(obj);
 	count = fobj->shared_count;
 
 	write_seqcount_begin(&obj->seq);
@@ -287,7 +287,7 @@ void dma_resv_add_excl_fence(struct dma_resv *obj, struct dma_fence *fence)
 
 	dma_resv_assert_held(obj);
 
-	old = dma_resv_get_list(obj);
+	old = dma_resv_shared(obj);
 	if (old)
 		i = old->shared_count;
 
@@ -326,7 +326,7 @@ int dma_resv_copy_fences(struct dma_resv *dst, struct dma_resv *src)
 	dma_resv_assert_held(dst);
 
 	rcu_read_lock();
-	src_list = rcu_dereference(src->fence);
+	src_list = dma_resv_shared(src);
 
 retry:
 	if (src_list) {
@@ -339,7 +339,7 @@ int dma_resv_copy_fences(struct dma_resv *dst, struct dma_resv *src)
 			return -ENOMEM;
 
 		rcu_read_lock();
-		src_list = rcu_dereference(src->fence);
+		src_list = dma_resv_shared(src);
 		if (!src_list || src_list->shared_count > shared_count) {
 			kfree(dst_list);
 			goto retry;
@@ -357,7 +357,7 @@ int dma_resv_copy_fences(struct dma_resv *dst, struct dma_resv *src)
 
 			if (!dma_fence_get_rcu(fence)) {
 				dma_resv_list_free(dst_list);
-				src_list = rcu_dereference(src->fence);
+				src_list = dma_resv_shared(src);
 				goto retry;
 			}
 
@@ -376,7 +376,7 @@ int dma_resv_copy_fences(struct dma_resv *dst, struct dma_resv *src)
 	new = dma_fence_get_rcu_safe(&src->fence_excl);
 	rcu_read_unlock();
 
-	src_list = dma_resv_get_list(dst);
+	src_list = dma_resv_shared(dst);
 	old = dma_resv_exclusive(dst);
 
 	write_seqcount_begin(&dst->seq);
@@ -429,7 +429,7 @@ int dma_resv_get_fences_rcu(struct dma_resv *obj,
 		if (fence_excl && !dma_fence_get_rcu(fence_excl))
 			goto unlock;
 
-		fobj = rcu_dereference(obj->fence);
+		fobj = dma_resv_shared(obj);
 		if (fobj)
 			sz += sizeof(*shared) * fobj->shared_max;
 
@@ -535,7 +535,7 @@ long dma_resv_wait_timeout_rcu(struct dma_resv *obj,
 	}
 
 	if (wait_all) {
-		struct dma_resv_list *fobj = rcu_dereference(obj->fence);
+		struct dma_resv_list *fobj = dma_resv_shared(obj);
 
 		if (fobj)
 			shared_count = fobj->shared_count;
@@ -620,7 +620,7 @@ bool dma_resv_test_signaled_rcu(struct dma_resv *obj, bool test_all)
 	seq = read_seqcount_begin(&obj->seq);
 
 	if (test_all) {
-		struct dma_resv_list *fobj = rcu_dereference(obj->fence);
+		struct dma_resv_list *fobj = dma_resv_shared(obj);
 		unsigned int i;
 
 		if (fobj)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 7d4118c8128a..88da0e400406 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -245,7 +245,7 @@ static int amdgpu_amdkfd_remove_eviction_fence(struct amdgpu_bo *bo,
 	if (!ef)
 		return -EINVAL;
 
-	old = dma_resv_get_list(resv);
+	old = dma_resv_shared(resv);
 	if (!old)
 		return 0;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
index baa980a477d9..0371947ba96b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
@@ -95,7 +95,7 @@ __dma_resv_make_exclusive(struct dma_resv *obj)
 	unsigned int count;
 	int r;
 
-	if (!dma_resv_get_list(obj)) /* no shared fences to convert */
+	if (!dma_resv_shared(obj)) /* no shared fences to convert */
 		return 0;
 
 	r = dma_resv_get_fences_rcu(obj, NULL, &count, &fences);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
index c84d5b843985..c50d9f92a0cd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
@@ -213,7 +213,7 @@ int amdgpu_sync_resv(struct amdgpu_device *adev, struct amdgpu_sync *sync,
 	f = dma_resv_exclusive(resv);
 	r = amdgpu_sync_fence(sync, f);
 
-	flist = dma_resv_get_list(resv);
+	flist = dma_resv_shared(resv);
 	if (!flist || r)
 		return r;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 8c7ec09eb1a4..6ab50810bd54 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1407,7 +1407,7 @@ static bool amdgpu_ttm_bo_eviction_valuable(struct ttm_buffer_object *bo,
 	 * If true, then return false as any KFD process needs all its BOs to
 	 * be resident to run successfully
 	 */
-	flist = dma_resv_get_list(bo->base.resv);
+	flist = dma_resv_shared(bo->base.resv);
 	if (flist) {
 		for (i = 0; i < flist->shared_count; ++i) {
 			f = rcu_dereference_protected(flist->shared[i],
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.c b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
index d4f54dea8ac1..4d43b8630f0e 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
@@ -461,7 +461,7 @@ static void etnaviv_gem_describe(struct drm_gem_object *obj, struct seq_file *m)
 			off, etnaviv_obj->vaddr, obj->size);
 
 	rcu_read_lock();
-	fobj = rcu_dereference(robj->fence);
+	fobj = dma_resv_shared(robj);
 	if (fobj) {
 		unsigned int i, shared_count = fobj->shared_count;
 
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_busy.c b/drivers/gpu/drm/i915/gem/i915_gem_busy.c
index 02312a0c3a36..3f94becac541 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_busy.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_busy.c
@@ -116,7 +116,7 @@ i915_gem_busy_ioctl(struct drm_device *dev, void *data,
 	args->busy = busy_check_writer(dma_resv_exclusive(obj->base.resv));
 
 	/* Translate shared fences to READ set of engines */
-	list = rcu_dereference(obj->base.resv->fence);
+	list = dma_resv_shared(obj->base.resv);
 	if (list) {
 		unsigned int shared_count = list->shared_count, i;
 
diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c
index 99b8cefb0072..cf3e08b6a910 100644
--- a/drivers/gpu/drm/msm/msm_gem.c
+++ b/drivers/gpu/drm/msm/msm_gem.c
@@ -803,7 +803,7 @@ int msm_gem_sync_object(struct drm_gem_object *obj,
 	struct dma_fence *fence;
 	int i, ret;
 
-	fobj = dma_resv_get_list(obj->resv);
+	fobj = dma_resv_shared(obj->resv);
 	if (!fobj || (fobj->shared_count == 0)) {
 		fence = dma_resv_exclusive(obj->resv);
 		/* don't need to wait on our own fences, since ring is fifo */
@@ -1011,7 +1011,7 @@ void msm_gem_describe(struct drm_gem_object *obj, struct seq_file *m,
 	}
 
 	rcu_read_lock();
-	fobj = rcu_dereference(robj->fence);
+	fobj = dma_resv_shared(robj);
 	if (fobj) {
 		unsigned int i, shared_count = fobj->shared_count;
 
diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c b/drivers/gpu/drm/nouveau/nouveau_fence.c
index a6cb35181aee..5ce441c655ea 100644
--- a/drivers/gpu/drm/nouveau/nouveau_fence.c
+++ b/drivers/gpu/drm/nouveau/nouveau_fence.c
@@ -355,7 +355,7 @@ nouveau_fence_sync(struct nouveau_bo *nvbo, struct nouveau_channel *chan, bool e
 			return ret;
 	}
 
-	fobj = dma_resv_get_list(resv);
+	fobj = dma_resv_shared(resv);
 	fence = dma_resv_exclusive(resv);
 
 	if (fence && (!exclusive || !fobj || !fobj->shared_count)) {
diff --git a/drivers/gpu/drm/qxl/qxl_debugfs.c b/drivers/gpu/drm/qxl/qxl_debugfs.c
index 183d15e2cf58..0acc70a6d3dd 100644
--- a/drivers/gpu/drm/qxl/qxl_debugfs.c
+++ b/drivers/gpu/drm/qxl/qxl_debugfs.c
@@ -61,7 +61,7 @@ qxl_debugfs_buffers_info(struct seq_file *m, void *data)
 		int rel;
 
 		rcu_read_lock();
-		fobj = rcu_dereference(bo->tbo.base.resv->fence);
+		fobj = dma_resv_shared(bo->tbo.base.resv);
 		rel = fobj ? fobj->shared_count : 0;
 		rcu_read_unlock();
 
diff --git a/drivers/gpu/drm/radeon/radeon_sync.c b/drivers/gpu/drm/radeon/radeon_sync.c
index e476f90ef1c1..a9cdb88da173 100644
--- a/drivers/gpu/drm/radeon/radeon_sync.c
+++ b/drivers/gpu/drm/radeon/radeon_sync.c
@@ -105,7 +105,7 @@ int radeon_sync_resv(struct radeon_device *rdev,
 	else if (f)
 		r = dma_fence_wait(f, true);
 
-	flist = dma_resv_get_list(resv);
+	flist = dma_resv_shared(resv);
 	if (shared || !flist || r)
 		return r;
 
diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index 95fa73ef90fb..9d453c2ca800 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -259,7 +259,7 @@ static void ttm_bo_flush_all_fences(struct ttm_buffer_object *bo)
 	int i;
 
 	rcu_read_lock();
-	fobj = rcu_dereference(resv->fence);
+	fobj = dma_resv_shared(resv);
 	fence = dma_resv_exclusive(resv);
 	if (fence && !fence->ops->signaled)
 		dma_fence_enable_sw_signaling(fence);
diff --git a/include/linux/dma-resv.h b/include/linux/dma-resv.h
index 7549ec5eb35c..98ac66fecb71 100644
--- a/include/linux/dma-resv.h
+++ b/include/linux/dma-resv.h
@@ -78,20 +78,6 @@ struct dma_resv {
 #define dma_resv_held(obj) lockdep_is_held(&(obj)->lock.base)
 #define dma_resv_assert_held(obj) lockdep_assert_held(&(obj)->lock.base)
 
-/**
- * dma_resv_get_list - get the reservation object's
- * shared fence list, with update-side lock held
- * @obj: the reservation object
- *
- * Returns the shared fence list.  Does NOT take references to
- * the fence.  The obj->lock must be held.
- */
-static inline struct dma_resv_list *dma_resv_get_list(struct dma_resv *obj)
-{
-	return rcu_dereference_protected(obj->fence,
-					 dma_resv_held(obj));
-}
-
 #ifdef CONFIG_DEBUG_MUTEXES
 void dma_resv_reset_shared_max(struct dma_resv *obj);
 #else
@@ -267,6 +253,17 @@ dma_resv_get_excl_rcu(struct dma_resv *obj)
 	return fence;
 }
 
+/**
+ * dma_resv_shared - get the reservation object's shared fence list
+ * @obj: the reservation object
+ *
+ * Returns the shared fence list. The obj->lock or rcu read side must be held.
+ */
+static inline struct dma_resv_list *dma_resv_shared(struct dma_resv *obj)
+{
+	return rcu_dereference_check(obj->fence, dma_resv_held(obj));
+}
+
 void dma_resv_init(struct dma_resv *obj);
 void dma_resv_fini(struct dma_resv *obj);
 int dma_resv_reserve_shared(struct dma_resv *obj, unsigned int num_fences);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 50+ messages in thread

* [PATCH 06/11] dma-buf: add dma_resv_list_fence helper
  2021-05-17 14:11 [RFC] Add DMA_RESV_USAGE flags Christian König
                   ` (4 preceding siblings ...)
  2021-05-17 14:11 ` [PATCH 05/11] dma-buf: rename and cleanup dma_resv_get_list Christian König
@ 2021-05-17 14:11 ` Christian König
  2021-05-17 14:11 ` [PATCH 07/11] dma-buf: add dma_resv_replace_shared Christian König
                   ` (5 subsequent siblings)
  11 siblings, 0 replies; 50+ messages in thread
From: Christian König @ 2021-05-17 14:11 UTC (permalink / raw)
  To: dri-devel, linaro-mm-sig

Instead of repeating the access check over and over again.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/dma-buf/dma-resv.c | 42 +++++++++++++++++++++++---------------
 1 file changed, 25 insertions(+), 17 deletions(-)

diff --git a/drivers/dma-buf/dma-resv.c b/drivers/dma-buf/dma-resv.c
index b1a1a31dc009..49f3c1009821 100644
--- a/drivers/dma-buf/dma-resv.c
+++ b/drivers/dma-buf/dma-resv.c
@@ -93,6 +93,22 @@ static void dma_resv_list_free(struct dma_resv_list *list)
 	kfree_rcu(list, rcu);
 }
 
+/**
+ * dma_resv_list_fence - return fence for index
+ * @obj: the reservation object
+ * @list: list to get the fence from
+ * @idx: index into the fence array
+ *
+ * Return the fence at the specified index double checking that either the rcu
+ * read side or the dma_resv object is held.
+ */
+static struct dma_fence *dma_resv_list_fence(struct dma_resv *obj,
+					     struct dma_resv_list *list,
+					     unsigned int idx)
+{
+	return rcu_dereference_check(list->shared[idx], dma_resv_held(obj));
+}
+
 /**
  * dma_resv_init - initialize a reservation object
  * @obj: the reservation object
@@ -171,8 +187,7 @@ int dma_resv_reserve_shared(struct dma_resv *obj, unsigned int num_fences)
 	for (i = 0, j = 0, k = max; i < (old ? old->shared_count : 0); ++i) {
 		struct dma_fence *fence;
 
-		fence = rcu_dereference_protected(old->shared[i],
-						  dma_resv_held(obj));
+		fence = dma_resv_list_fence(obj, old, i);
 		if (dma_fence_is_signaled(fence))
 			RCU_INIT_POINTER(new->shared[--k], fence);
 		else
@@ -194,13 +209,8 @@ int dma_resv_reserve_shared(struct dma_resv *obj, unsigned int num_fences)
 		return 0;
 
 	/* Drop the references to the signaled fences */
-	for (i = k; i < max; ++i) {
-		struct dma_fence *fence;
-
-		fence = rcu_dereference_protected(new->shared[i],
-						  dma_resv_held(obj));
-		dma_fence_put(fence);
-	}
+	for (i = k; i < max; ++i)
+		dma_fence_put(dma_resv_list_fence(obj, new, i));
 	kfree_rcu(old, rcu);
 
 	return 0;
@@ -251,8 +261,7 @@ void dma_resv_add_shared_fence(struct dma_resv *obj, struct dma_fence *fence)
 
 	for (i = 0; i < count; ++i) {
 
-		old = rcu_dereference_protected(fobj->shared[i],
-						dma_resv_held(obj));
+		old = dma_resv_list_fence(obj, fobj, i);
 		if (old->context == fence->context ||
 		    dma_fence_is_signaled(old))
 			goto replace;
@@ -303,8 +312,7 @@ void dma_resv_add_excl_fence(struct dma_resv *obj, struct dma_fence *fence)
 
 	/* inplace update, no shared fences */
 	while (i--)
-		dma_fence_put(rcu_dereference_protected(old->shared[i],
-						dma_resv_held(obj)));
+		dma_fence_put(dma_resv_list_fence(obj, old, i));
 
 	dma_fence_put(old_fence);
 }
@@ -350,7 +358,7 @@ int dma_resv_copy_fences(struct dma_resv *dst, struct dma_resv *src)
 			struct dma_fence __rcu **dst;
 			struct dma_fence *fence;
 
-			fence = rcu_dereference(src_list->shared[i]);
+			fence = dma_resv_list_fence(src, src_list, i);
 			if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
 				     &fence->flags))
 				continue;
@@ -459,7 +467,7 @@ int dma_resv_get_fences_rcu(struct dma_resv *obj,
 			shared = nshared;
 			shared_count = fobj ? fobj->shared_count : 0;
 			for (i = 0; i < shared_count; ++i) {
-				shared[i] = rcu_dereference(fobj->shared[i]);
+				shared[i] = dma_resv_list_fence(obj, fobj, i);
 				if (!dma_fence_get_rcu(shared[i]))
 					break;
 			}
@@ -543,7 +551,7 @@ long dma_resv_wait_timeout_rcu(struct dma_resv *obj,
 		for (i = 0; !fence && i < shared_count; ++i) {
 			struct dma_fence *lfence;
 
-			lfence = rcu_dereference(fobj->shared[i]);
+			lfence = dma_resv_list_fence(obj, fobj, i);
 			if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
 				     &lfence->flags))
 				continue;
@@ -629,7 +637,7 @@ bool dma_resv_test_signaled_rcu(struct dma_resv *obj, bool test_all)
 		for (i = 0; i < shared_count; ++i) {
 			struct dma_fence *fence;
 
-			fence = rcu_dereference(fobj->shared[i]);
+			fence = dma_resv_list_fence(obj, fobj, i);
 			ret = dma_resv_test_signaled_single(fence);
 			if (ret < 0)
 				goto retry;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 50+ messages in thread

* [PATCH 07/11] dma-buf: add dma_resv_replace_shared
  2021-05-17 14:11 [RFC] Add DMA_RESV_USAGE flags Christian König
                   ` (5 preceding siblings ...)
  2021-05-17 14:11 ` [PATCH 06/11] dma-buf: add dma_resv_list_fence helper Christian König
@ 2021-05-17 14:11 ` Christian König
  2021-05-17 14:11 ` [PATCH 08/11] dma-buf: improve shared fence abstraction Christian König
                   ` (4 subsequent siblings)
  11 siblings, 0 replies; 50+ messages in thread
From: Christian König @ 2021-05-17 14:11 UTC (permalink / raw)
  To: dri-devel, linaro-mm-sig

This function allows to replace fences from the shared fence list when
we can gurantee that the operation represented by the original fence has
finished or no accesses to the resources protected by the dma_resv
object any more when the new fence finishes.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/dma-buf/dma-resv.c                    | 38 +++++++++++++++
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 47 ++-----------------
 include/linux/dma-resv.h                      |  2 +
 3 files changed, 44 insertions(+), 43 deletions(-)

diff --git a/drivers/dma-buf/dma-resv.c b/drivers/dma-buf/dma-resv.c
index 49f3c1009821..5703e328b8ac 100644
--- a/drivers/dma-buf/dma-resv.c
+++ b/drivers/dma-buf/dma-resv.c
@@ -281,6 +281,44 @@ void dma_resv_add_shared_fence(struct dma_resv *obj, struct dma_fence *fence)
 }
 EXPORT_SYMBOL(dma_resv_add_shared_fence);
 
+/**
+ * dma_resv_replace_shared - replace shared fences
+ * @obj: the reservation object
+ * @context: the context of the fences to replace
+ * @fence: the new fence to use instead
+ *
+ * Replace fences with a specified context with a new fence. Only valid if the
+ * operation represented by the original fences is completed or has no longer
+ * access to the resources protected by the dma_resv object when the new fence
+ * completes. Takes the reference to the new fence.
+ */
+void dma_resv_replace_shared(struct dma_resv *obj, uint64_t context,
+			     struct dma_fence *fence)
+{
+	struct dma_resv_list *list;
+	unsigned int i;
+
+	list = dma_resv_shared(obj);
+	if (!list) {
+		dma_fence_put(fence);
+		return;
+	}
+
+	write_seqcount_begin(&obj->seq);
+	for (i = 0; i < list->shared_count; ++i) {
+		struct dma_fence *old = dma_resv_list_fence(obj, list, i);
+
+		if (old->context != context)
+			continue;
+
+		rcu_assign_pointer(list->shared[i], dma_fence_get(fence));
+		dma_fence_put(old);
+	}
+	write_seqcount_end(&obj->seq);
+	dma_fence_put(fence);
+}
+EXPORT_SYMBOL(dma_resv_replace_shared);
+
 /**
  * dma_resv_add_excl_fence - Add an exclusive fence.
  * @obj: the reservation object
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 88da0e400406..3e5a681a5482 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -238,53 +238,14 @@ void amdgpu_amdkfd_unreserve_memory_limit(struct amdgpu_bo *bo)
 static int amdgpu_amdkfd_remove_eviction_fence(struct amdgpu_bo *bo,
 					struct amdgpu_amdkfd_fence *ef)
 {
-	struct dma_resv *resv = bo->tbo.base.resv;
-	struct dma_resv_list *old, *new;
-	unsigned int i, j, k;
-
 	if (!ef)
 		return -EINVAL;
 
-	old = dma_resv_shared(resv);
-	if (!old)
-		return 0;
-
-	new = kmalloc(struct_size(new, shared, old->shared_max), GFP_KERNEL);
-	if (!new)
-		return -ENOMEM;
-
-	/* Go through all the shared fences in the resevation object and sort
-	 * the interesting ones to the end of the list.
+	/* TODO: Instead of block before we should use the fence of the page
+	 * table update and TLB flush here directly.
 	 */
-	for (i = 0, j = old->shared_count, k = 0; i < old->shared_count; ++i) {
-		struct dma_fence *f;
-
-		f = rcu_dereference_protected(old->shared[i],
-					      dma_resv_held(resv));
-
-		if (f->context == ef->base.context)
-			RCU_INIT_POINTER(new->shared[--j], f);
-		else
-			RCU_INIT_POINTER(new->shared[k++], f);
-	}
-	new->shared_max = old->shared_max;
-	new->shared_count = k;
-
-	/* Install the new fence list, seqcount provides the barriers */
-	write_seqcount_begin(&resv->seq);
-	RCU_INIT_POINTER(resv->fence, new);
-	write_seqcount_end(&resv->seq);
-
-	/* Drop the references to the removed fences or move them to ef_list */
-	for (i = j, k = 0; i < old->shared_count; ++i) {
-		struct dma_fence *f;
-
-		f = rcu_dereference_protected(new->shared[i],
-					      dma_resv_held(resv));
-		dma_fence_put(f);
-	}
-	kfree_rcu(old, rcu);
-
+	dma_resv_replace_shared(bo->tbo.base.resv, ef->base.context,
+				dma_fence_get_stub());
 	return 0;
 }
 
diff --git a/include/linux/dma-resv.h b/include/linux/dma-resv.h
index 98ac66fecb71..74b217b82f39 100644
--- a/include/linux/dma-resv.h
+++ b/include/linux/dma-resv.h
@@ -268,6 +268,8 @@ void dma_resv_init(struct dma_resv *obj);
 void dma_resv_fini(struct dma_resv *obj);
 int dma_resv_reserve_shared(struct dma_resv *obj, unsigned int num_fences);
 void dma_resv_add_shared_fence(struct dma_resv *obj, struct dma_fence *fence);
+void dma_resv_replace_shared(struct dma_resv *obj, uint64_t context,
+			     struct dma_fence *fence);
 
 void dma_resv_add_excl_fence(struct dma_resv *obj, struct dma_fence *fence);
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 50+ messages in thread

* [PATCH 08/11] dma-buf: improve shared fence abstraction
  2021-05-17 14:11 [RFC] Add DMA_RESV_USAGE flags Christian König
                   ` (6 preceding siblings ...)
  2021-05-17 14:11 ` [PATCH 07/11] dma-buf: add dma_resv_replace_shared Christian König
@ 2021-05-17 14:11 ` Christian König
  2021-05-17 14:11 ` [PATCH 09/11] dma-buf: add shared fence usage flags Christian König
                   ` (3 subsequent siblings)
  11 siblings, 0 replies; 50+ messages in thread
From: Christian König @ 2021-05-17 14:11 UTC (permalink / raw)
  To: dri-devel, linaro-mm-sig

Put access to the shared fences behind an interator.
This way we don't need to expose the internal implementation any more.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/dma-buf/dma-buf.c                   | 46 ++++++----------
 drivers/dma-buf/dma-resv.c                  | 61 +++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c |  9 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c    | 14 ++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c     | 14 ++---
 drivers/gpu/drm/etnaviv/etnaviv_gem.c       | 15 ++---
 drivers/gpu/drm/i915/gem/i915_gem_busy.c    | 16 ++----
 drivers/gpu/drm/nouveau/nouveau_fence.c     | 24 ++++----
 drivers/gpu/drm/qxl/qxl_debugfs.c           | 10 ++--
 drivers/gpu/drm/radeon/radeon_sync.c        | 14 ++---
 drivers/gpu/drm/ttm/ttm_bo.c                | 15 ++---
 include/linux/dma-resv.h                    | 40 ++++++--------
 12 files changed, 140 insertions(+), 138 deletions(-)

diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
index 5abf6b8c89ac..c51c1fca4c1b 100644
--- a/drivers/dma-buf/dma-buf.c
+++ b/drivers/dma-buf/dma-buf.c
@@ -206,12 +206,12 @@ static void dma_buf_poll_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
 
 static __poll_t dma_buf_poll(struct file *file, poll_table *poll)
 {
+	struct dma_fence *fence_excl, *fence_shared;
+	struct dma_resv_cursor cursor;
 	struct dma_buf *dmabuf;
 	struct dma_resv *resv;
-	struct dma_resv_list *fobj;
-	struct dma_fence *fence_excl;
 	__poll_t events;
-	unsigned shared_count, seq;
+	unsigned seq;
 
 	dmabuf = file->private_data;
 	if (!dmabuf || !dmabuf->resv)
@@ -229,22 +229,18 @@ static __poll_t dma_buf_poll(struct file *file, poll_table *poll)
 	seq = read_seqcount_begin(&resv->seq);
 	rcu_read_lock();
 
-	fobj = rcu_dereference(resv->fence);
-	if (fobj)
-		shared_count = fobj->shared_count;
-	else
-		shared_count = 0;
+	fence_shared = dma_resv_first_shared(resv, &cursor);
 	fence_excl = dma_resv_exclusive(resv);
 	if (read_seqcount_retry(&resv->seq, seq)) {
 		rcu_read_unlock();
 		goto retry;
 	}
 
-	if (fence_excl && (!(events & EPOLLOUT) || shared_count == 0)) {
+	if (fence_excl && (!(events & EPOLLOUT) || !fence_shared)) {
 		struct dma_buf_poll_cb_t *dcb = &dmabuf->cb_excl;
 		__poll_t pevents = EPOLLIN;
 
-		if (shared_count == 0)
+		if (fence_shared)
 			pevents |= EPOLLOUT;
 
 		spin_lock_irq(&dmabuf->poll.lock);
@@ -275,9 +271,8 @@ static __poll_t dma_buf_poll(struct file *file, poll_table *poll)
 		}
 	}
 
-	if ((events & EPOLLOUT) && shared_count > 0) {
+	if ((events & EPOLLOUT) && fence_shared) {
 		struct dma_buf_poll_cb_t *dcb = &dmabuf->cb_shared;
-		int i;
 
 		/* Only queue a new callback if no event has fired yet */
 		spin_lock_irq(&dmabuf->poll.lock);
@@ -290,13 +285,11 @@ static __poll_t dma_buf_poll(struct file *file, poll_table *poll)
 		if (!(events & EPOLLOUT))
 			goto out;
 
-		for (i = 0; i < shared_count; ++i) {
-			struct dma_fence *fence = rcu_dereference(fobj->shared[i]);
-
-			if (!dma_fence_get_rcu(fence)) {
+		do {
+			if (!dma_fence_get_rcu(fence_shared)) {
 				/*
 				 * fence refcount dropped to zero, this means
-				 * that fobj has been freed
+				 * that the shared fence has been freed
 				 *
 				 * call dma_buf_poll_cb and force a recheck!
 				 */
@@ -304,17 +297,17 @@ static __poll_t dma_buf_poll(struct file *file, poll_table *poll)
 				dma_buf_poll_cb(NULL, &dcb->cb);
 				break;
 			}
-			if (!dma_fence_add_callback(fence, &dcb->cb,
+			if (!dma_fence_add_callback(fence_shared, &dcb->cb,
 						    dma_buf_poll_cb)) {
-				dma_fence_put(fence);
+				dma_fence_put(fence_shared);
 				events &= ~EPOLLOUT;
 				break;
 			}
-			dma_fence_put(fence);
-		}
+			dma_fence_put(fence_shared);
+		} while ((fence_shared = dma_resv_next_shared(resv, &cursor)));
 
 		/* No callback queued, wake up any additional waiters. */
-		if (i == shared_count)
+		if (!fence_shared)
 			dma_buf_poll_cb(NULL, &dcb->cb);
 	}
 
@@ -1353,10 +1346,9 @@ static int dma_buf_debug_show(struct seq_file *s, void *unused)
 	struct dma_buf *buf_obj;
 	struct dma_buf_attachment *attach_obj;
 	struct dma_resv *robj;
-	struct dma_resv_list *fobj;
 	struct dma_fence *fence;
 	unsigned seq;
-	int count = 0, attach_count, shared_count, i;
+	int count = 0, attach_count;
 	size_t size = 0;
 
 	ret = mutex_lock_interruptible(&db_list.lock);
@@ -1369,6 +1361,7 @@ static int dma_buf_debug_show(struct seq_file *s, void *unused)
 		   "size", "flags", "mode", "count", "ino");
 
 	list_for_each_entry(buf_obj, &db_list.head, list_node) {
+		struct dma_resv_cursor cursor;
 
 		ret = dma_resv_lock_interruptible(buf_obj->resv, NULL);
 		if (ret)
@@ -1392,10 +1385,7 @@ static int dma_buf_debug_show(struct seq_file *s, void *unused)
 				   fence->ops->get_timeline_name(fence),
 				   dma_fence_is_signaled(fence) ? "" : "un");
 
-		fobj = rcu_dereference(robj->fence);
-		shared_count = fobj ? fobj->shared_count : 0;
-		for (i = 0; i < shared_count; i++) {
-			fence = rcu_dereference(fobj->shared[i]);
+		dma_resv_for_each_shared(robj, &cursor, fence) {
 			if (!dma_fence_get_rcu(fence))
 				continue;
 			seq_printf(s, "\tShared fence: %s %s %ssignalled\n",
diff --git a/drivers/dma-buf/dma-resv.c b/drivers/dma-buf/dma-resv.c
index 5703e328b8ac..ef7e0464e08d 100644
--- a/drivers/dma-buf/dma-resv.c
+++ b/drivers/dma-buf/dma-resv.c
@@ -39,6 +39,19 @@
 #include <linux/sched/mm.h>
 #include <linux/mmu_notifier.h>
 
+/**
+ * struct dma_resv_list - a list of shared fences
+ * @rcu: for internal use
+ * @shared_count: table of shared fences
+ * @shared_max: for growing shared fence table
+ * @shared: shared fence table
+ */
+struct dma_resv_list {
+	struct rcu_head rcu;
+	u32 shared_count, shared_max;
+	struct dma_fence __rcu *shared[];
+};
+
 /**
  * DOC: Reservation Object Overview
  *
@@ -146,6 +159,54 @@ void dma_resv_fini(struct dma_resv *obj)
 }
 EXPORT_SYMBOL(dma_resv_fini);
 
+/**
+ * dma_resv_shared - get the reservation object's shared fence list
+ * @obj: the reservation object
+ *
+ * Returns the shared fence list. The obj->lock or rcu read side must be held.
+ */
+static inline struct dma_resv_list *dma_resv_shared(struct dma_resv *obj)
+{
+	return rcu_dereference_check(obj->fence, dma_resv_held(obj));
+}
+
+/**
+ * dma_resv_first_shared - get first shared fence
+ * @obj: the reservation object
+ * @cursor: cursor to record the position
+ *
+ * Return the first shared fence of the resv object and initialize the cursor to
+ * track the position inside the list.
+ */
+struct dma_fence *dma_resv_first_shared(struct dma_resv *obj,
+					struct dma_resv_cursor *cursor)
+{
+	cursor->fences = dma_resv_shared(obj);
+	if (!cursor->fences)
+		return NULL;
+
+	cursor->i = 0;
+	return dma_resv_next_shared(obj, cursor);
+}
+EXPORT_SYMBOL(dma_resv_first_shared);
+
+/**
+ * dma_resv_next_shared - get the next shared fence from the resv object
+ * @obj: the reservation object
+ * @cursor: cursor to record the position
+ *
+ * Return the next shared fence of the resv object where cursor points to.
+ */
+struct dma_fence *dma_resv_next_shared(struct dma_resv *obj,
+				       struct dma_resv_cursor *cursor)
+{
+	if (cursor->i >= cursor->fences->shared_count)
+		return NULL;
+
+	return dma_resv_list_fence(obj, cursor->fences, cursor->i++);
+}
+EXPORT_SYMBOL(dma_resv_next_shared);
+
 /**
  * dma_resv_reserve_shared - Reserve space to add shared fences to
  * a dma_resv.
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
index 0371947ba96b..67cef80e25c8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
@@ -95,20 +95,15 @@ __dma_resv_make_exclusive(struct dma_resv *obj)
 	unsigned int count;
 	int r;
 
-	if (!dma_resv_shared(obj)) /* no shared fences to convert */
-		return 0;
-
 	r = dma_resv_get_fences_rcu(obj, NULL, &count, &fences);
 	if (r)
 		return r;
 
-	if (count == 0) {
-		/* Now that was unexpected. */
-	} else if (count == 1) {
+	if (count == 1) {
 		dma_resv_add_excl_fence(obj, fences[0]);
 		dma_fence_put(fences[0]);
 		kfree(fences);
-	} else {
+	} else if (count > 1) {
 		struct dma_fence_array *array;
 
 		array = dma_fence_array_create(count, fences,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
index c50d9f92a0cd..a4478332a79e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
@@ -201,10 +201,9 @@ int amdgpu_sync_resv(struct amdgpu_device *adev, struct amdgpu_sync *sync,
 		     struct dma_resv *resv, enum amdgpu_sync_mode mode,
 		     void *owner)
 {
-	struct dma_resv_list *flist;
+	struct dma_resv_cursor cursor;
 	struct dma_fence *f;
-	unsigned i;
-	int r = 0;
+	int r;
 
 	if (resv == NULL)
 		return -EINVAL;
@@ -212,17 +211,12 @@ int amdgpu_sync_resv(struct amdgpu_device *adev, struct amdgpu_sync *sync,
 	/* always sync to the exclusive fence */
 	f = dma_resv_exclusive(resv);
 	r = amdgpu_sync_fence(sync, f);
-
-	flist = dma_resv_shared(resv);
-	if (!flist || r)
+	if (r)
 		return r;
 
-	for (i = 0; i < flist->shared_count; ++i) {
+	dma_resv_for_each_shared(resv, &cursor, f) {
 		void *fence_owner;
 
-		f = rcu_dereference_protected(flist->shared[i],
-					      dma_resv_held(resv));
-
 		fence_owner = amdgpu_sync_get_owner(f);
 
 		/* Always sync to moves, no matter what */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 6ab50810bd54..ba89e35c1b84 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1394,10 +1394,9 @@ static bool amdgpu_ttm_bo_eviction_valuable(struct ttm_buffer_object *bo,
 					    const struct ttm_place *place)
 {
 	unsigned long num_pages = bo->mem.num_pages;
+	struct dma_resv_cursor resv_cursor;
 	struct amdgpu_res_cursor cursor;
-	struct dma_resv_list *flist;
 	struct dma_fence *f;
-	int i;
 
 	if (bo->type == ttm_bo_type_kernel &&
 	    !amdgpu_vm_evictable(ttm_to_amdgpu_bo(bo)))
@@ -1407,14 +1406,9 @@ static bool amdgpu_ttm_bo_eviction_valuable(struct ttm_buffer_object *bo,
 	 * If true, then return false as any KFD process needs all its BOs to
 	 * be resident to run successfully
 	 */
-	flist = dma_resv_shared(bo->base.resv);
-	if (flist) {
-		for (i = 0; i < flist->shared_count; ++i) {
-			f = rcu_dereference_protected(flist->shared[i],
-				dma_resv_held(bo->base.resv));
-			if (amdkfd_fence_check_mm(f, current->mm))
-				return false;
-		}
+	dma_resv_for_each_shared(bo->base.resv, &resv_cursor, f) {
+		if (amdkfd_fence_check_mm(f, current->mm))
+			return false;
 	}
 
 	switch (bo->mem.mem_type) {
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.c b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
index 4d43b8630f0e..52e9eaa43f2e 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
@@ -450,10 +450,10 @@ static void etnaviv_gem_describe_fence(struct dma_fence *fence,
 static void etnaviv_gem_describe(struct drm_gem_object *obj, struct seq_file *m)
 {
 	struct etnaviv_gem_object *etnaviv_obj = to_etnaviv_bo(obj);
+	unsigned long off = drm_vma_node_start(&obj->vma_node);
 	struct dma_resv *robj = obj->resv;
-	struct dma_resv_list *fobj;
+	struct dma_resv_cursor cursor;
 	struct dma_fence *fence;
-	unsigned long off = drm_vma_node_start(&obj->vma_node);
 
 	seq_printf(m, "%08x: %c %2d (%2d) %08lx %p %zd\n",
 			etnaviv_obj->flags, is_active(etnaviv_obj) ? 'A' : 'I',
@@ -461,15 +461,8 @@ static void etnaviv_gem_describe(struct drm_gem_object *obj, struct seq_file *m)
 			off, etnaviv_obj->vaddr, obj->size);
 
 	rcu_read_lock();
-	fobj = dma_resv_shared(robj);
-	if (fobj) {
-		unsigned int i, shared_count = fobj->shared_count;
-
-		for (i = 0; i < shared_count; i++) {
-			fence = rcu_dereference(fobj->shared[i]);
-			etnaviv_gem_describe_fence(fence, "Shared", m);
-		}
-	}
+	dma_resv_for_each_shared(robj, &cursor, fence)
+		etnaviv_gem_describe_fence(fence, "Shared", m);
 
 	fence = dma_resv_exclusive(robj);
 	if (fence)
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_busy.c b/drivers/gpu/drm/i915/gem/i915_gem_busy.c
index 3f94becac541..1028df6cee67 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_busy.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_busy.c
@@ -82,7 +82,8 @@ i915_gem_busy_ioctl(struct drm_device *dev, void *data,
 {
 	struct drm_i915_gem_busy *args = data;
 	struct drm_i915_gem_object *obj;
-	struct dma_resv_list *list;
+	struct dma_resv_cursor cursor;
+	struct dma_fence *fence;
 	unsigned int seq;
 	int err;
 
@@ -116,17 +117,8 @@ i915_gem_busy_ioctl(struct drm_device *dev, void *data,
 	args->busy = busy_check_writer(dma_resv_exclusive(obj->base.resv));
 
 	/* Translate shared fences to READ set of engines */
-	list = dma_resv_shared(obj->base.resv);
-	if (list) {
-		unsigned int shared_count = list->shared_count, i;
-
-		for (i = 0; i < shared_count; ++i) {
-			struct dma_fence *fence =
-				rcu_dereference(list->shared[i]);
-
-			args->busy |= busy_check_reader(fence);
-		}
-	}
+	dma_resv_for_each_shared(obj->base.resv, &cursor, fence)
+		args->busy |= busy_check_reader(fence);
 
 	if (args->busy && read_seqcount_retry(&obj->base.resv->seq, seq))
 		goto retry;
diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c b/drivers/gpu/drm/nouveau/nouveau_fence.c
index 5ce441c655ea..9efe47932b42 100644
--- a/drivers/gpu/drm/nouveau/nouveau_fence.c
+++ b/drivers/gpu/drm/nouveau/nouveau_fence.c
@@ -342,11 +342,11 @@ int
 nouveau_fence_sync(struct nouveau_bo *nvbo, struct nouveau_channel *chan, bool exclusive, bool intr)
 {
 	struct nouveau_fence_chan *fctx = chan->fence;
-	struct dma_fence *fence;
 	struct dma_resv *resv = nvbo->bo.base.resv;
-	struct dma_resv_list *fobj;
+	struct dma_fence *fence, *shared;
+	struct dma_resv_cursor cursor;
 	struct nouveau_fence *f;
-	int ret = 0, i;
+	int ret = 0;
 
 	if (!exclusive) {
 		ret = dma_resv_reserve_shared(resv, 1);
@@ -355,10 +355,10 @@ nouveau_fence_sync(struct nouveau_bo *nvbo, struct nouveau_channel *chan, bool e
 			return ret;
 	}
 
-	fobj = dma_resv_shared(resv);
+	shared = dma_resv_first_shared(resv, &cursor);
 	fence = dma_resv_exclusive(resv);
 
-	if (fence && (!exclusive || !fobj || !fobj->shared_count)) {
+	if (fence && (!exclusive || !shared)) {
 		struct nouveau_channel *prev = NULL;
 		bool must_wait = true;
 
@@ -377,17 +377,15 @@ nouveau_fence_sync(struct nouveau_bo *nvbo, struct nouveau_channel *chan, bool e
 		return ret;
 	}
 
-	if (!exclusive || !fobj)
+	if (!exclusive || !shared)
 		return ret;
 
-	for (i = 0; i < fobj->shared_count && !ret; ++i) {
+
+	do {
 		struct nouveau_channel *prev = NULL;
 		bool must_wait = true;
 
-		fence = rcu_dereference_protected(fobj->shared[i],
-						dma_resv_held(resv));
-
-		f = nouveau_local_fence(fence, chan->drm);
+		f = nouveau_local_fence(shared, chan->drm);
 		if (f) {
 			rcu_read_lock();
 			prev = rcu_dereference(f->channel);
@@ -397,8 +395,8 @@ nouveau_fence_sync(struct nouveau_bo *nvbo, struct nouveau_channel *chan, bool e
 		}
 
 		if (must_wait)
-			ret = dma_fence_wait(fence, intr);
-	}
+			ret = dma_fence_wait(shared, intr);
+	} while ((shared = dma_resv_next_shared(resv, &cursor)));
 
 	return ret;
 }
diff --git a/drivers/gpu/drm/qxl/qxl_debugfs.c b/drivers/gpu/drm/qxl/qxl_debugfs.c
index 0acc70a6d3dd..1d24e02f4652 100644
--- a/drivers/gpu/drm/qxl/qxl_debugfs.c
+++ b/drivers/gpu/drm/qxl/qxl_debugfs.c
@@ -57,12 +57,14 @@ qxl_debugfs_buffers_info(struct seq_file *m, void *data)
 	struct qxl_bo *bo;
 
 	list_for_each_entry(bo, &qdev->gem.objects, list) {
-		struct dma_resv_list *fobj;
-		int rel;
+		struct dma_resv_cursor cursor;
+		struct dma_fence *f;
+		int rel = 0;
 
+		/* TODO: Is this sufficient fast enough ? */
 		rcu_read_lock();
-		fobj = dma_resv_shared(bo->tbo.base.resv);
-		rel = fobj ? fobj->shared_count : 0;
+		dma_resv_for_each_shared(bo->tbo.base.resv, &cursor, f)
+			++rel;
 		rcu_read_unlock();
 
 		seq_printf(m, "size %ld, pc %d, num releases %d\n",
diff --git a/drivers/gpu/drm/radeon/radeon_sync.c b/drivers/gpu/drm/radeon/radeon_sync.c
index a9cdb88da173..915ac0de0633 100644
--- a/drivers/gpu/drm/radeon/radeon_sync.c
+++ b/drivers/gpu/drm/radeon/radeon_sync.c
@@ -91,11 +91,10 @@ int radeon_sync_resv(struct radeon_device *rdev,
 		     struct dma_resv *resv,
 		     bool shared)
 {
-	struct dma_resv_list *flist;
-	struct dma_fence *f;
+	struct dma_resv_cursor cursor;
 	struct radeon_fence *fence;
-	unsigned i;
-	int r = 0;
+	struct dma_fence *f;
+	int r;
 
 	/* always sync to the exclusive fence */
 	f = dma_resv_exclusive(resv);
@@ -105,13 +104,10 @@ int radeon_sync_resv(struct radeon_device *rdev,
 	else if (f)
 		r = dma_fence_wait(f, true);
 
-	flist = dma_resv_shared(resv);
-	if (shared || !flist || r)
+	if (shared || r)
 		return r;
 
-	for (i = 0; i < flist->shared_count; ++i) {
-		f = rcu_dereference_protected(flist->shared[i],
-					      dma_resv_held(resv));
+	dma_resv_for_each_shared(resv, &cursor, f) {
 		fence = to_radeon_fence(f);
 		if (fence && fence->rdev == rdev)
 			radeon_sync_fence(sync, fence);
diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index 9d453c2ca800..16b869d9b1d6 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -254,22 +254,17 @@ static int ttm_bo_individualize_resv(struct ttm_buffer_object *bo)
 static void ttm_bo_flush_all_fences(struct ttm_buffer_object *bo)
 {
 	struct dma_resv *resv = &bo->base._resv;
-	struct dma_resv_list *fobj;
+	struct dma_resv_cursor cursor;
 	struct dma_fence *fence;
-	int i;
 
 	rcu_read_lock();
-	fobj = dma_resv_shared(resv);
-	fence = dma_resv_exclusive(resv);
-	if (fence && !fence->ops->signaled)
-		dma_fence_enable_sw_signaling(fence);
-
-	for (i = 0; fobj && i < fobj->shared_count; ++i) {
-		fence = rcu_dereference(fobj->shared[i]);
-
+	dma_resv_for_each_shared(resv, &cursor, fence) {
 		if (!fence->ops->signaled)
 			dma_fence_enable_sw_signaling(fence);
 	}
+	fence = dma_resv_exclusive(resv);
+	if (fence && !fence->ops->signaled)
+		dma_fence_enable_sw_signaling(fence);
 	rcu_read_unlock();
 }
 
diff --git a/include/linux/dma-resv.h b/include/linux/dma-resv.h
index 74b217b82f39..bb6911baac3d 100644
--- a/include/linux/dma-resv.h
+++ b/include/linux/dma-resv.h
@@ -45,20 +45,9 @@
 #include <linux/seqlock.h>
 #include <linux/rcupdate.h>
 
-extern struct ww_class reservation_ww_class;
+struct dma_resv_list;
 
-/**
- * struct dma_resv_list - a list of shared fences
- * @rcu: for internal use
- * @shared_count: table of shared fences
- * @shared_max: for growing shared fence table
- * @shared: shared fence table
- */
-struct dma_resv_list {
-	struct rcu_head rcu;
-	u32 shared_count, shared_max;
-	struct dma_fence __rcu *shared[];
-};
+extern struct ww_class reservation_ww_class;
 
 /**
  * struct dma_resv - a reservation object manages fences for a buffer
@@ -75,9 +64,23 @@ struct dma_resv {
 	struct dma_resv_list __rcu *fence;
 };
 
+struct dma_resv_cursor {
+	struct dma_resv_list *fences;
+	unsigned int i;
+};
+
 #define dma_resv_held(obj) lockdep_is_held(&(obj)->lock.base)
 #define dma_resv_assert_held(obj) lockdep_assert_held(&(obj)->lock.base)
 
+struct dma_fence *dma_resv_first_shared(struct dma_resv *obj,
+					struct dma_resv_cursor *cursor);
+struct dma_fence *dma_resv_next_shared(struct dma_resv *obj,
+				       struct dma_resv_cursor *cursor);
+
+#define dma_resv_for_each_shared(obj, cursor, fence)		\
+	for (fence = dma_resv_first_shared(obj, cursor); fence;	\
+	     fence = dma_resv_next_shared(obj, cursor))
+
 #ifdef CONFIG_DEBUG_MUTEXES
 void dma_resv_reset_shared_max(struct dma_resv *obj);
 #else
@@ -253,17 +256,6 @@ dma_resv_get_excl_rcu(struct dma_resv *obj)
 	return fence;
 }
 
-/**
- * dma_resv_shared - get the reservation object's shared fence list
- * @obj: the reservation object
- *
- * Returns the shared fence list. The obj->lock or rcu read side must be held.
- */
-static inline struct dma_resv_list *dma_resv_shared(struct dma_resv *obj)
-{
-	return rcu_dereference_check(obj->fence, dma_resv_held(obj));
-}
-
 void dma_resv_init(struct dma_resv *obj);
 void dma_resv_fini(struct dma_resv *obj);
 int dma_resv_reserve_shared(struct dma_resv *obj, unsigned int num_fences);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 50+ messages in thread

* [PATCH 09/11] dma-buf: add shared fence usage flags
  2021-05-17 14:11 [RFC] Add DMA_RESV_USAGE flags Christian König
                   ` (7 preceding siblings ...)
  2021-05-17 14:11 ` [PATCH 08/11] dma-buf: improve shared fence abstraction Christian König
@ 2021-05-17 14:11 ` Christian König
  2021-05-17 20:36   ` Daniel Vetter
  2021-05-17 14:11 ` [PATCH 10/11] drm/i915: also wait for shared dmabuf fences before flip Christian König
                   ` (2 subsequent siblings)
  11 siblings, 1 reply; 50+ messages in thread
From: Christian König @ 2021-05-17 14:11 UTC (permalink / raw)
  To: dri-devel, linaro-mm-sig

Add usage flags for shared fences and improve the documentation.

This allows driver to better specify what shared fences
are doing with the resource.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/dma-buf/dma-resv.c                    | 132 +++++++++++++-----
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  |   2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c    |   2 +-
 drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c  |   4 +-
 .../gpu/drm/i915/gem/i915_gem_execbuffer.c    |   3 +-
 drivers/gpu/drm/i915/i915_vma.c               |   2 +-
 drivers/gpu/drm/msm/msm_gpu.c                 |   3 +-
 drivers/gpu/drm/nouveau/nouveau_bo.c          |   2 +-
 drivers/gpu/drm/qxl/qxl_release.c             |   3 +-
 drivers/gpu/drm/radeon/radeon_object.c        |   2 +-
 drivers/gpu/drm/ttm/ttm_bo.c                  |   2 +-
 drivers/gpu/drm/ttm/ttm_execbuf_util.c        |   3 +-
 drivers/gpu/drm/vc4/vc4_gem.c                 |   3 +-
 drivers/gpu/drm/vgem/vgem_fence.c             |   2 +-
 include/linux/dma-resv.h                      |  21 ++-
 15 files changed, 135 insertions(+), 51 deletions(-)

diff --git a/drivers/dma-buf/dma-resv.c b/drivers/dma-buf/dma-resv.c
index ef7e0464e08d..bf72c162fd70 100644
--- a/drivers/dma-buf/dma-resv.c
+++ b/drivers/dma-buf/dma-resv.c
@@ -49,17 +49,35 @@
 struct dma_resv_list {
 	struct rcu_head rcu;
 	u32 shared_count, shared_max;
-	struct dma_fence __rcu *shared[];
+	unsigned long shared[];
 };
 
 /**
  * DOC: Reservation Object Overview
  *
- * The reservation object provides a mechanism to manage shared and
- * exclusive fences associated with a buffer.  A reservation object
- * can have attached one exclusive fence (normally associated with
- * write operations) or N shared fences (read operations).  The RCU
- * mechanism is used to protect read access to fences from locked
+ * The reservation object provides a mechanism to manage shared and exclusive
+ * fences associated with a buffer. A reservation object can have attached one
+ * exclusive fence or multiple shared fences. Using the exclusive fence
+ * effectively serializes all accesses to the resource while using the shared
+ * fence slots allows for concurrent access.
+ *
+ * Because of this newly added DMA operations which want to use a resource
+ * always needs to wait for the existing exclusive fence before they start,
+ * no matter if they are added as shared or exclusive one.
+ *
+ * To aid drivers in determining if they need to wait for a shared resource
+ * usage flags should be given with each shared fence added to the resource.
+ *
+ * @DMA_RESV_USAGE_NONE is used for special cases where no waiting is desired.
+ * @DMA_RESV_USAGE_READ is used to note that the resource is read by this
+ * operation and writers should wait for it.
+ * @DMA_RESV_USAGE_WRITE is used to note that the resource is written by this
+ * operation and readers should wait for it.
+ * @DMA_RESV_USAGE_RW is used to note that the resource is both read and
+ * written.
+ *
+ * When drivers access the fences contained inside the dma_resv object the RCU
+ * mechanism can be used to protect read access to fences from locked
  * write-side updates.
  */
 
@@ -100,8 +118,12 @@ static void dma_resv_list_free(struct dma_resv_list *list)
 	if (!list)
 		return;
 
-	for (i = 0; i < list->shared_count; ++i)
-		dma_fence_put(rcu_dereference_protected(list->shared[i], true));
+	for (i = 0; i < list->shared_count; ++i) {
+		struct dma_fence __rcu *fence;
+
+		fence = (void __rcu *)(list->shared[i] & ~DMA_RESV_USAGE_RW);
+		dma_fence_put(rcu_dereference_protected(fence, true));
+	}
 
 	kfree_rcu(list, rcu);
 }
@@ -111,15 +133,44 @@ static void dma_resv_list_free(struct dma_resv_list *list)
  * @obj: the reservation object
  * @list: list to get the fence from
  * @idx: index into the fence array
+ * @val: optional original value
  *
  * Return the fence at the specified index double checking that either the rcu
  * read side or the dma_resv object is held.
  */
-static struct dma_fence *dma_resv_list_fence(struct dma_resv *obj,
-					     struct dma_resv_list *list,
-					     unsigned int idx)
+static inline struct dma_fence *dma_resv_list_fence(struct dma_resv *obj,
+						    struct dma_resv_list *list,
+						    unsigned int idx,
+						    unsigned long *val)
 {
-	return rcu_dereference_check(list->shared[idx], dma_resv_held(obj));
+	struct dma_fence __rcu *fence;
+	unsigned long tmp = READ_ONCE(list->shared[idx]);
+
+	if (val)
+		*val = tmp;
+
+	fence = (void __rcu *)(tmp & ~DMA_RESV_USAGE_RW);
+	return rcu_dereference_check(fence, dma_resv_held(obj));
+}
+
+/**
+ * dma_resv_list_assign - assign fence and usage
+ * @list: list to assign the fence to
+ * @idx: index where to assign the fence
+ * @f: the fence to assign
+ * @usage: the usage to use
+ *
+ * Assign the fence and usage to the slot at position idx.
+ */
+static void dma_resv_list_assign(struct dma_resv_list *list,
+				 unsigned int idx,
+				 struct dma_fence *f,
+				 uint32_t usage)
+{
+	struct dma_fence __rcu *fence;
+
+	rcu_assign_pointer(fence, f);
+	WRITE_ONCE(list->shared[idx], ((unsigned long __force)fence) | usage);
 }
 
 /**
@@ -200,10 +251,17 @@ EXPORT_SYMBOL(dma_resv_first_shared);
 struct dma_fence *dma_resv_next_shared(struct dma_resv *obj,
 				       struct dma_resv_cursor *cursor)
 {
+	struct dma_fence *fence;
+	unsigned long val;
+	int idx;
+
 	if (cursor->i >= cursor->fences->shared_count)
 		return NULL;
 
-	return dma_resv_list_fence(obj, cursor->fences, cursor->i++);
+	idx = cursor->i++;
+	fence = dma_resv_list_fence(obj, cursor->fences, idx, &val);
+	cursor->usage = val & DMA_RESV_USAGE_RW;
+	return fence;
 }
 EXPORT_SYMBOL(dma_resv_next_shared);
 
@@ -246,13 +304,14 @@ int dma_resv_reserve_shared(struct dma_resv *obj, unsigned int num_fences)
 	 * the new.
 	 */
 	for (i = 0, j = 0, k = max; i < (old ? old->shared_count : 0); ++i) {
-		struct dma_fence *fence;
+		struct dma_fence * fence;
+		unsigned long val;
 
-		fence = dma_resv_list_fence(obj, old, i);
+		fence = dma_resv_list_fence(obj, old, i, &val);
 		if (dma_fence_is_signaled(fence))
-			RCU_INIT_POINTER(new->shared[--k], fence);
+			new->shared[--k] = val;
 		else
-			RCU_INIT_POINTER(new->shared[j++], fence);
+			new->shared[j++] = val;
 	}
 	new->shared_count = j;
 
@@ -271,7 +330,7 @@ int dma_resv_reserve_shared(struct dma_resv *obj, unsigned int num_fences)
 
 	/* Drop the references to the signaled fences */
 	for (i = k; i < max; ++i)
-		dma_fence_put(dma_resv_list_fence(obj, new, i));
+		dma_fence_put(dma_resv_list_fence(obj, new, i, NULL));
 	kfree_rcu(old, rcu);
 
 	return 0;
@@ -298,14 +357,16 @@ void dma_resv_reset_shared_max(struct dma_resv *obj)
 #endif
 
 /**
- * dma_resv_add_shared_fence - Add a fence to a shared slot
+ * dma_resv_add_shared - Add a fence to a shared slot
  * @obj: the reservation object
  * @fence: the shared fence to add
+ * @usage: how the fence is using the resource
  *
  * Add a fence to a shared slot, obj->lock must be held, and
  * dma_resv_reserve_shared() has been called.
  */
-void dma_resv_add_shared_fence(struct dma_resv *obj, struct dma_fence *fence)
+void dma_resv_add_shared(struct dma_resv *obj, struct dma_fence *fence,
+			 enum dma_resv_usage usage)
 {
 	struct dma_resv_list *fobj;
 	struct dma_fence *old;
@@ -321,8 +382,7 @@ void dma_resv_add_shared_fence(struct dma_resv *obj, struct dma_fence *fence)
 	write_seqcount_begin(&obj->seq);
 
 	for (i = 0; i < count; ++i) {
-
-		old = dma_resv_list_fence(obj, fobj, i);
+		old = dma_resv_list_fence(obj, fobj, i, NULL);
 		if (old->context == fence->context ||
 		    dma_fence_is_signaled(old))
 			goto replace;
@@ -333,20 +393,21 @@ void dma_resv_add_shared_fence(struct dma_resv *obj, struct dma_fence *fence)
 	count++;
 
 replace:
-	RCU_INIT_POINTER(fobj->shared[i], fence);
+	dma_resv_list_assign(fobj, i, fence, usage);
 	/* pointer update must be visible before we extend the shared_count */
 	smp_store_mb(fobj->shared_count, count);
 
 	write_seqcount_end(&obj->seq);
 	dma_fence_put(old);
 }
-EXPORT_SYMBOL(dma_resv_add_shared_fence);
+EXPORT_SYMBOL(dma_resv_add_shared);
 
 /**
  * dma_resv_replace_shared - replace shared fences
  * @obj: the reservation object
  * @context: the context of the fences to replace
  * @fence: the new fence to use instead
+ * @usage: how the fence is using the resource
  *
  * Replace fences with a specified context with a new fence. Only valid if the
  * operation represented by the original fences is completed or has no longer
@@ -354,7 +415,7 @@ EXPORT_SYMBOL(dma_resv_add_shared_fence);
  * completes. Takes the reference to the new fence.
  */
 void dma_resv_replace_shared(struct dma_resv *obj, uint64_t context,
-			     struct dma_fence *fence)
+			     struct dma_fence *fence, enum dma_resv_usage usage)
 {
 	struct dma_resv_list *list;
 	unsigned int i;
@@ -367,12 +428,13 @@ void dma_resv_replace_shared(struct dma_resv *obj, uint64_t context,
 
 	write_seqcount_begin(&obj->seq);
 	for (i = 0; i < list->shared_count; ++i) {
-		struct dma_fence *old = dma_resv_list_fence(obj, list, i);
+		struct dma_fence *old;
 
+		old = dma_resv_list_fence(obj, list, i, NULL);
 		if (old->context != context)
 			continue;
 
-		rcu_assign_pointer(list->shared[i], dma_fence_get(fence));
+		dma_resv_list_assign(list, i, fence, usage);
 		dma_fence_put(old);
 	}
 	write_seqcount_end(&obj->seq);
@@ -411,7 +473,7 @@ void dma_resv_add_excl_fence(struct dma_resv *obj, struct dma_fence *fence)
 
 	/* inplace update, no shared fences */
 	while (i--)
-		dma_fence_put(dma_resv_list_fence(obj, old, i));
+		dma_fence_put(dma_resv_list_fence(obj, old, i, NULL));
 
 	dma_fence_put(old_fence);
 }
@@ -454,10 +516,10 @@ int dma_resv_copy_fences(struct dma_resv *dst, struct dma_resv *src)
 
 		dst_list->shared_count = 0;
 		for (i = 0; i < src_list->shared_count; ++i) {
-			struct dma_fence __rcu **dst;
 			struct dma_fence *fence;
+			unsigned long val;
 
-			fence = dma_resv_list_fence(src, src_list, i);
+			fence = dma_resv_list_fence(src, src_list, i, &val);
 			if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
 				     &fence->flags))
 				continue;
@@ -473,8 +535,7 @@ int dma_resv_copy_fences(struct dma_resv *dst, struct dma_resv *src)
 				continue;
 			}
 
-			dst = &dst_list->shared[dst_list->shared_count++];
-			rcu_assign_pointer(*dst, fence);
+			dst_list->shared[dst_list->shared_count++] = val;
 		}
 	} else {
 		dst_list = NULL;
@@ -566,7 +627,8 @@ int dma_resv_get_fences_rcu(struct dma_resv *obj,
 			shared = nshared;
 			shared_count = fobj ? fobj->shared_count : 0;
 			for (i = 0; i < shared_count; ++i) {
-				shared[i] = dma_resv_list_fence(obj, fobj, i);
+				shared[i] = dma_resv_list_fence(obj, fobj,
+								i, NULL);
 				if (!dma_fence_get_rcu(shared[i]))
 					break;
 			}
@@ -650,7 +712,7 @@ long dma_resv_wait_timeout_rcu(struct dma_resv *obj,
 		for (i = 0; !fence && i < shared_count; ++i) {
 			struct dma_fence *lfence;
 
-			lfence = dma_resv_list_fence(obj, fobj, i);
+			lfence = dma_resv_list_fence(obj, fobj, i, NULL);
 			if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
 				     &lfence->flags))
 				continue;
@@ -736,7 +798,7 @@ bool dma_resv_test_signaled_rcu(struct dma_resv *obj, bool test_all)
 		for (i = 0; i < shared_count; ++i) {
 			struct dma_fence *fence;
 
-			fence = dma_resv_list_fence(obj, fobj, i);
+			fence = dma_resv_list_fence(obj, fobj, i, NULL);
 			ret = dma_resv_test_signaled_single(fence);
 			if (ret < 0)
 				goto retry;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 3e5a681a5482..50bdf9bfd030 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -245,7 +245,7 @@ static int amdgpu_amdkfd_remove_eviction_fence(struct amdgpu_bo *bo,
 	 * table update and TLB flush here directly.
 	 */
 	dma_resv_replace_shared(bo->tbo.base.resv, ef->base.context,
-				dma_fence_get_stub());
+				dma_fence_get_stub(), DMA_RESV_USAGE_NONE);
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index 0adffcace326..1190781cefcb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -1422,7 +1422,7 @@ void amdgpu_bo_fence(struct amdgpu_bo *bo, struct dma_fence *fence,
 	struct dma_resv *resv = bo->tbo.base.resv;
 
 	if (shared)
-		dma_resv_add_shared_fence(resv, fence);
+		dma_resv_add_shared(resv, fence, DMA_RESV_USAGE_RW);
 	else
 		dma_resv_add_excl_fence(resv, fence);
 }
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c b/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c
index d05c35994579..f0e69c514275 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c
@@ -214,8 +214,8 @@ static void submit_attach_object_fences(struct etnaviv_gem_submit *submit)
 			dma_resv_add_excl_fence(obj->resv,
 							  submit->out_fence);
 		else
-			dma_resv_add_shared_fence(obj->resv,
-							    submit->out_fence);
+			dma_resv_add_shared(obj->resv, submit->out_fence,
+					    DMA_RESV_USAGE_READ);
 
 		submit_unlock_object(submit, i);
 	}
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
index 5964e67c7d36..16138bc2dbe4 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -2551,7 +2551,8 @@ static int eb_parse_pipeline(struct i915_execbuffer *eb,
 		goto err_commit;
 
 	/* Keep the batch alive and unwritten as we parse */
-	dma_resv_add_shared_fence(pw->batch->resv, &pw->base.dma);
+	dma_resv_add_shared(pw->batch->resv, &pw->base.dma,
+			    DMA_RESV_USAGE_READ);
 
 	/* Force execution to wait for completion of the parser */
 	dma_resv_add_excl_fence(shadow->resv, &pw->base.dma);
diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
index 07490db51cdc..0d177a7687cb 100644
--- a/drivers/gpu/drm/i915/i915_vma.c
+++ b/drivers/gpu/drm/i915/i915_vma.c
@@ -1256,7 +1256,7 @@ int i915_vma_move_to_active(struct i915_vma *vma,
 				return err;
 		}
 
-		dma_resv_add_shared_fence(vma->resv, &rq->fence);
+		dma_resv_add_shared(vma->resv, &rq->fence, DMA_RESV_USAGE_READ);
 		obj->write_domain = 0;
 	}
 
diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
index 9dd1c58430ab..82f98a4224f7 100644
--- a/drivers/gpu/drm/msm/msm_gpu.c
+++ b/drivers/gpu/drm/msm/msm_gpu.c
@@ -820,7 +820,8 @@ void msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
 		if (submit->bos[i].flags & MSM_SUBMIT_BO_WRITE)
 			dma_resv_add_excl_fence(drm_obj->resv, submit->fence);
 		else if (submit->bos[i].flags & MSM_SUBMIT_BO_READ)
-			dma_resv_add_shared_fence(drm_obj->resv, submit->fence);
+			dma_resv_add_shared(drm_obj->resv, submit->fence,
+					    DMA_RESV_USAGE_READ);
 
 		msm_gem_active_get(drm_obj, gpu);
 	}
diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c
index ad30a6a100b9..b2c13c63f93c 100644
--- a/drivers/gpu/drm/nouveau/nouveau_bo.c
+++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
@@ -1313,7 +1313,7 @@ nouveau_bo_fence(struct nouveau_bo *nvbo, struct nouveau_fence *fence, bool excl
 	if (exclusive)
 		dma_resv_add_excl_fence(resv, &fence->base);
 	else if (fence)
-		dma_resv_add_shared_fence(resv, &fence->base);
+		dma_resv_add_shared(resv, &fence->base, DMA_RESV_USAGE_RW);
 }
 
 static void
diff --git a/drivers/gpu/drm/qxl/qxl_release.c b/drivers/gpu/drm/qxl/qxl_release.c
index b19f2f00b215..b725e3fbbb49 100644
--- a/drivers/gpu/drm/qxl/qxl_release.c
+++ b/drivers/gpu/drm/qxl/qxl_release.c
@@ -429,7 +429,8 @@ void qxl_release_fence_buffer_objects(struct qxl_release *release)
 	list_for_each_entry(entry, &release->bos, head) {
 		bo = entry->bo;
 
-		dma_resv_add_shared_fence(bo->base.resv, &release->base);
+		dma_resv_add_shared(bo->base.resv, &release->base,
+				    DMA_RESV_USAGE_READ);
 		ttm_bo_move_to_lru_tail_unlocked(bo);
 		dma_resv_unlock(bo->base.resv);
 	}
diff --git a/drivers/gpu/drm/radeon/radeon_object.c b/drivers/gpu/drm/radeon/radeon_object.c
index cee11c55fd15..b744cd766bb1 100644
--- a/drivers/gpu/drm/radeon/radeon_object.c
+++ b/drivers/gpu/drm/radeon/radeon_object.c
@@ -815,7 +815,7 @@ void radeon_bo_fence(struct radeon_bo *bo, struct radeon_fence *fence,
 	struct dma_resv *resv = bo->tbo.base.resv;
 
 	if (shared)
-		dma_resv_add_shared_fence(resv, &fence->base);
+		dma_resv_add_shared(resv, &fence->base, DMA_RESV_USAGE_READ);
 	else
 		dma_resv_add_excl_fence(resv, &fence->base);
 }
diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index 16b869d9b1d6..c9bbc4630afc 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -700,7 +700,7 @@ static int ttm_bo_add_move_fence(struct ttm_buffer_object *bo,
 		return ret;
 	}
 
-	dma_resv_add_shared_fence(bo->base.resv, fence);
+	dma_resv_add_shared(bo->base.resv, fence, DMA_RESV_USAGE_RW);
 
 	ret = dma_resv_reserve_shared(bo->base.resv, 1);
 	if (unlikely(ret)) {
diff --git a/drivers/gpu/drm/ttm/ttm_execbuf_util.c b/drivers/gpu/drm/ttm/ttm_execbuf_util.c
index 071c48d672c6..8ed1d89cfeeb 100644
--- a/drivers/gpu/drm/ttm/ttm_execbuf_util.c
+++ b/drivers/gpu/drm/ttm/ttm_execbuf_util.c
@@ -156,7 +156,8 @@ void ttm_eu_fence_buffer_objects(struct ww_acquire_ctx *ticket,
 		struct ttm_buffer_object *bo = entry->bo;
 
 		if (entry->num_shared)
-			dma_resv_add_shared_fence(bo->base.resv, fence);
+			dma_resv_add_shared(bo->base.resv, fence,
+					    DMA_RESV_USAGE_RW);
 		else
 			dma_resv_add_excl_fence(bo->base.resv, fence);
 		ttm_bo_move_to_lru_tail_unlocked(bo);
diff --git a/drivers/gpu/drm/vc4/vc4_gem.c b/drivers/gpu/drm/vc4/vc4_gem.c
index 445d3bab89e0..2ab59abcea1a 100644
--- a/drivers/gpu/drm/vc4/vc4_gem.c
+++ b/drivers/gpu/drm/vc4/vc4_gem.c
@@ -543,7 +543,8 @@ vc4_update_bo_seqnos(struct vc4_exec_info *exec, uint64_t seqno)
 		bo = to_vc4_bo(&exec->bo[i]->base);
 		bo->seqno = seqno;
 
-		dma_resv_add_shared_fence(bo->base.base.resv, exec->fence);
+		dma_resv_add_shared(bo->base.base.resv, exec->fence,
+				    DMA_RESV_USAGE_READ);
 	}
 
 	list_for_each_entry(bo, &exec->unref_list, unref_head) {
diff --git a/drivers/gpu/drm/vgem/vgem_fence.c b/drivers/gpu/drm/vgem/vgem_fence.c
index 2902dc6e64fa..cb5e731d07af 100644
--- a/drivers/gpu/drm/vgem/vgem_fence.c
+++ b/drivers/gpu/drm/vgem/vgem_fence.c
@@ -163,7 +163,7 @@ int vgem_fence_attach_ioctl(struct drm_device *dev,
 	if (arg->flags & VGEM_FENCE_WRITE)
 		dma_resv_add_excl_fence(resv, fence);
 	else if ((ret = dma_resv_reserve_shared(resv, 1)) == 0)
-		dma_resv_add_shared_fence(resv, fence);
+		dma_resv_add_shared(resv, fence, DMA_RESV_USAGE_READ);
 	dma_resv_unlock(resv);
 
 	/* Record the fence in our idr for later signaling */
diff --git a/include/linux/dma-resv.h b/include/linux/dma-resv.h
index bb6911baac3d..e5afffaec579 100644
--- a/include/linux/dma-resv.h
+++ b/include/linux/dma-resv.h
@@ -49,6 +49,20 @@ struct dma_resv_list;
 
 extern struct ww_class reservation_ww_class;
 
+/**
+ * enum dma_resv_usage - how a DMA resource is used for implicit sync
+ * @DMA_RESV_USAGE_NONE: Only for memory management
+ * @DMA_RESV_USAGE_READ: Read only access
+ * @DMA_RESV_USAGE_WRITE: Write only access
+ * @DMA_RESV_USAGE_RW: Both read and write access
+ */
+enum dma_resv_usage {
+	DMA_RESV_USAGE_NONE	= 0,
+	DMA_RESV_USAGE_READ	= 1 << 0,
+	DMA_RESV_USAGE_WRITE	= 1 << 1,
+	DMA_RESV_USAGE_RW	= (DMA_RESV_USAGE_READ | DMA_RESV_USAGE_WRITE)
+};
+
 /**
  * struct dma_resv - a reservation object manages fences for a buffer
  * @lock: update side lock
@@ -66,6 +80,7 @@ struct dma_resv {
 
 struct dma_resv_cursor {
 	struct dma_resv_list *fences;
+	enum dma_resv_usage usage;
 	unsigned int i;
 };
 
@@ -259,9 +274,11 @@ dma_resv_get_excl_rcu(struct dma_resv *obj)
 void dma_resv_init(struct dma_resv *obj);
 void dma_resv_fini(struct dma_resv *obj);
 int dma_resv_reserve_shared(struct dma_resv *obj, unsigned int num_fences);
-void dma_resv_add_shared_fence(struct dma_resv *obj, struct dma_fence *fence);
+void dma_resv_add_shared(struct dma_resv *obj, struct dma_fence *fence,
+			 enum dma_resv_usage usage);
 void dma_resv_replace_shared(struct dma_resv *obj, uint64_t context,
-			     struct dma_fence *fence);
+			     struct dma_fence *fence,
+			     enum dma_resv_usage usage);
 
 void dma_resv_add_excl_fence(struct dma_resv *obj, struct dma_fence *fence);
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 50+ messages in thread

* [PATCH 10/11] drm/i915: also wait for shared dmabuf fences before flip
  2021-05-17 14:11 [RFC] Add DMA_RESV_USAGE flags Christian König
                   ` (8 preceding siblings ...)
  2021-05-17 14:11 ` [PATCH 09/11] dma-buf: add shared fence usage flags Christian König
@ 2021-05-17 14:11 ` Christian König
  2021-05-17 14:11 ` [PATCH 11/11] drm/amdgpu: fix shared access to exported DMA-bufs Christian König
  2021-05-17 15:04 ` [RFC] Add DMA_RESV_USAGE flags Daniel Vetter
  11 siblings, 0 replies; 50+ messages in thread
From: Christian König @ 2021-05-17 14:11 UTC (permalink / raw)
  To: dri-devel, linaro-mm-sig

Add dependencies for implicit sync to shared fences as well to i915.

This was lengthy discussed about four years ago, but since the workaround we
did in amdgpu is now causing more and more problems we need to fix this
properly.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/i915/display/intel_display.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/drivers/gpu/drm/i915/display/intel_display.c b/drivers/gpu/drm/i915/display/intel_display.c
index 9b9b538b0cb6..197b54fdefa2 100644
--- a/drivers/gpu/drm/i915/display/intel_display.c
+++ b/drivers/gpu/drm/i915/display/intel_display.c
@@ -10640,6 +10640,7 @@ intel_prepare_plane_fb(struct drm_plane *_plane,
 	i915_gem_object_flush_frontbuffer(obj, ORIGIN_DIRTYFB);
 
 	if (!new_plane_state->uapi.fence) { /* implicit fencing */
+		struct dma_resv_cursor cursor;
 		struct dma_fence *fence;
 
 		ret = i915_sw_fence_await_reservation(&state->commit_ready,
@@ -10656,6 +10657,21 @@ intel_prepare_plane_fb(struct drm_plane *_plane,
 						   fence);
 			dma_fence_put(fence);
 		}
+
+retry:
+		dma_resv_for_each_shared(obj->base.resv, &cursor, fence) {
+			if (!(cursor.usage & DMA_RESV_USAGE_WRITE))
+				continue;
+
+			if (!dma_fence_get_rcu(fence))
+				goto retry;
+
+			add_rps_boost_after_vblank(new_plane_state->hw.crtc,
+						   fence);
+			dma_fence_put(fence);
+		}
+
+
 	} else {
 		add_rps_boost_after_vblank(new_plane_state->hw.crtc,
 					   new_plane_state->uapi.fence);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 50+ messages in thread

* [PATCH 11/11] drm/amdgpu: fix shared access to exported DMA-bufs
  2021-05-17 14:11 [RFC] Add DMA_RESV_USAGE flags Christian König
                   ` (9 preceding siblings ...)
  2021-05-17 14:11 ` [PATCH 10/11] drm/i915: also wait for shared dmabuf fences before flip Christian König
@ 2021-05-17 14:11 ` Christian König
  2021-05-17 15:04 ` [RFC] Add DMA_RESV_USAGE flags Daniel Vetter
  11 siblings, 0 replies; 50+ messages in thread
From: Christian König @ 2021-05-17 14:11 UTC (permalink / raw)
  To: dri-devel, linaro-mm-sig

We are running into more and more problems with that approach since every
command submission to the buffer in question is now serializing.

Since the i915 where we originally added that for is now fixed we should
be able to remove this.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c      | 10 ++--------
 drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c |  6 ------
 drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c     |  5 -----
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c  |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.h  |  1 -
 5 files changed, 3 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index b5c766998045..6b610a2df52f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -614,14 +614,8 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
 	gws = p->bo_list->gws_obj;
 	oa = p->bo_list->oa_obj;
 
-	amdgpu_bo_list_for_each_entry(e, p->bo_list) {
-		struct amdgpu_bo *bo = ttm_to_amdgpu_bo(e->tv.bo);
-
-		/* Make sure we use the exclusive slot for shared BOs */
-		if (bo->prime_shared_count)
-			e->tv.num_shared = 0;
-		e->bo_va = amdgpu_vm_bo_find(vm, bo);
-	}
+	amdgpu_bo_list_for_each_entry(e, p->bo_list)
+		e->bo_va = amdgpu_vm_bo_find(vm, ttm_to_amdgpu_bo(e->tv.bo));
 
 	if (gds) {
 		p->job->gds_base = amdgpu_bo_gpu_offset(gds) >> PAGE_SHIFT;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
index 67cef80e25c8..76a2ac547698 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
@@ -167,7 +167,6 @@ static int amdgpu_dma_buf_attach(struct dma_buf *dmabuf,
 	if (r)
 		goto out;
 
-	bo->prime_shared_count++;
 	amdgpu_bo_unreserve(bo);
 	return 0;
 
@@ -191,9 +190,6 @@ static void amdgpu_dma_buf_detach(struct dma_buf *dmabuf,
 	struct amdgpu_bo *bo = gem_to_amdgpu_bo(obj);
 	struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
 
-	if (attach->dev->driver != adev->dev->driver && bo->prime_shared_count)
-		bo->prime_shared_count--;
-
 	pm_runtime_mark_last_busy(adev_to_drm(adev)->dev);
 	pm_runtime_put_autosuspend(adev_to_drm(adev)->dev);
 }
@@ -446,8 +442,6 @@ amdgpu_dma_buf_create_obj(struct drm_device *dev, struct dma_buf *dma_buf)
 	bo = gem_to_amdgpu_bo(gobj);
 	bo->allowed_domains = AMDGPU_GEM_DOMAIN_GTT;
 	bo->preferred_domains = AMDGPU_GEM_DOMAIN_GTT;
-	if (dma_buf->ops != &amdgpu_dmabuf_ops)
-		bo->prime_shared_count = 1;
 
 	dma_resv_unlock(resv);
 	return gobj;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index 94da44d97e7f..33eddea5d83d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -775,11 +775,6 @@ int amdgpu_gem_op_ioctl(struct drm_device *dev, void *data,
 		break;
 	}
 	case AMDGPU_GEM_OP_SET_PLACEMENT:
-		if (robj->prime_shared_count && (args->value & AMDGPU_GEM_DOMAIN_VRAM)) {
-			r = -EINVAL;
-			amdgpu_bo_unreserve(robj);
-			break;
-		}
 		if (amdgpu_ttm_tt_get_usermm(robj->tbo.ttm)) {
 			r = -EPERM;
 			amdgpu_bo_unreserve(robj);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index 1190781cefcb..158c9e94d42f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -906,7 +906,7 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain,
 		return -EINVAL;
 
 	/* A shared bo cannot be migrated to VRAM */
-	if (bo->prime_shared_count || bo->tbo.base.import_attach) {
+	if (bo->tbo.base.import_attach) {
 		if (domain & AMDGPU_GEM_DOMAIN_GTT)
 			domain = AMDGPU_GEM_DOMAIN_GTT;
 		else
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
index b37d36ac6b5a..d7355396995d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
@@ -92,7 +92,6 @@ struct amdgpu_bo {
 	struct ttm_buffer_object	tbo;
 	struct ttm_bo_kmap_obj		kmap;
 	u64				flags;
-	unsigned			prime_shared_count;
 	/* per VM structure for page tables and with virtual addresses */
 	struct amdgpu_vm_bo_base	*vm_bo;
 	/* Constant after initialization */
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 50+ messages in thread

* Re: [RFC] Add DMA_RESV_USAGE flags
  2021-05-17 14:11 [RFC] Add DMA_RESV_USAGE flags Christian König
                   ` (10 preceding siblings ...)
  2021-05-17 14:11 ` [PATCH 11/11] drm/amdgpu: fix shared access to exported DMA-bufs Christian König
@ 2021-05-17 15:04 ` Daniel Vetter
  2021-05-17 19:38   ` Christian König
  11 siblings, 1 reply; 50+ messages in thread
From: Daniel Vetter @ 2021-05-17 15:04 UTC (permalink / raw)
  To: Christian König; +Cc: linaro-mm-sig, dri-devel

On Mon, May 17, 2021 at 04:11:18PM +0200, Christian König wrote:
> We had a long outstanding problem in amdgpu that buffers exported to
> user drivers by DMA-buf serialize all command submissions using them.
> 
> In other words we can't compose the buffer with different engines and
> then send it to another driver for display further processing.
> 
> This was added to work around the fact that i915 didn't wanted to wait
> for shared fences in the dma_resv objects before displaying a buffer.
> 
> Since this problem is now causing issues with Vulkan we need to find a
> better solution for that.
> 
> The patch set here tries to do this by adding an usage flag to the
> shared fences noting when and how they should participate in implicit
> synchronization.

So the way this is fixed in every other vulkan driver is that vulkan
userspace sets flags in the CS ioctl when it wants to synchronize with
implicit sync. This gets you mostly there. Last time I checked amdgpu
isn't doing this, and yes that's broken.

I915 and iirc msm has explicit flags for this, panfrost was designed to
support this correctly from the start (also with flags I think). That's at
least what I remember from all the discussions at XDC and #dri-devel, but
didn't check the code again to give you the list of uapi flags you need
for each driver.

The other piece is making sure you're only picking up implicit fences when
you should, and not any later ones, for which Jason has a solution:

https://lore.kernel.org/dri-devel/20210317221940.2146688-1-jason@jlekstrand.net/

If amdgpu isn't using those, then you will suffer from
over-synchronization in vulkan and pay a price. The entire point of vulkan
is that you pick up sync points very explicitly, and we also need to have
very explicit uapi for userspace to pick up/set the implicit fences.

Trying to paper over this with more implicit magic is imo just wrong, and
definitely not the long term explicit sync model we want.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] Add DMA_RESV_USAGE flags
  2021-05-17 15:04 ` [RFC] Add DMA_RESV_USAGE flags Daniel Vetter
@ 2021-05-17 19:38   ` Christian König
  2021-05-17 20:15     ` Jason Ekstrand
  2021-05-17 20:15     ` Daniel Vetter
  0 siblings, 2 replies; 50+ messages in thread
From: Christian König @ 2021-05-17 19:38 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: linaro-mm-sig, dri-devel

Am 17.05.21 um 17:04 schrieb Daniel Vetter:
> On Mon, May 17, 2021 at 04:11:18PM +0200, Christian König wrote:
>> We had a long outstanding problem in amdgpu that buffers exported to
>> user drivers by DMA-buf serialize all command submissions using them.
>>
>> In other words we can't compose the buffer with different engines and
>> then send it to another driver for display further processing.
>>
>> This was added to work around the fact that i915 didn't wanted to wait
>> for shared fences in the dma_resv objects before displaying a buffer.
>>
>> Since this problem is now causing issues with Vulkan we need to find a
>> better solution for that.
>>
>> The patch set here tries to do this by adding an usage flag to the
>> shared fences noting when and how they should participate in implicit
>> synchronization.
> So the way this is fixed in every other vulkan driver is that vulkan
> userspace sets flags in the CS ioctl when it wants to synchronize with
> implicit sync. This gets you mostly there. Last time I checked amdgpu
> isn't doing this, and yes that's broken.

And exactly that is a really bad approach as far as I can see. The 
Vulkan stack on top simply doesn't know when to set this flag during CS.

That's also the reason the Valve guys came up with a solution where each 
BO gets a flag for explicit sync, but that only works for exports and 
not for imports.

> I915 and iirc msm has explicit flags for this, panfrost was designed to
> support this correctly from the start (also with flags I think). That's at
> least what I remember from all the discussions at XDC and #dri-devel, but
> didn't check the code again to give you the list of uapi flags you need
> for each driver.
>
> The other piece is making sure you're only picking up implicit fences when
> you should, and not any later ones, for which Jason has a solution:
>
> https://lore.kernel.org/dri-devel/20210317221940.2146688-1-jason@jlekstrand.net/

Yes, I helped with that as well. But I think that this is just another 
workaround without really addressing the underlying problem.

> If amdgpu isn't using those, then you will suffer from
> over-synchronization in vulkan and pay a price. The entire point of vulkan
> is that you pick up sync points very explicitly, and we also need to have
> very explicit uapi for userspace to pick up/set the implicit fences.
>
> Trying to paper over this with more implicit magic is imo just wrong, and
> definitely not the long term explicit sync model we want.

I completely disagree.

In my opinion the implicit sync model we have for dma_resv currently is 
just not well designed at all, since it always requires cooperation from 
userspace.

In other words you need to know when to enable implicit sync in 
userspace and that information is simply not present all of the time.

What we have done here is just keeping the old reader/writer flags i915, 
radeon and nouveau once had and pushed that out to everybody else making 
the assumption that everybody would follow that without documenting the 
actual rules of engagement you need to follow here.

That was a really big mistake and we should try to fix that sooner or 
later. The only other clean alternative I see is to use a flag on the 
exporter to tell the importer if it should sync to shared fences or not.

Additional to that I'm perfectly fine with implicit sync. Explicit sync 
certainly has some use cases as well, but I don't see it as an absolute 
advantage over the implicit model.

Regards,
Christian.

> -Daniel


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] Add DMA_RESV_USAGE flags
  2021-05-17 19:38   ` Christian König
@ 2021-05-17 20:15     ` Jason Ekstrand
  2021-05-17 20:15     ` Daniel Vetter
  1 sibling, 0 replies; 50+ messages in thread
From: Jason Ekstrand @ 2021-05-17 20:15 UTC (permalink / raw)
  To: Christian König; +Cc: linaro-mm-sig, Maling list - DRI developers

On Mon, May 17, 2021 at 2:38 PM Christian König
<ckoenig.leichtzumerken@gmail.com> wrote:
>
> Am 17.05.21 um 17:04 schrieb Daniel Vetter:
> > On Mon, May 17, 2021 at 04:11:18PM +0200, Christian König wrote:
> >> We had a long outstanding problem in amdgpu that buffers exported to
> >> user drivers by DMA-buf serialize all command submissions using them.
> >>
> >> In other words we can't compose the buffer with different engines and
> >> then send it to another driver for display further processing.
> >>
> >> This was added to work around the fact that i915 didn't wanted to wait
> >> for shared fences in the dma_resv objects before displaying a buffer.
> >>
> >> Since this problem is now causing issues with Vulkan we need to find a
> >> better solution for that.
> >>
> >> The patch set here tries to do this by adding an usage flag to the
> >> shared fences noting when and how they should participate in implicit
> >> synchronization.
> > So the way this is fixed in every other vulkan driver is that vulkan
> > userspace sets flags in the CS ioctl when it wants to synchronize with
> > implicit sync. This gets you mostly there. Last time I checked amdgpu
> > isn't doing this, and yes that's broken.
>
> And exactly that is a really bad approach as far as I can see. The
> Vulkan stack on top simply doesn't know when to set this flag during CS.

Yes and no...  First off, I should preface this by saying that there
are no truly good solutions here that I'm aware of.  We've got one
thing in ANV that mostly works, RADV does something else that mostly
works.  Neither is great.  I think with
https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4037 we get
most of the way to a half-decent solution.

What ANV does today is to set EXEC_OBJECT_ASYNC on all BOs almost all
the time to disable implicit sync.  There are two exceptions to this:

 1. When we go to kick a buffer off to the window system, we do a
(possibly dummy) submit which takes in whatever explicit VkSemaphore
objects the client provides and uses a mesa-internal vkQueueSubmit
extension to mark the window-system buffer as being written by that
submit.
 2a. In vkAcquireNextImage, when we hand the next back buffer to the
client, it also provides a VkSemaphore.  We stuff the image BO into
said semaphore and, whenever it's used as a wait dependency by the
client, we throw the BO in that submit's object list and flag it as
being read.

This works pretty well.  The biggest problem is that there is some
potential for over-sync thanks to 2a because, as soon as the client
throws in a render which depends on the acquire semaphore, that BO is
marked as used and it can end up over-syncing on itself.  This is
entirely solved by the patch Daniel mentioned below.  With the patch
daniel mentioned and MR4037 linked above, 2 gets replaced with

2b. In vkAcquireNextImage, we fetch a sync_file out of the dma-buf and
stuff it into the semaphore.

This totally solves the over-sync problem because the sync_file used
represents the exact point on the CPU timeline where X handed the
image back to us and any fences added after that point don't count.
Since the client can't do a vkQueueSubmit using that image until
vkAcquireNextImage returns, there's no way it will contain any of our
fences unless they were from the previous frame rendered to that
image.

> That's also the reason the Valve guys came up with a solution where each
> BO gets a flag for explicit sync, but that only works for exports and
> not for imports.

Yes, RADV uses a different mechanism.  They have a per-BO flag for "am
I owned" and they sync on all the external+owned stuff.  The owned
flag is flipped on and off when buffers are handed between the client
and the compositor.

Which is better?  I think the final solution I'm driving towards in
ANV is the best we can do at the moment.  I also like the fact that it
is pretty cross-driver.  It does still depend on a dummy submit but
I'm not sure it's worth trying to get around that for now.

> > I915 and iirc msm has explicit flags for this, panfrost was designed to
> > support this correctly from the start (also with flags I think). That's at
> > least what I remember from all the discussions at XDC and #dri-devel, but
> > didn't check the code again to give you the list of uapi flags you need
> > for each driver.
> >
> > The other piece is making sure you're only picking up implicit fences when
> > you should, and not any later ones, for which Jason has a solution:
> >
> > https://lore.kernel.org/dri-devel/20210317221940.2146688-1-jason@jlekstrand.net/
>
> Yes, I helped with that as well. But I think that this is just another
> workaround without really addressing the underlying problem.

I think that depends on how we're defining "the underlying problem"
I'm afraid I'm a bit unclear on that.

> > If amdgpu isn't using those, then you will suffer from
> > over-synchronization in vulkan and pay a price. The entire point of vulkan
> > is that you pick up sync points very explicitly, and we also need to have
> > very explicit uapi for userspace to pick up/set the implicit fences.
> >
> > Trying to paper over this with more implicit magic is imo just wrong, and
> > definitely not the long term explicit sync model we want.
>
> I completely disagree.
>
> In my opinion the implicit sync model we have for dma_resv currently is
> just not well designed at all, since it always requires cooperation from
> userspace.
>
> In other words you need to know when to enable implicit sync in
> userspace and that information is simply not present all of the time.

Then I don't see how this helps.  If userspace doesn't know when to
sync so it can set it at submit time, why would it know on
import/export?  If you think we don't know when to sync, we certainly
don't know when it's read vs. write.

Also, making this all happen at import/export time totally blows away
Vulkan's model.  We don't want to have to destroy the BO object to
stop syncing on it and re-import it every frame.  That's a
non-starter.

Or am I really badly missing something?

> What we have done here is just keeping the old reader/writer flags i915,
> radeon and nouveau once had and pushed that out to everybody else making
> the assumption that everybody would follow that without documenting the
> actual rules of engagement you need to follow here.
>
> That was a really big mistake and we should try to fix that sooner or
> later. The only other clean alternative I see is to use a flag on the
> exporter to tell the importer if it should sync to shared fences or not.
>
> Additional to that I'm perfectly fine with implicit sync. Explicit sync
> certainly has some use cases as well, but I don't see it as an absolute
> advantage over the implicit model.

I'm not going to make any broad sweeping arguments about the future
here.  Implicit sync may be here to stay and I think that discussion
is sort-of beside the point anyway.

--Jason

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] Add DMA_RESV_USAGE flags
  2021-05-17 19:38   ` Christian König
  2021-05-17 20:15     ` Jason Ekstrand
@ 2021-05-17 20:15     ` Daniel Vetter
  2021-05-17 22:49       ` Jason Ekstrand
  1 sibling, 1 reply; 50+ messages in thread
From: Daniel Vetter @ 2021-05-17 20:15 UTC (permalink / raw)
  To: Christian König, Jason Ekstrand
  Cc: moderated list:DMA BUFFER SHARING FRAMEWORK, dri-devel

On Mon, May 17, 2021 at 9:38 PM Christian König
<ckoenig.leichtzumerken@gmail.com> wrote:
>
> Am 17.05.21 um 17:04 schrieb Daniel Vetter:
> > On Mon, May 17, 2021 at 04:11:18PM +0200, Christian König wrote:
> >> We had a long outstanding problem in amdgpu that buffers exported to
> >> user drivers by DMA-buf serialize all command submissions using them.
> >>
> >> In other words we can't compose the buffer with different engines and
> >> then send it to another driver for display further processing.
> >>
> >> This was added to work around the fact that i915 didn't wanted to wait
> >> for shared fences in the dma_resv objects before displaying a buffer.
> >>
> >> Since this problem is now causing issues with Vulkan we need to find a
> >> better solution for that.
> >>
> >> The patch set here tries to do this by adding an usage flag to the
> >> shared fences noting when and how they should participate in implicit
> >> synchronization.
> > So the way this is fixed in every other vulkan driver is that vulkan
> > userspace sets flags in the CS ioctl when it wants to synchronize with
> > implicit sync. This gets you mostly there. Last time I checked amdgpu
> > isn't doing this, and yes that's broken.
>
> And exactly that is a really bad approach as far as I can see. The
> Vulkan stack on top simply doesn't know when to set this flag during CS.

Adding Jason for the Vulkan side of things, because this isn't how I
understand this works.

But purely form a kernel pov your patches are sketchy for two reasons:

- we reinstate the amdgpu special case of not setting exclusive fences

- you only fix the single special case of i915 display, nothing else

That's not how a cross driver interface works. And if you'd do this
properly, you'd be back to all the same sync fun you've orignally had,
with all the same fallout.

> That's also the reason the Valve guys came up with a solution where each
> BO gets a flag for explicit sync, but that only works for exports and
> not for imports.
>
> > I915 and iirc msm has explicit flags for this, panfrost was designed to
> > support this correctly from the start (also with flags I think). That's at
> > least what I remember from all the discussions at XDC and #dri-devel, but
> > didn't check the code again to give you the list of uapi flags you need
> > for each driver.
> >
> > The other piece is making sure you're only picking up implicit fences when
> > you should, and not any later ones, for which Jason has a solution:
> >
> > https://lore.kernel.org/dri-devel/20210317221940.2146688-1-jason@jlekstrand.net/
>
> Yes, I helped with that as well. But I think that this is just another
> workaround without really addressing the underlying problem.
>
> > If amdgpu isn't using those, then you will suffer from
> > over-synchronization in vulkan and pay a price. The entire point of vulkan
> > is that you pick up sync points very explicitly, and we also need to have
> > very explicit uapi for userspace to pick up/set the implicit fences.
> >
> > Trying to paper over this with more implicit magic is imo just wrong, and
> > definitely not the long term explicit sync model we want.
>
> I completely disagree.
>
> In my opinion the implicit sync model we have for dma_resv currently is
> just not well designed at all, since it always requires cooperation from
> userspace.
>
> In other words you need to know when to enable implicit sync in
> userspace and that information is simply not present all of the time.
>
> What we have done here is just keeping the old reader/writer flags i915,
> radeon and nouveau once had and pushed that out to everybody else making
> the assumption that everybody would follow that without documenting the
> actual rules of engagement you need to follow here.
>
> That was a really big mistake and we should try to fix that sooner or
> later. The only other clean alternative I see is to use a flag on the
> exporter to tell the importer if it should sync to shared fences or not.
>
> Additional to that I'm perfectly fine with implicit sync. Explicit sync
> certainly has some use cases as well, but I don't see it as an absolute
> advantage over the implicit model.

Ok this stops making sense. Somehow you claim userspace doesn't know
when to sync, but somehow the kernel does? By guessing, and getting it
wrong mostly, except for the one case that you benchmarked?

Aside from silly userspace which exports a buffer to a dma-buf, but
then never imports it anywhere else, there isn't a case I know of
where the kernel actually knows more than userspace. But there's lots
of cases where the kernel definitely knows less, especially if
userspace doesn't tell it about what's going on with each rendering
and buffer.

So here's the 2 things you need to make this work like every other driver:

1. A way to set the explicit fence on a buffer. CS ioctl is perfectly
fine, but also can be seperate. Userspace uses this only on a) shared
buffers b) when there's a flush/swap on that shared buffer. Not when
rendering any of the interim stuff, that only leads to oversync.
Anything non-shared is handled explicitly in userspace (at least for
modern-ish drivers). This is the only thing that ever sets an
exclusive fence (aside from ttm moving buffers around ofc).

2. A way to sync with the implicit fences, either all of them (for
upcoming write access) or just the write fence (for read access). At
first we thought it's good enough to do this in the CS ioctl, but
that's a wee bit too late, hence the patches from Jason. My
understanding is that vulkan converts this into an vk syncobj/fence of
some sorts, so really can't make this more explicit and intentional
than that.

None of this is something the kernel has the slightest idea about when
it happens, so you have to have explicit uapi for it. Trying to fake
it in the kernel just doesn't work.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 09/11] dma-buf: add shared fence usage flags
  2021-05-17 14:11 ` [PATCH 09/11] dma-buf: add shared fence usage flags Christian König
@ 2021-05-17 20:36   ` Daniel Vetter
  2021-05-18 12:54     ` Christian König
  0 siblings, 1 reply; 50+ messages in thread
From: Daniel Vetter @ 2021-05-17 20:36 UTC (permalink / raw)
  To: Christian König; +Cc: linaro-mm-sig, dri-devel

On Mon, May 17, 2021 at 04:11:27PM +0200, Christian König wrote:
> Add usage flags for shared fences and improve the documentation.
> 
> This allows driver to better specify what shared fences
> are doing with the resource.
> 
> Signed-off-by: Christian König <christian.koenig@amd.com>

> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
> index 16b869d9b1d6..c9bbc4630afc 100644
> --- a/drivers/gpu/drm/ttm/ttm_bo.c
> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
> @@ -700,7 +700,7 @@ static int ttm_bo_add_move_fence(struct ttm_buffer_object *bo,
>  		return ret;
>  	}
>  
> -	dma_resv_add_shared_fence(bo->base.resv, fence);
> +	dma_resv_add_shared(bo->base.resv, fence, DMA_RESV_USAGE_RW);

Entirely aside, but I ended up scratching my head a lot for why exactly
this here is a shared fence, and why that's ok. Since just looking at this
it seems like waiting for the memory allocation to actually be owned by
this driver is optional.

Is this ok because the next thing we'll do is a move, which will then set
the exclusive fence here. Which will then wait on the shared one here, so
it doesn't matter? Or well, allows us to pipeline the eviction of ttm_man
against whatever might be currently keeping the bo busy in it's current
place?

Might be good candidate to explain this in a comment or something like
that.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] Add DMA_RESV_USAGE flags
  2021-05-17 20:15     ` Daniel Vetter
@ 2021-05-17 22:49       ` Jason Ekstrand
  2021-05-18  5:59         ` Daniel Vetter
  0 siblings, 1 reply; 50+ messages in thread
From: Jason Ekstrand @ 2021-05-17 22:49 UTC (permalink / raw)
  To: Daniel Vetter
  Cc: moderated list:DMA BUFFER SHARING FRAMEWORK,
	Christian König, dri-devel

On Mon, May 17, 2021 at 3:15 PM Daniel Vetter <daniel@ffwll.ch> wrote:
>
> On Mon, May 17, 2021 at 9:38 PM Christian König
> <ckoenig.leichtzumerken@gmail.com> wrote:
> >
> > Am 17.05.21 um 17:04 schrieb Daniel Vetter:
> > > On Mon, May 17, 2021 at 04:11:18PM +0200, Christian König wrote:
> > >> We had a long outstanding problem in amdgpu that buffers exported to
> > >> user drivers by DMA-buf serialize all command submissions using them.
> > >>
> > >> In other words we can't compose the buffer with different engines and
> > >> then send it to another driver for display further processing.
> > >>
> > >> This was added to work around the fact that i915 didn't wanted to wait
> > >> for shared fences in the dma_resv objects before displaying a buffer.
> > >>
> > >> Since this problem is now causing issues with Vulkan we need to find a
> > >> better solution for that.
> > >>
> > >> The patch set here tries to do this by adding an usage flag to the
> > >> shared fences noting when and how they should participate in implicit
> > >> synchronization.
> > > So the way this is fixed in every other vulkan driver is that vulkan
> > > userspace sets flags in the CS ioctl when it wants to synchronize with
> > > implicit sync. This gets you mostly there. Last time I checked amdgpu
> > > isn't doing this, and yes that's broken.
> >
> > And exactly that is a really bad approach as far as I can see. The
> > Vulkan stack on top simply doesn't know when to set this flag during CS.
>
> Adding Jason for the Vulkan side of things, because this isn't how I
> understand this works.
>
> But purely form a kernel pov your patches are sketchy for two reasons:
>
> - we reinstate the amdgpu special case of not setting exclusive fences
>
> - you only fix the single special case of i915 display, nothing else
>
> That's not how a cross driver interface works. And if you'd do this
> properly, you'd be back to all the same sync fun you've orignally had,
> with all the same fallout.

I think I'm starting to see what Christian is trying to do here and I
think there likely is a real genuine problem here.  I'm not convinced
this is 100% of a solution but there might be something real.  Let me
see if I can convince you or if I just make a hash of things. :-)

The problem, once again, comes down to memory fencing vs. execution
fencing and the way that we've unfortunately tied them together in the
kernel.  With the current architecture, the only way to get proper
write-fence semantics for implicit sync is to take an exclusive fence
on the buffer.  This implies two things:

 1. You have to implicitly wait on EVERY fence on the buffer before
you can start your write-fenced operation

 2. No one else can start ANY operation which accesses that buffer
until you're done.

Let's say that you have a buffer which is shared between two drivers A
and B and let's say driver A has thrown a fence on it just to ensure
that the BO doesn't get swapped out to disk until it's at a good
stopping point.  Then driver B comes along and wants to throw a
write-fence on it.  Suddenly, your memory fence from driver A causes
driver B to have to stall waiting for a "good" time to throw in a
fence.  It sounds like this is the sort of scenario that Christian is
running into.  And, yes, with certain Vulkan drivers being a bit
sloppy about exactly when they throw in write fences, I could see it
being a real problem.

The solution I *think* Christian is proposing is basically to have
four categories of fences instead of two: exclusive, weak (shared with
no r/w), read, and write.  (No, I didn't include r/w but that's the
same as write-only when it comes to hazards.)  Then a bunch of flags
and helpers to be able to handle the interactions between the three
types of shared fences.  Honestly, this is something I've considered
as I've wrestled with these problems in the past.  That said....

 1. In GL, we can make the read/write information accurate and never
over/under sync.

 2. In the future ANV model I described earlier, this isn't a problem.
It throws in a write-fence exactly once per frame.  It actually
under-synchronizes but in a safe way.  I think that mostly makes the
problem go away in practice.

 3. If the core issue here really is memory vs. execution sync as I've
said, maybe we really are papering over something by continuing to mix
them.  Do we really want four fence types or do we want two orthogonal
fence types?

I think I've convinced myself that the problem is real, but not that
this solution is correct.

--Jason


> > That's also the reason the Valve guys came up with a solution where each
> > BO gets a flag for explicit sync, but that only works for exports and
> > not for imports.
> >
> > > I915 and iirc msm has explicit flags for this, panfrost was designed to
> > > support this correctly from the start (also with flags I think). That's at
> > > least what I remember from all the discussions at XDC and #dri-devel, but
> > > didn't check the code again to give you the list of uapi flags you need
> > > for each driver.
> > >
> > > The other piece is making sure you're only picking up implicit fences when
> > > you should, and not any later ones, for which Jason has a solution:
> > >
> > > https://lore.kernel.org/dri-devel/20210317221940.2146688-1-jason@jlekstrand.net/
> >
> > Yes, I helped with that as well. But I think that this is just another
> > workaround without really addressing the underlying problem.
> >
> > > If amdgpu isn't using those, then you will suffer from
> > > over-synchronization in vulkan and pay a price. The entire point of vulkan
> > > is that you pick up sync points very explicitly, and we also need to have
> > > very explicit uapi for userspace to pick up/set the implicit fences.
> > >
> > > Trying to paper over this with more implicit magic is imo just wrong, and
> > > definitely not the long term explicit sync model we want.
> >
> > I completely disagree.
> >
> > In my opinion the implicit sync model we have for dma_resv currently is
> > just not well designed at all, since it always requires cooperation from
> > userspace.
> >
> > In other words you need to know when to enable implicit sync in
> > userspace and that information is simply not present all of the time.
> >
> > What we have done here is just keeping the old reader/writer flags i915,
> > radeon and nouveau once had and pushed that out to everybody else making
> > the assumption that everybody would follow that without documenting the
> > actual rules of engagement you need to follow here.
> >
> > That was a really big mistake and we should try to fix that sooner or
> > later. The only other clean alternative I see is to use a flag on the
> > exporter to tell the importer if it should sync to shared fences or not.
> >
> > Additional to that I'm perfectly fine with implicit sync. Explicit sync
> > certainly has some use cases as well, but I don't see it as an absolute
> > advantage over the implicit model.
>
> Ok this stops making sense. Somehow you claim userspace doesn't know
> when to sync, but somehow the kernel does? By guessing, and getting it
> wrong mostly, except for the one case that you benchmarked?
>
> Aside from silly userspace which exports a buffer to a dma-buf, but
> then never imports it anywhere else, there isn't a case I know of
> where the kernel actually knows more than userspace. But there's lots
> of cases where the kernel definitely knows less, especially if
> userspace doesn't tell it about what's going on with each rendering
> and buffer.
>
> So here's the 2 things you need to make this work like every other driver:
>
> 1. A way to set the explicit fence on a buffer. CS ioctl is perfectly
> fine, but also can be seperate. Userspace uses this only on a) shared
> buffers b) when there's a flush/swap on that shared buffer. Not when
> rendering any of the interim stuff, that only leads to oversync.
> Anything non-shared is handled explicitly in userspace (at least for
> modern-ish drivers). This is the only thing that ever sets an
> exclusive fence (aside from ttm moving buffers around ofc).
>
> 2. A way to sync with the implicit fences, either all of them (for
> upcoming write access) or just the write fence (for read access). At
> first we thought it's good enough to do this in the CS ioctl, but
> that's a wee bit too late, hence the patches from Jason. My
> understanding is that vulkan converts this into an vk syncobj/fence of
> some sorts, so really can't make this more explicit and intentional
> than that.
>
> None of this is something the kernel has the slightest idea about when
> it happens, so you have to have explicit uapi for it. Trying to fake
> it in the kernel just doesn't work.
> -Daniel
> --
> Daniel Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] Add DMA_RESV_USAGE flags
  2021-05-17 22:49       ` Jason Ekstrand
@ 2021-05-18  5:59         ` Daniel Vetter
  2021-05-18 10:29           ` Daniel Vetter
  2021-05-18 12:49           ` Christian König
  0 siblings, 2 replies; 50+ messages in thread
From: Daniel Vetter @ 2021-05-18  5:59 UTC (permalink / raw)
  To: Jason Ekstrand
  Cc: moderated list:DMA BUFFER SHARING FRAMEWORK,
	Christian König, dri-devel

On Tue, May 18, 2021 at 12:49 AM Jason Ekstrand <jason@jlekstrand.net> wrote:
>
> On Mon, May 17, 2021 at 3:15 PM Daniel Vetter <daniel@ffwll.ch> wrote:
> >
> > On Mon, May 17, 2021 at 9:38 PM Christian König
> > <ckoenig.leichtzumerken@gmail.com> wrote:
> > >
> > > Am 17.05.21 um 17:04 schrieb Daniel Vetter:
> > > > On Mon, May 17, 2021 at 04:11:18PM +0200, Christian König wrote:
> > > >> We had a long outstanding problem in amdgpu that buffers exported to
> > > >> user drivers by DMA-buf serialize all command submissions using them.
> > > >>
> > > >> In other words we can't compose the buffer with different engines and
> > > >> then send it to another driver for display further processing.
> > > >>
> > > >> This was added to work around the fact that i915 didn't wanted to wait
> > > >> for shared fences in the dma_resv objects before displaying a buffer.
> > > >>
> > > >> Since this problem is now causing issues with Vulkan we need to find a
> > > >> better solution for that.
> > > >>
> > > >> The patch set here tries to do this by adding an usage flag to the
> > > >> shared fences noting when and how they should participate in implicit
> > > >> synchronization.
> > > > So the way this is fixed in every other vulkan driver is that vulkan
> > > > userspace sets flags in the CS ioctl when it wants to synchronize with
> > > > implicit sync. This gets you mostly there. Last time I checked amdgpu
> > > > isn't doing this, and yes that's broken.
> > >
> > > And exactly that is a really bad approach as far as I can see. The
> > > Vulkan stack on top simply doesn't know when to set this flag during CS.
> >
> > Adding Jason for the Vulkan side of things, because this isn't how I
> > understand this works.
> >
> > But purely form a kernel pov your patches are sketchy for two reasons:
> >
> > - we reinstate the amdgpu special case of not setting exclusive fences
> >
> > - you only fix the single special case of i915 display, nothing else
> >
> > That's not how a cross driver interface works. And if you'd do this
> > properly, you'd be back to all the same sync fun you've orignally had,
> > with all the same fallout.
>
> I think I'm starting to see what Christian is trying to do here and I
> think there likely is a real genuine problem here.  I'm not convinced
> this is 100% of a solution but there might be something real.  Let me
> see if I can convince you or if I just make a hash of things. :-)
>
> The problem, once again, comes down to memory fencing vs. execution
> fencing and the way that we've unfortunately tied them together in the
> kernel.  With the current architecture, the only way to get proper
> write-fence semantics for implicit sync is to take an exclusive fence
> on the buffer.  This implies two things:
>
>  1. You have to implicitly wait on EVERY fence on the buffer before
> you can start your write-fenced operation
>
>  2. No one else can start ANY operation which accesses that buffer
> until you're done.
>
> Let's say that you have a buffer which is shared between two drivers A
> and B and let's say driver A has thrown a fence on it just to ensure
> that the BO doesn't get swapped out to disk until it's at a good
> stopping point.  Then driver B comes along and wants to throw a
> write-fence on it.  Suddenly, your memory fence from driver A causes
> driver B to have to stall waiting for a "good" time to throw in a
> fence.  It sounds like this is the sort of scenario that Christian is
> running into.  And, yes, with certain Vulkan drivers being a bit
> sloppy about exactly when they throw in write fences, I could see it
> being a real problem.

Yes this is a potential problem, and on the i915 side we need to do
some shuffling here most likely. Especially due to discrete, but the
problem is pre-existing. tbh I forgot about the implications here
until I pondered this again yesterday evening.

But afaiui the amdgpu code and winsys in mesa, this isn't (yet) the
problem amd vk drivers have. The issue is that with amdgpu, all you
supply are the following bits at CS time:
- list of always mapped private buffers, which is implicit and O(1) in
the kernel fastpath
- additional list of shared buffers that are used by the current CS

I didn't check how exactly that works wrt winsys buffer ownership, but
the thing is that on the kernel side _any_ buffer in there is treated
as a implicit sync'ed write. Which means if you render your winsys
with a bunch of command submission split over 3d and compute pipes,
you end up with horrendous amounts of oversync.

The reason for this is that amdgpu decided to go with a different
implicit sync model than everyone else:
- within an drm file everything is unsynced and left to userspace to
handle, amdgpu.ko only ever sets the shared fence slots.
- this means the exclusive slot really is exclusive to memory manage
issues, which side-steps the issue you point out above
- for anything cross-device they unconditionally wait for any shared
fence which is by another process

Works, except it's incompatible with what everyone else is doing, so
had to be papered over by the current massive oversync solution.

First step in fixing that is (and frankly was since years) to fix the
amdgpu CS so winsys can pass along a bunch of flags about which CS
should actually set the exclusive fence, so that you stop oversyncing
so badly. Ofc old userspace needs to keep oversyncing forever, no way
to fix that.

Instead what Christian patch set here does is move amdgpu back to the
dma_resv contract it prefers, break everything else and then fix up
i915 atomic path so that the one use case that originally highlighted
the mismatch here works again. Which hrm .... no :-)

I think the reason this wasn't ever a pressing issue is that amdgpu.ko
only does this for buffers shared across devices, so in most cases you
don't suffer from the terribly oversync. Conceptually it's still all
there.

> The solution I *think* Christian is proposing is basically to have
> four categories of fences instead of two: exclusive, weak (shared with
> no r/w), read, and write.  (No, I didn't include r/w but that's the
> same as write-only when it comes to hazards.)  Then a bunch of flags
> and helpers to be able to handle the interactions between the three
> types of shared fences.  Honestly, this is something I've considered
> as I've wrestled with these problems in the past.  That said....
>
>  1. In GL, we can make the read/write information accurate and never
> over/under sync.
>
>  2. In the future ANV model I described earlier, this isn't a problem.
> It throws in a write-fence exactly once per frame.  It actually
> under-synchronizes but in a safe way.  I think that mostly makes the
> problem go away in practice.
>
>  3. If the core issue here really is memory vs. execution sync as I've
> said, maybe we really are papering over something by continuing to mix
> them.  Do we really want four fence types or do we want two orthogonal
> fence types?

Now once amdgpu.ko is fixed, we still have the problem of mixing up
the exclusive fence for implicit sync with the exclusive fence for
memory management. And for that we can and probably should figure out
what to do there. But that still requires that amdgpu CS first learns
what's actually going on from userspace, and secondly, that we do this
addition in a way which is compatible with current dma_resv users
(i.e. all drivers currently asking for an exclusive fence need to pick
up both types of exclusive fences if we decide to split them).

> I think I've convinced myself that the problem is real, but not that
> this solution is correct.

Yeah there's definitely some problems here, but Christian hasn't
really explained which one he's trying to solve, so we're also running
a bit in a circle trying to guess what's what :-/

Cheers, Daniel

>
> --Jason
>
>
> > > That's also the reason the Valve guys came up with a solution where each
> > > BO gets a flag for explicit sync, but that only works for exports and
> > > not for imports.
> > >
> > > > I915 and iirc msm has explicit flags for this, panfrost was designed to
> > > > support this correctly from the start (also with flags I think). That's at
> > > > least what I remember from all the discussions at XDC and #dri-devel, but
> > > > didn't check the code again to give you the list of uapi flags you need
> > > > for each driver.
> > > >
> > > > The other piece is making sure you're only picking up implicit fences when
> > > > you should, and not any later ones, for which Jason has a solution:
> > > >
> > > > https://lore.kernel.org/dri-devel/20210317221940.2146688-1-jason@jlekstrand.net/
> > >
> > > Yes, I helped with that as well. But I think that this is just another
> > > workaround without really addressing the underlying problem.
> > >
> > > > If amdgpu isn't using those, then you will suffer from
> > > > over-synchronization in vulkan and pay a price. The entire point of vulkan
> > > > is that you pick up sync points very explicitly, and we also need to have
> > > > very explicit uapi for userspace to pick up/set the implicit fences.
> > > >
> > > > Trying to paper over this with more implicit magic is imo just wrong, and
> > > > definitely not the long term explicit sync model we want.
> > >
> > > I completely disagree.
> > >
> > > In my opinion the implicit sync model we have for dma_resv currently is
> > > just not well designed at all, since it always requires cooperation from
> > > userspace.
> > >
> > > In other words you need to know when to enable implicit sync in
> > > userspace and that information is simply not present all of the time.
> > >
> > > What we have done here is just keeping the old reader/writer flags i915,
> > > radeon and nouveau once had and pushed that out to everybody else making
> > > the assumption that everybody would follow that without documenting the
> > > actual rules of engagement you need to follow here.
> > >
> > > That was a really big mistake and we should try to fix that sooner or
> > > later. The only other clean alternative I see is to use a flag on the
> > > exporter to tell the importer if it should sync to shared fences or not.
> > >
> > > Additional to that I'm perfectly fine with implicit sync. Explicit sync
> > > certainly has some use cases as well, but I don't see it as an absolute
> > > advantage over the implicit model.
> >
> > Ok this stops making sense. Somehow you claim userspace doesn't know
> > when to sync, but somehow the kernel does? By guessing, and getting it
> > wrong mostly, except for the one case that you benchmarked?
> >
> > Aside from silly userspace which exports a buffer to a dma-buf, but
> > then never imports it anywhere else, there isn't a case I know of
> > where the kernel actually knows more than userspace. But there's lots
> > of cases where the kernel definitely knows less, especially if
> > userspace doesn't tell it about what's going on with each rendering
> > and buffer.
> >
> > So here's the 2 things you need to make this work like every other driver:
> >
> > 1. A way to set the explicit fence on a buffer. CS ioctl is perfectly
> > fine, but also can be seperate. Userspace uses this only on a) shared
> > buffers b) when there's a flush/swap on that shared buffer. Not when
> > rendering any of the interim stuff, that only leads to oversync.
> > Anything non-shared is handled explicitly in userspace (at least for
> > modern-ish drivers). This is the only thing that ever sets an
> > exclusive fence (aside from ttm moving buffers around ofc).
> >
> > 2. A way to sync with the implicit fences, either all of them (for
> > upcoming write access) or just the write fence (for read access). At
> > first we thought it's good enough to do this in the CS ioctl, but
> > that's a wee bit too late, hence the patches from Jason. My
> > understanding is that vulkan converts this into an vk syncobj/fence of
> > some sorts, so really can't make this more explicit and intentional
> > than that.
> >
> > None of this is something the kernel has the slightest idea about when
> > it happens, so you have to have explicit uapi for it. Trying to fake
> > it in the kernel just doesn't work.
> > -Daniel
> > --
> > Daniel Vetter
> > Software Engineer, Intel Corporation
> > http://blog.ffwll.ch



--
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] Add DMA_RESV_USAGE flags
  2021-05-18  5:59         ` Daniel Vetter
@ 2021-05-18 10:29           ` Daniel Vetter
  2021-05-18 12:49           ` Christian König
  1 sibling, 0 replies; 50+ messages in thread
From: Daniel Vetter @ 2021-05-18 10:29 UTC (permalink / raw)
  To: Jason Ekstrand, Thomas Hellström (VMware)
  Cc: moderated list:DMA BUFFER SHARING FRAMEWORK,
	Christian König, dri-devel

On Tue, May 18, 2021 at 7:59 AM Daniel Vetter <daniel@ffwll.ch> wrote:
>
> On Tue, May 18, 2021 at 12:49 AM Jason Ekstrand <jason@jlekstrand.net> wrote:
> >
> > On Mon, May 17, 2021 at 3:15 PM Daniel Vetter <daniel@ffwll.ch> wrote:
> > >
> > > On Mon, May 17, 2021 at 9:38 PM Christian König
> > > <ckoenig.leichtzumerken@gmail.com> wrote:
> > > >
> > > > Am 17.05.21 um 17:04 schrieb Daniel Vetter:
> > > > > On Mon, May 17, 2021 at 04:11:18PM +0200, Christian König wrote:
> > > > >> We had a long outstanding problem in amdgpu that buffers exported to
> > > > >> user drivers by DMA-buf serialize all command submissions using them.
> > > > >>
> > > > >> In other words we can't compose the buffer with different engines and
> > > > >> then send it to another driver for display further processing.
> > > > >>
> > > > >> This was added to work around the fact that i915 didn't wanted to wait
> > > > >> for shared fences in the dma_resv objects before displaying a buffer.
> > > > >>
> > > > >> Since this problem is now causing issues with Vulkan we need to find a
> > > > >> better solution for that.
> > > > >>
> > > > >> The patch set here tries to do this by adding an usage flag to the
> > > > >> shared fences noting when and how they should participate in implicit
> > > > >> synchronization.
> > > > > So the way this is fixed in every other vulkan driver is that vulkan
> > > > > userspace sets flags in the CS ioctl when it wants to synchronize with
> > > > > implicit sync. This gets you mostly there. Last time I checked amdgpu
> > > > > isn't doing this, and yes that's broken.
> > > >
> > > > And exactly that is a really bad approach as far as I can see. The
> > > > Vulkan stack on top simply doesn't know when to set this flag during CS.
> > >
> > > Adding Jason for the Vulkan side of things, because this isn't how I
> > > understand this works.
> > >
> > > But purely form a kernel pov your patches are sketchy for two reasons:
> > >
> > > - we reinstate the amdgpu special case of not setting exclusive fences
> > >
> > > - you only fix the single special case of i915 display, nothing else
> > >
> > > That's not how a cross driver interface works. And if you'd do this
> > > properly, you'd be back to all the same sync fun you've orignally had,
> > > with all the same fallout.
> >
> > I think I'm starting to see what Christian is trying to do here and I
> > think there likely is a real genuine problem here.  I'm not convinced
> > this is 100% of a solution but there might be something real.  Let me
> > see if I can convince you or if I just make a hash of things. :-)
> >
> > The problem, once again, comes down to memory fencing vs. execution
> > fencing and the way that we've unfortunately tied them together in the
> > kernel.  With the current architecture, the only way to get proper
> > write-fence semantics for implicit sync is to take an exclusive fence
> > on the buffer.  This implies two things:
> >
> >  1. You have to implicitly wait on EVERY fence on the buffer before
> > you can start your write-fenced operation
> >
> >  2. No one else can start ANY operation which accesses that buffer
> > until you're done.
> >
> > Let's say that you have a buffer which is shared between two drivers A
> > and B and let's say driver A has thrown a fence on it just to ensure
> > that the BO doesn't get swapped out to disk until it's at a good
> > stopping point.  Then driver B comes along and wants to throw a
> > write-fence on it.  Suddenly, your memory fence from driver A causes
> > driver B to have to stall waiting for a "good" time to throw in a
> > fence.  It sounds like this is the sort of scenario that Christian is
> > running into.  And, yes, with certain Vulkan drivers being a bit
> > sloppy about exactly when they throw in write fences, I could see it
> > being a real problem.
>
> Yes this is a potential problem, and on the i915 side we need to do
> some shuffling here most likely. Especially due to discrete, but the
> problem is pre-existing. tbh I forgot about the implications here
> until I pondered this again yesterday evening.
>
> But afaiui the amdgpu code and winsys in mesa, this isn't (yet) the
> problem amd vk drivers have. The issue is that with amdgpu, all you
> supply are the following bits at CS time:
> - list of always mapped private buffers, which is implicit and O(1) in
> the kernel fastpath
> - additional list of shared buffers that are used by the current CS
>
> I didn't check how exactly that works wrt winsys buffer ownership, but
> the thing is that on the kernel side _any_ buffer in there is treated
> as a implicit sync'ed write. Which means if you render your winsys
> with a bunch of command submission split over 3d and compute pipes,
> you end up with horrendous amounts of oversync.
>
> The reason for this is that amdgpu decided to go with a different
> implicit sync model than everyone else:
> - within an drm file everything is unsynced and left to userspace to
> handle, amdgpu.ko only ever sets the shared fence slots.
> - this means the exclusive slot really is exclusive to memory manage
> issues, which side-steps the issue you point out above
> - for anything cross-device they unconditionally wait for any shared
> fence which is by another process
>
> Works, except it's incompatible with what everyone else is doing, so
> had to be papered over by the current massive oversync solution.
>
> First step in fixing that is (and frankly was since years) to fix the
> amdgpu CS so winsys can pass along a bunch of flags about which CS
> should actually set the exclusive fence, so that you stop oversyncing
> so badly. Ofc old userspace needs to keep oversyncing forever, no way
> to fix that.
>
> Instead what Christian patch set here does is move amdgpu back to the
> dma_resv contract it prefers, break everything else and then fix up
> i915 atomic path so that the one use case that originally highlighted
> the mismatch here works again. Which hrm .... no :-)
>
> I think the reason this wasn't ever a pressing issue is that amdgpu.ko
> only does this for buffers shared across devices, so in most cases you
> don't suffer from the terribly oversync. Conceptually it's still all
> there.
>
> > The solution I *think* Christian is proposing is basically to have
> > four categories of fences instead of two: exclusive, weak (shared with
> > no r/w), read, and write.  (No, I didn't include r/w but that's the
> > same as write-only when it comes to hazards.)  Then a bunch of flags
> > and helpers to be able to handle the interactions between the three
> > types of shared fences.  Honestly, this is something I've considered
> > as I've wrestled with these problems in the past.  That said....
> >
> >  1. In GL, we can make the read/write information accurate and never
> > over/under sync.
> >
> >  2. In the future ANV model I described earlier, this isn't a problem.
> > It throws in a write-fence exactly once per frame.  It actually
> > under-synchronizes but in a safe way.  I think that mostly makes the
> > problem go away in practice.
> >
> >  3. If the core issue here really is memory vs. execution sync as I've
> > said, maybe we really are papering over something by continuing to mix
> > them.  Do we really want four fence types or do we want two orthogonal
> > fence types?
>
> Now once amdgpu.ko is fixed, we still have the problem of mixing up
> the exclusive fence for implicit sync with the exclusive fence for
> memory management. And for that we can and probably should figure out
> what to do there. But that still requires that amdgpu CS first learns
> what's actually going on from userspace, and secondly, that we do this
> addition in a way which is compatible with current dma_resv users
> (i.e. all drivers currently asking for an exclusive fence need to pick
> up both types of exclusive fences if we decide to split them).

Thomas Hellstrom reminded me that ttm_bo->moving is a thing. So we
already have that separate "absolutely can't ignore it" fence slot for
kernel memory management tasks. So we're actually good here.

The only issue is that ttm_bo->moving isn't part of the dma_resv
struct, so for p2p dma-buf we might still have a problem here ...
-Daniel

>
> > I think I've convinced myself that the problem is real, but not that
> > this solution is correct.
>
> Yeah there's definitely some problems here, but Christian hasn't
> really explained which one he's trying to solve, so we're also running
> a bit in a circle trying to guess what's what :-/
>
> Cheers, Daniel
>
> >
> > --Jason
> >
> >
> > > > That's also the reason the Valve guys came up with a solution where each
> > > > BO gets a flag for explicit sync, but that only works for exports and
> > > > not for imports.
> > > >
> > > > > I915 and iirc msm has explicit flags for this, panfrost was designed to
> > > > > support this correctly from the start (also with flags I think). That's at
> > > > > least what I remember from all the discussions at XDC and #dri-devel, but
> > > > > didn't check the code again to give you the list of uapi flags you need
> > > > > for each driver.
> > > > >
> > > > > The other piece is making sure you're only picking up implicit fences when
> > > > > you should, and not any later ones, for which Jason has a solution:
> > > > >
> > > > > https://lore.kernel.org/dri-devel/20210317221940.2146688-1-jason@jlekstrand.net/
> > > >
> > > > Yes, I helped with that as well. But I think that this is just another
> > > > workaround without really addressing the underlying problem.
> > > >
> > > > > If amdgpu isn't using those, then you will suffer from
> > > > > over-synchronization in vulkan and pay a price. The entire point of vulkan
> > > > > is that you pick up sync points very explicitly, and we also need to have
> > > > > very explicit uapi for userspace to pick up/set the implicit fences.
> > > > >
> > > > > Trying to paper over this with more implicit magic is imo just wrong, and
> > > > > definitely not the long term explicit sync model we want.
> > > >
> > > > I completely disagree.
> > > >
> > > > In my opinion the implicit sync model we have for dma_resv currently is
> > > > just not well designed at all, since it always requires cooperation from
> > > > userspace.
> > > >
> > > > In other words you need to know when to enable implicit sync in
> > > > userspace and that information is simply not present all of the time.
> > > >
> > > > What we have done here is just keeping the old reader/writer flags i915,
> > > > radeon and nouveau once had and pushed that out to everybody else making
> > > > the assumption that everybody would follow that without documenting the
> > > > actual rules of engagement you need to follow here.
> > > >
> > > > That was a really big mistake and we should try to fix that sooner or
> > > > later. The only other clean alternative I see is to use a flag on the
> > > > exporter to tell the importer if it should sync to shared fences or not.
> > > >
> > > > Additional to that I'm perfectly fine with implicit sync. Explicit sync
> > > > certainly has some use cases as well, but I don't see it as an absolute
> > > > advantage over the implicit model.
> > >
> > > Ok this stops making sense. Somehow you claim userspace doesn't know
> > > when to sync, but somehow the kernel does? By guessing, and getting it
> > > wrong mostly, except for the one case that you benchmarked?
> > >
> > > Aside from silly userspace which exports a buffer to a dma-buf, but
> > > then never imports it anywhere else, there isn't a case I know of
> > > where the kernel actually knows more than userspace. But there's lots
> > > of cases where the kernel definitely knows less, especially if
> > > userspace doesn't tell it about what's going on with each rendering
> > > and buffer.
> > >
> > > So here's the 2 things you need to make this work like every other driver:
> > >
> > > 1. A way to set the explicit fence on a buffer. CS ioctl is perfectly
> > > fine, but also can be seperate. Userspace uses this only on a) shared
> > > buffers b) when there's a flush/swap on that shared buffer. Not when
> > > rendering any of the interim stuff, that only leads to oversync.
> > > Anything non-shared is handled explicitly in userspace (at least for
> > > modern-ish drivers). This is the only thing that ever sets an
> > > exclusive fence (aside from ttm moving buffers around ofc).
> > >
> > > 2. A way to sync with the implicit fences, either all of them (for
> > > upcoming write access) or just the write fence (for read access). At
> > > first we thought it's good enough to do this in the CS ioctl, but
> > > that's a wee bit too late, hence the patches from Jason. My
> > > understanding is that vulkan converts this into an vk syncobj/fence of
> > > some sorts, so really can't make this more explicit and intentional
> > > than that.
> > >
> > > None of this is something the kernel has the slightest idea about when
> > > it happens, so you have to have explicit uapi for it. Trying to fake
> > > it in the kernel just doesn't work.
> > > -Daniel
> > > --
> > > Daniel Vetter
> > > Software Engineer, Intel Corporation
> > > http://blog.ffwll.ch
>
>
>
> --
> Daniel Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch



-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] Add DMA_RESV_USAGE flags
  2021-05-18  5:59         ` Daniel Vetter
  2021-05-18 10:29           ` Daniel Vetter
@ 2021-05-18 12:49           ` Christian König
  2021-05-18 13:26             ` Daniel Stone
  2021-05-18 16:48             ` Daniel Vetter
  1 sibling, 2 replies; 50+ messages in thread
From: Christian König @ 2021-05-18 12:49 UTC (permalink / raw)
  To: Daniel Vetter, Jason Ekstrand
  Cc: moderated list:DMA BUFFER SHARING FRAMEWORK, dri-devel

Hi Jason & Daniel,

Am 18.05.21 um 07:59 schrieb Daniel Vetter:
> On Tue, May 18, 2021 at 12:49 AM Jason Ekstrand <jason@jlekstrand.net> wrote:
>> On Mon, May 17, 2021 at 3:15 PM Daniel Vetter <daniel@ffwll.ch> wrote:
>>> On Mon, May 17, 2021 at 9:38 PM Christian König
>>> <ckoenig.leichtzumerken@gmail.com> wrote:
>>>> Am 17.05.21 um 17:04 schrieb Daniel Vetter:
>>>>> On Mon, May 17, 2021 at 04:11:18PM +0200, Christian König wrote:
>>>>>> We had a long outstanding problem in amdgpu that buffers exported to
>>>>>> user drivers by DMA-buf serialize all command submissions using them.
>>>>>>
>>>>>> In other words we can't compose the buffer with different engines and
>>>>>> then send it to another driver for display further processing.
>>>>>>
>>>>>> This was added to work around the fact that i915 didn't wanted to wait
>>>>>> for shared fences in the dma_resv objects before displaying a buffer.
>>>>>>
>>>>>> Since this problem is now causing issues with Vulkan we need to find a
>>>>>> better solution for that.
>>>>>>
>>>>>> The patch set here tries to do this by adding an usage flag to the
>>>>>> shared fences noting when and how they should participate in implicit
>>>>>> synchronization.
>>>>> So the way this is fixed in every other vulkan driver is that vulkan
>>>>> userspace sets flags in the CS ioctl when it wants to synchronize with
>>>>> implicit sync. This gets you mostly there. Last time I checked amdgpu
>>>>> isn't doing this, and yes that's broken.
>>>> And exactly that is a really bad approach as far as I can see. The
>>>> Vulkan stack on top simply doesn't know when to set this flag during CS.
>>> Adding Jason for the Vulkan side of things, because this isn't how I
>>> understand this works.
>>>
>>> But purely form a kernel pov your patches are sketchy for two reasons:
>>>
>>> - we reinstate the amdgpu special case of not setting exclusive fences
>>>
>>> - you only fix the single special case of i915 display, nothing else
>>>
>>> That's not how a cross driver interface works. And if you'd do this
>>> properly, you'd be back to all the same sync fun you've orignally had,
>>> with all the same fallout.
>> I think I'm starting to see what Christian is trying to do here and I
>> think there likely is a real genuine problem here.  I'm not convinced
>> this is 100% of a solution but there might be something real.  Let me
>> see if I can convince you or if I just make a hash of things. :-)
>>
>> The problem, once again, comes down to memory fencing vs. execution
>> fencing and the way that we've unfortunately tied them together in the
>> kernel.  With the current architecture, the only way to get proper
>> write-fence semantics for implicit sync is to take an exclusive fence
>> on the buffer.  This implies two things:
>>
>>   1. You have to implicitly wait on EVERY fence on the buffer before
>> you can start your write-fenced operation
>>
>>   2. No one else can start ANY operation which accesses that buffer
>> until you're done.

Yes, exactly that. You absolutely nailed it.

I unfortunately also have a 3rd use case:

3. Operations which shouldn't participate in any syncing, but only 
affect the memory management.

This is basically our heavyweight TLB flush after unmapping the BO from 
somebodies page tables. Nobody should ever be concerned about it for any 
form of synchronization, but memory managment is not allowed to reuse or 
move the buffer before the operation is completed.

>>
>> Let's say that you have a buffer which is shared between two drivers A
>> and B and let's say driver A has thrown a fence on it just to ensure
>> that the BO doesn't get swapped out to disk until it's at a good
>> stopping point.  Then driver B comes along and wants to throw a
>> write-fence on it.  Suddenly, your memory fence from driver A causes
>> driver B to have to stall waiting for a "good" time to throw in a
>> fence.  It sounds like this is the sort of scenario that Christian is
>> running into.  And, yes, with certain Vulkan drivers being a bit
>> sloppy about exactly when they throw in write fences, I could see it
>> being a real problem.
> Yes this is a potential problem, and on the i915 side we need to do
> some shuffling here most likely. Especially due to discrete, but the
> problem is pre-existing. tbh I forgot about the implications here
> until I pondered this again yesterday evening.
>
> But afaiui the amdgpu code and winsys in mesa, this isn't (yet) the
> problem amd vk drivers have. The issue is that with amdgpu, all you
> supply are the following bits at CS time:
> - list of always mapped private buffers, which is implicit and O(1) in
> the kernel fastpath
> - additional list of shared buffers that are used by the current CS
>
> I didn't check how exactly that works wrt winsys buffer ownership, but
> the thing is that on the kernel side _any_ buffer in there is treated
> as a implicit sync'ed write. Which means if you render your winsys
> with a bunch of command submission split over 3d and compute pipes,
> you end up with horrendous amounts of oversync.

What are you talking about? We have no sync at all for submissions from 
the same client.

> The reason for this is that amdgpu decided to go with a different
> implicit sync model than everyone else:
> - within an drm file everything is unsynced and left to userspace to
> handle, amdgpu.ko only ever sets the shared fence slots.
> - this means the exclusive slot really is exclusive to memory manage
> issues, which side-steps the issue you point out above
> - for anything cross-device they unconditionally wait for any shared
> fence which is by another process
>
> Works, except it's incompatible with what everyone else is doing, so
> had to be papered over by the current massive oversync solution.

Well actually it is only i915 I care of which is working differently.

Radeon works the same way as amdgpu by waiting for everything before 
doing command submission or pageflip.

> First step in fixing that is (and frankly was since years) to fix the
> amdgpu CS so winsys can pass along a bunch of flags about which CS
> should actually set the exclusive fence, so that you stop oversyncing
> so badly. Ofc old userspace needs to keep oversyncing forever, no way
> to fix that.

Exactly that is what we don't want to do because the winsys has no idea 
when to sync and when not to sync.

The kernel on the other hand perfectly knows that.

> Instead what Christian patch set here does is move amdgpu back to the
> dma_resv contract it prefers, break everything else and then fix up
> i915 atomic path so that the one use case that originally highlighted
> the mismatch here works again. Which hrm .... no :-)
>
> I think the reason this wasn't ever a pressing issue is that amdgpu.ko
> only does this for buffers shared across devices, so in most cases you
> don't suffer from the terribly oversync. Conceptually it's still all
> there.
>
>> The solution I *think* Christian is proposing is basically to have
>> four categories of fences instead of two: exclusive, weak (shared with
>> no r/w), read, and write.  (No, I didn't include r/w but that's the
>> same as write-only when it comes to hazards.)  Then a bunch of flags
>> and helpers to be able to handle the interactions between the three
>> types of shared fences.  Honestly, this is something I've considered
>> as I've wrestled with these problems in the past.  That said....
>>
>>   1. In GL, we can make the read/write information accurate and never
>> over/under sync.
>>
>>   2. In the future ANV model I described earlier, this isn't a problem.
>> It throws in a write-fence exactly once per frame.  It actually
>> under-synchronizes but in a safe way.  I think that mostly makes the
>> problem go away in practice.
>>
>>   3. If the core issue here really is memory vs. execution sync as I've
>> said, maybe we really are papering over something by continuing to mix
>> them.  Do we really want four fence types or do we want two orthogonal
>> fence types?
> Now once amdgpu.ko is fixed, we still have the problem of mixing up
> the exclusive fence for implicit sync with the exclusive fence for
> memory management. And for that we can and probably should figure out
> what to do there. But that still requires that amdgpu CS first learns
> what's actually going on from userspace, and secondly, that we do this
> addition in a way which is compatible with current dma_resv users
> (i.e. all drivers currently asking for an exclusive fence need to pick
> up both types of exclusive fences if we decide to split them).
>> I think I've convinced myself that the problem is real, but not that
>> this solution is correct.
> Yeah there's definitely some problems here, but Christian hasn't
> really explained which one he's trying to solve, so we're also running
> a bit in a circle trying to guess what's what :-/

Well how can I help with that?

Jason seems to have the perfect understanding why we have those problems.

And as long as we are all inside amdgpu we also don't have any oversync, 
the issue only happens when we share dma-bufs with i915 (radeon and 
AFAIK nouveau does the right thing as well).

Regards,
Christian.

>
> Cheers, Daniel
>
>> --Jason
>>
>>
>>>> That's also the reason the Valve guys came up with a solution where each
>>>> BO gets a flag for explicit sync, but that only works for exports and
>>>> not for imports.
>>>>
>>>>> I915 and iirc msm has explicit flags for this, panfrost was designed to
>>>>> support this correctly from the start (also with flags I think). That's at
>>>>> least what I remember from all the discussions at XDC and #dri-devel, but
>>>>> didn't check the code again to give you the list of uapi flags you need
>>>>> for each driver.
>>>>>
>>>>> The other piece is making sure you're only picking up implicit fences when
>>>>> you should, and not any later ones, for which Jason has a solution:
>>>>>
>>>>> https://lore.kernel.org/dri-devel/20210317221940.2146688-1-jason@jlekstrand.net/
>>>> Yes, I helped with that as well. But I think that this is just another
>>>> workaround without really addressing the underlying problem.
>>>>
>>>>> If amdgpu isn't using those, then you will suffer from
>>>>> over-synchronization in vulkan and pay a price. The entire point of vulkan
>>>>> is that you pick up sync points very explicitly, and we also need to have
>>>>> very explicit uapi for userspace to pick up/set the implicit fences.
>>>>>
>>>>> Trying to paper over this with more implicit magic is imo just wrong, and
>>>>> definitely not the long term explicit sync model we want.
>>>> I completely disagree.
>>>>
>>>> In my opinion the implicit sync model we have for dma_resv currently is
>>>> just not well designed at all, since it always requires cooperation from
>>>> userspace.
>>>>
>>>> In other words you need to know when to enable implicit sync in
>>>> userspace and that information is simply not present all of the time.
>>>>
>>>> What we have done here is just keeping the old reader/writer flags i915,
>>>> radeon and nouveau once had and pushed that out to everybody else making
>>>> the assumption that everybody would follow that without documenting the
>>>> actual rules of engagement you need to follow here.
>>>>
>>>> That was a really big mistake and we should try to fix that sooner or
>>>> later. The only other clean alternative I see is to use a flag on the
>>>> exporter to tell the importer if it should sync to shared fences or not.
>>>>
>>>> Additional to that I'm perfectly fine with implicit sync. Explicit sync
>>>> certainly has some use cases as well, but I don't see it as an absolute
>>>> advantage over the implicit model.
>>> Ok this stops making sense. Somehow you claim userspace doesn't know
>>> when to sync, but somehow the kernel does? By guessing, and getting it
>>> wrong mostly, except for the one case that you benchmarked?
>>>
>>> Aside from silly userspace which exports a buffer to a dma-buf, but
>>> then never imports it anywhere else, there isn't a case I know of
>>> where the kernel actually knows more than userspace. But there's lots
>>> of cases where the kernel definitely knows less, especially if
>>> userspace doesn't tell it about what's going on with each rendering
>>> and buffer.
>>>
>>> So here's the 2 things you need to make this work like every other driver:
>>>
>>> 1. A way to set the explicit fence on a buffer. CS ioctl is perfectly
>>> fine, but also can be seperate. Userspace uses this only on a) shared
>>> buffers b) when there's a flush/swap on that shared buffer. Not when
>>> rendering any of the interim stuff, that only leads to oversync.
>>> Anything non-shared is handled explicitly in userspace (at least for
>>> modern-ish drivers). This is the only thing that ever sets an
>>> exclusive fence (aside from ttm moving buffers around ofc).
>>>
>>> 2. A way to sync with the implicit fences, either all of them (for
>>> upcoming write access) or just the write fence (for read access). At
>>> first we thought it's good enough to do this in the CS ioctl, but
>>> that's a wee bit too late, hence the patches from Jason. My
>>> understanding is that vulkan converts this into an vk syncobj/fence of
>>> some sorts, so really can't make this more explicit and intentional
>>> than that.
>>>
>>> None of this is something the kernel has the slightest idea about when
>>> it happens, so you have to have explicit uapi for it. Trying to fake
>>> it in the kernel just doesn't work.
>>> -Daniel
>>> --
>>> Daniel Vetter
>>> Software Engineer, Intel Corporation
>>> http://blog.ffwll.ch
>
>
> --
> Daniel Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 09/11] dma-buf: add shared fence usage flags
  2021-05-17 20:36   ` Daniel Vetter
@ 2021-05-18 12:54     ` Christian König
  0 siblings, 0 replies; 50+ messages in thread
From: Christian König @ 2021-05-18 12:54 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: linaro-mm-sig, dri-devel

Am 17.05.21 um 22:36 schrieb Daniel Vetter:
> On Mon, May 17, 2021 at 04:11:27PM +0200, Christian König wrote:
>> Add usage flags for shared fences and improve the documentation.
>>
>> This allows driver to better specify what shared fences
>> are doing with the resource.
>>
>> Signed-off-by: Christian König <christian.koenig@amd.com>
>> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
>> index 16b869d9b1d6..c9bbc4630afc 100644
>> --- a/drivers/gpu/drm/ttm/ttm_bo.c
>> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
>> @@ -700,7 +700,7 @@ static int ttm_bo_add_move_fence(struct ttm_buffer_object *bo,
>>   		return ret;
>>   	}
>>   
>> -	dma_resv_add_shared_fence(bo->base.resv, fence);
>> +	dma_resv_add_shared(bo->base.resv, fence, DMA_RESV_USAGE_RW);
> Entirely aside, but I ended up scratching my head a lot for why exactly
> this here is a shared fence, and why that's ok. Since just looking at this
> it seems like waiting for the memory allocation to actually be owned by
> this driver is optional.
>
> Is this ok because the next thing we'll do is a move, which will then set
> the exclusive fence here. Which will then wait on the shared one here, so
> it doesn't matter? Or well, allows us to pipeline the eviction of ttm_man
> against whatever might be currently keeping the bo busy in it's current
> place?

Yes, exactly that.

We just need to make sure that the new BO location isn't used before the 
fence is completed, but we can't use the exclusive slot because we have 
no guarantee at all that the move fence signals in the right order.

Regards,
Christian.

>
> Might be good candidate to explain this in a comment or something like
> that.
> -Daniel


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] Add DMA_RESV_USAGE flags
  2021-05-18 12:49           ` Christian König
@ 2021-05-18 13:26             ` Daniel Stone
  2021-05-18 13:51               ` Christian König
  2021-05-18 16:48             ` Daniel Vetter
  1 sibling, 1 reply; 50+ messages in thread
From: Daniel Stone @ 2021-05-18 13:26 UTC (permalink / raw)
  To: Christian König
  Cc: moderated list:DMA BUFFER SHARING FRAMEWORK, dri-devel, Jason Ekstrand

On Tue, 18 May 2021 at 13:49, Christian König
<ckoenig.leichtzumerken@gmail.com> wrote:
> Am 18.05.21 um 07:59 schrieb Daniel Vetter:
> > First step in fixing that is (and frankly was since years) to fix the
> > amdgpu CS so winsys can pass along a bunch of flags about which CS
> > should actually set the exclusive fence, so that you stop oversyncing
> > so badly. Ofc old userspace needs to keep oversyncing forever, no way
> > to fix that.
>
> Exactly that is what we don't want to do because the winsys has no idea
> when to sync and when not to sync.

Hey, we're typing that out as fast as we can ... it's just that you
keep reinventing sync primitives faster than we can ship support for
them :P

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] Add DMA_RESV_USAGE flags
  2021-05-18 13:26             ` Daniel Stone
@ 2021-05-18 13:51               ` Christian König
  0 siblings, 0 replies; 50+ messages in thread
From: Christian König @ 2021-05-18 13:51 UTC (permalink / raw)
  To: Daniel Stone
  Cc: moderated list:DMA BUFFER SHARING FRAMEWORK, dri-devel, Jason Ekstrand

Am 18.05.21 um 15:26 schrieb Daniel Stone:
> On Tue, 18 May 2021 at 13:49, Christian König
> <ckoenig.leichtzumerken@gmail.com> wrote:
>> Am 18.05.21 um 07:59 schrieb Daniel Vetter:
>>> First step in fixing that is (and frankly was since years) to fix the
>>> amdgpu CS so winsys can pass along a bunch of flags about which CS
>>> should actually set the exclusive fence, so that you stop oversyncing
>>> so badly. Ofc old userspace needs to keep oversyncing forever, no way
>>> to fix that.
>> Exactly that is what we don't want to do because the winsys has no idea
>> when to sync and when not to sync.
> Hey, we're typing that out as fast as we can ... it's just that you
> keep reinventing sync primitives faster than we can ship support for
> them :P

You can stop typing. We will even need that for backward compatibility.

But yeah, not reinventing sync_file support with drm_syncobj would have 
helped :)

Cheers,
Christian.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] Add DMA_RESV_USAGE flags
  2021-05-18 12:49           ` Christian König
  2021-05-18 13:26             ` Daniel Stone
@ 2021-05-18 16:48             ` Daniel Vetter
  2021-05-18 17:40               ` Christian König
  1 sibling, 1 reply; 50+ messages in thread
From: Daniel Vetter @ 2021-05-18 16:48 UTC (permalink / raw)
  To: Christian König
  Cc: moderated list:DMA BUFFER SHARING FRAMEWORK, dri-devel, Jason Ekstrand

On Tue, May 18, 2021 at 2:49 PM Christian König
<ckoenig.leichtzumerken@gmail.com> wrote:
>
> Hi Jason & Daniel,
>
> Am 18.05.21 um 07:59 schrieb Daniel Vetter:
> > On Tue, May 18, 2021 at 12:49 AM Jason Ekstrand <jason@jlekstrand.net> wrote:
> >> On Mon, May 17, 2021 at 3:15 PM Daniel Vetter <daniel@ffwll.ch> wrote:
> >>> On Mon, May 17, 2021 at 9:38 PM Christian König
> >>> <ckoenig.leichtzumerken@gmail.com> wrote:
> >>>> Am 17.05.21 um 17:04 schrieb Daniel Vetter:
> >>>>> On Mon, May 17, 2021 at 04:11:18PM +0200, Christian König wrote:
> >>>>>> We had a long outstanding problem in amdgpu that buffers exported to
> >>>>>> user drivers by DMA-buf serialize all command submissions using them.
> >>>>>>
> >>>>>> In other words we can't compose the buffer with different engines and
> >>>>>> then send it to another driver for display further processing.
> >>>>>>
> >>>>>> This was added to work around the fact that i915 didn't wanted to wait
> >>>>>> for shared fences in the dma_resv objects before displaying a buffer.
> >>>>>>
> >>>>>> Since this problem is now causing issues with Vulkan we need to find a
> >>>>>> better solution for that.
> >>>>>>
> >>>>>> The patch set here tries to do this by adding an usage flag to the
> >>>>>> shared fences noting when and how they should participate in implicit
> >>>>>> synchronization.
> >>>>> So the way this is fixed in every other vulkan driver is that vulkan
> >>>>> userspace sets flags in the CS ioctl when it wants to synchronize with
> >>>>> implicit sync. This gets you mostly there. Last time I checked amdgpu
> >>>>> isn't doing this, and yes that's broken.
> >>>> And exactly that is a really bad approach as far as I can see. The
> >>>> Vulkan stack on top simply doesn't know when to set this flag during CS.
> >>> Adding Jason for the Vulkan side of things, because this isn't how I
> >>> understand this works.
> >>>
> >>> But purely form a kernel pov your patches are sketchy for two reasons:
> >>>
> >>> - we reinstate the amdgpu special case of not setting exclusive fences
> >>>
> >>> - you only fix the single special case of i915 display, nothing else
> >>>
> >>> That's not how a cross driver interface works. And if you'd do this
> >>> properly, you'd be back to all the same sync fun you've orignally had,
> >>> with all the same fallout.
> >> I think I'm starting to see what Christian is trying to do here and I
> >> think there likely is a real genuine problem here.  I'm not convinced
> >> this is 100% of a solution but there might be something real.  Let me
> >> see if I can convince you or if I just make a hash of things. :-)
> >>
> >> The problem, once again, comes down to memory fencing vs. execution
> >> fencing and the way that we've unfortunately tied them together in the
> >> kernel.  With the current architecture, the only way to get proper
> >> write-fence semantics for implicit sync is to take an exclusive fence
> >> on the buffer.  This implies two things:
> >>
> >>   1. You have to implicitly wait on EVERY fence on the buffer before
> >> you can start your write-fenced operation
> >>
> >>   2. No one else can start ANY operation which accesses that buffer
> >> until you're done.
>
> Yes, exactly that. You absolutely nailed it.
>
> I unfortunately also have a 3rd use case:
>
> 3. Operations which shouldn't participate in any syncing, but only
> affect the memory management.
>
> This is basically our heavyweight TLB flush after unmapping the BO from
> somebodies page tables. Nobody should ever be concerned about it for any
> form of synchronization, but memory managment is not allowed to reuse or
> move the buffer before the operation is completed.

Isn't that just another case of 2? Or I'm not getting it.

> >> Let's say that you have a buffer which is shared between two drivers A
> >> and B and let's say driver A has thrown a fence on it just to ensure
> >> that the BO doesn't get swapped out to disk until it's at a good
> >> stopping point.  Then driver B comes along and wants to throw a
> >> write-fence on it.  Suddenly, your memory fence from driver A causes
> >> driver B to have to stall waiting for a "good" time to throw in a
> >> fence.  It sounds like this is the sort of scenario that Christian is
> >> running into.  And, yes, with certain Vulkan drivers being a bit
> >> sloppy about exactly when they throw in write fences, I could see it
> >> being a real problem.
> > Yes this is a potential problem, and on the i915 side we need to do
> > some shuffling here most likely. Especially due to discrete, but the
> > problem is pre-existing. tbh I forgot about the implications here
> > until I pondered this again yesterday evening.
> >
> > But afaiui the amdgpu code and winsys in mesa, this isn't (yet) the
> > problem amd vk drivers have. The issue is that with amdgpu, all you
> > supply are the following bits at CS time:
> > - list of always mapped private buffers, which is implicit and O(1) in
> > the kernel fastpath
> > - additional list of shared buffers that are used by the current CS
> >
> > I didn't check how exactly that works wrt winsys buffer ownership, but
> > the thing is that on the kernel side _any_ buffer in there is treated
> > as a implicit sync'ed write. Which means if you render your winsys
> > with a bunch of command submission split over 3d and compute pipes,
> > you end up with horrendous amounts of oversync.
>
> What are you talking about? We have no sync at all for submissions from
> the same client.

Yes. Except when the buffer is shared with another driver, at which
point you sync a _lot_ and feel the pain.

Or I'm not understanding at all what your patch series does and why
it's improving anything, specifically the last patch for amdgpu.

So please explain how this all adds up.

> > The reason for this is that amdgpu decided to go with a different
> > implicit sync model than everyone else:
> > - within an drm file everything is unsynced and left to userspace to
> > handle, amdgpu.ko only ever sets the shared fence slots.
> > - this means the exclusive slot really is exclusive to memory manage
> > issues, which side-steps the issue you point out above
> > - for anything cross-device they unconditionally wait for any shared
> > fence which is by another process
> >
> > Works, except it's incompatible with what everyone else is doing, so
> > had to be papered over by the current massive oversync solution.
>
> Well actually it is only i915 I care of which is working differently.

It's also anything that uses the atomic commit helpers. They _all_
expect the exclusive fence to be set for the last write. i915 is the
odd one out here by having its own commit helpers still, most other
atomic drivers moved over to the helper version. But since I wrote
both I think you can trust me that they work the same :-)

> Radeon works the same way as amdgpu by waiting for everything before
> doing command submission or pageflip.

The thing is, we don't want to wait for everything. We only want to
wait for the last writer, not for maybe a readback job or something
else. And this isn't just about atomic flip, it's for any
cross-device/process dma-buf sharing.

There's essentially two worlds we have here:
- drivers which work like i915, where exclusive slot is for implicit
sync, and shared is just for "I'm using this"
- amdgpu (you claim more, I'm honestly not so sure since you only
fixed amdgpu and i915 display), where all access is in the shard
slots, and then on cross-* sync you want to sync with all of them.

These two aren't compatible.

Also please keep in mind that neither radeon nor nouveau have a vulkan
driver, so pretty sure they haven't had to solve this problem much
yet.

Also I just reviewed nouveau, nouveau_bo_fence() says your wrong with
your claim, it sets the exclusive fence when userspace indicates a
write domain.

Also I looked at radeon, assuming I didn't get lost this seems to
indicate radeon also works like I think it should:

        p->relocs[i].tv.num_shared = !r->write_domain;

ttm_eu_fence_buffer_objects() then picks that up and sets the right
fence for radeon_cs.c code.

> > First step in fixing that is (and frankly was since years) to fix the
> > amdgpu CS so winsys can pass along a bunch of flags about which CS
> > should actually set the exclusive fence, so that you stop oversyncing
> > so badly. Ofc old userspace needs to keep oversyncing forever, no way
> > to fix that.
>
> Exactly that is what we don't want to do because the winsys has no idea
> when to sync and when not to sync.

Uh ... so why exactly can anv do it? And turnip and a few others?
What's the precise case where the winsys can't do the sync itself,
because it has no idea what's going on, but somehow the kernel can?
Can you please explain this, because we're definitely talking past
each another here. I really don't see any case where the kernel has
additional information than the userspace drivers here. But there's
lots of cases where userspace definitely knows more.

> The kernel on the other hand perfectly knows that.
>
> > Instead what Christian patch set here does is move amdgpu back to the
> > dma_resv contract it prefers, break everything else and then fix up
> > i915 atomic path so that the one use case that originally highlighted
> > the mismatch here works again. Which hrm .... no :-)
> >
> > I think the reason this wasn't ever a pressing issue is that amdgpu.ko
> > only does this for buffers shared across devices, so in most cases you
> > don't suffer from the terribly oversync. Conceptually it's still all
> > there.
> >
> >> The solution I *think* Christian is proposing is basically to have
> >> four categories of fences instead of two: exclusive, weak (shared with
> >> no r/w), read, and write.  (No, I didn't include r/w but that's the
> >> same as write-only when it comes to hazards.)  Then a bunch of flags
> >> and helpers to be able to handle the interactions between the three
> >> types of shared fences.  Honestly, this is something I've considered
> >> as I've wrestled with these problems in the past.  That said....
> >>
> >>   1. In GL, we can make the read/write information accurate and never
> >> over/under sync.
> >>
> >>   2. In the future ANV model I described earlier, this isn't a problem.
> >> It throws in a write-fence exactly once per frame.  It actually
> >> under-synchronizes but in a safe way.  I think that mostly makes the
> >> problem go away in practice.
> >>
> >>   3. If the core issue here really is memory vs. execution sync as I've
> >> said, maybe we really are papering over something by continuing to mix
> >> them.  Do we really want four fence types or do we want two orthogonal
> >> fence types?
> > Now once amdgpu.ko is fixed, we still have the problem of mixing up
> > the exclusive fence for implicit sync with the exclusive fence for
> > memory management. And for that we can and probably should figure out
> > what to do there. But that still requires that amdgpu CS first learns
> > what's actually going on from userspace, and secondly, that we do this
> > addition in a way which is compatible with current dma_resv users
> > (i.e. all drivers currently asking for an exclusive fence need to pick
> > up both types of exclusive fences if we decide to split them).
> >> I think I've convinced myself that the problem is real, but not that
> >> this solution is correct.
> > Yeah there's definitely some problems here, but Christian hasn't
> > really explained which one he's trying to solve, so we're also running
> > a bit in a circle trying to guess what's what :-/
>
> Well how can I help with that?
>
> Jason seems to have the perfect understanding why we have those problems.

See my other reply. I think aside from dma-buf p2p we don't actually
have a problem?

> And as long as we are all inside amdgpu we also don't have any oversync,
> the issue only happens when we share dma-bufs with i915 (radeon and
> AFAIK nouveau does the right thing as well).

Yeah because then you can't use the amdgpu dma_resv model anymore and
have to use the one atomic helpers use. Which is also the one that
e.g. Jason is threathening to bake in as uapi with his dma_buf ioctl,
so as soon as that lands and someone starts using it, something has to
adapt _anytime_ you have a dma-buf hanging around. Not just when it's
shared with another device.

So the way I see things right now:
- exclusive fence slot is for implicit sync. kmd should only set it
when userspace indicates, otherwise you will suffer. Explicit syncing
userspace needs to tell the kernel with a flag in the CS ioctl when it
should sync against this exclusive fence and when it should ignore it,
otherwise you'll suffer badly once more.
- no funny tricks with not doing this when it's just internally in
your driver, because the more uapi we build on top of dma-buf fd the
harder this will break. amdgpu gets to keep some nasty tricks going
here until appropriate uapi is finally rolled out, but should stop
this asap.
- ttm_bo->moving is the fence for stuff you're not allowed to ignore.
Probably need to move that to dma_resv for p2p dma-buf, not sure on
that yet.

After that I think we can look at what exact oversync issue remains
and why and solve it, but until we have this this just feels like
another rehash of "amgpu insist its own dma_resv interpration is the
right one and everyone else should move one over".

Or maybe I've just become real garbage at reading random driver code,
wouldn't be the first time :-)

Cheers, Daniel

> Regards,
> Christian.
>
> >
> > Cheers, Daniel
> >
> >> --Jason
> >>
> >>
> >>>> That's also the reason the Valve guys came up with a solution where each
> >>>> BO gets a flag for explicit sync, but that only works for exports and
> >>>> not for imports.
> >>>>
> >>>>> I915 and iirc msm has explicit flags for this, panfrost was designed to
> >>>>> support this correctly from the start (also with flags I think). That's at
> >>>>> least what I remember from all the discussions at XDC and #dri-devel, but
> >>>>> didn't check the code again to give you the list of uapi flags you need
> >>>>> for each driver.
> >>>>>
> >>>>> The other piece is making sure you're only picking up implicit fences when
> >>>>> you should, and not any later ones, for which Jason has a solution:
> >>>>>
> >>>>> https://lore.kernel.org/dri-devel/20210317221940.2146688-1-jason@jlekstrand.net/
> >>>> Yes, I helped with that as well. But I think that this is just another
> >>>> workaround without really addressing the underlying problem.
> >>>>
> >>>>> If amdgpu isn't using those, then you will suffer from
> >>>>> over-synchronization in vulkan and pay a price. The entire point of vulkan
> >>>>> is that you pick up sync points very explicitly, and we also need to have
> >>>>> very explicit uapi for userspace to pick up/set the implicit fences.
> >>>>>
> >>>>> Trying to paper over this with more implicit magic is imo just wrong, and
> >>>>> definitely not the long term explicit sync model we want.
> >>>> I completely disagree.
> >>>>
> >>>> In my opinion the implicit sync model we have for dma_resv currently is
> >>>> just not well designed at all, since it always requires cooperation from
> >>>> userspace.
> >>>>
> >>>> In other words you need to know when to enable implicit sync in
> >>>> userspace and that information is simply not present all of the time.
> >>>>
> >>>> What we have done here is just keeping the old reader/writer flags i915,
> >>>> radeon and nouveau once had and pushed that out to everybody else making
> >>>> the assumption that everybody would follow that without documenting the
> >>>> actual rules of engagement you need to follow here.
> >>>>
> >>>> That was a really big mistake and we should try to fix that sooner or
> >>>> later. The only other clean alternative I see is to use a flag on the
> >>>> exporter to tell the importer if it should sync to shared fences or not.
> >>>>
> >>>> Additional to that I'm perfectly fine with implicit sync. Explicit sync
> >>>> certainly has some use cases as well, but I don't see it as an absolute
> >>>> advantage over the implicit model.
> >>> Ok this stops making sense. Somehow you claim userspace doesn't know
> >>> when to sync, but somehow the kernel does? By guessing, and getting it
> >>> wrong mostly, except for the one case that you benchmarked?
> >>>
> >>> Aside from silly userspace which exports a buffer to a dma-buf, but
> >>> then never imports it anywhere else, there isn't a case I know of
> >>> where the kernel actually knows more than userspace. But there's lots
> >>> of cases where the kernel definitely knows less, especially if
> >>> userspace doesn't tell it about what's going on with each rendering
> >>> and buffer.
> >>>
> >>> So here's the 2 things you need to make this work like every other driver:
> >>>
> >>> 1. A way to set the explicit fence on a buffer. CS ioctl is perfectly
> >>> fine, but also can be seperate. Userspace uses this only on a) shared
> >>> buffers b) when there's a flush/swap on that shared buffer. Not when
> >>> rendering any of the interim stuff, that only leads to oversync.
> >>> Anything non-shared is handled explicitly in userspace (at least for
> >>> modern-ish drivers). This is the only thing that ever sets an
> >>> exclusive fence (aside from ttm moving buffers around ofc).
> >>>
> >>> 2. A way to sync with the implicit fences, either all of them (for
> >>> upcoming write access) or just the write fence (for read access). At
> >>> first we thought it's good enough to do this in the CS ioctl, but
> >>> that's a wee bit too late, hence the patches from Jason. My
> >>> understanding is that vulkan converts this into an vk syncobj/fence of
> >>> some sorts, so really can't make this more explicit and intentional
> >>> than that.
> >>>
> >>> None of this is something the kernel has the slightest idea about when
> >>> it happens, so you have to have explicit uapi for it. Trying to fake
> >>> it in the kernel just doesn't work.
> >>> -Daniel
> >>> --
> >>> Daniel Vetter
> >>> Software Engineer, Intel Corporation
> >>> http://blog.ffwll.ch
> >
> >
> > --
> > Daniel Vetter
> > Software Engineer, Intel Corporation
> > http://blog.ffwll.ch
>


-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] Add DMA_RESV_USAGE flags
  2021-05-18 16:48             ` Daniel Vetter
@ 2021-05-18 17:40               ` Christian König
  2021-05-18 21:17                 ` Daniel Vetter
  2021-05-18 21:31                 ` Dave Airlie
  0 siblings, 2 replies; 50+ messages in thread
From: Christian König @ 2021-05-18 17:40 UTC (permalink / raw)
  To: Daniel Vetter
  Cc: moderated list:DMA BUFFER SHARING FRAMEWORK, dri-devel, Jason Ekstrand

Am 18.05.21 um 18:48 schrieb Daniel Vetter:
> On Tue, May 18, 2021 at 2:49 PM Christian König
> <ckoenig.leichtzumerken@gmail.com> wrote:
>> Hi Jason & Daniel,
>>
>> Am 18.05.21 um 07:59 schrieb Daniel Vetter:
>>> On Tue, May 18, 2021 at 12:49 AM Jason Ekstrand <jason@jlekstrand.net> wrote:
>>>> On Mon, May 17, 2021 at 3:15 PM Daniel Vetter <daniel@ffwll.ch> wrote:
>>>>> On Mon, May 17, 2021 at 9:38 PM Christian König
>>>>> <ckoenig.leichtzumerken@gmail.com> wrote:
>>>>>> Am 17.05.21 um 17:04 schrieb Daniel Vetter:
>>>>>>> On Mon, May 17, 2021 at 04:11:18PM +0200, Christian König wrote:
>>>>>>>> We had a long outstanding problem in amdgpu that buffers exported to
>>>>>>>> user drivers by DMA-buf serialize all command submissions using them.
>>>>>>>>
>>>>>>>> In other words we can't compose the buffer with different engines and
>>>>>>>> then send it to another driver for display further processing.
>>>>>>>>
>>>>>>>> This was added to work around the fact that i915 didn't wanted to wait
>>>>>>>> for shared fences in the dma_resv objects before displaying a buffer.
>>>>>>>>
>>>>>>>> Since this problem is now causing issues with Vulkan we need to find a
>>>>>>>> better solution for that.
>>>>>>>>
>>>>>>>> The patch set here tries to do this by adding an usage flag to the
>>>>>>>> shared fences noting when and how they should participate in implicit
>>>>>>>> synchronization.
>>>>>>> So the way this is fixed in every other vulkan driver is that vulkan
>>>>>>> userspace sets flags in the CS ioctl when it wants to synchronize with
>>>>>>> implicit sync. This gets you mostly there. Last time I checked amdgpu
>>>>>>> isn't doing this, and yes that's broken.
>>>>>> And exactly that is a really bad approach as far as I can see. The
>>>>>> Vulkan stack on top simply doesn't know when to set this flag during CS.
>>>>> Adding Jason for the Vulkan side of things, because this isn't how I
>>>>> understand this works.
>>>>>
>>>>> But purely form a kernel pov your patches are sketchy for two reasons:
>>>>>
>>>>> - we reinstate the amdgpu special case of not setting exclusive fences
>>>>>
>>>>> - you only fix the single special case of i915 display, nothing else
>>>>>
>>>>> That's not how a cross driver interface works. And if you'd do this
>>>>> properly, you'd be back to all the same sync fun you've orignally had,
>>>>> with all the same fallout.
>>>> I think I'm starting to see what Christian is trying to do here and I
>>>> think there likely is a real genuine problem here.  I'm not convinced
>>>> this is 100% of a solution but there might be something real.  Let me
>>>> see if I can convince you or if I just make a hash of things. :-)
>>>>
>>>> The problem, once again, comes down to memory fencing vs. execution
>>>> fencing and the way that we've unfortunately tied them together in the
>>>> kernel.  With the current architecture, the only way to get proper
>>>> write-fence semantics for implicit sync is to take an exclusive fence
>>>> on the buffer.  This implies two things:
>>>>
>>>>    1. You have to implicitly wait on EVERY fence on the buffer before
>>>> you can start your write-fenced operation
>>>>
>>>>    2. No one else can start ANY operation which accesses that buffer
>>>> until you're done.
>> Yes, exactly that. You absolutely nailed it.
>>
>> I unfortunately also have a 3rd use case:
>>
>> 3. Operations which shouldn't participate in any syncing, but only
>> affect the memory management.
>>
>> This is basically our heavyweight TLB flush after unmapping the BO from
>> somebodies page tables. Nobody should ever be concerned about it for any
>> form of synchronization, but memory managment is not allowed to reuse or
>> move the buffer before the operation is completed.
> Isn't that just another case of 2? Or I'm not getting it.

The problem in this case is not starting a new CS, but synchronizing to 
the existing ones.

See a heavy TLB flush is made completely out of sync. E.g. it doesn't 
want to wait for any previous operation.

In other words imagine the following example:
1. Both process A and B have a BO mapped.
2. Process A is heavily using the BO and doing all kind of rendering.
3. Process B is unmapping the BO.

Now that process B unmaps the BO needs to trigger page table updates and 
a heavy TLB flush, but since this can take really long we want to do it 
asynchronously on the hardware.

With the current approach you basically can't do that because you can't 
note that a fence should not participate in synchronization at all.

E.g. we can't add a fence which doesn't wait for the exclusive one as 
shared.


>>>> Let's say that you have a buffer which is shared between two drivers A
>>>> and B and let's say driver A has thrown a fence on it just to ensure
>>>> that the BO doesn't get swapped out to disk until it's at a good
>>>> stopping point.  Then driver B comes along and wants to throw a
>>>> write-fence on it.  Suddenly, your memory fence from driver A causes
>>>> driver B to have to stall waiting for a "good" time to throw in a
>>>> fence.  It sounds like this is the sort of scenario that Christian is
>>>> running into.  And, yes, with certain Vulkan drivers being a bit
>>>> sloppy about exactly when they throw in write fences, I could see it
>>>> being a real problem.
>>> Yes this is a potential problem, and on the i915 side we need to do
>>> some shuffling here most likely. Especially due to discrete, but the
>>> problem is pre-existing. tbh I forgot about the implications here
>>> until I pondered this again yesterday evening.
>>>
>>> But afaiui the amdgpu code and winsys in mesa, this isn't (yet) the
>>> problem amd vk drivers have. The issue is that with amdgpu, all you
>>> supply are the following bits at CS time:
>>> - list of always mapped private buffers, which is implicit and O(1) in
>>> the kernel fastpath
>>> - additional list of shared buffers that are used by the current CS
>>>
>>> I didn't check how exactly that works wrt winsys buffer ownership, but
>>> the thing is that on the kernel side _any_ buffer in there is treated
>>> as a implicit sync'ed write. Which means if you render your winsys
>>> with a bunch of command submission split over 3d and compute pipes,
>>> you end up with horrendous amounts of oversync.
>> What are you talking about? We have no sync at all for submissions from
>> the same client.
> Yes. Except when the buffer is shared with another driver, at which
> point you sync a _lot_ and feel the pain.

Yes, exactly that's the problem.

We basically don't know during CS if a BO is shared or not.

We do know that during importing or exporting the BO thought.

> Or I'm not understanding at all what your patch series does and why
> it's improving anything, specifically the last patch for amdgpu.
>
> So please explain how this all adds up.
>
>>> The reason for this is that amdgpu decided to go with a different
>>> implicit sync model than everyone else:
>>> - within an drm file everything is unsynced and left to userspace to
>>> handle, amdgpu.ko only ever sets the shared fence slots.
>>> - this means the exclusive slot really is exclusive to memory manage
>>> issues, which side-steps the issue you point out above
>>> - for anything cross-device they unconditionally wait for any shared
>>> fence which is by another process
>>>
>>> Works, except it's incompatible with what everyone else is doing, so
>>> had to be papered over by the current massive oversync solution.
>> Well actually it is only i915 I care of which is working differently.
> It's also anything that uses the atomic commit helpers. They _all_
> expect the exclusive fence to be set for the last write. i915 is the
> odd one out here by having its own commit helpers still, most other
> atomic drivers moved over to the helper version. But since I wrote
> both I think you can trust me that they work the same :-)
>
>> Radeon works the same way as amdgpu by waiting for everything before
>> doing command submission or pageflip.
> The thing is, we don't want to wait for everything. We only want to
> wait for the last writer, not for maybe a readback job or something
> else. And this isn't just about atomic flip, it's for any
> cross-device/process dma-buf sharing.

Well exactly that's the problem. In amdgpu we do want to wait for 
multiple fences, but not for page table updates (for example).

That also one of the reasons why the approach with adding an exclusive 
fence was never an option here.

> There's essentially two worlds we have here:
> - drivers which work like i915, where exclusive slot is for implicit
> sync, and shared is just for "I'm using this"
> - amdgpu (you claim more, I'm honestly not so sure since you only
> fixed amdgpu and i915 display), where all access is in the shard
> slots, and then on cross-* sync you want to sync with all of them.

Well we also have radeon and nouveau which are basically lying to the 
kernel when they say that a BO is only read accessed to allow different 
submissions to the MM and graphics engine to run in parallel.

> These two aren't compatible.
>
> Also please keep in mind that neither radeon nor nouveau have a vulkan
> driver, so pretty sure they haven't had to solve this problem much
> yet.

Not the Vulkan problem, but the MM engine and GFX engine need to access 
the same BO with both reads and writes at the same time problem.

> Also I just reviewed nouveau, nouveau_bo_fence() says your wrong with
> your claim, it sets the exclusive fence when userspace indicates a
> write domain.
>
> Also I looked at radeon, assuming I didn't get lost this seems to
> indicate radeon also works like I think it should:
>
>          p->relocs[i].tv.num_shared = !r->write_domain;
>
> ttm_eu_fence_buffer_objects() then picks that up and sets the right
> fence for radeon_cs.c code.

Yes, your observation is correct. The problem is only that both nouveau 
and radeon are lying to the kernel (or at least they used to).

We just never ran into problems because neither driver can share BOs 
containing NV12 pictures directly with other drivers.

>>> First step in fixing that is (and frankly was since years) to fix the
>>> amdgpu CS so winsys can pass along a bunch of flags about which CS
>>> should actually set the exclusive fence, so that you stop oversyncing
>>> so badly. Ofc old userspace needs to keep oversyncing forever, no way
>>> to fix that.
>> Exactly that is what we don't want to do because the winsys has no idea
>> when to sync and when not to sync.
> Uh ... so why exactly can anv do it? And turnip and a few others?
> What's the precise case where the winsys can't do the sync itself,
> because it has no idea what's going on, but somehow the kernel can?
> Can you please explain this, because we're definitely talking past
> each another here. I really don't see any case where the kernel has
> additional information than the userspace drivers here. But there's
> lots of cases where userspace definitely knows more.

The kernel knows when a BO is used by a different process and can add 
the proper inter process synchronization there.

>> The kernel on the other hand perfectly knows that.
>>
>>> Instead what Christian patch set here does is move amdgpu back to the
>>> dma_resv contract it prefers, break everything else and then fix up
>>> i915 atomic path so that the one use case that originally highlighted
>>> the mismatch here works again. Which hrm .... no :-)
>>>
>>> I think the reason this wasn't ever a pressing issue is that amdgpu.ko
>>> only does this for buffers shared across devices, so in most cases you
>>> don't suffer from the terribly oversync. Conceptually it's still all
>>> there.
>>>
>>>> The solution I *think* Christian is proposing is basically to have
>>>> four categories of fences instead of two: exclusive, weak (shared with
>>>> no r/w), read, and write.  (No, I didn't include r/w but that's the
>>>> same as write-only when it comes to hazards.)  Then a bunch of flags
>>>> and helpers to be able to handle the interactions between the three
>>>> types of shared fences.  Honestly, this is something I've considered
>>>> as I've wrestled with these problems in the past.  That said....
>>>>
>>>>    1. In GL, we can make the read/write information accurate and never
>>>> over/under sync.
>>>>
>>>>    2. In the future ANV model I described earlier, this isn't a problem.
>>>> It throws in a write-fence exactly once per frame.  It actually
>>>> under-synchronizes but in a safe way.  I think that mostly makes the
>>>> problem go away in practice.
>>>>
>>>>    3. If the core issue here really is memory vs. execution sync as I've
>>>> said, maybe we really are papering over something by continuing to mix
>>>> them.  Do we really want four fence types or do we want two orthogonal
>>>> fence types?
>>> Now once amdgpu.ko is fixed, we still have the problem of mixing up
>>> the exclusive fence for implicit sync with the exclusive fence for
>>> memory management. And for that we can and probably should figure out
>>> what to do there. But that still requires that amdgpu CS first learns
>>> what's actually going on from userspace, and secondly, that we do this
>>> addition in a way which is compatible with current dma_resv users
>>> (i.e. all drivers currently asking for an exclusive fence need to pick
>>> up both types of exclusive fences if we decide to split them).
>>>> I think I've convinced myself that the problem is real, but not that
>>>> this solution is correct.
>>> Yeah there's definitely some problems here, but Christian hasn't
>>> really explained which one he's trying to solve, so we're also running
>>> a bit in a circle trying to guess what's what :-/
>> Well how can I help with that?
>>
>> Jason seems to have the perfect understanding why we have those problems.
> See my other reply. I think aside from dma-buf p2p we don't actually
> have a problem?
>
>> And as long as we are all inside amdgpu we also don't have any oversync,
>> the issue only happens when we share dma-bufs with i915 (radeon and
>> AFAIK nouveau does the right thing as well).
> Yeah because then you can't use the amdgpu dma_resv model anymore and
> have to use the one atomic helpers use. Which is also the one that
> e.g. Jason is threathening to bake in as uapi with his dma_buf ioctl,
> so as soon as that lands and someone starts using it, something has to
> adapt _anytime_ you have a dma-buf hanging around. Not just when it's
> shared with another device.

Yeah, and that is exactly the reason why I will NAK this uAPI change.

This doesn't works for amdgpu at all for the reasons outlined above.

> So the way I see things right now:
> - exclusive fence slot is for implicit sync. kmd should only set it
> when userspace indicates, otherwise you will suffer. Explicit syncing
> userspace needs to tell the kernel with a flag in the CS ioctl when it
> should sync against this exclusive fence and when it should ignore it,
> otherwise you'll suffer badly once more.

That is not sufficient. The explicit sync slot is for kernel internal 
memory management.

E.g. every access needs to sync to it and we can't allow to ignore it by 
specifying an userspace flag.

> - no funny tricks with not doing this when it's just internally in
> your driver, because the more uapi we build on top of dma-buf fd the
> harder this will break. amdgpu gets to keep some nasty tricks going
> here until appropriate uapi is finally rolled out, but should stop
> this asap.

That is really not going to happen. The kernel is the only place where 
you can do proper implicit synchronization between processes.

> - ttm_bo->moving is the fence for stuff you're not allowed to ignore.
> Probably need to move that to dma_resv for p2p dma-buf, not sure on
> that yet.

Well that's at least some possibility. But I would do it the other way 
around, the exclusive fence stays what it is and you add another 
implicit sync fence.

Regards,
Christian.

>
> After that I think we can look at what exact oversync issue remains
> and why and solve it, but until we have this this just feels like
> another rehash of "amgpu insist its own dma_resv interpration is the
> right one and everyone else should move one over".
>
> Or maybe I've just become real garbage at reading random driver code,
> wouldn't be the first time :-)
>
> Cheers, Daniel
>
>> Regards,
>> Christian.
>>
>>> Cheers, Daniel
>>>
>>>> --Jason
>>>>
>>>>
>>>>>> That's also the reason the Valve guys came up with a solution where each
>>>>>> BO gets a flag for explicit sync, but that only works for exports and
>>>>>> not for imports.
>>>>>>
>>>>>>> I915 and iirc msm has explicit flags for this, panfrost was designed to
>>>>>>> support this correctly from the start (also with flags I think). That's at
>>>>>>> least what I remember from all the discussions at XDC and #dri-devel, but
>>>>>>> didn't check the code again to give you the list of uapi flags you need
>>>>>>> for each driver.
>>>>>>>
>>>>>>> The other piece is making sure you're only picking up implicit fences when
>>>>>>> you should, and not any later ones, for which Jason has a solution:
>>>>>>>
>>>>>>> https://lore.kernel.org/dri-devel/20210317221940.2146688-1-jason@jlekstrand.net/
>>>>>> Yes, I helped with that as well. But I think that this is just another
>>>>>> workaround without really addressing the underlying problem.
>>>>>>
>>>>>>> If amdgpu isn't using those, then you will suffer from
>>>>>>> over-synchronization in vulkan and pay a price. The entire point of vulkan
>>>>>>> is that you pick up sync points very explicitly, and we also need to have
>>>>>>> very explicit uapi for userspace to pick up/set the implicit fences.
>>>>>>>
>>>>>>> Trying to paper over this with more implicit magic is imo just wrong, and
>>>>>>> definitely not the long term explicit sync model we want.
>>>>>> I completely disagree.
>>>>>>
>>>>>> In my opinion the implicit sync model we have for dma_resv currently is
>>>>>> just not well designed at all, since it always requires cooperation from
>>>>>> userspace.
>>>>>>
>>>>>> In other words you need to know when to enable implicit sync in
>>>>>> userspace and that information is simply not present all of the time.
>>>>>>
>>>>>> What we have done here is just keeping the old reader/writer flags i915,
>>>>>> radeon and nouveau once had and pushed that out to everybody else making
>>>>>> the assumption that everybody would follow that without documenting the
>>>>>> actual rules of engagement you need to follow here.
>>>>>>
>>>>>> That was a really big mistake and we should try to fix that sooner or
>>>>>> later. The only other clean alternative I see is to use a flag on the
>>>>>> exporter to tell the importer if it should sync to shared fences or not.
>>>>>>
>>>>>> Additional to that I'm perfectly fine with implicit sync. Explicit sync
>>>>>> certainly has some use cases as well, but I don't see it as an absolute
>>>>>> advantage over the implicit model.
>>>>> Ok this stops making sense. Somehow you claim userspace doesn't know
>>>>> when to sync, but somehow the kernel does? By guessing, and getting it
>>>>> wrong mostly, except for the one case that you benchmarked?
>>>>>
>>>>> Aside from silly userspace which exports a buffer to a dma-buf, but
>>>>> then never imports it anywhere else, there isn't a case I know of
>>>>> where the kernel actually knows more than userspace. But there's lots
>>>>> of cases where the kernel definitely knows less, especially if
>>>>> userspace doesn't tell it about what's going on with each rendering
>>>>> and buffer.
>>>>>
>>>>> So here's the 2 things you need to make this work like every other driver:
>>>>>
>>>>> 1. A way to set the explicit fence on a buffer. CS ioctl is perfectly
>>>>> fine, but also can be seperate. Userspace uses this only on a) shared
>>>>> buffers b) when there's a flush/swap on that shared buffer. Not when
>>>>> rendering any of the interim stuff, that only leads to oversync.
>>>>> Anything non-shared is handled explicitly in userspace (at least for
>>>>> modern-ish drivers). This is the only thing that ever sets an
>>>>> exclusive fence (aside from ttm moving buffers around ofc).
>>>>>
>>>>> 2. A way to sync with the implicit fences, either all of them (for
>>>>> upcoming write access) or just the write fence (for read access). At
>>>>> first we thought it's good enough to do this in the CS ioctl, but
>>>>> that's a wee bit too late, hence the patches from Jason. My
>>>>> understanding is that vulkan converts this into an vk syncobj/fence of
>>>>> some sorts, so really can't make this more explicit and intentional
>>>>> than that.
>>>>>
>>>>> None of this is something the kernel has the slightest idea about when
>>>>> it happens, so you have to have explicit uapi for it. Trying to fake
>>>>> it in the kernel just doesn't work.
>>>>> -Daniel
>>>>> --
>>>>> Daniel Vetter
>>>>> Software Engineer, Intel Corporation
>>>>> http://blog.ffwll.ch
>>>
>>> --
>>> Daniel Vetter
>>> Software Engineer, Intel Corporation
>>> http://blog.ffwll.ch
>


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] Add DMA_RESV_USAGE flags
  2021-05-18 17:40               ` Christian König
@ 2021-05-18 21:17                 ` Daniel Vetter
  2021-05-18 22:06                   ` Jason Ekstrand
  2021-05-19 11:24                   ` Christian König
  2021-05-18 21:31                 ` Dave Airlie
  1 sibling, 2 replies; 50+ messages in thread
From: Daniel Vetter @ 2021-05-18 21:17 UTC (permalink / raw)
  To: Christian König
  Cc: moderated list:DMA BUFFER SHARING FRAMEWORK, dri-devel, Jason Ekstrand

On Tue, May 18, 2021 at 7:40 PM Christian König
<ckoenig.leichtzumerken@gmail.com> wrote:
>
> Am 18.05.21 um 18:48 schrieb Daniel Vetter:
> > On Tue, May 18, 2021 at 2:49 PM Christian König
> > <ckoenig.leichtzumerken@gmail.com> wrote:
> >> Hi Jason & Daniel,
> >>
> >> Am 18.05.21 um 07:59 schrieb Daniel Vetter:
> >>> On Tue, May 18, 2021 at 12:49 AM Jason Ekstrand <jason@jlekstrand.net> wrote:
> >>>> On Mon, May 17, 2021 at 3:15 PM Daniel Vetter <daniel@ffwll.ch> wrote:
> >>>>> On Mon, May 17, 2021 at 9:38 PM Christian König
> >>>>> <ckoenig.leichtzumerken@gmail.com> wrote:
> >>>>>> Am 17.05.21 um 17:04 schrieb Daniel Vetter:
> >>>>>>> On Mon, May 17, 2021 at 04:11:18PM +0200, Christian König wrote:
> >>>>>>>> We had a long outstanding problem in amdgpu that buffers exported to
> >>>>>>>> user drivers by DMA-buf serialize all command submissions using them.
> >>>>>>>>
> >>>>>>>> In other words we can't compose the buffer with different engines and
> >>>>>>>> then send it to another driver for display further processing.
> >>>>>>>>
> >>>>>>>> This was added to work around the fact that i915 didn't wanted to wait
> >>>>>>>> for shared fences in the dma_resv objects before displaying a buffer.
> >>>>>>>>
> >>>>>>>> Since this problem is now causing issues with Vulkan we need to find a
> >>>>>>>> better solution for that.
> >>>>>>>>
> >>>>>>>> The patch set here tries to do this by adding an usage flag to the
> >>>>>>>> shared fences noting when and how they should participate in implicit
> >>>>>>>> synchronization.
> >>>>>>> So the way this is fixed in every other vulkan driver is that vulkan
> >>>>>>> userspace sets flags in the CS ioctl when it wants to synchronize with
> >>>>>>> implicit sync. This gets you mostly there. Last time I checked amdgpu
> >>>>>>> isn't doing this, and yes that's broken.
> >>>>>> And exactly that is a really bad approach as far as I can see. The
> >>>>>> Vulkan stack on top simply doesn't know when to set this flag during CS.
> >>>>> Adding Jason for the Vulkan side of things, because this isn't how I
> >>>>> understand this works.
> >>>>>
> >>>>> But purely form a kernel pov your patches are sketchy for two reasons:
> >>>>>
> >>>>> - we reinstate the amdgpu special case of not setting exclusive fences
> >>>>>
> >>>>> - you only fix the single special case of i915 display, nothing else
> >>>>>
> >>>>> That's not how a cross driver interface works. And if you'd do this
> >>>>> properly, you'd be back to all the same sync fun you've orignally had,
> >>>>> with all the same fallout.
> >>>> I think I'm starting to see what Christian is trying to do here and I
> >>>> think there likely is a real genuine problem here.  I'm not convinced
> >>>> this is 100% of a solution but there might be something real.  Let me
> >>>> see if I can convince you or if I just make a hash of things. :-)
> >>>>
> >>>> The problem, once again, comes down to memory fencing vs. execution
> >>>> fencing and the way that we've unfortunately tied them together in the
> >>>> kernel.  With the current architecture, the only way to get proper
> >>>> write-fence semantics for implicit sync is to take an exclusive fence
> >>>> on the buffer.  This implies two things:
> >>>>
> >>>>    1. You have to implicitly wait on EVERY fence on the buffer before
> >>>> you can start your write-fenced operation
> >>>>
> >>>>    2. No one else can start ANY operation which accesses that buffer
> >>>> until you're done.
> >> Yes, exactly that. You absolutely nailed it.
> >>
> >> I unfortunately also have a 3rd use case:
> >>
> >> 3. Operations which shouldn't participate in any syncing, but only
> >> affect the memory management.
> >>
> >> This is basically our heavyweight TLB flush after unmapping the BO from
> >> somebodies page tables. Nobody should ever be concerned about it for any
> >> form of synchronization, but memory managment is not allowed to reuse or
> >> move the buffer before the operation is completed.
> > Isn't that just another case of 2? Or I'm not getting it.
>
> The problem in this case is not starting a new CS, but synchronizing to
> the existing ones.
>
> See a heavy TLB flush is made completely out of sync. E.g. it doesn't
> want to wait for any previous operation.
>
> In other words imagine the following example:
> 1. Both process A and B have a BO mapped.
> 2. Process A is heavily using the BO and doing all kind of rendering.
> 3. Process B is unmapping the BO.
>
> Now that process B unmaps the BO needs to trigger page table updates and
> a heavy TLB flush, but since this can take really long we want to do it
> asynchronously on the hardware.
>
> With the current approach you basically can't do that because you can't
> note that a fence should not participate in synchronization at all.
>
> E.g. we can't add a fence which doesn't wait for the exclusive one as
> shared.

Ok I think that's a real problem, and  guess it's also related to all
the ttm privatization tricks and all that. So essentially we'd need
the opposite of ttm_bo->moving, as in you can't ignore it, but
otherwise it completely ignores all the userspace implicit fence
stuff.

> >>>> Let's say that you have a buffer which is shared between two drivers A
> >>>> and B and let's say driver A has thrown a fence on it just to ensure
> >>>> that the BO doesn't get swapped out to disk until it's at a good
> >>>> stopping point.  Then driver B comes along and wants to throw a
> >>>> write-fence on it.  Suddenly, your memory fence from driver A causes
> >>>> driver B to have to stall waiting for a "good" time to throw in a
> >>>> fence.  It sounds like this is the sort of scenario that Christian is
> >>>> running into.  And, yes, with certain Vulkan drivers being a bit
> >>>> sloppy about exactly when they throw in write fences, I could see it
> >>>> being a real problem.
> >>> Yes this is a potential problem, and on the i915 side we need to do
> >>> some shuffling here most likely. Especially due to discrete, but the
> >>> problem is pre-existing. tbh I forgot about the implications here
> >>> until I pondered this again yesterday evening.
> >>>
> >>> But afaiui the amdgpu code and winsys in mesa, this isn't (yet) the
> >>> problem amd vk drivers have. The issue is that with amdgpu, all you
> >>> supply are the following bits at CS time:
> >>> - list of always mapped private buffers, which is implicit and O(1) in
> >>> the kernel fastpath
> >>> - additional list of shared buffers that are used by the current CS
> >>>
> >>> I didn't check how exactly that works wrt winsys buffer ownership, but
> >>> the thing is that on the kernel side _any_ buffer in there is treated
> >>> as a implicit sync'ed write. Which means if you render your winsys
> >>> with a bunch of command submission split over 3d and compute pipes,
> >>> you end up with horrendous amounts of oversync.
> >> What are you talking about? We have no sync at all for submissions from
> >> the same client.
> > Yes. Except when the buffer is shared with another driver, at which
> > point you sync a _lot_ and feel the pain.
>
> Yes, exactly that's the problem.
>
> We basically don't know during CS if a BO is shared or not.
>
> We do know that during importing or exporting the BO thought.

No you don't. Or at least that's massively awkward, see Jason's reply.

> > Or I'm not understanding at all what your patch series does and why
> > it's improving anything, specifically the last patch for amdgpu.
> >
> > So please explain how this all adds up.
> >
> >>> The reason for this is that amdgpu decided to go with a different
> >>> implicit sync model than everyone else:
> >>> - within an drm file everything is unsynced and left to userspace to
> >>> handle, amdgpu.ko only ever sets the shared fence slots.
> >>> - this means the exclusive slot really is exclusive to memory manage
> >>> issues, which side-steps the issue you point out above
> >>> - for anything cross-device they unconditionally wait for any shared
> >>> fence which is by another process
> >>>
> >>> Works, except it's incompatible with what everyone else is doing, so
> >>> had to be papered over by the current massive oversync solution.
> >> Well actually it is only i915 I care of which is working differently.
> > It's also anything that uses the atomic commit helpers. They _all_
> > expect the exclusive fence to be set for the last write. i915 is the
> > odd one out here by having its own commit helpers still, most other
> > atomic drivers moved over to the helper version. But since I wrote
> > both I think you can trust me that they work the same :-)
> >
> >> Radeon works the same way as amdgpu by waiting for everything before
> >> doing command submission or pageflip.
> > The thing is, we don't want to wait for everything. We only want to
> > wait for the last writer, not for maybe a readback job or something
> > else. And this isn't just about atomic flip, it's for any
> > cross-device/process dma-buf sharing.
>
> Well exactly that's the problem. In amdgpu we do want to wait for
> multiple fences, but not for page table updates (for example).
>
> That also one of the reasons why the approach with adding an exclusive
> fence was never an option here.

Lying to the kernel is ok. That's the entire point I'm trying to get
across. And amdgpu needs to gain some uapi to make that lying
possible.

Also it's not lying, it's how this stuff works:
- For anything you don't share, you _never_ set the write flag.
Userspace takes care of any fencing needs itself. You alos tell the
kernel to _always_ ignore any exclusive fence it sets.
- For sharing you set the write flag, but _only_ only handover points.
Same when you synchronize with other access, you do that once at the
handover point, and then you tell the kernel to ignore the exclusive
fence everywhere else.

Essentially you treat implicit sync not as something magic, but as a
very screwed up IPC mechanism for explicit fences.

Again this isn't lying, it's how it works. The kernel cannot and must
not rely on userspace telling the truth (couldn't check it without a
cmd parser), so the only thing you can use the write flag respectively
exclusive fence is as an IPC slot for fences.

Use it like IPC, not like a shotgun approach of "maybe we should set a
fence and let the kernel sort out the mess".

> > There's essentially two worlds we have here:
> > - drivers which work like i915, where exclusive slot is for implicit
> > sync, and shared is just for "I'm using this"
> > - amdgpu (you claim more, I'm honestly not so sure since you only
> > fixed amdgpu and i915 display), where all access is in the shard
> > slots, and then on cross-* sync you want to sync with all of them.
>
> Well we also have radeon and nouveau which are basically lying to the
> kernel when they say that a BO is only read accessed to allow different
> submissions to the MM and graphics engine to run in parallel.

Again, lying is how this works. amdgpu needs to learn to lie too.

> > These two aren't compatible.
> >
> > Also please keep in mind that neither radeon nor nouveau have a vulkan
> > driver, so pretty sure they haven't had to solve this problem much
> > yet.
>
> Not the Vulkan problem, but the MM engine and GFX engine need to access
> the same BO with both reads and writes at the same time problem.
>
> > Also I just reviewed nouveau, nouveau_bo_fence() says your wrong with
> > your claim, it sets the exclusive fence when userspace indicates a
> > write domain.
> >
> > Also I looked at radeon, assuming I didn't get lost this seems to
> > indicate radeon also works like I think it should:
> >
> >          p->relocs[i].tv.num_shared = !r->write_domain;
> >
> > ttm_eu_fence_buffer_objects() then picks that up and sets the right
> > fence for radeon_cs.c code.
>
> Yes, your observation is correct. The problem is only that both nouveau
> and radeon are lying to the kernel (or at least they used to).
>
> We just never ran into problems because neither driver can share BOs
> containing NV12 pictures directly with other drivers.

Hm care to explain? Why is NV12 special?

> >>> First step in fixing that is (and frankly was since years) to fix the
> >>> amdgpu CS so winsys can pass along a bunch of flags about which CS
> >>> should actually set the exclusive fence, so that you stop oversyncing
> >>> so badly. Ofc old userspace needs to keep oversyncing forever, no way
> >>> to fix that.
> >> Exactly that is what we don't want to do because the winsys has no idea
> >> when to sync and when not to sync.
> > Uh ... so why exactly can anv do it? And turnip and a few others?
> > What's the precise case where the winsys can't do the sync itself,
> > because it has no idea what's going on, but somehow the kernel can?
> > Can you please explain this, because we're definitely talking past
> > each another here. I really don't see any case where the kernel has
> > additional information than the userspace drivers here. But there's
> > lots of cases where userspace definitely knows more.
>
> The kernel knows when a BO is used by a different process and can add
> the proper inter process synchronization there.

Yeah but why does your userspace not know when a bo is used?

Or very bluntly, why cant radv do what anv does (or amdvlk if you care
more about that, it's the same)? What's missing with lots of blantant
lying?

> >> The kernel on the other hand perfectly knows that.
> >>
> >>> Instead what Christian patch set here does is move amdgpu back to the
> >>> dma_resv contract it prefers, break everything else and then fix up
> >>> i915 atomic path so that the one use case that originally highlighted
> >>> the mismatch here works again. Which hrm .... no :-)
> >>>
> >>> I think the reason this wasn't ever a pressing issue is that amdgpu.ko
> >>> only does this for buffers shared across devices, so in most cases you
> >>> don't suffer from the terribly oversync. Conceptually it's still all
> >>> there.
> >>>
> >>>> The solution I *think* Christian is proposing is basically to have
> >>>> four categories of fences instead of two: exclusive, weak (shared with
> >>>> no r/w), read, and write.  (No, I didn't include r/w but that's the
> >>>> same as write-only when it comes to hazards.)  Then a bunch of flags
> >>>> and helpers to be able to handle the interactions between the three
> >>>> types of shared fences.  Honestly, this is something I've considered
> >>>> as I've wrestled with these problems in the past.  That said....
> >>>>
> >>>>    1. In GL, we can make the read/write information accurate and never
> >>>> over/under sync.
> >>>>
> >>>>    2. In the future ANV model I described earlier, this isn't a problem.
> >>>> It throws in a write-fence exactly once per frame.  It actually
> >>>> under-synchronizes but in a safe way.  I think that mostly makes the
> >>>> problem go away in practice.
> >>>>
> >>>>    3. If the core issue here really is memory vs. execution sync as I've
> >>>> said, maybe we really are papering over something by continuing to mix
> >>>> them.  Do we really want four fence types or do we want two orthogonal
> >>>> fence types?
> >>> Now once amdgpu.ko is fixed, we still have the problem of mixing up
> >>> the exclusive fence for implicit sync with the exclusive fence for
> >>> memory management. And for that we can and probably should figure out
> >>> what to do there. But that still requires that amdgpu CS first learns
> >>> what's actually going on from userspace, and secondly, that we do this
> >>> addition in a way which is compatible with current dma_resv users
> >>> (i.e. all drivers currently asking for an exclusive fence need to pick
> >>> up both types of exclusive fences if we decide to split them).
> >>>> I think I've convinced myself that the problem is real, but not that
> >>>> this solution is correct.
> >>> Yeah there's definitely some problems here, but Christian hasn't
> >>> really explained which one he's trying to solve, so we're also running
> >>> a bit in a circle trying to guess what's what :-/
> >> Well how can I help with that?
> >>
> >> Jason seems to have the perfect understanding why we have those problems.
> > See my other reply. I think aside from dma-buf p2p we don't actually
> > have a problem?
> >
> >> And as long as we are all inside amdgpu we also don't have any oversync,
> >> the issue only happens when we share dma-bufs with i915 (radeon and
> >> AFAIK nouveau does the right thing as well).
> > Yeah because then you can't use the amdgpu dma_resv model anymore and
> > have to use the one atomic helpers use. Which is also the one that
> > e.g. Jason is threathening to bake in as uapi with his dma_buf ioctl,
> > so as soon as that lands and someone starts using it, something has to
> > adapt _anytime_ you have a dma-buf hanging around. Not just when it's
> > shared with another device.
>
> Yeah, and that is exactly the reason why I will NAK this uAPI change.
>
> This doesn't works for amdgpu at all for the reasons outlined above.

Uh that's really not how uapi works. "my driver is right, everyone
else is wrong" is not how cross driver contracts are defined. If that
means a perf impact until you've fixed your rules, that's on you.

Also you're a few years too late with nacking this, it's already uapi
in the form of the dma-buf poll() support.

> > So the way I see things right now:
> > - exclusive fence slot is for implicit sync. kmd should only set it
> > when userspace indicates, otherwise you will suffer. Explicit syncing
> > userspace needs to tell the kernel with a flag in the CS ioctl when it
> > should sync against this exclusive fence and when it should ignore it,
> > otherwise you'll suffer badly once more.
>
> That is not sufficient. The explicit sync slot is for kernel internal
> memory management.

Then we need to split it. But what I discussed with Thomas Hellstrom
is that at least for anything except p2p dma-buf ttm_bo->moving should
be enough.

> E.g. every access needs to sync to it and we can't allow to ignore it by
> specifying an userspace flag.
>
> > - no funny tricks with not doing this when it's just internally in
> > your driver, because the more uapi we build on top of dma-buf fd the
> > harder this will break. amdgpu gets to keep some nasty tricks going
> > here until appropriate uapi is finally rolled out, but should stop
> > this asap.
>
> That is really not going to happen. The kernel is the only place where
> you can do proper implicit synchronization between processes.

I think you need to refute Jason's mail here with a bit more detail
than just a claim that this is so.

> > - ttm_bo->moving is the fence for stuff you're not allowed to ignore.
> > Probably need to move that to dma_resv for p2p dma-buf, not sure on
> > that yet.
>
> Well that's at least some possibility. But I would do it the other way
> around, the exclusive fence stays what it is and you add another
> implicit sync fence.

Can we please stop with the "amdgpu is right, everyone else is wrong" approach?

Like I'm pretty much going to type up the patch that does a full drm
subsytem audit of everything and whack amdgpu into compliance. Perf
hit be damned, you had a few years to fix this with better uapi. Or I
find out that there's a giant inconsistent mess, but at least we'd
gain some clarity about where exactly we are here and maybe what to do
next.
-Daniel

>
> Regards,
> Christian.
>
> >
> > After that I think we can look at what exact oversync issue remains
> > and why and solve it, but until we have this this just feels like
> > another rehash of "amgpu insist its own dma_resv interpration is the
> > right one and everyone else should move one over".
> >
> > Or maybe I've just become real garbage at reading random driver code,
> > wouldn't be the first time :-)
> >
> > Cheers, Daniel
> >
> >> Regards,
> >> Christian.
> >>
> >>> Cheers, Daniel
> >>>
> >>>> --Jason
> >>>>
> >>>>
> >>>>>> That's also the reason the Valve guys came up with a solution where each
> >>>>>> BO gets a flag for explicit sync, but that only works for exports and
> >>>>>> not for imports.
> >>>>>>
> >>>>>>> I915 and iirc msm has explicit flags for this, panfrost was designed to
> >>>>>>> support this correctly from the start (also with flags I think). That's at
> >>>>>>> least what I remember from all the discussions at XDC and #dri-devel, but
> >>>>>>> didn't check the code again to give you the list of uapi flags you need
> >>>>>>> for each driver.
> >>>>>>>
> >>>>>>> The other piece is making sure you're only picking up implicit fences when
> >>>>>>> you should, and not any later ones, for which Jason has a solution:
> >>>>>>>
> >>>>>>> https://lore.kernel.org/dri-devel/20210317221940.2146688-1-jason@jlekstrand.net/
> >>>>>> Yes, I helped with that as well. But I think that this is just another
> >>>>>> workaround without really addressing the underlying problem.
> >>>>>>
> >>>>>>> If amdgpu isn't using those, then you will suffer from
> >>>>>>> over-synchronization in vulkan and pay a price. The entire point of vulkan
> >>>>>>> is that you pick up sync points very explicitly, and we also need to have
> >>>>>>> very explicit uapi for userspace to pick up/set the implicit fences.
> >>>>>>>
> >>>>>>> Trying to paper over this with more implicit magic is imo just wrong, and
> >>>>>>> definitely not the long term explicit sync model we want.
> >>>>>> I completely disagree.
> >>>>>>
> >>>>>> In my opinion the implicit sync model we have for dma_resv currently is
> >>>>>> just not well designed at all, since it always requires cooperation from
> >>>>>> userspace.
> >>>>>>
> >>>>>> In other words you need to know when to enable implicit sync in
> >>>>>> userspace and that information is simply not present all of the time.
> >>>>>>
> >>>>>> What we have done here is just keeping the old reader/writer flags i915,
> >>>>>> radeon and nouveau once had and pushed that out to everybody else making
> >>>>>> the assumption that everybody would follow that without documenting the
> >>>>>> actual rules of engagement you need to follow here.
> >>>>>>
> >>>>>> That was a really big mistake and we should try to fix that sooner or
> >>>>>> later. The only other clean alternative I see is to use a flag on the
> >>>>>> exporter to tell the importer if it should sync to shared fences or not.
> >>>>>>
> >>>>>> Additional to that I'm perfectly fine with implicit sync. Explicit sync
> >>>>>> certainly has some use cases as well, but I don't see it as an absolute
> >>>>>> advantage over the implicit model.
> >>>>> Ok this stops making sense. Somehow you claim userspace doesn't know
> >>>>> when to sync, but somehow the kernel does? By guessing, and getting it
> >>>>> wrong mostly, except for the one case that you benchmarked?
> >>>>>
> >>>>> Aside from silly userspace which exports a buffer to a dma-buf, but
> >>>>> then never imports it anywhere else, there isn't a case I know of
> >>>>> where the kernel actually knows more than userspace. But there's lots
> >>>>> of cases where the kernel definitely knows less, especially if
> >>>>> userspace doesn't tell it about what's going on with each rendering
> >>>>> and buffer.
> >>>>>
> >>>>> So here's the 2 things you need to make this work like every other driver:
> >>>>>
> >>>>> 1. A way to set the explicit fence on a buffer. CS ioctl is perfectly
> >>>>> fine, but also can be seperate. Userspace uses this only on a) shared
> >>>>> buffers b) when there's a flush/swap on that shared buffer. Not when
> >>>>> rendering any of the interim stuff, that only leads to oversync.
> >>>>> Anything non-shared is handled explicitly in userspace (at least for
> >>>>> modern-ish drivers). This is the only thing that ever sets an
> >>>>> exclusive fence (aside from ttm moving buffers around ofc).
> >>>>>
> >>>>> 2. A way to sync with the implicit fences, either all of them (for
> >>>>> upcoming write access) or just the write fence (for read access). At
> >>>>> first we thought it's good enough to do this in the CS ioctl, but
> >>>>> that's a wee bit too late, hence the patches from Jason. My
> >>>>> understanding is that vulkan converts this into an vk syncobj/fence of
> >>>>> some sorts, so really can't make this more explicit and intentional
> >>>>> than that.
> >>>>>
> >>>>> None of this is something the kernel has the slightest idea about when
> >>>>> it happens, so you have to have explicit uapi for it. Trying to fake
> >>>>> it in the kernel just doesn't work.
> >>>>> -Daniel
> >>>>> --
> >>>>> Daniel Vetter
> >>>>> Software Engineer, Intel Corporation
> >>>>> http://blog.ffwll.ch
> >>>
> >>> --
> >>> Daniel Vetter
> >>> Software Engineer, Intel Corporation
> >>> http://blog.ffwll.ch
> >
>


-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] Add DMA_RESV_USAGE flags
  2021-05-18 17:40               ` Christian König
  2021-05-18 21:17                 ` Daniel Vetter
@ 2021-05-18 21:31                 ` Dave Airlie
  1 sibling, 0 replies; 50+ messages in thread
From: Dave Airlie @ 2021-05-18 21:31 UTC (permalink / raw)
  To: Christian König
  Cc: moderated list:DMA BUFFER SHARING FRAMEWORK, Jason Ekstrand, dri-devel

>
> We basically don't know during CS if a BO is shared or not.

Who doesn't know? We should be able to track this quite easily,
userspace either imports or exports buffers,
it can surely keep track of these and flag them.

Is this a userspace might lie to use worry or do you have some really
broken userspace we don't know about?

Dave.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] Add DMA_RESV_USAGE flags
  2021-05-18 21:17                 ` Daniel Vetter
@ 2021-05-18 22:06                   ` Jason Ekstrand
  2021-05-19 10:52                     ` Michel Dänzer
  2021-05-19 11:43                     ` Christian König
  2021-05-19 11:24                   ` Christian König
  1 sibling, 2 replies; 50+ messages in thread
From: Jason Ekstrand @ 2021-05-18 22:06 UTC (permalink / raw)
  To: Daniel Vetter
  Cc: moderated list:DMA BUFFER SHARING FRAMEWORK,
	Christian König, dri-devel

On Tue, May 18, 2021 at 4:17 PM Daniel Vetter <daniel@ffwll.ch> wrote:
>
> On Tue, May 18, 2021 at 7:40 PM Christian König
> <ckoenig.leichtzumerken@gmail.com> wrote:
> >
> > Am 18.05.21 um 18:48 schrieb Daniel Vetter:
> > > On Tue, May 18, 2021 at 2:49 PM Christian König
> > > <ckoenig.leichtzumerken@gmail.com> wrote:
> > >> Hi Jason & Daniel,
> > >>
> > >> Am 18.05.21 um 07:59 schrieb Daniel Vetter:
> > >>> On Tue, May 18, 2021 at 12:49 AM Jason Ekstrand <jason@jlekstrand.net> wrote:
> > >>>> On Mon, May 17, 2021 at 3:15 PM Daniel Vetter <daniel@ffwll.ch> wrote:
> > >>>>> On Mon, May 17, 2021 at 9:38 PM Christian König
> > >>>>> <ckoenig.leichtzumerken@gmail.com> wrote:
> > >>>>>> Am 17.05.21 um 17:04 schrieb Daniel Vetter:
> > >>>>>>> On Mon, May 17, 2021 at 04:11:18PM +0200, Christian König wrote:
> > >>>>>>>> We had a long outstanding problem in amdgpu that buffers exported to
> > >>>>>>>> user drivers by DMA-buf serialize all command submissions using them.
> > >>>>>>>>
> > >>>>>>>> In other words we can't compose the buffer with different engines and
> > >>>>>>>> then send it to another driver for display further processing.
> > >>>>>>>>
> > >>>>>>>> This was added to work around the fact that i915 didn't wanted to wait
> > >>>>>>>> for shared fences in the dma_resv objects before displaying a buffer.
> > >>>>>>>>
> > >>>>>>>> Since this problem is now causing issues with Vulkan we need to find a
> > >>>>>>>> better solution for that.
> > >>>>>>>>
> > >>>>>>>> The patch set here tries to do this by adding an usage flag to the
> > >>>>>>>> shared fences noting when and how they should participate in implicit
> > >>>>>>>> synchronization.
> > >>>>>>> So the way this is fixed in every other vulkan driver is that vulkan
> > >>>>>>> userspace sets flags in the CS ioctl when it wants to synchronize with
> > >>>>>>> implicit sync. This gets you mostly there. Last time I checked amdgpu
> > >>>>>>> isn't doing this, and yes that's broken.
> > >>>>>> And exactly that is a really bad approach as far as I can see. The
> > >>>>>> Vulkan stack on top simply doesn't know when to set this flag during CS.
> > >>>>> Adding Jason for the Vulkan side of things, because this isn't how I
> > >>>>> understand this works.
> > >>>>>
> > >>>>> But purely form a kernel pov your patches are sketchy for two reasons:
> > >>>>>
> > >>>>> - we reinstate the amdgpu special case of not setting exclusive fences
> > >>>>>
> > >>>>> - you only fix the single special case of i915 display, nothing else
> > >>>>>
> > >>>>> That's not how a cross driver interface works. And if you'd do this
> > >>>>> properly, you'd be back to all the same sync fun you've orignally had,
> > >>>>> with all the same fallout.
> > >>>> I think I'm starting to see what Christian is trying to do here and I
> > >>>> think there likely is a real genuine problem here.  I'm not convinced
> > >>>> this is 100% of a solution but there might be something real.  Let me
> > >>>> see if I can convince you or if I just make a hash of things. :-)
> > >>>>
> > >>>> The problem, once again, comes down to memory fencing vs. execution
> > >>>> fencing and the way that we've unfortunately tied them together in the
> > >>>> kernel.  With the current architecture, the only way to get proper
> > >>>> write-fence semantics for implicit sync is to take an exclusive fence
> > >>>> on the buffer.  This implies two things:
> > >>>>
> > >>>>    1. You have to implicitly wait on EVERY fence on the buffer before
> > >>>> you can start your write-fenced operation
> > >>>>
> > >>>>    2. No one else can start ANY operation which accesses that buffer
> > >>>> until you're done.
> > >> Yes, exactly that. You absolutely nailed it.
> > >>
> > >> I unfortunately also have a 3rd use case:
> > >>
> > >> 3. Operations which shouldn't participate in any syncing, but only
> > >> affect the memory management.
> > >>
> > >> This is basically our heavyweight TLB flush after unmapping the BO from
> > >> somebodies page tables. Nobody should ever be concerned about it for any
> > >> form of synchronization, but memory managment is not allowed to reuse or
> > >> move the buffer before the operation is completed.
> > > Isn't that just another case of 2? Or I'm not getting it.
> >
> > The problem in this case is not starting a new CS, but synchronizing to
> > the existing ones.
> >
> > See a heavy TLB flush is made completely out of sync. E.g. it doesn't
> > want to wait for any previous operation.
> >
> > In other words imagine the following example:
> > 1. Both process A and B have a BO mapped.
> > 2. Process A is heavily using the BO and doing all kind of rendering.
> > 3. Process B is unmapping the BO.
> >
> > Now that process B unmaps the BO needs to trigger page table updates and
> > a heavy TLB flush, but since this can take really long we want to do it
> > asynchronously on the hardware.
> >
> > With the current approach you basically can't do that because you can't
> > note that a fence should not participate in synchronization at all.
> >
> > E.g. we can't add a fence which doesn't wait for the exclusive one as
> > shared.
>
> Ok I think that's a real problem, and  guess it's also related to all
> the ttm privatization tricks and all that. So essentially we'd need
> the opposite of ttm_bo->moving, as in you can't ignore it, but
> otherwise it completely ignores all the userspace implicit fence
> stuff.

Would you mind explaining it to the rest of the class?  I get the need
to do a TLB flush after a BO is removed from the processes address
space and I get that it may be super-heavy and that it has to be
delayed.  I also get that the driver needs to hold a reference to the
underlying pages until that TLB flush is done.  What I don't get is
what this has to do with the exclusive fence.  Why can't the driver
just gather up all the dma_resv fences on the current object (or,
better yet, just the ones from the current amdgpu process) and wait on
them all?  Why does it need to insert an exclusive fence that then
clogs up the whole works?

> > >>>> Let's say that you have a buffer which is shared between two drivers A
> > >>>> and B and let's say driver A has thrown a fence on it just to ensure
> > >>>> that the BO doesn't get swapped out to disk until it's at a good
> > >>>> stopping point.  Then driver B comes along and wants to throw a
> > >>>> write-fence on it.  Suddenly, your memory fence from driver A causes
> > >>>> driver B to have to stall waiting for a "good" time to throw in a
> > >>>> fence.  It sounds like this is the sort of scenario that Christian is
> > >>>> running into.  And, yes, with certain Vulkan drivers being a bit
> > >>>> sloppy about exactly when they throw in write fences, I could see it
> > >>>> being a real problem.
> > >>> Yes this is a potential problem, and on the i915 side we need to do
> > >>> some shuffling here most likely. Especially due to discrete, but the
> > >>> problem is pre-existing. tbh I forgot about the implications here
> > >>> until I pondered this again yesterday evening.
> > >>>
> > >>> But afaiui the amdgpu code and winsys in mesa, this isn't (yet) the
> > >>> problem amd vk drivers have. The issue is that with amdgpu, all you
> > >>> supply are the following bits at CS time:
> > >>> - list of always mapped private buffers, which is implicit and O(1) in
> > >>> the kernel fastpath
> > >>> - additional list of shared buffers that are used by the current CS
> > >>>
> > >>> I didn't check how exactly that works wrt winsys buffer ownership, but
> > >>> the thing is that on the kernel side _any_ buffer in there is treated
> > >>> as a implicit sync'ed write. Which means if you render your winsys
> > >>> with a bunch of command submission split over 3d and compute pipes,
> > >>> you end up with horrendous amounts of oversync.
> > >> What are you talking about? We have no sync at all for submissions from
> > >> the same client.
> > > Yes. Except when the buffer is shared with another driver, at which
> > > point you sync a _lot_ and feel the pain.
> >
> > Yes, exactly that's the problem.
> >
> > We basically don't know during CS if a BO is shared or not.
> >
> > We do know that during importing or exporting the BO thought.
>
> No you don't. Or at least that's massively awkward, see Jason's reply.

Please.  In Vulkan, we know explicitly whether or not any BO will ever
be shared and, if a BO is ever flagged as shared even though it's not,
that's the app being stupid and they can eat the perf hit.  In GL,
things are more wishy-washy but GL has so many stupid cases where we
have to throw a buffer away and re-allocate that one more isn't going
to be all that bad.  Even there, you could do something where you add
an in-fence to the BO export operation so that the driver knows when
to switch from the shared internal dma_resv to the external one
without having to create a new BO and copy.

> > > Or I'm not understanding at all what your patch series does and why
> > > it's improving anything, specifically the last patch for amdgpu.
> > >
> > > So please explain how this all adds up.
> > >
> > >>> The reason for this is that amdgpu decided to go with a different
> > >>> implicit sync model than everyone else:
> > >>> - within an drm file everything is unsynced and left to userspace to
> > >>> handle, amdgpu.ko only ever sets the shared fence slots.
> > >>> - this means the exclusive slot really is exclusive to memory manage
> > >>> issues, which side-steps the issue you point out above
> > >>> - for anything cross-device they unconditionally wait for any shared
> > >>> fence which is by another process
> > >>>
> > >>> Works, except it's incompatible with what everyone else is doing, so
> > >>> had to be papered over by the current massive oversync solution.
> > >> Well actually it is only i915 I care of which is working differently.
> > > It's also anything that uses the atomic commit helpers. They _all_
> > > expect the exclusive fence to be set for the last write. i915 is the
> > > odd one out here by having its own commit helpers still, most other
> > > atomic drivers moved over to the helper version. But since I wrote
> > > both I think you can trust me that they work the same :-)
> > >
> > >> Radeon works the same way as amdgpu by waiting for everything before
> > >> doing command submission or pageflip.
> > > The thing is, we don't want to wait for everything. We only want to
> > > wait for the last writer, not for maybe a readback job or something
> > > else. And this isn't just about atomic flip, it's for any
> > > cross-device/process dma-buf sharing.
> >
> > Well exactly that's the problem. In amdgpu we do want to wait for
> > multiple fences, but not for page table updates (for example).
> >
> > That also one of the reasons why the approach with adding an exclusive
> > fence was never an option here.
>
> Lying to the kernel is ok. That's the entire point I'm trying to get
> across. And amdgpu needs to gain some uapi to make that lying
> possible.
>
> Also it's not lying, it's how this stuff works:
> - For anything you don't share, you _never_ set the write flag.
> Userspace takes care of any fencing needs itself. You alos tell the
> kernel to _always_ ignore any exclusive fence it sets.
> - For sharing you set the write flag, but _only_ only handover points.
> Same when you synchronize with other access, you do that once at the
> handover point, and then you tell the kernel to ignore the exclusive
> fence everywhere else.
>
> Essentially you treat implicit sync not as something magic, but as a
> very screwed up IPC mechanism for explicit fences.
>
> Again this isn't lying, it's how it works. The kernel cannot and must
> not rely on userspace telling the truth (couldn't check it without a
> cmd parser), so the only thing you can use the write flag respectively
> exclusive fence is as an IPC slot for fences.
>
> Use it like IPC, not like a shotgun approach of "maybe we should set a
> fence and let the kernel sort out the mess".
>
> > > There's essentially two worlds we have here:
> > > - drivers which work like i915, where exclusive slot is for implicit
> > > sync, and shared is just for "I'm using this"
> > > - amdgpu (you claim more, I'm honestly not so sure since you only
> > > fixed amdgpu and i915 display), where all access is in the shard
> > > slots, and then on cross-* sync you want to sync with all of them.
> >
> > Well we also have radeon and nouveau which are basically lying to the
> > kernel when they say that a BO is only read accessed to allow different
> > submissions to the MM and graphics engine to run in parallel.
>
> Again, lying is how this works. amdgpu needs to learn to lie too.

If that's lying then ANV has been lying since the dawn of time.

> > > These two aren't compatible.
> > >
> > > Also please keep in mind that neither radeon nor nouveau have a vulkan
> > > driver, so pretty sure they haven't had to solve this problem much
> > > yet.
> >
> > Not the Vulkan problem, but the MM engine and GFX engine need to access
> > the same BO with both reads and writes at the same time problem.
> >
> > > Also I just reviewed nouveau, nouveau_bo_fence() says your wrong with
> > > your claim, it sets the exclusive fence when userspace indicates a
> > > write domain.
> > >
> > > Also I looked at radeon, assuming I didn't get lost this seems to
> > > indicate radeon also works like I think it should:
> > >
> > >          p->relocs[i].tv.num_shared = !r->write_domain;
> > >
> > > ttm_eu_fence_buffer_objects() then picks that up and sets the right
> > > fence for radeon_cs.c code.
> >
> > Yes, your observation is correct. The problem is only that both nouveau
> > and radeon are lying to the kernel (or at least they used to).
> >
> > We just never ran into problems because neither driver can share BOs
> > containing NV12 pictures directly with other drivers.
>
> Hm care to explain? Why is NV12 special?
>
> > >>> First step in fixing that is (and frankly was since years) to fix the
> > >>> amdgpu CS so winsys can pass along a bunch of flags about which CS
> > >>> should actually set the exclusive fence, so that you stop oversyncing
> > >>> so badly. Ofc old userspace needs to keep oversyncing forever, no way
> > >>> to fix that.
> > >> Exactly that is what we don't want to do because the winsys has no idea
> > >> when to sync and when not to sync.
> > > Uh ... so why exactly can anv do it? And turnip and a few others?
> > > What's the precise case where the winsys can't do the sync itself,
> > > because it has no idea what's going on, but somehow the kernel can?
> > > Can you please explain this, because we're definitely talking past
> > > each another here. I really don't see any case where the kernel has
> > > additional information than the userspace drivers here. But there's
> > > lots of cases where userspace definitely knows more.
> >
> > The kernel knows when a BO is used by a different process and can add
> > the proper inter process synchronization there.
>
> Yeah but why does your userspace not know when a bo is used?

We always know when a BO is exported because we're the ones doing the
export call.  Always.  Of course, we don't know if that BO is shared
with another driver or re-imported back into the same one but is that
really the case we're optimizing for?

> Or very bluntly, why cant radv do what anv does (or amdvlk if you care
> more about that, it's the same)? What's missing with lots of blantant
> lying?

I'm also not buying this.  You keep claiming that userspace doesn't
know but GL definitely does know and Vulkan knows well enough.  You
say that it's motivated by Vulkan and use RADV as an example but the
only reason why the RADV guys haven't followed the ANV design is to
work around limitations in amdgpu.  We shouldn't then use RADV to
justify why this is the right uAPI and why i915 is wrong.

> > >> The kernel on the other hand perfectly knows that.
> > >>
> > >>> Instead what Christian patch set here does is move amdgpu back to the
> > >>> dma_resv contract it prefers, break everything else and then fix up
> > >>> i915 atomic path so that the one use case that originally highlighted
> > >>> the mismatch here works again. Which hrm .... no :-)
> > >>>
> > >>> I think the reason this wasn't ever a pressing issue is that amdgpu.ko
> > >>> only does this for buffers shared across devices, so in most cases you
> > >>> don't suffer from the terribly oversync. Conceptually it's still all
> > >>> there.
> > >>>
> > >>>> The solution I *think* Christian is proposing is basically to have
> > >>>> four categories of fences instead of two: exclusive, weak (shared with
> > >>>> no r/w), read, and write.  (No, I didn't include r/w but that's the
> > >>>> same as write-only when it comes to hazards.)  Then a bunch of flags
> > >>>> and helpers to be able to handle the interactions between the three
> > >>>> types of shared fences.  Honestly, this is something I've considered
> > >>>> as I've wrestled with these problems in the past.  That said....
> > >>>>
> > >>>>    1. In GL, we can make the read/write information accurate and never
> > >>>> over/under sync.
> > >>>>
> > >>>>    2. In the future ANV model I described earlier, this isn't a problem.
> > >>>> It throws in a write-fence exactly once per frame.  It actually
> > >>>> under-synchronizes but in a safe way.  I think that mostly makes the
> > >>>> problem go away in practice.
> > >>>>
> > >>>>    3. If the core issue here really is memory vs. execution sync as I've
> > >>>> said, maybe we really are papering over something by continuing to mix
> > >>>> them.  Do we really want four fence types or do we want two orthogonal
> > >>>> fence types?
> > >>> Now once amdgpu.ko is fixed, we still have the problem of mixing up
> > >>> the exclusive fence for implicit sync with the exclusive fence for
> > >>> memory management. And for that we can and probably should figure out
> > >>> what to do there. But that still requires that amdgpu CS first learns
> > >>> what's actually going on from userspace, and secondly, that we do this
> > >>> addition in a way which is compatible with current dma_resv users
> > >>> (i.e. all drivers currently asking for an exclusive fence need to pick
> > >>> up both types of exclusive fences if we decide to split them).
> > >>>> I think I've convinced myself that the problem is real, but not that
> > >>>> this solution is correct.
> > >>> Yeah there's definitely some problems here, but Christian hasn't
> > >>> really explained which one he's trying to solve, so we're also running
> > >>> a bit in a circle trying to guess what's what :-/
> > >> Well how can I help with that?
> > >>
> > >> Jason seems to have the perfect understanding why we have those problems.
> > > See my other reply. I think aside from dma-buf p2p we don't actually
> > > have a problem?
> > >
> > >> And as long as we are all inside amdgpu we also don't have any oversync,
> > >> the issue only happens when we share dma-bufs with i915 (radeon and
> > >> AFAIK nouveau does the right thing as well).
> > > Yeah because then you can't use the amdgpu dma_resv model anymore and
> > > have to use the one atomic helpers use. Which is also the one that
> > > e.g. Jason is threathening to bake in as uapi with his dma_buf ioctl,
> > > so as soon as that lands and someone starts using it, something has to
> > > adapt _anytime_ you have a dma-buf hanging around. Not just when it's
> > > shared with another device.
> >
> > Yeah, and that is exactly the reason why I will NAK this uAPI change.
> >
> > This doesn't works for amdgpu at all for the reasons outlined above.
>
> Uh that's really not how uapi works. "my driver is right, everyone
> else is wrong" is not how cross driver contracts are defined. If that
> means a perf impact until you've fixed your rules, that's on you.
>
> Also you're a few years too late with nacking this, it's already uapi
> in the form of the dma-buf poll() support.

^^  My fancy new ioctl doesn't expose anything that isn't already
there.  It just lets you take a snap-shot of a wait instead of doing
an active wait which might end up with more fences added depending on
interrupts and retries.  The dma-buf poll waits on all fences for
POLLOUT and only the exclusive fence for POLLIN.  It's already uAPI.

> > > So the way I see things right now:
> > > - exclusive fence slot is for implicit sync. kmd should only set it
> > > when userspace indicates, otherwise you will suffer. Explicit syncing
> > > userspace needs to tell the kernel with a flag in the CS ioctl when it
> > > should sync against this exclusive fence and when it should ignore it,
> > > otherwise you'll suffer badly once more.
> >
> > That is not sufficient. The explicit sync slot is for kernel internal
> > memory management.
>
> Then we need to split it. But what I discussed with Thomas Hellstrom
> is that at least for anything except p2p dma-buf ttm_bo->moving should
> be enough.

This is starting to sound like maybe roughly the right direction to me
but I'm still unclear on exactly what problem we're trying to solve
for TLB invalidates.  I'd like to understand that better before giving
strong opinions.  I'm also not super-familiar with ttm_bo->moving but
it sounds like we need some third category of fence somewhere.

--Jason


> > E.g. every access needs to sync to it and we can't allow to ignore it by
> > specifying an userspace flag.
> >
> > > - no funny tricks with not doing this when it's just internally in
> > > your driver, because the more uapi we build on top of dma-buf fd the
> > > harder this will break. amdgpu gets to keep some nasty tricks going
> > > here until appropriate uapi is finally rolled out, but should stop
> > > this asap.
> >
> > That is really not going to happen. The kernel is the only place where
> > you can do proper implicit synchronization between processes.
>
> I think you need to refute Jason's mail here with a bit more detail
> than just a claim that this is so.
>
> > > - ttm_bo->moving is the fence for stuff you're not allowed to ignore.
> > > Probably need to move that to dma_resv for p2p dma-buf, not sure on
> > > that yet.
> >
> > Well that's at least some possibility. But I would do it the other way
> > around, the exclusive fence stays what it is and you add another
> > implicit sync fence.
>
> Can we please stop with the "amdgpu is right, everyone else is wrong" approach?
>
> Like I'm pretty much going to type up the patch that does a full drm
> subsytem audit of everything and whack amdgpu into compliance. Perf
> hit be damned, you had a few years to fix this with better uapi. Or I
> find out that there's a giant inconsistent mess, but at least we'd
> gain some clarity about where exactly we are here and maybe what to do
> next.
> -Daniel
>
> >
> > Regards,
> > Christian.
> >
> > >
> > > After that I think we can look at what exact oversync issue remains
> > > and why and solve it, but until we have this this just feels like
> > > another rehash of "amgpu insist its own dma_resv interpration is the
> > > right one and everyone else should move one over".
> > >
> > > Or maybe I've just become real garbage at reading random driver code,
> > > wouldn't be the first time :-)
> > >
> > > Cheers, Daniel
> > >
> > >> Regards,
> > >> Christian.
> > >>
> > >>> Cheers, Daniel
> > >>>
> > >>>> --Jason
> > >>>>
> > >>>>
> > >>>>>> That's also the reason the Valve guys came up with a solution where each
> > >>>>>> BO gets a flag for explicit sync, but that only works for exports and
> > >>>>>> not for imports.
> > >>>>>>
> > >>>>>>> I915 and iirc msm has explicit flags for this, panfrost was designed to
> > >>>>>>> support this correctly from the start (also with flags I think). That's at
> > >>>>>>> least what I remember from all the discussions at XDC and #dri-devel, but
> > >>>>>>> didn't check the code again to give you the list of uapi flags you need
> > >>>>>>> for each driver.
> > >>>>>>>
> > >>>>>>> The other piece is making sure you're only picking up implicit fences when
> > >>>>>>> you should, and not any later ones, for which Jason has a solution:
> > >>>>>>>
> > >>>>>>> https://lore.kernel.org/dri-devel/20210317221940.2146688-1-jason@jlekstrand.net/
> > >>>>>> Yes, I helped with that as well. But I think that this is just another
> > >>>>>> workaround without really addressing the underlying problem.
> > >>>>>>
> > >>>>>>> If amdgpu isn't using those, then you will suffer from
> > >>>>>>> over-synchronization in vulkan and pay a price. The entire point of vulkan
> > >>>>>>> is that you pick up sync points very explicitly, and we also need to have
> > >>>>>>> very explicit uapi for userspace to pick up/set the implicit fences.
> > >>>>>>>
> > >>>>>>> Trying to paper over this with more implicit magic is imo just wrong, and
> > >>>>>>> definitely not the long term explicit sync model we want.
> > >>>>>> I completely disagree.
> > >>>>>>
> > >>>>>> In my opinion the implicit sync model we have for dma_resv currently is
> > >>>>>> just not well designed at all, since it always requires cooperation from
> > >>>>>> userspace.
> > >>>>>>
> > >>>>>> In other words you need to know when to enable implicit sync in
> > >>>>>> userspace and that information is simply not present all of the time.
> > >>>>>>
> > >>>>>> What we have done here is just keeping the old reader/writer flags i915,
> > >>>>>> radeon and nouveau once had and pushed that out to everybody else making
> > >>>>>> the assumption that everybody would follow that without documenting the
> > >>>>>> actual rules of engagement you need to follow here.
> > >>>>>>
> > >>>>>> That was a really big mistake and we should try to fix that sooner or
> > >>>>>> later. The only other clean alternative I see is to use a flag on the
> > >>>>>> exporter to tell the importer if it should sync to shared fences or not.
> > >>>>>>
> > >>>>>> Additional to that I'm perfectly fine with implicit sync. Explicit sync
> > >>>>>> certainly has some use cases as well, but I don't see it as an absolute
> > >>>>>> advantage over the implicit model.
> > >>>>> Ok this stops making sense. Somehow you claim userspace doesn't know
> > >>>>> when to sync, but somehow the kernel does? By guessing, and getting it
> > >>>>> wrong mostly, except for the one case that you benchmarked?
> > >>>>>
> > >>>>> Aside from silly userspace which exports a buffer to a dma-buf, but
> > >>>>> then never imports it anywhere else, there isn't a case I know of
> > >>>>> where the kernel actually knows more than userspace. But there's lots
> > >>>>> of cases where the kernel definitely knows less, especially if
> > >>>>> userspace doesn't tell it about what's going on with each rendering
> > >>>>> and buffer.
> > >>>>>
> > >>>>> So here's the 2 things you need to make this work like every other driver:
> > >>>>>
> > >>>>> 1. A way to set the explicit fence on a buffer. CS ioctl is perfectly
> > >>>>> fine, but also can be seperate. Userspace uses this only on a) shared
> > >>>>> buffers b) when there's a flush/swap on that shared buffer. Not when
> > >>>>> rendering any of the interim stuff, that only leads to oversync.
> > >>>>> Anything non-shared is handled explicitly in userspace (at least for
> > >>>>> modern-ish drivers). This is the only thing that ever sets an
> > >>>>> exclusive fence (aside from ttm moving buffers around ofc).
> > >>>>>
> > >>>>> 2. A way to sync with the implicit fences, either all of them (for
> > >>>>> upcoming write access) or just the write fence (for read access). At
> > >>>>> first we thought it's good enough to do this in the CS ioctl, but
> > >>>>> that's a wee bit too late, hence the patches from Jason. My
> > >>>>> understanding is that vulkan converts this into an vk syncobj/fence of
> > >>>>> some sorts, so really can't make this more explicit and intentional
> > >>>>> than that.
> > >>>>>
> > >>>>> None of this is something the kernel has the slightest idea about when
> > >>>>> it happens, so you have to have explicit uapi for it. Trying to fake
> > >>>>> it in the kernel just doesn't work.
> > >>>>> -Daniel
> > >>>>> --
> > >>>>> Daniel Vetter
> > >>>>> Software Engineer, Intel Corporation
> > >>>>> http://blog.ffwll.ch
> > >>>
> > >>> --
> > >>> Daniel Vetter
> > >>> Software Engineer, Intel Corporation
> > >>> http://blog.ffwll.ch
> > >
> >
>
>
> --
> Daniel Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] Add DMA_RESV_USAGE flags
  2021-05-18 22:06                   ` Jason Ekstrand
@ 2021-05-19 10:52                     ` Michel Dänzer
  2021-05-19 15:21                       ` Jason Ekstrand
  2021-05-19 11:43                     ` Christian König
  1 sibling, 1 reply; 50+ messages in thread
From: Michel Dänzer @ 2021-05-19 10:52 UTC (permalink / raw)
  To: Jason Ekstrand, Daniel Vetter
  Cc: moderated list:DMA BUFFER SHARING FRAMEWORK,
	Christian König, dri-devel

On 2021-05-19 12:06 a.m., Jason Ekstrand wrote:
> On Tue, May 18, 2021 at 4:17 PM Daniel Vetter <daniel@ffwll.ch> wrote:
>>
>> On Tue, May 18, 2021 at 7:40 PM Christian König
>> <ckoenig.leichtzumerken@gmail.com> wrote:
>>>
>>> Am 18.05.21 um 18:48 schrieb Daniel Vetter:
>>>> On Tue, May 18, 2021 at 2:49 PM Christian König
>>>> <ckoenig.leichtzumerken@gmail.com> wrote:
>>>>
>>>>> And as long as we are all inside amdgpu we also don't have any oversync,
>>>>> the issue only happens when we share dma-bufs with i915 (radeon and
>>>>> AFAIK nouveau does the right thing as well).
>>>> Yeah because then you can't use the amdgpu dma_resv model anymore and
>>>> have to use the one atomic helpers use. Which is also the one that
>>>> e.g. Jason is threathening to bake in as uapi with his dma_buf ioctl,
>>>> so as soon as that lands and someone starts using it, something has to
>>>> adapt _anytime_ you have a dma-buf hanging around. Not just when it's
>>>> shared with another device.
>>>
>>> Yeah, and that is exactly the reason why I will NAK this uAPI change.
>>>
>>> This doesn't works for amdgpu at all for the reasons outlined above.
>>
>> Uh that's really not how uapi works. "my driver is right, everyone
>> else is wrong" is not how cross driver contracts are defined. If that
>> means a perf impact until you've fixed your rules, that's on you.
>>
>> Also you're a few years too late with nacking this, it's already uapi
>> in the form of the dma-buf poll() support.
> 
> ^^  My fancy new ioctl doesn't expose anything that isn't already
> there.  It just lets you take a snap-shot of a wait instead of doing
> an active wait which might end up with more fences added depending on
> interrupts and retries.  The dma-buf poll waits on all fences for
> POLLOUT and only the exclusive fence for POLLIN.  It's already uAPI.

Note that the dma-buf poll support could be useful to Wayland compositors for the same purpose as Jason's new ioctl (only using client buffers which have finished drawing for an output frame, to avoid missing a refresh cycle due to client drawing), *if* it didn't work differently with amdgpu.

Am I understanding correctly that Jason's new ioctl would also work differently with amdgpu as things stand currently? If so, that would be a real bummer and might hinder adoption of the ioctl by Wayland compositors.


-- 
Earthling Michel Dänzer               |               https://redhat.com
Libre software enthusiast             |             Mesa and X developer

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] Add DMA_RESV_USAGE flags
  2021-05-18 21:17                 ` Daniel Vetter
  2021-05-18 22:06                   ` Jason Ekstrand
@ 2021-05-19 11:24                   ` Christian König
  2021-05-20  7:58                     ` Daniel Vetter
  1 sibling, 1 reply; 50+ messages in thread
From: Christian König @ 2021-05-19 11:24 UTC (permalink / raw)
  To: Daniel Vetter
  Cc: moderated list:DMA BUFFER SHARING FRAMEWORK, dri-devel, Jason Ekstrand

Am 18.05.21 um 23:17 schrieb Daniel Vetter:
> [SNIP]
>> The problem in this case is not starting a new CS, but synchronizing to
>> the existing ones.
>>
>> See a heavy TLB flush is made completely out of sync. E.g. it doesn't
>> want to wait for any previous operation.
>>
>> In other words imagine the following example:
>> 1. Both process A and B have a BO mapped.
>> 2. Process A is heavily using the BO and doing all kind of rendering.
>> 3. Process B is unmapping the BO.
>>
>> Now that process B unmaps the BO needs to trigger page table updates and
>> a heavy TLB flush, but since this can take really long we want to do it
>> asynchronously on the hardware.
>>
>> With the current approach you basically can't do that because you can't
>> note that a fence should not participate in synchronization at all.
>>
>> E.g. we can't add a fence which doesn't wait for the exclusive one as
>> shared.
> Ok I think that's a real problem, and  guess it's also related to all
> the ttm privatization tricks and all that. So essentially we'd need
> the opposite of ttm_bo->moving, as in you can't ignore it, but
> otherwise it completely ignores all the userspace implicit fence
> stuff.

It goes into that direction, but doesn't sounds like the full solution 
either.

[SNIP]
> Can we please stop with the "amdgpu is right, everyone else is wrong" approach?

Well the approach I do here is not "amdgpu is right, everyone else is 
wrong". But rather we had DRM uAPI for i915, nouveau and radeon and 
unfortunately leaked that into DMA-buf without much thinking about it.

I'm also not saying that the approach amdgpu is right. It's just what 
amdgpu needs in it's CS interface.

What I'm saying is that DMA-buf is a device driver independent subsystem 
and we shouldn't make any assumption which come from just a handful of 
DRM driver on it's implicit sync implementation.

> Like I'm pretty much going to type up the patch that does a full drm
> subsytem audit of everything and whack amdgpu into compliance. Perf
> hit be damned, you had a few years to fix this with better uapi. Or I
> find out that there's a giant inconsistent mess, but at least we'd
> gain some clarity about where exactly we are here and maybe what to do
> next.

Ok to let us move forward please take a look at the first patches of the 
set. It cleans up quite a bunch of the mess we have in there before even 
coming to adding flags to the shared slots.

I think you will agree on that we should do is cleaning up the use cases 
further and separate implicit sync from resource management.

In other words we forbid touching the exclusive and shared fences 
directly and have separate APIs for resource management and implicit sync.

This makes sense anyway, no matter what implicit synchronization 
framework we will install underneath.

Regards,
Christian.

> -Daniel
>
>> Regards,
>> Christian.
>>
>>> After that I think we can look at what exact oversync issue remains
>>> and why and solve it, but until we have this this just feels like
>>> another rehash of "amgpu insist its own dma_resv interpration is the
>>> right one and everyone else should move one over".
>>>
>>> Or maybe I've just become real garbage at reading random driver code,
>>> wouldn't be the first time :-)
>>>
>>> Cheers, Daniel
>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>> Cheers, Daniel
>>>>>
>>>>>> --Jason
>>>>>>
>>>>>>
>>>>>>>> That's also the reason the Valve guys came up with a solution where each
>>>>>>>> BO gets a flag for explicit sync, but that only works for exports and
>>>>>>>> not for imports.
>>>>>>>>
>>>>>>>>> I915 and iirc msm has explicit flags for this, panfrost was designed to
>>>>>>>>> support this correctly from the start (also with flags I think). That's at
>>>>>>>>> least what I remember from all the discussions at XDC and #dri-devel, but
>>>>>>>>> didn't check the code again to give you the list of uapi flags you need
>>>>>>>>> for each driver.
>>>>>>>>>
>>>>>>>>> The other piece is making sure you're only picking up implicit fences when
>>>>>>>>> you should, and not any later ones, for which Jason has a solution:
>>>>>>>>>
>>>>>>>>> https://lore.kernel.org/dri-devel/20210317221940.2146688-1-jason@jlekstrand.net/
>>>>>>>> Yes, I helped with that as well. But I think that this is just another
>>>>>>>> workaround without really addressing the underlying problem.
>>>>>>>>
>>>>>>>>> If amdgpu isn't using those, then you will suffer from
>>>>>>>>> over-synchronization in vulkan and pay a price. The entire point of vulkan
>>>>>>>>> is that you pick up sync points very explicitly, and we also need to have
>>>>>>>>> very explicit uapi for userspace to pick up/set the implicit fences.
>>>>>>>>>
>>>>>>>>> Trying to paper over this with more implicit magic is imo just wrong, and
>>>>>>>>> definitely not the long term explicit sync model we want.
>>>>>>>> I completely disagree.
>>>>>>>>
>>>>>>>> In my opinion the implicit sync model we have for dma_resv currently is
>>>>>>>> just not well designed at all, since it always requires cooperation from
>>>>>>>> userspace.
>>>>>>>>
>>>>>>>> In other words you need to know when to enable implicit sync in
>>>>>>>> userspace and that information is simply not present all of the time.
>>>>>>>>
>>>>>>>> What we have done here is just keeping the old reader/writer flags i915,
>>>>>>>> radeon and nouveau once had and pushed that out to everybody else making
>>>>>>>> the assumption that everybody would follow that without documenting the
>>>>>>>> actual rules of engagement you need to follow here.
>>>>>>>>
>>>>>>>> That was a really big mistake and we should try to fix that sooner or
>>>>>>>> later. The only other clean alternative I see is to use a flag on the
>>>>>>>> exporter to tell the importer if it should sync to shared fences or not.
>>>>>>>>
>>>>>>>> Additional to that I'm perfectly fine with implicit sync. Explicit sync
>>>>>>>> certainly has some use cases as well, but I don't see it as an absolute
>>>>>>>> advantage over the implicit model.
>>>>>>> Ok this stops making sense. Somehow you claim userspace doesn't know
>>>>>>> when to sync, but somehow the kernel does? By guessing, and getting it
>>>>>>> wrong mostly, except for the one case that you benchmarked?
>>>>>>>
>>>>>>> Aside from silly userspace which exports a buffer to a dma-buf, but
>>>>>>> then never imports it anywhere else, there isn't a case I know of
>>>>>>> where the kernel actually knows more than userspace. But there's lots
>>>>>>> of cases where the kernel definitely knows less, especially if
>>>>>>> userspace doesn't tell it about what's going on with each rendering
>>>>>>> and buffer.
>>>>>>>
>>>>>>> So here's the 2 things you need to make this work like every other driver:
>>>>>>>
>>>>>>> 1. A way to set the explicit fence on a buffer. CS ioctl is perfectly
>>>>>>> fine, but also can be seperate. Userspace uses this only on a) shared
>>>>>>> buffers b) when there's a flush/swap on that shared buffer. Not when
>>>>>>> rendering any of the interim stuff, that only leads to oversync.
>>>>>>> Anything non-shared is handled explicitly in userspace (at least for
>>>>>>> modern-ish drivers). This is the only thing that ever sets an
>>>>>>> exclusive fence (aside from ttm moving buffers around ofc).
>>>>>>>
>>>>>>> 2. A way to sync with the implicit fences, either all of them (for
>>>>>>> upcoming write access) or just the write fence (for read access). At
>>>>>>> first we thought it's good enough to do this in the CS ioctl, but
>>>>>>> that's a wee bit too late, hence the patches from Jason. My
>>>>>>> understanding is that vulkan converts this into an vk syncobj/fence of
>>>>>>> some sorts, so really can't make this more explicit and intentional
>>>>>>> than that.
>>>>>>>
>>>>>>> None of this is something the kernel has the slightest idea about when
>>>>>>> it happens, so you have to have explicit uapi for it. Trying to fake
>>>>>>> it in the kernel just doesn't work.
>>>>>>> -Daniel
>>>>>>> --
>>>>>>> Daniel Vetter
>>>>>>> Software Engineer, Intel Corporation
>>>>>>> http://blog.ffwll.ch
>>>>> --
>>>>> Daniel Vetter
>>>>> Software Engineer, Intel Corporation
>>>>> http://blog.ffwll.ch
>


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] Add DMA_RESV_USAGE flags
  2021-05-18 22:06                   ` Jason Ekstrand
  2021-05-19 10:52                     ` Michel Dänzer
@ 2021-05-19 11:43                     ` Christian König
  2021-05-19 15:35                       ` Jason Ekstrand
  1 sibling, 1 reply; 50+ messages in thread
From: Christian König @ 2021-05-19 11:43 UTC (permalink / raw)
  To: Jason Ekstrand, Daniel Vetter
  Cc: moderated list:DMA BUFFER SHARING FRAMEWORK, dri-devel

Am 19.05.21 um 00:06 schrieb Jason Ekstrand:
> [SNIP]
>>> E.g. we can't add a fence which doesn't wait for the exclusive one as
>>> shared.
>> Ok I think that's a real problem, and  guess it's also related to all
>> the ttm privatization tricks and all that. So essentially we'd need
>> the opposite of ttm_bo->moving, as in you can't ignore it, but
>> otherwise it completely ignores all the userspace implicit fence
>> stuff.
> Would you mind explaining it to the rest of the class?  I get the need
> to do a TLB flush after a BO is removed from the processes address
> space and I get that it may be super-heavy and that it has to be
> delayed.  I also get that the driver needs to hold a reference to the
> underlying pages until that TLB flush is done.  What I don't get is
> what this has to do with the exclusive fence.  Why can't the driver
> just gather up all the dma_resv fences on the current object (or,
> better yet, just the ones from the current amdgpu process) and wait on
> them all?  Why does it need to insert an exclusive fence that then
> clogs up the whole works?

Because we have mixed up resource management with implicit syncing.

When I sum up all fences in (for example) a dma_fence_array container 
and add that as explicit fence to the dma_resv object resource 
management will do what I want and wait for everything to finish before 
moving or freeing the buffer. But implicit sync will just horrible over 
sync and wait for stuff it shouldn't wait for in the first place.

When I add the fence as shared fence I can run into the problem the the 
TLB flush might finish before the exclusive fence. Which is not allowed 
according to the DMA-buf fencing rules.

We currently have some rather crude workarounds to make use cases like 
this work as expected. E.g. by using a 
dma_fence_chain()/dma_fence_array() and/or adding the explusive fence to 
the shared fences etc etc...

>>>>>>> Let's say that you have a buffer which is shared between two drivers A
>>>>>>> and B and let's say driver A has thrown a fence on it just to ensure
>>>>>>> that the BO doesn't get swapped out to disk until it's at a good
>>>>>>> stopping point.  Then driver B comes along and wants to throw a
>>>>>>> write-fence on it.  Suddenly, your memory fence from driver A causes
>>>>>>> driver B to have to stall waiting for a "good" time to throw in a
>>>>>>> fence.  It sounds like this is the sort of scenario that Christian is
>>>>>>> running into.  And, yes, with certain Vulkan drivers being a bit
>>>>>>> sloppy about exactly when they throw in write fences, I could see it
>>>>>>> being a real problem.
>>>>>> Yes this is a potential problem, and on the i915 side we need to do
>>>>>> some shuffling here most likely. Especially due to discrete, but the
>>>>>> problem is pre-existing. tbh I forgot about the implications here
>>>>>> until I pondered this again yesterday evening.
>>>>>>
>>>>>> But afaiui the amdgpu code and winsys in mesa, this isn't (yet) the
>>>>>> problem amd vk drivers have. The issue is that with amdgpu, all you
>>>>>> supply are the following bits at CS time:
>>>>>> - list of always mapped private buffers, which is implicit and O(1) in
>>>>>> the kernel fastpath
>>>>>> - additional list of shared buffers that are used by the current CS
>>>>>>
>>>>>> I didn't check how exactly that works wrt winsys buffer ownership, but
>>>>>> the thing is that on the kernel side _any_ buffer in there is treated
>>>>>> as a implicit sync'ed write. Which means if you render your winsys
>>>>>> with a bunch of command submission split over 3d and compute pipes,
>>>>>> you end up with horrendous amounts of oversync.
>>>>> What are you talking about? We have no sync at all for submissions from
>>>>> the same client.
>>>> Yes. Except when the buffer is shared with another driver, at which
>>>> point you sync a _lot_ and feel the pain.
>>> Yes, exactly that's the problem.
>>>
>>> We basically don't know during CS if a BO is shared or not.
>>>
>>> We do know that during importing or exporting the BO thought.
>> No you don't. Or at least that's massively awkward, see Jason's reply.
> Please.  In Vulkan, we know explicitly whether or not any BO will ever
> be shared and, if a BO is ever flagged as shared even though it's not,
> that's the app being stupid and they can eat the perf hit.

Yeah, that's not a problem at all. We already have the per BO flag in 
amdgpu for this as well.

> In GL, things are more wishy-washy but GL has so many stupid cases where we
> have to throw a buffer away and re-allocate that one more isn't going
> to be all that bad.  Even there, you could do something where you add
> an in-fence to the BO export operation so that the driver knows when
> to switch from the shared internal dma_resv to the external one
> without having to create a new BO and copy.

Hui what? What do you mean with in-fence here?

> [SNIP]
>> Yeah but why does your userspace not know when a bo is used?
> We always know when a BO is exported because we're the ones doing the
> export call.  Always.  Of course, we don't know if that BO is shared
> with another driver or re-imported back into the same one but is that
> really the case we're optimizing for?

Yes, unfortunately. Exactly that's one of the reasons we couldn't go 
with the per CS per BO flag if it should be shared or exclusive.

>> Or very bluntly, why cant radv do what anv does (or amdvlk if you care
>> more about that, it's the same)? What's missing with lots of blantant
>> lying?
> I'm also not buying this.  You keep claiming that userspace doesn't
> know but GL definitely does know and Vulkan knows well enough.  You
> say that it's motivated by Vulkan and use RADV as an example but the
> only reason why the RADV guys haven't followed the ANV design is to
> work around limitations in amdgpu.  We shouldn't then use RADV to
> justify why this is the right uAPI and why i915 is wrong.

Well, I never said that this is because of RADV. The main motivation we 
had is because of MM engines, e.g. VA-API, VDPAU and OpenMax.

And when we expose a BO with the DMA-buf functions we simply doesn't 
know in userspace if that is then re-imported into VA-API or send to a 
different process.

>>> [SNIP]
>>> Yeah, and that is exactly the reason why I will NAK this uAPI change.
>>>
>>> This doesn't works for amdgpu at all for the reasons outlined above.
>> Uh that's really not how uapi works. "my driver is right, everyone
>> else is wrong" is not how cross driver contracts are defined. If that
>> means a perf impact until you've fixed your rules, that's on you.
>>
>> Also you're a few years too late with nacking this, it's already uapi
>> in the form of the dma-buf poll() support.
> ^^  My fancy new ioctl doesn't expose anything that isn't already
> there.  It just lets you take a snap-shot of a wait instead of doing
> an active wait which might end up with more fences added depending on
> interrupts and retries.  The dma-buf poll waits on all fences for
> POLLOUT and only the exclusive fence for POLLIN.  It's already uAPI.

Well that's not the stuff I'm concerned about. But rather that you want 
to add that as exclusive fence from the shared ones once more.

This prevents the TLB flush case I've outlined from working correctly.

>>>> So the way I see things right now:
>>>> - exclusive fence slot is for implicit sync. kmd should only set it
>>>> when userspace indicates, otherwise you will suffer. Explicit syncing
>>>> userspace needs to tell the kernel with a flag in the CS ioctl when it
>>>> should sync against this exclusive fence and when it should ignore it,
>>>> otherwise you'll suffer badly once more.
>>> That is not sufficient. The explicit sync slot is for kernel internal
>>> memory management.
>> Then we need to split it. But what I discussed with Thomas Hellstrom
>> is that at least for anything except p2p dma-buf ttm_bo->moving should
>> be enough.
> This is starting to sound like maybe roughly the right direction to me
> but I'm still unclear on exactly what problem we're trying to solve
> for TLB invalidates.  I'd like to understand that better before giving
> strong opinions.  I'm also not super-familiar with ttm_bo->moving but
> it sounds like we need some third category of fence somewhere.

Well I would rather say that we should separate the use cases.

E.g. clear APIs for resource management vs. implicit sync.

Christian.

>
> --Jason
>


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] Add DMA_RESV_USAGE flags
  2021-05-19 10:52                     ` Michel Dänzer
@ 2021-05-19 15:21                       ` Jason Ekstrand
  2021-05-19 15:48                         ` Michel Dänzer
  0 siblings, 1 reply; 50+ messages in thread
From: Jason Ekstrand @ 2021-05-19 15:21 UTC (permalink / raw)
  To: Michel Dänzer
  Cc: moderated list:DMA BUFFER SHARING FRAMEWORK,
	Christian König, dri-devel

On Wed, May 19, 2021 at 5:52 AM Michel Dänzer <michel@daenzer.net> wrote:
>
> On 2021-05-19 12:06 a.m., Jason Ekstrand wrote:
> > On Tue, May 18, 2021 at 4:17 PM Daniel Vetter <daniel@ffwll.ch> wrote:
> >>
> >> On Tue, May 18, 2021 at 7:40 PM Christian König
> >> <ckoenig.leichtzumerken@gmail.com> wrote:
> >>>
> >>> Am 18.05.21 um 18:48 schrieb Daniel Vetter:
> >>>> On Tue, May 18, 2021 at 2:49 PM Christian König
> >>>> <ckoenig.leichtzumerken@gmail.com> wrote:
> >>>>
> >>>>> And as long as we are all inside amdgpu we also don't have any oversync,
> >>>>> the issue only happens when we share dma-bufs with i915 (radeon and
> >>>>> AFAIK nouveau does the right thing as well).
> >>>> Yeah because then you can't use the amdgpu dma_resv model anymore and
> >>>> have to use the one atomic helpers use. Which is also the one that
> >>>> e.g. Jason is threathening to bake in as uapi with his dma_buf ioctl,
> >>>> so as soon as that lands and someone starts using it, something has to
> >>>> adapt _anytime_ you have a dma-buf hanging around. Not just when it's
> >>>> shared with another device.
> >>>
> >>> Yeah, and that is exactly the reason why I will NAK this uAPI change.
> >>>
> >>> This doesn't works for amdgpu at all for the reasons outlined above.
> >>
> >> Uh that's really not how uapi works. "my driver is right, everyone
> >> else is wrong" is not how cross driver contracts are defined. If that
> >> means a perf impact until you've fixed your rules, that's on you.
> >>
> >> Also you're a few years too late with nacking this, it's already uapi
> >> in the form of the dma-buf poll() support.
> >
> > ^^  My fancy new ioctl doesn't expose anything that isn't already
> > there.  It just lets you take a snap-shot of a wait instead of doing
> > an active wait which might end up with more fences added depending on
> > interrupts and retries.  The dma-buf poll waits on all fences for
> > POLLOUT and only the exclusive fence for POLLIN.  It's already uAPI.
>
> Note that the dma-buf poll support could be useful to Wayland compositors for the same purpose as Jason's new ioctl (only using client buffers which have finished drawing for an output frame, to avoid missing a refresh cycle due to client drawing), *if* it didn't work differently with amdgpu.
>
> Am I understanding correctly that Jason's new ioctl would also work differently with amdgpu as things stand currently? If so, that would be a real bummer and might hinder adoption of the ioctl by Wayland compositors.

My new ioctl has identical semantics to poll().  It just lets you take
a snapshot in time to wait on later instead of waiting on whatever
happens to be set right now.  IMO, having identical semantics to
poll() isn't something we want to change.

--Jason

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] Add DMA_RESV_USAGE flags
  2021-05-19 11:43                     ` Christian König
@ 2021-05-19 15:35                       ` Jason Ekstrand
  0 siblings, 0 replies; 50+ messages in thread
From: Jason Ekstrand @ 2021-05-19 15:35 UTC (permalink / raw)
  To: Christian König
  Cc: moderated list:DMA BUFFER SHARING FRAMEWORK, dri-devel

On Wed, May 19, 2021 at 6:43 AM Christian König
<ckoenig.leichtzumerken@gmail.com> wrote:
>
> Am 19.05.21 um 00:06 schrieb Jason Ekstrand:
> > [SNIP]
> >>> E.g. we can't add a fence which doesn't wait for the exclusive one as
> >>> shared.
> >> Ok I think that's a real problem, and  guess it's also related to all
> >> the ttm privatization tricks and all that. So essentially we'd need
> >> the opposite of ttm_bo->moving, as in you can't ignore it, but
> >> otherwise it completely ignores all the userspace implicit fence
> >> stuff.
> > Would you mind explaining it to the rest of the class?  I get the need
> > to do a TLB flush after a BO is removed from the processes address
> > space and I get that it may be super-heavy and that it has to be
> > delayed.  I also get that the driver needs to hold a reference to the
> > underlying pages until that TLB flush is done.  What I don't get is
> > what this has to do with the exclusive fence.  Why can't the driver
> > just gather up all the dma_resv fences on the current object (or,
> > better yet, just the ones from the current amdgpu process) and wait on
> > them all?  Why does it need to insert an exclusive fence that then
> > clogs up the whole works?
>
> Because we have mixed up resource management with implicit syncing.
>
> When I sum up all fences in (for example) a dma_fence_array container
> and add that as explicit fence to the dma_resv object resource
> management will do what I want and wait for everything to finish before
> moving or freeing the buffer. But implicit sync will just horrible over
> sync and wait for stuff it shouldn't wait for in the first place.
>
> When I add the fence as shared fence I can run into the problem the the
> TLB flush might finish before the exclusive fence. Which is not allowed
> according to the DMA-buf fencing rules.

I'm starting to feel a bit dense here, sorry...  So the problem is
that the TLB flush really wants to just wait on memory management
fences and not implicit sync fences?  Or is it that you need to wait
on the exclusive fence in case it actually matters but you don't want
to if it was stuffed in there for implicit sync and doesn't have any
memory implications?  Also, how bad is it for the TLB flush to come in
late?  Is other stuff blocking on it?

> We currently have some rather crude workarounds to make use cases like
> this work as expected. E.g. by using a
> dma_fence_chain()/dma_fence_array() and/or adding the explusive fence to
> the shared fences etc etc...
>
> >>>>>>> Let's say that you have a buffer which is shared between two drivers A
> >>>>>>> and B and let's say driver A has thrown a fence on it just to ensure
> >>>>>>> that the BO doesn't get swapped out to disk until it's at a good
> >>>>>>> stopping point.  Then driver B comes along and wants to throw a
> >>>>>>> write-fence on it.  Suddenly, your memory fence from driver A causes
> >>>>>>> driver B to have to stall waiting for a "good" time to throw in a
> >>>>>>> fence.  It sounds like this is the sort of scenario that Christian is
> >>>>>>> running into.  And, yes, with certain Vulkan drivers being a bit
> >>>>>>> sloppy about exactly when they throw in write fences, I could see it
> >>>>>>> being a real problem.
> >>>>>> Yes this is a potential problem, and on the i915 side we need to do
> >>>>>> some shuffling here most likely. Especially due to discrete, but the
> >>>>>> problem is pre-existing. tbh I forgot about the implications here
> >>>>>> until I pondered this again yesterday evening.
> >>>>>>
> >>>>>> But afaiui the amdgpu code and winsys in mesa, this isn't (yet) the
> >>>>>> problem amd vk drivers have. The issue is that with amdgpu, all you
> >>>>>> supply are the following bits at CS time:
> >>>>>> - list of always mapped private buffers, which is implicit and O(1) in
> >>>>>> the kernel fastpath
> >>>>>> - additional list of shared buffers that are used by the current CS
> >>>>>>
> >>>>>> I didn't check how exactly that works wrt winsys buffer ownership, but
> >>>>>> the thing is that on the kernel side _any_ buffer in there is treated
> >>>>>> as a implicit sync'ed write. Which means if you render your winsys
> >>>>>> with a bunch of command submission split over 3d and compute pipes,
> >>>>>> you end up with horrendous amounts of oversync.
> >>>>> What are you talking about? We have no sync at all for submissions from
> >>>>> the same client.
> >>>> Yes. Except when the buffer is shared with another driver, at which
> >>>> point you sync a _lot_ and feel the pain.
> >>> Yes, exactly that's the problem.
> >>>
> >>> We basically don't know during CS if a BO is shared or not.
> >>>
> >>> We do know that during importing or exporting the BO thought.
> >> No you don't. Or at least that's massively awkward, see Jason's reply.
> > Please.  In Vulkan, we know explicitly whether or not any BO will ever
> > be shared and, if a BO is ever flagged as shared even though it's not,
> > that's the app being stupid and they can eat the perf hit.
>
> Yeah, that's not a problem at all. We already have the per BO flag in
> amdgpu for this as well.
>
> > In GL, things are more wishy-washy but GL has so many stupid cases where we
> > have to throw a buffer away and re-allocate that one more isn't going
> > to be all that bad.  Even there, you could do something where you add
> > an in-fence to the BO export operation so that the driver knows when
> > to switch from the shared internal dma_resv to the external one
> > without having to create a new BO and copy.
>
> Hui what? What do you mean with in-fence here?

I could imagine wanting to have a sort of mode-switch on BO export
rather than the userspace driver creating a new exportable buffer and
copying into it.  That would get sticky if there was any outstanding
usage of said buffer that would need to be converted to from the
internal model of usage with explicit fences to a more
dma-buf-friendly model.  I was just saying that you could add
something to BO export to give amdgpu the information it needs to do
the switch.  If you're too confused, feel free to ignore this whole
idea.

> > [SNIP]
> >> Yeah but why does your userspace not know when a bo is used?
> > We always know when a BO is exported because we're the ones doing the
> > export call.  Always.  Of course, we don't know if that BO is shared
> > with another driver or re-imported back into the same one but is that
> > really the case we're optimizing for?
>
> Yes, unfortunately. Exactly that's one of the reasons we couldn't go
> with the per CS per BO flag if it should be shared or exclusive.
>
> >> Or very bluntly, why cant radv do what anv does (or amdvlk if you care
> >> more about that, it's the same)? What's missing with lots of blantant
> >> lying?
> > I'm also not buying this.  You keep claiming that userspace doesn't
> > know but GL definitely does know and Vulkan knows well enough.  You
> > say that it's motivated by Vulkan and use RADV as an example but the
> > only reason why the RADV guys haven't followed the ANV design is to
> > work around limitations in amdgpu.  We shouldn't then use RADV to
> > justify why this is the right uAPI and why i915 is wrong.
>
> Well, I never said that this is because of RADV. The main motivation we
> had is because of MM engines, e.g. VA-API, VDPAU and OpenMax.

Sorry, RADV was mentioned many e-mails ago and this is the first time
I've seen video mentioned.  I've just been going with what I read.

If this is an internal driver over-sync issue with video, that's a
whole different card game.  The ways this is tying you in knots makes
a lot more sense to me now, I think.

> And when we expose a BO with the DMA-buf functions we simply doesn't
> know in userspace if that is then re-imported into VA-API or send to a
> different process.
>
> >>> [SNIP]
> >>> Yeah, and that is exactly the reason why I will NAK this uAPI change.
> >>>
> >>> This doesn't works for amdgpu at all for the reasons outlined above.
> >> Uh that's really not how uapi works. "my driver is right, everyone
> >> else is wrong" is not how cross driver contracts are defined. If that
> >> means a perf impact until you've fixed your rules, that's on you.
> >>
> >> Also you're a few years too late with nacking this, it's already uapi
> >> in the form of the dma-buf poll() support.
> > ^^  My fancy new ioctl doesn't expose anything that isn't already
> > there.  It just lets you take a snap-shot of a wait instead of doing
> > an active wait which might end up with more fences added depending on
> > interrupts and retries.  The dma-buf poll waits on all fences for
> > POLLOUT and only the exclusive fence for POLLIN.  It's already uAPI.
>
> Well that's not the stuff I'm concerned about. But rather that you want
> to add that as exclusive fence from the shared ones once more.
>
> This prevents the TLB flush case I've outlined from working correctly.
>
> >>>> So the way I see things right now:
> >>>> - exclusive fence slot is for implicit sync. kmd should only set it
> >>>> when userspace indicates, otherwise you will suffer. Explicit syncing
> >>>> userspace needs to tell the kernel with a flag in the CS ioctl when it
> >>>> should sync against this exclusive fence and when it should ignore it,
> >>>> otherwise you'll suffer badly once more.
> >>> That is not sufficient. The explicit sync slot is for kernel internal
> >>> memory management.
> >> Then we need to split it. But what I discussed with Thomas Hellstrom
> >> is that at least for anything except p2p dma-buf ttm_bo->moving should
> >> be enough.
> > This is starting to sound like maybe roughly the right direction to me
> > but I'm still unclear on exactly what problem we're trying to solve
> > for TLB invalidates.  I'd like to understand that better before giving
> > strong opinions.  I'm also not super-familiar with ttm_bo->moving but
> > it sounds like we need some third category of fence somewhere.
>
> Well I would rather say that we should separate the use cases.
>
> E.g. clear APIs for resource management vs. implicit sync.

No arguments there.  I'm all for separating those two use-cases.  I'm
just trying to understand the problem space.  I think I may finally be
getting there. :-)

--Jason

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] Add DMA_RESV_USAGE flags
  2021-05-19 15:21                       ` Jason Ekstrand
@ 2021-05-19 15:48                         ` Michel Dänzer
  2021-05-20  7:55                           ` Daniel Vetter
  0 siblings, 1 reply; 50+ messages in thread
From: Michel Dänzer @ 2021-05-19 15:48 UTC (permalink / raw)
  To: Jason Ekstrand
  Cc: moderated list:DMA BUFFER SHARING FRAMEWORK,
	Christian König, dri-devel

On 2021-05-19 5:21 p.m., Jason Ekstrand wrote:
> On Wed, May 19, 2021 at 5:52 AM Michel Dänzer <michel@daenzer.net> wrote:
>>
>> On 2021-05-19 12:06 a.m., Jason Ekstrand wrote:
>>> On Tue, May 18, 2021 at 4:17 PM Daniel Vetter <daniel@ffwll.ch> wrote:
>>>>
>>>> On Tue, May 18, 2021 at 7:40 PM Christian König
>>>> <ckoenig.leichtzumerken@gmail.com> wrote:
>>>>>
>>>>> Am 18.05.21 um 18:48 schrieb Daniel Vetter:
>>>>>> On Tue, May 18, 2021 at 2:49 PM Christian König
>>>>>> <ckoenig.leichtzumerken@gmail.com> wrote:
>>>>>>
>>>>>>> And as long as we are all inside amdgpu we also don't have any oversync,
>>>>>>> the issue only happens when we share dma-bufs with i915 (radeon and
>>>>>>> AFAIK nouveau does the right thing as well).
>>>>>> Yeah because then you can't use the amdgpu dma_resv model anymore and
>>>>>> have to use the one atomic helpers use. Which is also the one that
>>>>>> e.g. Jason is threathening to bake in as uapi with his dma_buf ioctl,
>>>>>> so as soon as that lands and someone starts using it, something has to
>>>>>> adapt _anytime_ you have a dma-buf hanging around. Not just when it's
>>>>>> shared with another device.
>>>>>
>>>>> Yeah, and that is exactly the reason why I will NAK this uAPI change.
>>>>>
>>>>> This doesn't works for amdgpu at all for the reasons outlined above.
>>>>
>>>> Uh that's really not how uapi works. "my driver is right, everyone
>>>> else is wrong" is not how cross driver contracts are defined. If that
>>>> means a perf impact until you've fixed your rules, that's on you.
>>>>
>>>> Also you're a few years too late with nacking this, it's already uapi
>>>> in the form of the dma-buf poll() support.
>>>
>>> ^^  My fancy new ioctl doesn't expose anything that isn't already
>>> there.  It just lets you take a snap-shot of a wait instead of doing
>>> an active wait which might end up with more fences added depending on
>>> interrupts and retries.  The dma-buf poll waits on all fences for
>>> POLLOUT and only the exclusive fence for POLLIN.  It's already uAPI.
>>
>> Note that the dma-buf poll support could be useful to Wayland compositors for the same purpose as Jason's new ioctl (only using client buffers which have finished drawing for an output frame, to avoid missing a refresh cycle due to client drawing), *if* it didn't work differently with amdgpu.
>>
>> Am I understanding correctly that Jason's new ioctl would also work differently with amdgpu as things stand currently? If so, that would be a real bummer and might hinder adoption of the ioctl by Wayland compositors.
> 
> My new ioctl has identical semantics to poll().  It just lets you take
> a snapshot in time to wait on later instead of waiting on whatever
> happens to be set right now.  IMO, having identical semantics to
> poll() isn't something we want to change.

Agreed.

I'd argue then that making amdgpu poll semantics match those of other drivers is a pre-requisite for the new ioctl, otherwise it seems unlikely that the ioctl will be widely adopted.


-- 
Earthling Michel Dänzer               |               https://redhat.com
Libre software enthusiast             |             Mesa and X developer

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] Add DMA_RESV_USAGE flags
  2021-05-19 15:48                         ` Michel Dänzer
@ 2021-05-20  7:55                           ` Daniel Vetter
  2021-05-20  8:13                             ` Michel Dänzer
  2021-05-20 10:50                             ` Christian König
  0 siblings, 2 replies; 50+ messages in thread
From: Daniel Vetter @ 2021-05-20  7:55 UTC (permalink / raw)
  To: Michel Dänzer
  Cc: moderated list:DMA BUFFER SHARING FRAMEWORK,
	Christian König, dri-devel, Jason Ekstrand

On Wed, May 19, 2021 at 5:48 PM Michel Dänzer <michel@daenzer.net> wrote:
>
> On 2021-05-19 5:21 p.m., Jason Ekstrand wrote:
> > On Wed, May 19, 2021 at 5:52 AM Michel Dänzer <michel@daenzer.net> wrote:
> >>
> >> On 2021-05-19 12:06 a.m., Jason Ekstrand wrote:
> >>> On Tue, May 18, 2021 at 4:17 PM Daniel Vetter <daniel@ffwll.ch> wrote:
> >>>>
> >>>> On Tue, May 18, 2021 at 7:40 PM Christian König
> >>>> <ckoenig.leichtzumerken@gmail.com> wrote:
> >>>>>
> >>>>> Am 18.05.21 um 18:48 schrieb Daniel Vetter:
> >>>>>> On Tue, May 18, 2021 at 2:49 PM Christian König
> >>>>>> <ckoenig.leichtzumerken@gmail.com> wrote:
> >>>>>>
> >>>>>>> And as long as we are all inside amdgpu we also don't have any oversync,
> >>>>>>> the issue only happens when we share dma-bufs with i915 (radeon and
> >>>>>>> AFAIK nouveau does the right thing as well).
> >>>>>> Yeah because then you can't use the amdgpu dma_resv model anymore and
> >>>>>> have to use the one atomic helpers use. Which is also the one that
> >>>>>> e.g. Jason is threathening to bake in as uapi with his dma_buf ioctl,
> >>>>>> so as soon as that lands and someone starts using it, something has to
> >>>>>> adapt _anytime_ you have a dma-buf hanging around. Not just when it's
> >>>>>> shared with another device.
> >>>>>
> >>>>> Yeah, and that is exactly the reason why I will NAK this uAPI change.
> >>>>>
> >>>>> This doesn't works for amdgpu at all for the reasons outlined above.
> >>>>
> >>>> Uh that's really not how uapi works. "my driver is right, everyone
> >>>> else is wrong" is not how cross driver contracts are defined. If that
> >>>> means a perf impact until you've fixed your rules, that's on you.
> >>>>
> >>>> Also you're a few years too late with nacking this, it's already uapi
> >>>> in the form of the dma-buf poll() support.
> >>>
> >>> ^^  My fancy new ioctl doesn't expose anything that isn't already
> >>> there.  It just lets you take a snap-shot of a wait instead of doing
> >>> an active wait which might end up with more fences added depending on
> >>> interrupts and retries.  The dma-buf poll waits on all fences for
> >>> POLLOUT and only the exclusive fence for POLLIN.  It's already uAPI.
> >>
> >> Note that the dma-buf poll support could be useful to Wayland compositors for the same purpose as Jason's new ioctl (only using client buffers which have finished drawing for an output frame, to avoid missing a refresh cycle due to client drawing), *if* it didn't work differently with amdgpu.
> >>
> >> Am I understanding correctly that Jason's new ioctl would also work differently with amdgpu as things stand currently? If so, that would be a real bummer and might hinder adoption of the ioctl by Wayland compositors.
> >
> > My new ioctl has identical semantics to poll().  It just lets you take
> > a snapshot in time to wait on later instead of waiting on whatever
> > happens to be set right now.  IMO, having identical semantics to
> > poll() isn't something we want to change.
>
> Agreed.
>
> I'd argue then that making amdgpu poll semantics match those of other drivers is a pre-requisite for the new ioctl, otherwise it seems unlikely that the ioctl will be widely adopted.

This seems backwards, because that means useful improvements in all
other drivers are stalled until amdgpu is fixed.

I think we need agreement on what the rules are, reasonable plan to
get there, and then that should be enough to unblock work in the wider
community. Holding the community at large hostage because one driver
is different is really not great.

I've just finished the subsystem review of everything, and thus far
only found some minor bugs without practical significance. I'll fix
those and then send out a series.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] Add DMA_RESV_USAGE flags
  2021-05-19 11:24                   ` Christian König
@ 2021-05-20  7:58                     ` Daniel Vetter
  0 siblings, 0 replies; 50+ messages in thread
From: Daniel Vetter @ 2021-05-20  7:58 UTC (permalink / raw)
  To: Christian König
  Cc: moderated list:DMA BUFFER SHARING FRAMEWORK, dri-devel, Jason Ekstrand

On Wed, May 19, 2021 at 1:24 PM Christian König
<ckoenig.leichtzumerken@gmail.com> wrote:
>
> Am 18.05.21 um 23:17 schrieb Daniel Vetter:
> > [SNIP]
> >> The problem in this case is not starting a new CS, but synchronizing to
> >> the existing ones.
> >>
> >> See a heavy TLB flush is made completely out of sync. E.g. it doesn't
> >> want to wait for any previous operation.
> >>
> >> In other words imagine the following example:
> >> 1. Both process A and B have a BO mapped.
> >> 2. Process A is heavily using the BO and doing all kind of rendering.
> >> 3. Process B is unmapping the BO.
> >>
> >> Now that process B unmaps the BO needs to trigger page table updates and
> >> a heavy TLB flush, but since this can take really long we want to do it
> >> asynchronously on the hardware.
> >>
> >> With the current approach you basically can't do that because you can't
> >> note that a fence should not participate in synchronization at all.
> >>
> >> E.g. we can't add a fence which doesn't wait for the exclusive one as
> >> shared.
> > Ok I think that's a real problem, and  guess it's also related to all
> > the ttm privatization tricks and all that. So essentially we'd need
> > the opposite of ttm_bo->moving, as in you can't ignore it, but
> > otherwise it completely ignores all the userspace implicit fence
> > stuff.
>
> It goes into that direction, but doesn't sounds like the full solution
> either.
>
> [SNIP]
> > Can we please stop with the "amdgpu is right, everyone else is wrong" approach?
>
> Well the approach I do here is not "amdgpu is right, everyone else is
> wrong". But rather we had DRM uAPI for i915, nouveau and radeon and
> unfortunately leaked that into DMA-buf without much thinking about it.
>
> I'm also not saying that the approach amdgpu is right. It's just what
> amdgpu needs in it's CS interface.
>
> What I'm saying is that DMA-buf is a device driver independent subsystem
> and we shouldn't make any assumption which come from just a handful of
> DRM driver on it's implicit sync implementation.
>
> > Like I'm pretty much going to type up the patch that does a full drm
> > subsytem audit of everything and whack amdgpu into compliance. Perf
> > hit be damned, you had a few years to fix this with better uapi. Or I
> > find out that there's a giant inconsistent mess, but at least we'd
> > gain some clarity about where exactly we are here and maybe what to do
> > next.
>
> Ok to let us move forward please take a look at the first patches of the
> set. It cleans up quite a bunch of the mess we have in there before even
> coming to adding flags to the shared slots.
>
> I think you will agree on that we should do is cleaning up the use cases
> further and separate implicit sync from resource management.

Just replying on this because I'm a bit busy with reviewing everything
we have in upstream right now.

I agree there's some useful stuff in there, but we have a fundamental
disagreement on how this works. That needs to be resolved first, and
as part of that we need to come up with a plan how to get everyone on
the same page.

Then next thing is a plan how to get the various issues you're raising
around dma_resv rules sorted out.

Once we have that, and only then, does it imo make sense to
review/merge cleanup patches. As long as we have fundamental
disagreements along the lines like we have here there's no point.

I should have a patch set maybe tomorrow or early next week with my
results of the drm subsystem review of how exactly dma_resv is used
currently. Thus far it's a few pages of code analysis, but not yet
complete. Also I found some smaller issues in a few places, so the
discussion is going to involve a few more people until we're settled
here :-/

Cheers, Daniel


> In other words we forbid touching the exclusive and shared fences
> directly and have separate APIs for resource management and implicit sync.
>
> This makes sense anyway, no matter what implicit synchronization
> framework we will install underneath.
>
> Regards,
> Christian.
>
> > -Daniel
> >
> >> Regards,
> >> Christian.
> >>
> >>> After that I think we can look at what exact oversync issue remains
> >>> and why and solve it, but until we have this this just feels like
> >>> another rehash of "amgpu insist its own dma_resv interpration is the
> >>> right one and everyone else should move one over".
> >>>
> >>> Or maybe I've just become real garbage at reading random driver code,
> >>> wouldn't be the first time :-)
> >>>
> >>> Cheers, Daniel
> >>>
> >>>> Regards,
> >>>> Christian.
> >>>>
> >>>>> Cheers, Daniel
> >>>>>
> >>>>>> --Jason
> >>>>>>
> >>>>>>
> >>>>>>>> That's also the reason the Valve guys came up with a solution where each
> >>>>>>>> BO gets a flag for explicit sync, but that only works for exports and
> >>>>>>>> not for imports.
> >>>>>>>>
> >>>>>>>>> I915 and iirc msm has explicit flags for this, panfrost was designed to
> >>>>>>>>> support this correctly from the start (also with flags I think). That's at
> >>>>>>>>> least what I remember from all the discussions at XDC and #dri-devel, but
> >>>>>>>>> didn't check the code again to give you the list of uapi flags you need
> >>>>>>>>> for each driver.
> >>>>>>>>>
> >>>>>>>>> The other piece is making sure you're only picking up implicit fences when
> >>>>>>>>> you should, and not any later ones, for which Jason has a solution:
> >>>>>>>>>
> >>>>>>>>> https://lore.kernel.org/dri-devel/20210317221940.2146688-1-jason@jlekstrand.net/
> >>>>>>>> Yes, I helped with that as well. But I think that this is just another
> >>>>>>>> workaround without really addressing the underlying problem.
> >>>>>>>>
> >>>>>>>>> If amdgpu isn't using those, then you will suffer from
> >>>>>>>>> over-synchronization in vulkan and pay a price. The entire point of vulkan
> >>>>>>>>> is that you pick up sync points very explicitly, and we also need to have
> >>>>>>>>> very explicit uapi for userspace to pick up/set the implicit fences.
> >>>>>>>>>
> >>>>>>>>> Trying to paper over this with more implicit magic is imo just wrong, and
> >>>>>>>>> definitely not the long term explicit sync model we want.
> >>>>>>>> I completely disagree.
> >>>>>>>>
> >>>>>>>> In my opinion the implicit sync model we have for dma_resv currently is
> >>>>>>>> just not well designed at all, since it always requires cooperation from
> >>>>>>>> userspace.
> >>>>>>>>
> >>>>>>>> In other words you need to know when to enable implicit sync in
> >>>>>>>> userspace and that information is simply not present all of the time.
> >>>>>>>>
> >>>>>>>> What we have done here is just keeping the old reader/writer flags i915,
> >>>>>>>> radeon and nouveau once had and pushed that out to everybody else making
> >>>>>>>> the assumption that everybody would follow that without documenting the
> >>>>>>>> actual rules of engagement you need to follow here.
> >>>>>>>>
> >>>>>>>> That was a really big mistake and we should try to fix that sooner or
> >>>>>>>> later. The only other clean alternative I see is to use a flag on the
> >>>>>>>> exporter to tell the importer if it should sync to shared fences or not.
> >>>>>>>>
> >>>>>>>> Additional to that I'm perfectly fine with implicit sync. Explicit sync
> >>>>>>>> certainly has some use cases as well, but I don't see it as an absolute
> >>>>>>>> advantage over the implicit model.
> >>>>>>> Ok this stops making sense. Somehow you claim userspace doesn't know
> >>>>>>> when to sync, but somehow the kernel does? By guessing, and getting it
> >>>>>>> wrong mostly, except for the one case that you benchmarked?
> >>>>>>>
> >>>>>>> Aside from silly userspace which exports a buffer to a dma-buf, but
> >>>>>>> then never imports it anywhere else, there isn't a case I know of
> >>>>>>> where the kernel actually knows more than userspace. But there's lots
> >>>>>>> of cases where the kernel definitely knows less, especially if
> >>>>>>> userspace doesn't tell it about what's going on with each rendering
> >>>>>>> and buffer.
> >>>>>>>
> >>>>>>> So here's the 2 things you need to make this work like every other driver:
> >>>>>>>
> >>>>>>> 1. A way to set the explicit fence on a buffer. CS ioctl is perfectly
> >>>>>>> fine, but also can be seperate. Userspace uses this only on a) shared
> >>>>>>> buffers b) when there's a flush/swap on that shared buffer. Not when
> >>>>>>> rendering any of the interim stuff, that only leads to oversync.
> >>>>>>> Anything non-shared is handled explicitly in userspace (at least for
> >>>>>>> modern-ish drivers). This is the only thing that ever sets an
> >>>>>>> exclusive fence (aside from ttm moving buffers around ofc).
> >>>>>>>
> >>>>>>> 2. A way to sync with the implicit fences, either all of them (for
> >>>>>>> upcoming write access) or just the write fence (for read access). At
> >>>>>>> first we thought it's good enough to do this in the CS ioctl, but
> >>>>>>> that's a wee bit too late, hence the patches from Jason. My
> >>>>>>> understanding is that vulkan converts this into an vk syncobj/fence of
> >>>>>>> some sorts, so really can't make this more explicit and intentional
> >>>>>>> than that.
> >>>>>>>
> >>>>>>> None of this is something the kernel has the slightest idea about when
> >>>>>>> it happens, so you have to have explicit uapi for it. Trying to fake
> >>>>>>> it in the kernel just doesn't work.
> >>>>>>> -Daniel
> >>>>>>> --
> >>>>>>> Daniel Vetter
> >>>>>>> Software Engineer, Intel Corporation
> >>>>>>> http://blog.ffwll.ch
> >>>>> --
> >>>>> Daniel Vetter
> >>>>> Software Engineer, Intel Corporation
> >>>>> http://blog.ffwll.ch
> >
>


-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] Add DMA_RESV_USAGE flags
  2021-05-20  7:55                           ` Daniel Vetter
@ 2021-05-20  8:13                             ` Michel Dänzer
  2021-05-20 10:00                               ` Christian König
  2021-05-20 14:18                               ` Daniel Vetter
  2021-05-20 10:50                             ` Christian König
  1 sibling, 2 replies; 50+ messages in thread
From: Michel Dänzer @ 2021-05-20  8:13 UTC (permalink / raw)
  To: Daniel Vetter
  Cc: moderated list:DMA BUFFER SHARING FRAMEWORK,
	Christian König, Jason Ekstrand, dri-devel

On 2021-05-20 9:55 a.m., Daniel Vetter wrote:
> On Wed, May 19, 2021 at 5:48 PM Michel Dänzer <michel@daenzer.net> wrote:
>>
>> On 2021-05-19 5:21 p.m., Jason Ekstrand wrote:
>>> On Wed, May 19, 2021 at 5:52 AM Michel Dänzer <michel@daenzer.net> wrote:
>>>>
>>>> On 2021-05-19 12:06 a.m., Jason Ekstrand wrote:
>>>>> On Tue, May 18, 2021 at 4:17 PM Daniel Vetter <daniel@ffwll.ch> wrote:
>>>>>>
>>>>>> On Tue, May 18, 2021 at 7:40 PM Christian König
>>>>>> <ckoenig.leichtzumerken@gmail.com> wrote:
>>>>>>>
>>>>>>> Am 18.05.21 um 18:48 schrieb Daniel Vetter:
>>>>>>>> On Tue, May 18, 2021 at 2:49 PM Christian König
>>>>>>>> <ckoenig.leichtzumerken@gmail.com> wrote:
>>>>>>>>
>>>>>>>>> And as long as we are all inside amdgpu we also don't have any oversync,
>>>>>>>>> the issue only happens when we share dma-bufs with i915 (radeon and
>>>>>>>>> AFAIK nouveau does the right thing as well).
>>>>>>>> Yeah because then you can't use the amdgpu dma_resv model anymore and
>>>>>>>> have to use the one atomic helpers use. Which is also the one that
>>>>>>>> e.g. Jason is threathening to bake in as uapi with his dma_buf ioctl,
>>>>>>>> so as soon as that lands and someone starts using it, something has to
>>>>>>>> adapt _anytime_ you have a dma-buf hanging around. Not just when it's
>>>>>>>> shared with another device.
>>>>>>>
>>>>>>> Yeah, and that is exactly the reason why I will NAK this uAPI change.
>>>>>>>
>>>>>>> This doesn't works for amdgpu at all for the reasons outlined above.
>>>>>>
>>>>>> Uh that's really not how uapi works. "my driver is right, everyone
>>>>>> else is wrong" is not how cross driver contracts are defined. If that
>>>>>> means a perf impact until you've fixed your rules, that's on you.
>>>>>>
>>>>>> Also you're a few years too late with nacking this, it's already uapi
>>>>>> in the form of the dma-buf poll() support.
>>>>>
>>>>> ^^  My fancy new ioctl doesn't expose anything that isn't already
>>>>> there.  It just lets you take a snap-shot of a wait instead of doing
>>>>> an active wait which might end up with more fences added depending on
>>>>> interrupts and retries.  The dma-buf poll waits on all fences for
>>>>> POLLOUT and only the exclusive fence for POLLIN.  It's already uAPI.
>>>>
>>>> Note that the dma-buf poll support could be useful to Wayland compositors for the same purpose as Jason's new ioctl (only using client buffers which have finished drawing for an output frame, to avoid missing a refresh cycle due to client drawing), *if* it didn't work differently with amdgpu.
>>>>
>>>> Am I understanding correctly that Jason's new ioctl would also work differently with amdgpu as things stand currently? If so, that would be a real bummer and might hinder adoption of the ioctl by Wayland compositors.
>>>
>>> My new ioctl has identical semantics to poll().  It just lets you take
>>> a snapshot in time to wait on later instead of waiting on whatever
>>> happens to be set right now.  IMO, having identical semantics to
>>> poll() isn't something we want to change.
>>
>> Agreed.
>>
>> I'd argue then that making amdgpu poll semantics match those of other drivers is a pre-requisite for the new ioctl, otherwise it seems unlikely that the ioctl will be widely adopted.
> 
> This seems backwards, because that means useful improvements in all
> other drivers are stalled until amdgpu is fixed.
> 
> I think we need agreement on what the rules are, reasonable plan to
> get there, and then that should be enough to unblock work in the wider
> community. Holding the community at large hostage because one driver
> is different is really not great.

I think we're in violent agreement. :) The point I was trying to make is that amdgpu really needs to be fixed to be consistent with other drivers ASAP.


-- 
Earthling Michel Dänzer               |               https://redhat.com
Libre software enthusiast             |             Mesa and X developer

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] Add DMA_RESV_USAGE flags
  2021-05-20  8:13                             ` Michel Dänzer
@ 2021-05-20 10:00                               ` Christian König
  2021-05-20 14:18                               ` Daniel Vetter
  1 sibling, 0 replies; 50+ messages in thread
From: Christian König @ 2021-05-20 10:00 UTC (permalink / raw)
  To: Michel Dänzer, Daniel Vetter
  Cc: moderated list:DMA BUFFER SHARING FRAMEWORK, Jason Ekstrand, dri-devel

Am 20.05.21 um 10:13 schrieb Michel Dänzer:
> On 2021-05-20 9:55 a.m., Daniel Vetter wrote:
>> On Wed, May 19, 2021 at 5:48 PM Michel Dänzer <michel@daenzer.net> wrote:
>>> On 2021-05-19 5:21 p.m., Jason Ekstrand wrote:
>>>> On Wed, May 19, 2021 at 5:52 AM Michel Dänzer <michel@daenzer.net> wrote:
>>>>> On 2021-05-19 12:06 a.m., Jason Ekstrand wrote:
>>>>>> On Tue, May 18, 2021 at 4:17 PM Daniel Vetter <daniel@ffwll.ch> wrote:
>>>>>>> On Tue, May 18, 2021 at 7:40 PM Christian König
>>>>>>> <ckoenig.leichtzumerken@gmail.com> wrote:
>>>>>>>> Am 18.05.21 um 18:48 schrieb Daniel Vetter:
>>>>>>>>> On Tue, May 18, 2021 at 2:49 PM Christian König
>>>>>>>>> <ckoenig.leichtzumerken@gmail.com> wrote:
>>>>>>>>>
>>>>>>>>>> And as long as we are all inside amdgpu we also don't have any oversync,
>>>>>>>>>> the issue only happens when we share dma-bufs with i915 (radeon and
>>>>>>>>>> AFAIK nouveau does the right thing as well).
>>>>>>>>> Yeah because then you can't use the amdgpu dma_resv model anymore and
>>>>>>>>> have to use the one atomic helpers use. Which is also the one that
>>>>>>>>> e.g. Jason is threathening to bake in as uapi with his dma_buf ioctl,
>>>>>>>>> so as soon as that lands and someone starts using it, something has to
>>>>>>>>> adapt _anytime_ you have a dma-buf hanging around. Not just when it's
>>>>>>>>> shared with another device.
>>>>>>>> Yeah, and that is exactly the reason why I will NAK this uAPI change.
>>>>>>>>
>>>>>>>> This doesn't works for amdgpu at all for the reasons outlined above.
>>>>>>> Uh that's really not how uapi works. "my driver is right, everyone
>>>>>>> else is wrong" is not how cross driver contracts are defined. If that
>>>>>>> means a perf impact until you've fixed your rules, that's on you.
>>>>>>>
>>>>>>> Also you're a few years too late with nacking this, it's already uapi
>>>>>>> in the form of the dma-buf poll() support.
>>>>>> ^^  My fancy new ioctl doesn't expose anything that isn't already
>>>>>> there.  It just lets you take a snap-shot of a wait instead of doing
>>>>>> an active wait which might end up with more fences added depending on
>>>>>> interrupts and retries.  The dma-buf poll waits on all fences for
>>>>>> POLLOUT and only the exclusive fence for POLLIN.  It's already uAPI.
>>>>> Note that the dma-buf poll support could be useful to Wayland compositors for the same purpose as Jason's new ioctl (only using client buffers which have finished drawing for an output frame, to avoid missing a refresh cycle due to client drawing), *if* it didn't work differently with amdgpu.
>>>>>
>>>>> Am I understanding correctly that Jason's new ioctl would also work differently with amdgpu as things stand currently? If so, that would be a real bummer and might hinder adoption of the ioctl by Wayland compositors.
>>>> My new ioctl has identical semantics to poll().  It just lets you take
>>>> a snapshot in time to wait on later instead of waiting on whatever
>>>> happens to be set right now.  IMO, having identical semantics to
>>>> poll() isn't something we want to change.
>>> Agreed.
>>>
>>> I'd argue then that making amdgpu poll semantics match those of other drivers is a pre-requisite for the new ioctl, otherwise it seems unlikely that the ioctl will be widely adopted.
>> This seems backwards, because that means useful improvements in all
>> other drivers are stalled until amdgpu is fixed.
>>
>> I think we need agreement on what the rules are, reasonable plan to
>> get there, and then that should be enough to unblock work in the wider
>> community. Holding the community at large hostage because one driver
>> is different is really not great.
> I think we're in violent agreement. :) The point I was trying to make is that amdgpu really needs to be fixed to be consistent with other drivers ASAP.

Well from my point of view I rather think that the rules of DMA-buf 
implicit sync should be fixed, cause those are based on an ancient DRM 
approach.

And I'm seriously not accepting any changes to amdgpu involving per BO 
flags for CS.

Regards,
Christian.



^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] Add DMA_RESV_USAGE flags
  2021-05-20  7:55                           ` Daniel Vetter
  2021-05-20  8:13                             ` Michel Dänzer
@ 2021-05-20 10:50                             ` Christian König
  2021-05-20 17:23                               ` Jason Ekstrand
  1 sibling, 1 reply; 50+ messages in thread
From: Christian König @ 2021-05-20 10:50 UTC (permalink / raw)
  To: Daniel Vetter, Michel Dänzer
  Cc: moderated list:DMA BUFFER SHARING FRAMEWORK, dri-devel, Jason Ekstrand

Am 20.05.21 um 09:55 schrieb Daniel Vetter:
> On Wed, May 19, 2021 at 5:48 PM Michel Dänzer <michel@daenzer.net> wrote:
>> On 2021-05-19 5:21 p.m., Jason Ekstrand wrote:
>>> On Wed, May 19, 2021 at 5:52 AM Michel Dänzer <michel@daenzer.net> wrote:
>>>> On 2021-05-19 12:06 a.m., Jason Ekstrand wrote:
>>>>> On Tue, May 18, 2021 at 4:17 PM Daniel Vetter <daniel@ffwll.ch> wrote:
>>>>>> On Tue, May 18, 2021 at 7:40 PM Christian König
>>>>>> <ckoenig.leichtzumerken@gmail.com> wrote:
>>>>>>> Am 18.05.21 um 18:48 schrieb Daniel Vetter:
>>>>>>>> On Tue, May 18, 2021 at 2:49 PM Christian König
>>>>>>>> <ckoenig.leichtzumerken@gmail.com> wrote:
>>>>>>>>
>>>>>>>>> And as long as we are all inside amdgpu we also don't have any oversync,
>>>>>>>>> the issue only happens when we share dma-bufs with i915 (radeon and
>>>>>>>>> AFAIK nouveau does the right thing as well).
>>>>>>>> Yeah because then you can't use the amdgpu dma_resv model anymore and
>>>>>>>> have to use the one atomic helpers use. Which is also the one that
>>>>>>>> e.g. Jason is threathening to bake in as uapi with his dma_buf ioctl,
>>>>>>>> so as soon as that lands and someone starts using it, something has to
>>>>>>>> adapt _anytime_ you have a dma-buf hanging around. Not just when it's
>>>>>>>> shared with another device.
>>>>>>> Yeah, and that is exactly the reason why I will NAK this uAPI change.
>>>>>>>
>>>>>>> This doesn't works for amdgpu at all for the reasons outlined above.
>>>>>> Uh that's really not how uapi works. "my driver is right, everyone
>>>>>> else is wrong" is not how cross driver contracts are defined. If that
>>>>>> means a perf impact until you've fixed your rules, that's on you.
>>>>>>
>>>>>> Also you're a few years too late with nacking this, it's already uapi
>>>>>> in the form of the dma-buf poll() support.
>>>>> ^^  My fancy new ioctl doesn't expose anything that isn't already
>>>>> there.  It just lets you take a snap-shot of a wait instead of doing
>>>>> an active wait which might end up with more fences added depending on
>>>>> interrupts and retries.  The dma-buf poll waits on all fences for
>>>>> POLLOUT and only the exclusive fence for POLLIN.  It's already uAPI.
>>>> Note that the dma-buf poll support could be useful to Wayland compositors for the same purpose as Jason's new ioctl (only using client buffers which have finished drawing for an output frame, to avoid missing a refresh cycle due to client drawing), *if* it didn't work differently with amdgpu.
>>>>
>>>> Am I understanding correctly that Jason's new ioctl would also work differently with amdgpu as things stand currently? If so, that would be a real bummer and might hinder adoption of the ioctl by Wayland compositors.
>>> My new ioctl has identical semantics to poll().  It just lets you take
>>> a snapshot in time to wait on later instead of waiting on whatever
>>> happens to be set right now.  IMO, having identical semantics to
>>> poll() isn't something we want to change.
>> Agreed.
>>
>> I'd argue then that making amdgpu poll semantics match those of other drivers is a pre-requisite for the new ioctl, otherwise it seems unlikely that the ioctl will be widely adopted.
> This seems backwards, because that means useful improvements in all
> other drivers are stalled until amdgpu is fixed.

Well there is nothing to fix in amdgpu, what we need to is to come up 
with an DMA-buf implicit syncing model which works for everyone.

I've pointed this problem out at FOSDEM roughly 6 years ago, before 
DMA-buf was even merged upstream and way before amdgpu even existed. And 
the response was yeah, maybe we need to look at this as well.

Over the years I've mentioned now at least 5 times that this isn't going 
to work in some situations and came up with different approaches how to 
fix it.

And you still have the nerves to tell me that this isn't a problem and 
we should fix amdgpu instead? Sorry, but I'm really running out of ideas 
how to explain why this isn't working for everybody.

That amdgpu wants to be special is true, but it is a fundamental problem 
that we have designed the implicit sync in DMA-buf only around the needs 
of DRM drivers at that time instead of going a step back and saying hey 
what would be an approach which works for everyone.

You just need to apply my example from FOSDEM with ring buffers in a 
single BO to the DMA-buf implicit sync model and immediately see how it 
falls apart.

> I think we need agreement on what the rules are, reasonable plan to
> get there, and then that should be enough to unblock work in the wider
> community. Holding the community at large hostage because one driver
> is different is really not great.

Well forcing a drivers into a synchronization model not ideal for their 
hardware isn't great either.

The patches I provided at least clean up the naming convention and 
provide clean interfaces for iterating over the shared fence container. 
On top of that use case driven APIs can be implemented.

And yes I'm perfectly aware that this means that we need to touch all 
drivers and memory management handlers, but I'm pretty sure that 
untangling implicit synchronization from resource management is worth 
the effort no matter what changes to the sync model we are going to do.

Regards,
Christian.


> I've just finished the subsystem review of everything, and thus far
> only found some minor bugs without practical significance. I'll fix
> those and then send out a series.
> -Daniel


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] Add DMA_RESV_USAGE flags
  2021-05-20  8:13                             ` Michel Dänzer
  2021-05-20 10:00                               ` Christian König
@ 2021-05-20 14:18                               ` Daniel Vetter
  2021-05-20 14:30                                 ` Michel Dänzer
                                                   ` (2 more replies)
  1 sibling, 3 replies; 50+ messages in thread
From: Daniel Vetter @ 2021-05-20 14:18 UTC (permalink / raw)
  To: Michel Dänzer
  Cc: moderated list:DMA BUFFER SHARING FRAMEWORK,
	Christian König, dri-devel, Jason Ekstrand

On Thu, May 20, 2021 at 10:13:38AM +0200, Michel Dänzer wrote:
> On 2021-05-20 9:55 a.m., Daniel Vetter wrote:
> > On Wed, May 19, 2021 at 5:48 PM Michel Dänzer <michel@daenzer.net> wrote:
> >>
> >> On 2021-05-19 5:21 p.m., Jason Ekstrand wrote:
> >>> On Wed, May 19, 2021 at 5:52 AM Michel Dänzer <michel@daenzer.net> wrote:
> >>>>
> >>>> On 2021-05-19 12:06 a.m., Jason Ekstrand wrote:
> >>>>> On Tue, May 18, 2021 at 4:17 PM Daniel Vetter <daniel@ffwll.ch> wrote:
> >>>>>>
> >>>>>> On Tue, May 18, 2021 at 7:40 PM Christian König
> >>>>>> <ckoenig.leichtzumerken@gmail.com> wrote:
> >>>>>>>
> >>>>>>> Am 18.05.21 um 18:48 schrieb Daniel Vetter:
> >>>>>>>> On Tue, May 18, 2021 at 2:49 PM Christian König
> >>>>>>>> <ckoenig.leichtzumerken@gmail.com> wrote:
> >>>>>>>>
> >>>>>>>>> And as long as we are all inside amdgpu we also don't have any oversync,
> >>>>>>>>> the issue only happens when we share dma-bufs with i915 (radeon and
> >>>>>>>>> AFAIK nouveau does the right thing as well).
> >>>>>>>> Yeah because then you can't use the amdgpu dma_resv model anymore and
> >>>>>>>> have to use the one atomic helpers use. Which is also the one that
> >>>>>>>> e.g. Jason is threathening to bake in as uapi with his dma_buf ioctl,
> >>>>>>>> so as soon as that lands and someone starts using it, something has to
> >>>>>>>> adapt _anytime_ you have a dma-buf hanging around. Not just when it's
> >>>>>>>> shared with another device.
> >>>>>>>
> >>>>>>> Yeah, and that is exactly the reason why I will NAK this uAPI change.
> >>>>>>>
> >>>>>>> This doesn't works for amdgpu at all for the reasons outlined above.
> >>>>>>
> >>>>>> Uh that's really not how uapi works. "my driver is right, everyone
> >>>>>> else is wrong" is not how cross driver contracts are defined. If that
> >>>>>> means a perf impact until you've fixed your rules, that's on you.
> >>>>>>
> >>>>>> Also you're a few years too late with nacking this, it's already uapi
> >>>>>> in the form of the dma-buf poll() support.
> >>>>>
> >>>>> ^^  My fancy new ioctl doesn't expose anything that isn't already
> >>>>> there.  It just lets you take a snap-shot of a wait instead of doing
> >>>>> an active wait which might end up with more fences added depending on
> >>>>> interrupts and retries.  The dma-buf poll waits on all fences for
> >>>>> POLLOUT and only the exclusive fence for POLLIN.  It's already uAPI.
> >>>>
> >>>> Note that the dma-buf poll support could be useful to Wayland compositors for the same purpose as Jason's new ioctl (only using client buffers which have finished drawing for an output frame, to avoid missing a refresh cycle due to client drawing), *if* it didn't work differently with amdgpu.
> >>>>
> >>>> Am I understanding correctly that Jason's new ioctl would also work differently with amdgpu as things stand currently? If so, that would be a real bummer and might hinder adoption of the ioctl by Wayland compositors.
> >>>
> >>> My new ioctl has identical semantics to poll().  It just lets you take
> >>> a snapshot in time to wait on later instead of waiting on whatever
> >>> happens to be set right now.  IMO, having identical semantics to
> >>> poll() isn't something we want to change.
> >>
> >> Agreed.
> >>
> >> I'd argue then that making amdgpu poll semantics match those of other drivers is a pre-requisite for the new ioctl, otherwise it seems unlikely that the ioctl will be widely adopted.
> > 
> > This seems backwards, because that means useful improvements in all
> > other drivers are stalled until amdgpu is fixed.
> > 
> > I think we need agreement on what the rules are, reasonable plan to
> > get there, and then that should be enough to unblock work in the wider
> > community. Holding the community at large hostage because one driver
> > is different is really not great.
> 
> I think we're in violent agreement. :) The point I was trying to make is
> that amdgpu really needs to be fixed to be consistent with other drivers
> ASAP.

It's not that easy at all. I think best case we're looking at about a one
year plan to get this into shape, taking into account usual release/distro
update latencies.

Best case.

But also it's not a really big issue, since this shouldn't stop
compositors from using poll on dma-buf fd or the sync_file stuff from
Jason: The use-case for this in compositors is to avoid a single client
stalling the entire desktop. If a driver lies by not setting the exclusive
fence when expected, you simply don't get this stall avoidance benefit of
misbehaving clients. But also this needs a gpu scheduler and higher
priority for the compositor (or a lot of hw planes so you can composite
with them alone), so it's all fairly academic issue.

Iow amdgpu being different on these wont cause any actual issues I think.

The only case that does break is when the compositor does an mmap on the
dma-buf fd and relies on poll to indicate when the rendering is done. Not
even sure amdgpu supports mmap on dma-buf or not. That's the only case I
could think of which would result in actual corruption anywhere, and not
just stalls when no one expects them.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] Add DMA_RESV_USAGE flags
  2021-05-20 14:18                               ` Daniel Vetter
@ 2021-05-20 14:30                                 ` Michel Dänzer
  2021-05-20 17:08                                 ` Jason Ekstrand
  2021-05-31 12:49                                 ` Michel Dänzer
  2 siblings, 0 replies; 50+ messages in thread
From: Michel Dänzer @ 2021-05-20 14:30 UTC (permalink / raw)
  To: Daniel Vetter
  Cc: moderated list:DMA BUFFER SHARING FRAMEWORK,
	Christian König, Jason Ekstrand, dri-devel

On 2021-05-20 4:18 p.m., Daniel Vetter wrote:
> On Thu, May 20, 2021 at 10:13:38AM +0200, Michel Dänzer wrote:
>> On 2021-05-20 9:55 a.m., Daniel Vetter wrote:
>>> On Wed, May 19, 2021 at 5:48 PM Michel Dänzer <michel@daenzer.net> wrote:
>>>>
>>>> On 2021-05-19 5:21 p.m., Jason Ekstrand wrote:
>>>>> On Wed, May 19, 2021 at 5:52 AM Michel Dänzer <michel@daenzer.net> wrote:
>>>>>>
>>>>>> On 2021-05-19 12:06 a.m., Jason Ekstrand wrote:
>>>>>>> On Tue, May 18, 2021 at 4:17 PM Daniel Vetter <daniel@ffwll.ch> wrote:
>>>>>>>>
>>>>>>>> On Tue, May 18, 2021 at 7:40 PM Christian König
>>>>>>>> <ckoenig.leichtzumerken@gmail.com> wrote:
>>>>>>>>>
>>>>>>>>> Am 18.05.21 um 18:48 schrieb Daniel Vetter:
>>>>>>>>>> On Tue, May 18, 2021 at 2:49 PM Christian König
>>>>>>>>>> <ckoenig.leichtzumerken@gmail.com> wrote:
>>>>>>>>>>
>>>>>>>>>>> And as long as we are all inside amdgpu we also don't have any oversync,
>>>>>>>>>>> the issue only happens when we share dma-bufs with i915 (radeon and
>>>>>>>>>>> AFAIK nouveau does the right thing as well).
>>>>>>>>>> Yeah because then you can't use the amdgpu dma_resv model anymore and
>>>>>>>>>> have to use the one atomic helpers use. Which is also the one that
>>>>>>>>>> e.g. Jason is threathening to bake in as uapi with his dma_buf ioctl,
>>>>>>>>>> so as soon as that lands and someone starts using it, something has to
>>>>>>>>>> adapt _anytime_ you have a dma-buf hanging around. Not just when it's
>>>>>>>>>> shared with another device.
>>>>>>>>>
>>>>>>>>> Yeah, and that is exactly the reason why I will NAK this uAPI change.
>>>>>>>>>
>>>>>>>>> This doesn't works for amdgpu at all for the reasons outlined above.
>>>>>>>>
>>>>>>>> Uh that's really not how uapi works. "my driver is right, everyone
>>>>>>>> else is wrong" is not how cross driver contracts are defined. If that
>>>>>>>> means a perf impact until you've fixed your rules, that's on you.
>>>>>>>>
>>>>>>>> Also you're a few years too late with nacking this, it's already uapi
>>>>>>>> in the form of the dma-buf poll() support.
>>>>>>>
>>>>>>> ^^  My fancy new ioctl doesn't expose anything that isn't already
>>>>>>> there.  It just lets you take a snap-shot of a wait instead of doing
>>>>>>> an active wait which might end up with more fences added depending on
>>>>>>> interrupts and retries.  The dma-buf poll waits on all fences for
>>>>>>> POLLOUT and only the exclusive fence for POLLIN.  It's already uAPI.
>>>>>>
>>>>>> Note that the dma-buf poll support could be useful to Wayland compositors for the same purpose as Jason's new ioctl (only using client buffers which have finished drawing for an output frame, to avoid missing a refresh cycle due to client drawing), *if* it didn't work differently with amdgpu.
>>>>>>
>>>>>> Am I understanding correctly that Jason's new ioctl would also work differently with amdgpu as things stand currently? If so, that would be a real bummer and might hinder adoption of the ioctl by Wayland compositors.
>>>>>
>>>>> My new ioctl has identical semantics to poll().  It just lets you take
>>>>> a snapshot in time to wait on later instead of waiting on whatever
>>>>> happens to be set right now.  IMO, having identical semantics to
>>>>> poll() isn't something we want to change.
>>>>
>>>> Agreed.
>>>>
>>>> I'd argue then that making amdgpu poll semantics match those of other drivers is a pre-requisite for the new ioctl, otherwise it seems unlikely that the ioctl will be widely adopted.
>>>
>>> This seems backwards, because that means useful improvements in all
>>> other drivers are stalled until amdgpu is fixed.
>>>
>>> I think we need agreement on what the rules are, reasonable plan to
>>> get there, and then that should be enough to unblock work in the wider
>>> community. Holding the community at large hostage because one driver
>>> is different is really not great.
>>
>> I think we're in violent agreement. :) The point I was trying to make is
>> that amdgpu really needs to be fixed to be consistent with other drivers
>> ASAP.
> 
> It's not that easy at all. I think best case we're looking at about a one
> year plan to get this into shape, taking into account usual release/distro
> update latencies.
> 
> Best case.
> 
> But also it's not a really big issue, since this shouldn't stop
> compositors from using poll on dma-buf fd or the sync_file stuff from
> Jason: The use-case for this in compositors is to avoid a single client
> stalling the entire desktop. If a driver lies by not setting the exclusive
> fence when expected, you simply don't get this stall avoidance benefit of
> misbehaving clients. But also this needs a gpu scheduler and higher
> priority for the compositor (or a lot of hw planes so you can composite
> with them alone), so it's all fairly academic issue.

AFAIK current AMD GPUs have everything needed to make this work, it would just need to be hooked up for high priority EGL contexts (which at least mutter already uses when available) in amdgpu & radeonsi.


-- 
Earthling Michel Dänzer               |               https://redhat.com
Libre software enthusiast             |             Mesa and X developer

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] Add DMA_RESV_USAGE flags
  2021-05-20 14:18                               ` Daniel Vetter
  2021-05-20 14:30                                 ` Michel Dänzer
@ 2021-05-20 17:08                                 ` Jason Ekstrand
  2021-05-31 12:49                                 ` Michel Dänzer
  2 siblings, 0 replies; 50+ messages in thread
From: Jason Ekstrand @ 2021-05-20 17:08 UTC (permalink / raw)
  To: Daniel Vetter
  Cc: moderated list:DMA BUFFER SHARING FRAMEWORK,
	Christian König, Michel Dänzer, dri-devel

On Thu, May 20, 2021 at 9:18 AM Daniel Vetter <daniel@ffwll.ch> wrote:
>
> On Thu, May 20, 2021 at 10:13:38AM +0200, Michel Dänzer wrote:
> > On 2021-05-20 9:55 a.m., Daniel Vetter wrote:
> > > On Wed, May 19, 2021 at 5:48 PM Michel Dänzer <michel@daenzer.net> wrote:
> > >>
> > >> On 2021-05-19 5:21 p.m., Jason Ekstrand wrote:
> > >>> On Wed, May 19, 2021 at 5:52 AM Michel Dänzer <michel@daenzer.net> wrote:
> > >>>>
> > >>>> On 2021-05-19 12:06 a.m., Jason Ekstrand wrote:
> > >>>>> On Tue, May 18, 2021 at 4:17 PM Daniel Vetter <daniel@ffwll.ch> wrote:
> > >>>>>>
> > >>>>>> On Tue, May 18, 2021 at 7:40 PM Christian König
> > >>>>>> <ckoenig.leichtzumerken@gmail.com> wrote:
> > >>>>>>>
> > >>>>>>> Am 18.05.21 um 18:48 schrieb Daniel Vetter:
> > >>>>>>>> On Tue, May 18, 2021 at 2:49 PM Christian König
> > >>>>>>>> <ckoenig.leichtzumerken@gmail.com> wrote:
> > >>>>>>>>
> > >>>>>>>>> And as long as we are all inside amdgpu we also don't have any oversync,
> > >>>>>>>>> the issue only happens when we share dma-bufs with i915 (radeon and
> > >>>>>>>>> AFAIK nouveau does the right thing as well).
> > >>>>>>>> Yeah because then you can't use the amdgpu dma_resv model anymore and
> > >>>>>>>> have to use the one atomic helpers use. Which is also the one that
> > >>>>>>>> e.g. Jason is threathening to bake in as uapi with his dma_buf ioctl,
> > >>>>>>>> so as soon as that lands and someone starts using it, something has to
> > >>>>>>>> adapt _anytime_ you have a dma-buf hanging around. Not just when it's
> > >>>>>>>> shared with another device.
> > >>>>>>>
> > >>>>>>> Yeah, and that is exactly the reason why I will NAK this uAPI change.
> > >>>>>>>
> > >>>>>>> This doesn't works for amdgpu at all for the reasons outlined above.
> > >>>>>>
> > >>>>>> Uh that's really not how uapi works. "my driver is right, everyone
> > >>>>>> else is wrong" is not how cross driver contracts are defined. If that
> > >>>>>> means a perf impact until you've fixed your rules, that's on you.
> > >>>>>>
> > >>>>>> Also you're a few years too late with nacking this, it's already uapi
> > >>>>>> in the form of the dma-buf poll() support.
> > >>>>>
> > >>>>> ^^  My fancy new ioctl doesn't expose anything that isn't already
> > >>>>> there.  It just lets you take a snap-shot of a wait instead of doing
> > >>>>> an active wait which might end up with more fences added depending on
> > >>>>> interrupts and retries.  The dma-buf poll waits on all fences for
> > >>>>> POLLOUT and only the exclusive fence for POLLIN.  It's already uAPI.
> > >>>>
> > >>>> Note that the dma-buf poll support could be useful to Wayland compositors for the same purpose as Jason's new ioctl (only using client buffers which have finished drawing for an output frame, to avoid missing a refresh cycle due to client drawing), *if* it didn't work differently with amdgpu.
> > >>>>
> > >>>> Am I understanding correctly that Jason's new ioctl would also work differently with amdgpu as things stand currently? If so, that would be a real bummer and might hinder adoption of the ioctl by Wayland compositors.
> > >>>
> > >>> My new ioctl has identical semantics to poll().  It just lets you take
> > >>> a snapshot in time to wait on later instead of waiting on whatever
> > >>> happens to be set right now.  IMO, having identical semantics to
> > >>> poll() isn't something we want to change.
> > >>
> > >> Agreed.
> > >>
> > >> I'd argue then that making amdgpu poll semantics match those of other drivers is a pre-requisite for the new ioctl, otherwise it seems unlikely that the ioctl will be widely adopted.
> > >
> > > This seems backwards, because that means useful improvements in all
> > > other drivers are stalled until amdgpu is fixed.
> > >
> > > I think we need agreement on what the rules are, reasonable plan to
> > > get there, and then that should be enough to unblock work in the wider
> > > community. Holding the community at large hostage because one driver
> > > is different is really not great.
> >
> > I think we're in violent agreement. :) The point I was trying to make is
> > that amdgpu really needs to be fixed to be consistent with other drivers
> > ASAP.
>
> It's not that easy at all. I think best case we're looking at about a one
> year plan to get this into shape, taking into account usual release/distro
> update latencies.
>
> Best case.
>
> But also it's not a really big issue, since this shouldn't stop
> compositors from using poll on dma-buf fd or the sync_file stuff from
> Jason: The use-case for this in compositors is to avoid a single client
> stalling the entire desktop. If a driver lies by not setting the exclusive
> fence when expected, you simply don't get this stall avoidance benefit of
> misbehaving clients. But also this needs a gpu scheduler and higher
> priority for the compositor (or a lot of hw planes so you can composite
> with them alone), so it's all fairly academic issue.

That's not really the use-case.... I mean, that is one potential
use-case.  But the real intention is to provide a mechanism for
allowing explicit sync apps to live in an implicit sync world.  For
instance, with that ioctl, you could write an entirely explicit sync
compositor and just snag sync_files from any dma-bufs you get from
clients that don't support whatever your window system's explicit sync
protocol is.  It only works in the one direction, sadly, but I don't
see a good safe way to make the other direction work without snagging
a fence from the final submit which draws to the image.

--Jason

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] Add DMA_RESV_USAGE flags
  2021-05-20 10:50                             ` Christian König
@ 2021-05-20 17:23                               ` Jason Ekstrand
  2021-05-20 19:04                                 ` Jason Ekstrand
  2021-05-21  7:24                                 ` Christian König
  0 siblings, 2 replies; 50+ messages in thread
From: Jason Ekstrand @ 2021-05-20 17:23 UTC (permalink / raw)
  To: Christian König
  Cc: moderated list:DMA BUFFER SHARING FRAMEWORK, Michel Dänzer,
	dri-devel

On Thu, May 20, 2021 at 5:50 AM Christian König
<ckoenig.leichtzumerken@gmail.com> wrote:
>
> Am 20.05.21 um 09:55 schrieb Daniel Vetter:
> > On Wed, May 19, 2021 at 5:48 PM Michel Dänzer <michel@daenzer.net> wrote:
> >> On 2021-05-19 5:21 p.m., Jason Ekstrand wrote:
> >>> On Wed, May 19, 2021 at 5:52 AM Michel Dänzer <michel@daenzer.net> wrote:
> >>>> On 2021-05-19 12:06 a.m., Jason Ekstrand wrote:
> >>>>> On Tue, May 18, 2021 at 4:17 PM Daniel Vetter <daniel@ffwll.ch> wrote:
> >>>>>> On Tue, May 18, 2021 at 7:40 PM Christian König
> >>>>>> <ckoenig.leichtzumerken@gmail.com> wrote:
> >>>>>>> Am 18.05.21 um 18:48 schrieb Daniel Vetter:
> >>>>>>>> On Tue, May 18, 2021 at 2:49 PM Christian König
> >>>>>>>> <ckoenig.leichtzumerken@gmail.com> wrote:
> >>>>>>>>
> >>>>>>>>> And as long as we are all inside amdgpu we also don't have any oversync,
> >>>>>>>>> the issue only happens when we share dma-bufs with i915 (radeon and
> >>>>>>>>> AFAIK nouveau does the right thing as well).
> >>>>>>>> Yeah because then you can't use the amdgpu dma_resv model anymore and
> >>>>>>>> have to use the one atomic helpers use. Which is also the one that
> >>>>>>>> e.g. Jason is threathening to bake in as uapi with his dma_buf ioctl,
> >>>>>>>> so as soon as that lands and someone starts using it, something has to
> >>>>>>>> adapt _anytime_ you have a dma-buf hanging around. Not just when it's
> >>>>>>>> shared with another device.
> >>>>>>> Yeah, and that is exactly the reason why I will NAK this uAPI change.
> >>>>>>>
> >>>>>>> This doesn't works for amdgpu at all for the reasons outlined above.
> >>>>>> Uh that's really not how uapi works. "my driver is right, everyone
> >>>>>> else is wrong" is not how cross driver contracts are defined. If that
> >>>>>> means a perf impact until you've fixed your rules, that's on you.
> >>>>>>
> >>>>>> Also you're a few years too late with nacking this, it's already uapi
> >>>>>> in the form of the dma-buf poll() support.
> >>>>> ^^  My fancy new ioctl doesn't expose anything that isn't already
> >>>>> there.  It just lets you take a snap-shot of a wait instead of doing
> >>>>> an active wait which might end up with more fences added depending on
> >>>>> interrupts and retries.  The dma-buf poll waits on all fences for
> >>>>> POLLOUT and only the exclusive fence for POLLIN.  It's already uAPI.
> >>>> Note that the dma-buf poll support could be useful to Wayland compositors for the same purpose as Jason's new ioctl (only using client buffers which have finished drawing for an output frame, to avoid missing a refresh cycle due to client drawing), *if* it didn't work differently with amdgpu.
> >>>>
> >>>> Am I understanding correctly that Jason's new ioctl would also work differently with amdgpu as things stand currently? If so, that would be a real bummer and might hinder adoption of the ioctl by Wayland compositors.
> >>> My new ioctl has identical semantics to poll().  It just lets you take
> >>> a snapshot in time to wait on later instead of waiting on whatever
> >>> happens to be set right now.  IMO, having identical semantics to
> >>> poll() isn't something we want to change.
> >> Agreed.
> >>
> >> I'd argue then that making amdgpu poll semantics match those of other drivers is a pre-requisite for the new ioctl, otherwise it seems unlikely that the ioctl will be widely adopted.
> > This seems backwards, because that means useful improvements in all
> > other drivers are stalled until amdgpu is fixed.
>
> Well there is nothing to fix in amdgpu, what we need to is to come up
> with an DMA-buf implicit syncing model which works for everyone.
>
> I've pointed this problem out at FOSDEM roughly 6 years ago, before
> DMA-buf was even merged upstream and way before amdgpu even existed. And
> the response was yeah, maybe we need to look at this as well.
>
> Over the years I've mentioned now at least 5 times that this isn't going
> to work in some situations and came up with different approaches how to
> fix it.
>
> And you still have the nerves to tell me that this isn't a problem and
> we should fix amdgpu instead? Sorry, but I'm really running out of ideas
> how to explain why this isn't working for everybody.

I'm trying really hard to not fuel a flame war here but I tend to lean
Daniel's direction on this.  Stepping back from the individual needs
of amdgpu and looking at things from the PoV of Linux as a whole, AMD
being a special snowflake here is bad.  I think we have two problems:
amdgpu doesn't play by the established rules, and the rules don't work
well for amdgpu.  We need to solve BOTH problems.  Does that mean we
need to smash something into amdgpu to force it into the dma-buf model
today?  Maybe not; stuff's working well enough, I guess.  But we can't
just rewrite all the rules and break everyone else either.

> That amdgpu wants to be special is true, but it is a fundamental problem
> that we have designed the implicit sync in DMA-buf only around the needs
> of DRM drivers at that time instead of going a step back and saying hey
> what would be an approach which works for everyone.

How else was it supposed to be designed?  Based on the needs of
non-existent future drivers?  That's just not fair.  We (Intel) are
being burned by various aspects of dma-buf these days too.  It does no
good to blame past developers or our past selves for not knowing the
future.  It sucks but it's what we have.  And, to move forward, we
need to fix it.  Let's do that.

My concern with the flags approach as I'm beginning to digest it is
that it's a bit too much of an attempt to rewrite history for my
liking.  What do I mean by that?  I mean that any solution we come up
with needs ensure that legacy drivers and modern drivers can play
nicely together.  Either that or we need to modernize all the users of
dma-buf implicit sync.  I really don't like the "as long as AMD+Intel
works, we're good" approach.

> You just need to apply my example from FOSDEM with ring buffers in a
> single BO to the DMA-buf implicit sync model and immediately see how it
> falls apart.
>
> > I think we need agreement on what the rules are, reasonable plan to
> > get there, and then that should be enough to unblock work in the wider
> > community. Holding the community at large hostage because one driver
> > is different is really not great.
>
> Well forcing a drivers into a synchronization model not ideal for their
> hardware isn't great either.

As I said above, we're feeling the pain too.

--Jason

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] Add DMA_RESV_USAGE flags
  2021-05-20 17:23                               ` Jason Ekstrand
@ 2021-05-20 19:04                                 ` Jason Ekstrand
  2021-05-20 19:14                                   ` Daniel Vetter
  2021-05-21  7:24                                 ` Christian König
  1 sibling, 1 reply; 50+ messages in thread
From: Jason Ekstrand @ 2021-05-20 19:04 UTC (permalink / raw)
  To: Christian König
  Cc: moderated list:DMA BUFFER SHARING FRAMEWORK, Michel Dänzer,
	dri-devel

On Thu, May 20, 2021 at 12:23 PM Jason Ekstrand <jason@jlekstrand.net> wrote:
>
> On Thu, May 20, 2021 at 5:50 AM Christian König
> <ckoenig.leichtzumerken@gmail.com> wrote:
> >
> > Am 20.05.21 um 09:55 schrieb Daniel Vetter:
> > > On Wed, May 19, 2021 at 5:48 PM Michel Dänzer <michel@daenzer.net> wrote:
> > >> On 2021-05-19 5:21 p.m., Jason Ekstrand wrote:
> > >>> On Wed, May 19, 2021 at 5:52 AM Michel Dänzer <michel@daenzer.net> wrote:
> > >>>> On 2021-05-19 12:06 a.m., Jason Ekstrand wrote:
> > >>>>> On Tue, May 18, 2021 at 4:17 PM Daniel Vetter <daniel@ffwll.ch> wrote:
> > >>>>>> On Tue, May 18, 2021 at 7:40 PM Christian König
> > >>>>>> <ckoenig.leichtzumerken@gmail.com> wrote:
> > >>>>>>> Am 18.05.21 um 18:48 schrieb Daniel Vetter:
> > >>>>>>>> On Tue, May 18, 2021 at 2:49 PM Christian König
> > >>>>>>>> <ckoenig.leichtzumerken@gmail.com> wrote:
> > >>>>>>>>
> > >>>>>>>>> And as long as we are all inside amdgpu we also don't have any oversync,
> > >>>>>>>>> the issue only happens when we share dma-bufs with i915 (radeon and
> > >>>>>>>>> AFAIK nouveau does the right thing as well).
> > >>>>>>>> Yeah because then you can't use the amdgpu dma_resv model anymore and
> > >>>>>>>> have to use the one atomic helpers use. Which is also the one that
> > >>>>>>>> e.g. Jason is threathening to bake in as uapi with his dma_buf ioctl,
> > >>>>>>>> so as soon as that lands and someone starts using it, something has to
> > >>>>>>>> adapt _anytime_ you have a dma-buf hanging around. Not just when it's
> > >>>>>>>> shared with another device.
> > >>>>>>> Yeah, and that is exactly the reason why I will NAK this uAPI change.

I just re-sent my dma-buf sync_file import/export series.  Assuming we
can sort out what implicit sync looks like on the inside of dma-buf,
would that alleviate some of your uAPI fears?  The idea would be that
radeonsi and RADV would use amdgpu explicit sync primitives for
everything and then, at the very end, fetch a sync_file and stuff it
in the dma-buf's implicit sync container.  No nasty new uAPI for you.
We still get implicit sync.  Everyone wins?

Of course, that still leaves the question of what read and write
fences are, what they mean, and where they go in the dma_resv.  But
I'm trying to separate problems here.

--Jason


> > >>>>>>> This doesn't works for amdgpu at all for the reasons outlined above.
> > >>>>>> Uh that's really not how uapi works. "my driver is right, everyone
> > >>>>>> else is wrong" is not how cross driver contracts are defined. If that
> > >>>>>> means a perf impact until you've fixed your rules, that's on you.
> > >>>>>>
> > >>>>>> Also you're a few years too late with nacking this, it's already uapi
> > >>>>>> in the form of the dma-buf poll() support.
> > >>>>> ^^  My fancy new ioctl doesn't expose anything that isn't already
> > >>>>> there.  It just lets you take a snap-shot of a wait instead of doing
> > >>>>> an active wait which might end up with more fences added depending on
> > >>>>> interrupts and retries.  The dma-buf poll waits on all fences for
> > >>>>> POLLOUT and only the exclusive fence for POLLIN.  It's already uAPI.
> > >>>> Note that the dma-buf poll support could be useful to Wayland compositors for the same purpose as Jason's new ioctl (only using client buffers which have finished drawing for an output frame, to avoid missing a refresh cycle due to client drawing), *if* it didn't work differently with amdgpu.
> > >>>>
> > >>>> Am I understanding correctly that Jason's new ioctl would also work differently with amdgpu as things stand currently? If so, that would be a real bummer and might hinder adoption of the ioctl by Wayland compositors.
> > >>> My new ioctl has identical semantics to poll().  It just lets you take
> > >>> a snapshot in time to wait on later instead of waiting on whatever
> > >>> happens to be set right now.  IMO, having identical semantics to
> > >>> poll() isn't something we want to change.
> > >> Agreed.
> > >>
> > >> I'd argue then that making amdgpu poll semantics match those of other drivers is a pre-requisite for the new ioctl, otherwise it seems unlikely that the ioctl will be widely adopted.
> > > This seems backwards, because that means useful improvements in all
> > > other drivers are stalled until amdgpu is fixed.
> >
> > Well there is nothing to fix in amdgpu, what we need to is to come up
> > with an DMA-buf implicit syncing model which works for everyone.
> >
> > I've pointed this problem out at FOSDEM roughly 6 years ago, before
> > DMA-buf was even merged upstream and way before amdgpu even existed. And
> > the response was yeah, maybe we need to look at this as well.
> >
> > Over the years I've mentioned now at least 5 times that this isn't going
> > to work in some situations and came up with different approaches how to
> > fix it.
> >
> > And you still have the nerves to tell me that this isn't a problem and
> > we should fix amdgpu instead? Sorry, but I'm really running out of ideas
> > how to explain why this isn't working for everybody.
>
> I'm trying really hard to not fuel a flame war here but I tend to lean
> Daniel's direction on this.  Stepping back from the individual needs
> of amdgpu and looking at things from the PoV of Linux as a whole, AMD
> being a special snowflake here is bad.  I think we have two problems:
> amdgpu doesn't play by the established rules, and the rules don't work
> well for amdgpu.  We need to solve BOTH problems.  Does that mean we
> need to smash something into amdgpu to force it into the dma-buf model
> today?  Maybe not; stuff's working well enough, I guess.  But we can't
> just rewrite all the rules and break everyone else either.
>
> > That amdgpu wants to be special is true, but it is a fundamental problem
> > that we have designed the implicit sync in DMA-buf only around the needs
> > of DRM drivers at that time instead of going a step back and saying hey
> > what would be an approach which works for everyone.
>
> How else was it supposed to be designed?  Based on the needs of
> non-existent future drivers?  That's just not fair.  We (Intel) are
> being burned by various aspects of dma-buf these days too.  It does no
> good to blame past developers or our past selves for not knowing the
> future.  It sucks but it's what we have.  And, to move forward, we
> need to fix it.  Let's do that.
>
> My concern with the flags approach as I'm beginning to digest it is
> that it's a bit too much of an attempt to rewrite history for my
> liking.  What do I mean by that?  I mean that any solution we come up
> with needs ensure that legacy drivers and modern drivers can play
> nicely together.  Either that or we need to modernize all the users of
> dma-buf implicit sync.  I really don't like the "as long as AMD+Intel
> works, we're good" approach.
>
> > You just need to apply my example from FOSDEM with ring buffers in a
> > single BO to the DMA-buf implicit sync model and immediately see how it
> > falls apart.
> >
> > > I think we need agreement on what the rules are, reasonable plan to
> > > get there, and then that should be enough to unblock work in the wider
> > > community. Holding the community at large hostage because one driver
> > > is different is really not great.
> >
> > Well forcing a drivers into a synchronization model not ideal for their
> > hardware isn't great either.
>
> As I said above, we're feeling the pain too.
>
> --Jason

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] Add DMA_RESV_USAGE flags
  2021-05-20 19:04                                 ` Jason Ekstrand
@ 2021-05-20 19:14                                   ` Daniel Vetter
  2021-05-21  7:27                                     ` Christian König
  2021-05-21  9:36                                     ` Bas Nieuwenhuizen
  0 siblings, 2 replies; 50+ messages in thread
From: Daniel Vetter @ 2021-05-20 19:14 UTC (permalink / raw)
  To: Jason Ekstrand
  Cc: moderated list:DMA BUFFER SHARING FRAMEWORK,
	Christian König, Michel Dänzer, dri-devel

On Thu, May 20, 2021 at 9:04 PM Jason Ekstrand <jason@jlekstrand.net> wrote:
>
> On Thu, May 20, 2021 at 12:23 PM Jason Ekstrand <jason@jlekstrand.net> wrote:
> >
> > On Thu, May 20, 2021 at 5:50 AM Christian König
> > <ckoenig.leichtzumerken@gmail.com> wrote:
> > >
> > > Am 20.05.21 um 09:55 schrieb Daniel Vetter:
> > > > On Wed, May 19, 2021 at 5:48 PM Michel Dänzer <michel@daenzer.net> wrote:
> > > >> On 2021-05-19 5:21 p.m., Jason Ekstrand wrote:
> > > >>> On Wed, May 19, 2021 at 5:52 AM Michel Dänzer <michel@daenzer.net> wrote:
> > > >>>> On 2021-05-19 12:06 a.m., Jason Ekstrand wrote:
> > > >>>>> On Tue, May 18, 2021 at 4:17 PM Daniel Vetter <daniel@ffwll.ch> wrote:
> > > >>>>>> On Tue, May 18, 2021 at 7:40 PM Christian König
> > > >>>>>> <ckoenig.leichtzumerken@gmail.com> wrote:
> > > >>>>>>> Am 18.05.21 um 18:48 schrieb Daniel Vetter:
> > > >>>>>>>> On Tue, May 18, 2021 at 2:49 PM Christian König
> > > >>>>>>>> <ckoenig.leichtzumerken@gmail.com> wrote:
> > > >>>>>>>>
> > > >>>>>>>>> And as long as we are all inside amdgpu we also don't have any oversync,
> > > >>>>>>>>> the issue only happens when we share dma-bufs with i915 (radeon and
> > > >>>>>>>>> AFAIK nouveau does the right thing as well).
> > > >>>>>>>> Yeah because then you can't use the amdgpu dma_resv model anymore and
> > > >>>>>>>> have to use the one atomic helpers use. Which is also the one that
> > > >>>>>>>> e.g. Jason is threathening to bake in as uapi with his dma_buf ioctl,
> > > >>>>>>>> so as soon as that lands and someone starts using it, something has to
> > > >>>>>>>> adapt _anytime_ you have a dma-buf hanging around. Not just when it's
> > > >>>>>>>> shared with another device.
> > > >>>>>>> Yeah, and that is exactly the reason why I will NAK this uAPI change.
>
> I just re-sent my dma-buf sync_file import/export series.  Assuming we
> can sort out what implicit sync looks like on the inside of dma-buf,
> would that alleviate some of your uAPI fears?  The idea would be that
> radeonsi and RADV would use amdgpu explicit sync primitives for
> everything and then, at the very end, fetch a sync_file and stuff it
> in the dma-buf's implicit sync container.  No nasty new uAPI for you.
> We still get implicit sync.  Everyone wins?

You still need the implicit fencing opt-out, which currently amdgpu
lacks completely.

But I also thought through the security implications of the patch set
(including the exclusive injection patch 4), and I think even with
current amdgpu that's perfectly fine. Not very useful since the fences
you get out aren't reflecting status accurately, but that's not a
correctness/security issue. You'll simply hit stalls when you don't
expect, because the kernel is allowed to throw random other exclusive
fences in whenever it feels like.

> Of course, that still leaves the question of what read and write
> fences are, what they mean, and where they go in the dma_resv.  But
> I'm trying to separate problems here.

Yeah I'll dump my patch set for clarifying status quo tomorrow for that.
-Daniel

>
> --Jason
>
>
> > > >>>>>>> This doesn't works for amdgpu at all for the reasons outlined above.
> > > >>>>>> Uh that's really not how uapi works. "my driver is right, everyone
> > > >>>>>> else is wrong" is not how cross driver contracts are defined. If that
> > > >>>>>> means a perf impact until you've fixed your rules, that's on you.
> > > >>>>>>
> > > >>>>>> Also you're a few years too late with nacking this, it's already uapi
> > > >>>>>> in the form of the dma-buf poll() support.
> > > >>>>> ^^  My fancy new ioctl doesn't expose anything that isn't already
> > > >>>>> there.  It just lets you take a snap-shot of a wait instead of doing
> > > >>>>> an active wait which might end up with more fences added depending on
> > > >>>>> interrupts and retries.  The dma-buf poll waits on all fences for
> > > >>>>> POLLOUT and only the exclusive fence for POLLIN.  It's already uAPI.
> > > >>>> Note that the dma-buf poll support could be useful to Wayland compositors for the same purpose as Jason's new ioctl (only using client buffers which have finished drawing for an output frame, to avoid missing a refresh cycle due to client drawing), *if* it didn't work differently with amdgpu.
> > > >>>>
> > > >>>> Am I understanding correctly that Jason's new ioctl would also work differently with amdgpu as things stand currently? If so, that would be a real bummer and might hinder adoption of the ioctl by Wayland compositors.
> > > >>> My new ioctl has identical semantics to poll().  It just lets you take
> > > >>> a snapshot in time to wait on later instead of waiting on whatever
> > > >>> happens to be set right now.  IMO, having identical semantics to
> > > >>> poll() isn't something we want to change.
> > > >> Agreed.
> > > >>
> > > >> I'd argue then that making amdgpu poll semantics match those of other drivers is a pre-requisite for the new ioctl, otherwise it seems unlikely that the ioctl will be widely adopted.
> > > > This seems backwards, because that means useful improvements in all
> > > > other drivers are stalled until amdgpu is fixed.
> > >
> > > Well there is nothing to fix in amdgpu, what we need to is to come up
> > > with an DMA-buf implicit syncing model which works for everyone.
> > >
> > > I've pointed this problem out at FOSDEM roughly 6 years ago, before
> > > DMA-buf was even merged upstream and way before amdgpu even existed. And
> > > the response was yeah, maybe we need to look at this as well.
> > >
> > > Over the years I've mentioned now at least 5 times that this isn't going
> > > to work in some situations and came up with different approaches how to
> > > fix it.
> > >
> > > And you still have the nerves to tell me that this isn't a problem and
> > > we should fix amdgpu instead? Sorry, but I'm really running out of ideas
> > > how to explain why this isn't working for everybody.
> >
> > I'm trying really hard to not fuel a flame war here but I tend to lean
> > Daniel's direction on this.  Stepping back from the individual needs
> > of amdgpu and looking at things from the PoV of Linux as a whole, AMD
> > being a special snowflake here is bad.  I think we have two problems:
> > amdgpu doesn't play by the established rules, and the rules don't work
> > well for amdgpu.  We need to solve BOTH problems.  Does that mean we
> > need to smash something into amdgpu to force it into the dma-buf model
> > today?  Maybe not; stuff's working well enough, I guess.  But we can't
> > just rewrite all the rules and break everyone else either.
> >
> > > That amdgpu wants to be special is true, but it is a fundamental problem
> > > that we have designed the implicit sync in DMA-buf only around the needs
> > > of DRM drivers at that time instead of going a step back and saying hey
> > > what would be an approach which works for everyone.
> >
> > How else was it supposed to be designed?  Based on the needs of
> > non-existent future drivers?  That's just not fair.  We (Intel) are
> > being burned by various aspects of dma-buf these days too.  It does no
> > good to blame past developers or our past selves for not knowing the
> > future.  It sucks but it's what we have.  And, to move forward, we
> > need to fix it.  Let's do that.
> >
> > My concern with the flags approach as I'm beginning to digest it is
> > that it's a bit too much of an attempt to rewrite history for my
> > liking.  What do I mean by that?  I mean that any solution we come up
> > with needs ensure that legacy drivers and modern drivers can play
> > nicely together.  Either that or we need to modernize all the users of
> > dma-buf implicit sync.  I really don't like the "as long as AMD+Intel
> > works, we're good" approach.
> >
> > > You just need to apply my example from FOSDEM with ring buffers in a
> > > single BO to the DMA-buf implicit sync model and immediately see how it
> > > falls apart.
> > >
> > > > I think we need agreement on what the rules are, reasonable plan to
> > > > get there, and then that should be enough to unblock work in the wider
> > > > community. Holding the community at large hostage because one driver
> > > > is different is really not great.
> > >
> > > Well forcing a drivers into a synchronization model not ideal for their
> > > hardware isn't great either.
> >
> > As I said above, we're feeling the pain too.
> >
> > --Jason



-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] Add DMA_RESV_USAGE flags
  2021-05-20 17:23                               ` Jason Ekstrand
  2021-05-20 19:04                                 ` Jason Ekstrand
@ 2021-05-21  7:24                                 ` Christian König
  1 sibling, 0 replies; 50+ messages in thread
From: Christian König @ 2021-05-21  7:24 UTC (permalink / raw)
  To: Jason Ekstrand
  Cc: moderated list:DMA BUFFER SHARING FRAMEWORK, Michel Dänzer,
	dri-devel


Am 20.05.21 um 19:23 schrieb Jason Ekstrand:
> [SNIP]
>>>> I'd argue then that making amdgpu poll semantics match those of other drivers is a pre-requisite for the new ioctl, otherwise it seems unlikely that the ioctl will be widely adopted.
>>> This seems backwards, because that means useful improvements in all
>>> other drivers are stalled until amdgpu is fixed.
>> Well there is nothing to fix in amdgpu, what we need to is to come up
>> with an DMA-buf implicit syncing model which works for everyone.
>>
>> I've pointed this problem out at FOSDEM roughly 6 years ago, before
>> DMA-buf was even merged upstream and way before amdgpu even existed. And
>> the response was yeah, maybe we need to look at this as well.
>>
>> Over the years I've mentioned now at least 5 times that this isn't going
>> to work in some situations and came up with different approaches how to
>> fix it.
>>
>> And you still have the nerves to tell me that this isn't a problem and
>> we should fix amdgpu instead? Sorry, but I'm really running out of ideas
>> how to explain why this isn't working for everybody.
> I'm trying really hard to not fuel a flame war here but I tend to lean
> Daniel's direction on this.  Stepping back from the individual needs
> of amdgpu and looking at things from the PoV of Linux as a whole, AMD
> being a special snowflake here is bad.  I think we have two problems:
> amdgpu doesn't play by the established rules, and the rules don't work
> well for amdgpu.  We need to solve BOTH problems.  Does that mean we
> need to smash something into amdgpu to force it into the dma-buf model
> today?  Maybe not; stuff's working well enough, I guess.  But we can't
> just rewrite all the rules and break everyone else either.

Totally agree. Key point is I think I really expressed why some of the 
rules needs some changes and that at least requires an audit of 
everything currently using the dma_resv object.

>> That amdgpu wants to be special is true, but it is a fundamental problem
>> that we have designed the implicit sync in DMA-buf only around the needs
>> of DRM drivers at that time instead of going a step back and saying hey
>> what would be an approach which works for everyone.
> How else was it supposed to be designed?  Based on the needs of
> non-existent future drivers?  That's just not fair.  We (Intel) are
> being burned by various aspects of dma-buf these days too.  It does no
> good to blame past developers or our past selves for not knowing the
> future.  It sucks but it's what we have.  And, to move forward, we
> need to fix it.  Let's do that.

Yeah, coming up with a design which also works for future needs is 
always hard.

But what annoys me is that I've noted those problems way before DMA-buf 
was merged or amdgpu even existed. I could really kick my own ass to not 
have pushed back on this harder.

> My concern with the flags approach as I'm beginning to digest it is
> that it's a bit too much of an attempt to rewrite history for my
> liking.  What do I mean by that?  I mean that any solution we come up
> with needs ensure that legacy drivers and modern drivers can play
> nicely together.  Either that or we need to modernize all the users of
> dma-buf implicit sync.  I really don't like the "as long as AMD+Intel
> works, we're good" approach.

Seconded. That's why I'm saying that we need to take a step back and 
look at what would be a good design for drivers in general.

After sleeping a night over it I think what Daniel noted to have 
something similar to the moving fence of TTM inside the dma_resv object 
is a really good step into the right direction.

When we combine that with an ability to add fences which should never 
play with implicit sync and only resource management I think we could 
solve this.

This essentially untangles resource management from implicit sync and 
results in the following four categories:

1. A moving fence used by resource management only. Userspace can't in 
any way mess with that one.
2. The existing exclusive fence which is set by CS and/or your new IOCTL.
3. The existing shared fences which can be added by CS.
4. A new group of fences which don't participate in resource management, 
but not in implicit sync.

Number 1 requires an audit of all places which currently do CS or page flip.

Number 4 requires an audit of all places which do resource management.

I can tackle those and I'm perfectly aware that it might take some time.

Regards,
Christian.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] Add DMA_RESV_USAGE flags
  2021-05-20 19:14                                   ` Daniel Vetter
@ 2021-05-21  7:27                                     ` Christian König
  2021-05-21  9:36                                     ` Bas Nieuwenhuizen
  1 sibling, 0 replies; 50+ messages in thread
From: Christian König @ 2021-05-21  7:27 UTC (permalink / raw)
  To: Daniel Vetter, Jason Ekstrand
  Cc: moderated list:DMA BUFFER SHARING FRAMEWORK, Michel Dänzer,
	dri-devel

Am 20.05.21 um 21:14 schrieb Daniel Vetter:
> On Thu, May 20, 2021 at 9:04 PM Jason Ekstrand <jason@jlekstrand.net> wrote:
>> On Thu, May 20, 2021 at 12:23 PM Jason Ekstrand <jason@jlekstrand.net> wrote:
>>> On Thu, May 20, 2021 at 5:50 AM Christian König
>>> <ckoenig.leichtzumerken@gmail.com> wrote:
>>>> Am 20.05.21 um 09:55 schrieb Daniel Vetter:
>>>>> On Wed, May 19, 2021 at 5:48 PM Michel Dänzer <michel@daenzer.net> wrote:
>>>>>> On 2021-05-19 5:21 p.m., Jason Ekstrand wrote:
>>>>>>> On Wed, May 19, 2021 at 5:52 AM Michel Dänzer <michel@daenzer.net> wrote:
>>>>>>>> On 2021-05-19 12:06 a.m., Jason Ekstrand wrote:
>>>>>>>>> On Tue, May 18, 2021 at 4:17 PM Daniel Vetter <daniel@ffwll.ch> wrote:
>>>>>>>>>> On Tue, May 18, 2021 at 7:40 PM Christian König
>>>>>>>>>> <ckoenig.leichtzumerken@gmail.com> wrote:
>>>>>>>>>>> Am 18.05.21 um 18:48 schrieb Daniel Vetter:
>>>>>>>>>>>> On Tue, May 18, 2021 at 2:49 PM Christian König
>>>>>>>>>>>> <ckoenig.leichtzumerken@gmail.com> wrote:
>>>>>>>>>>>>
>>>>>>>>>>>>> And as long as we are all inside amdgpu we also don't have any oversync,
>>>>>>>>>>>>> the issue only happens when we share dma-bufs with i915 (radeon and
>>>>>>>>>>>>> AFAIK nouveau does the right thing as well).
>>>>>>>>>>>> Yeah because then you can't use the amdgpu dma_resv model anymore and
>>>>>>>>>>>> have to use the one atomic helpers use. Which is also the one that
>>>>>>>>>>>> e.g. Jason is threathening to bake in as uapi with his dma_buf ioctl,
>>>>>>>>>>>> so as soon as that lands and someone starts using it, something has to
>>>>>>>>>>>> adapt _anytime_ you have a dma-buf hanging around. Not just when it's
>>>>>>>>>>>> shared with another device.
>>>>>>>>>>> Yeah, and that is exactly the reason why I will NAK this uAPI change.
>> I just re-sent my dma-buf sync_file import/export series.  Assuming we
>> can sort out what implicit sync looks like on the inside of dma-buf,
>> would that alleviate some of your uAPI fears?  The idea would be that
>> radeonsi and RADV would use amdgpu explicit sync primitives for
>> everything and then, at the very end, fetch a sync_file and stuff it
>> in the dma-buf's implicit sync container.  No nasty new uAPI for you.
>> We still get implicit sync.  Everyone wins?
> You still need the implicit fencing opt-out, which currently amdgpu
> lacks completely.

Well we do have a per BO flag for this! We just don't do this on command 
submission, but rather on BO creation.

> But I also thought through the security implications of the patch set
> (including the exclusive injection patch 4), and I think even with
> current amdgpu that's perfectly fine. Not very useful since the fences
> you get out aren't reflecting status accurately, but that's not a
> correctness/security issue. You'll simply hit stalls when you don't
> expect, because the kernel is allowed to throw random other exclusive
> fences in whenever it feels like.

Yes, exactly that was my concern. I think what you noted with the moving 
fence from TTM would solve that.

Regards,
Christian.

>
>> Of course, that still leaves the question of what read and write
>> fences are, what they mean, and where they go in the dma_resv.  But
>> I'm trying to separate problems here.
> Yeah I'll dump my patch set for clarifying status quo tomorrow for that.
> -Daniel
>
>> --Jason
>>
>>
>>>>>>>>>>> This doesn't works for amdgpu at all for the reasons outlined above.
>>>>>>>>>> Uh that's really not how uapi works. "my driver is right, everyone
>>>>>>>>>> else is wrong" is not how cross driver contracts are defined. If that
>>>>>>>>>> means a perf impact until you've fixed your rules, that's on you.
>>>>>>>>>>
>>>>>>>>>> Also you're a few years too late with nacking this, it's already uapi
>>>>>>>>>> in the form of the dma-buf poll() support.
>>>>>>>>> ^^  My fancy new ioctl doesn't expose anything that isn't already
>>>>>>>>> there.  It just lets you take a snap-shot of a wait instead of doing
>>>>>>>>> an active wait which might end up with more fences added depending on
>>>>>>>>> interrupts and retries.  The dma-buf poll waits on all fences for
>>>>>>>>> POLLOUT and only the exclusive fence for POLLIN.  It's already uAPI.
>>>>>>>> Note that the dma-buf poll support could be useful to Wayland compositors for the same purpose as Jason's new ioctl (only using client buffers which have finished drawing for an output frame, to avoid missing a refresh cycle due to client drawing), *if* it didn't work differently with amdgpu.
>>>>>>>>
>>>>>>>> Am I understanding correctly that Jason's new ioctl would also work differently with amdgpu as things stand currently? If so, that would be a real bummer and might hinder adoption of the ioctl by Wayland compositors.
>>>>>>> My new ioctl has identical semantics to poll().  It just lets you take
>>>>>>> a snapshot in time to wait on later instead of waiting on whatever
>>>>>>> happens to be set right now.  IMO, having identical semantics to
>>>>>>> poll() isn't something we want to change.
>>>>>> Agreed.
>>>>>>
>>>>>> I'd argue then that making amdgpu poll semantics match those of other drivers is a pre-requisite for the new ioctl, otherwise it seems unlikely that the ioctl will be widely adopted.
>>>>> This seems backwards, because that means useful improvements in all
>>>>> other drivers are stalled until amdgpu is fixed.
>>>> Well there is nothing to fix in amdgpu, what we need to is to come up
>>>> with an DMA-buf implicit syncing model which works for everyone.
>>>>
>>>> I've pointed this problem out at FOSDEM roughly 6 years ago, before
>>>> DMA-buf was even merged upstream and way before amdgpu even existed. And
>>>> the response was yeah, maybe we need to look at this as well.
>>>>
>>>> Over the years I've mentioned now at least 5 times that this isn't going
>>>> to work in some situations and came up with different approaches how to
>>>> fix it.
>>>>
>>>> And you still have the nerves to tell me that this isn't a problem and
>>>> we should fix amdgpu instead? Sorry, but I'm really running out of ideas
>>>> how to explain why this isn't working for everybody.
>>> I'm trying really hard to not fuel a flame war here but I tend to lean
>>> Daniel's direction on this.  Stepping back from the individual needs
>>> of amdgpu and looking at things from the PoV of Linux as a whole, AMD
>>> being a special snowflake here is bad.  I think we have two problems:
>>> amdgpu doesn't play by the established rules, and the rules don't work
>>> well for amdgpu.  We need to solve BOTH problems.  Does that mean we
>>> need to smash something into amdgpu to force it into the dma-buf model
>>> today?  Maybe not; stuff's working well enough, I guess.  But we can't
>>> just rewrite all the rules and break everyone else either.
>>>
>>>> That amdgpu wants to be special is true, but it is a fundamental problem
>>>> that we have designed the implicit sync in DMA-buf only around the needs
>>>> of DRM drivers at that time instead of going a step back and saying hey
>>>> what would be an approach which works for everyone.
>>> How else was it supposed to be designed?  Based on the needs of
>>> non-existent future drivers?  That's just not fair.  We (Intel) are
>>> being burned by various aspects of dma-buf these days too.  It does no
>>> good to blame past developers or our past selves for not knowing the
>>> future.  It sucks but it's what we have.  And, to move forward, we
>>> need to fix it.  Let's do that.
>>>
>>> My concern with the flags approach as I'm beginning to digest it is
>>> that it's a bit too much of an attempt to rewrite history for my
>>> liking.  What do I mean by that?  I mean that any solution we come up
>>> with needs ensure that legacy drivers and modern drivers can play
>>> nicely together.  Either that or we need to modernize all the users of
>>> dma-buf implicit sync.  I really don't like the "as long as AMD+Intel
>>> works, we're good" approach.
>>>
>>>> You just need to apply my example from FOSDEM with ring buffers in a
>>>> single BO to the DMA-buf implicit sync model and immediately see how it
>>>> falls apart.
>>>>
>>>>> I think we need agreement on what the rules are, reasonable plan to
>>>>> get there, and then that should be enough to unblock work in the wider
>>>>> community. Holding the community at large hostage because one driver
>>>>> is different is really not great.
>>>> Well forcing a drivers into a synchronization model not ideal for their
>>>> hardware isn't great either.
>>> As I said above, we're feeling the pain too.
>>>
>>> --Jason
>
>


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] Add DMA_RESV_USAGE flags
  2021-05-20 19:14                                   ` Daniel Vetter
  2021-05-21  7:27                                     ` Christian König
@ 2021-05-21  9:36                                     ` Bas Nieuwenhuizen
  1 sibling, 0 replies; 50+ messages in thread
From: Bas Nieuwenhuizen @ 2021-05-21  9:36 UTC (permalink / raw)
  To: Daniel Vetter
  Cc: moderated list:DMA BUFFER SHARING FRAMEWORK,
	Christian König, Michel Dänzer, dri-devel,
	Jason Ekstrand

On Thu, May 20, 2021 at 9:15 PM Daniel Vetter <daniel@ffwll.ch> wrote:
>
> On Thu, May 20, 2021 at 9:04 PM Jason Ekstrand <jason@jlekstrand.net> wrote:
> >
> > On Thu, May 20, 2021 at 12:23 PM Jason Ekstrand <jason@jlekstrand.net> wrote:
> > >
> > > On Thu, May 20, 2021 at 5:50 AM Christian König
> > > <ckoenig.leichtzumerken@gmail.com> wrote:
> > > >
> > > > Am 20.05.21 um 09:55 schrieb Daniel Vetter:
> > > > > On Wed, May 19, 2021 at 5:48 PM Michel Dänzer <michel@daenzer.net> wrote:
> > > > >> On 2021-05-19 5:21 p.m., Jason Ekstrand wrote:
> > > > >>> On Wed, May 19, 2021 at 5:52 AM Michel Dänzer <michel@daenzer.net> wrote:
> > > > >>>> On 2021-05-19 12:06 a.m., Jason Ekstrand wrote:
> > > > >>>>> On Tue, May 18, 2021 at 4:17 PM Daniel Vetter <daniel@ffwll.ch> wrote:
> > > > >>>>>> On Tue, May 18, 2021 at 7:40 PM Christian König
> > > > >>>>>> <ckoenig.leichtzumerken@gmail.com> wrote:
> > > > >>>>>>> Am 18.05.21 um 18:48 schrieb Daniel Vetter:
> > > > >>>>>>>> On Tue, May 18, 2021 at 2:49 PM Christian König
> > > > >>>>>>>> <ckoenig.leichtzumerken@gmail.com> wrote:
> > > > >>>>>>>>
> > > > >>>>>>>>> And as long as we are all inside amdgpu we also don't have any oversync,
> > > > >>>>>>>>> the issue only happens when we share dma-bufs with i915 (radeon and
> > > > >>>>>>>>> AFAIK nouveau does the right thing as well).
> > > > >>>>>>>> Yeah because then you can't use the amdgpu dma_resv model anymore and
> > > > >>>>>>>> have to use the one atomic helpers use. Which is also the one that
> > > > >>>>>>>> e.g. Jason is threathening to bake in as uapi with his dma_buf ioctl,
> > > > >>>>>>>> so as soon as that lands and someone starts using it, something has to
> > > > >>>>>>>> adapt _anytime_ you have a dma-buf hanging around. Not just when it's
> > > > >>>>>>>> shared with another device.
> > > > >>>>>>> Yeah, and that is exactly the reason why I will NAK this uAPI change.
> >
> > I just re-sent my dma-buf sync_file import/export series.  Assuming we
> > can sort out what implicit sync looks like on the inside of dma-buf,
> > would that alleviate some of your uAPI fears?  The idea would be that
> > radeonsi and RADV would use amdgpu explicit sync primitives for
> > everything and then, at the very end, fetch a sync_file and stuff it
> > in the dma-buf's implicit sync container.  No nasty new uAPI for you.
> > We still get implicit sync.  Everyone wins?
>
> You still need the implicit fencing opt-out, which currently amdgpu
> lacks completely.

I think one of my big questions is to what extent we need another
opt-out in amdgpu (I mean I certainly want the submission level
opt-out but I don't think it is blocking)? Currently amdgpu has two
opt outs at buffer creation time:

1) AMDGPU_GEM_CREATE_EXPLICIT_SYNC
2) AMDGPU_GEM_CREATE_VM_ALWAYS_VALID

Together implicit sync is mostly disabled for Vulkan outside of the
WSI, allowing multiple engines simultaneously if we just use shared
fences for these (or for the latter not really any fences at the BO
level, but they are not shareable). On GL I think radeonsi now
disabled essentially all SDMA usage meaning it is pretty much
restricted to one engine at a time. Leaves video, where I'm not sure.
Maybe reference frames? On the other hand, video often involves a
process/drm-file boundary in which case the shared fences already
worked as exclusive and hence prohibit cross-engine usage.





>
> But I also thought through the security implications of the patch set
> (including the exclusive injection patch 4), and I think even with
> current amdgpu that's perfectly fine. Not very useful since the fences
> you get out aren't reflecting status accurately, but that's not a
> correctness/security issue. You'll simply hit stalls when you don't
> expect, because the kernel is allowed to throw random other exclusive
> fences in whenever it feels like.
>
> > Of course, that still leaves the question of what read and write
> > fences are, what they mean, and where they go in the dma_resv.  But
> > I'm trying to separate problems here.
>
> Yeah I'll dump my patch set for clarifying status quo tomorrow for that.
> -Daniel
>
> >
> > --Jason
> >
> >
> > > > >>>>>>> This doesn't works for amdgpu at all for the reasons outlined above.
> > > > >>>>>> Uh that's really not how uapi works. "my driver is right, everyone
> > > > >>>>>> else is wrong" is not how cross driver contracts are defined. If that
> > > > >>>>>> means a perf impact until you've fixed your rules, that's on you.
> > > > >>>>>>
> > > > >>>>>> Also you're a few years too late with nacking this, it's already uapi
> > > > >>>>>> in the form of the dma-buf poll() support.
> > > > >>>>> ^^  My fancy new ioctl doesn't expose anything that isn't already
> > > > >>>>> there.  It just lets you take a snap-shot of a wait instead of doing
> > > > >>>>> an active wait which might end up with more fences added depending on
> > > > >>>>> interrupts and retries.  The dma-buf poll waits on all fences for
> > > > >>>>> POLLOUT and only the exclusive fence for POLLIN.  It's already uAPI.
> > > > >>>> Note that the dma-buf poll support could be useful to Wayland compositors for the same purpose as Jason's new ioctl (only using client buffers which have finished drawing for an output frame, to avoid missing a refresh cycle due to client drawing), *if* it didn't work differently with amdgpu.
> > > > >>>>
> > > > >>>> Am I understanding correctly that Jason's new ioctl would also work differently with amdgpu as things stand currently? If so, that would be a real bummer and might hinder adoption of the ioctl by Wayland compositors.
> > > > >>> My new ioctl has identical semantics to poll().  It just lets you take
> > > > >>> a snapshot in time to wait on later instead of waiting on whatever
> > > > >>> happens to be set right now.  IMO, having identical semantics to
> > > > >>> poll() isn't something we want to change.
> > > > >> Agreed.
> > > > >>
> > > > >> I'd argue then that making amdgpu poll semantics match those of other drivers is a pre-requisite for the new ioctl, otherwise it seems unlikely that the ioctl will be widely adopted.
> > > > > This seems backwards, because that means useful improvements in all
> > > > > other drivers are stalled until amdgpu is fixed.
> > > >
> > > > Well there is nothing to fix in amdgpu, what we need to is to come up
> > > > with an DMA-buf implicit syncing model which works for everyone.
> > > >
> > > > I've pointed this problem out at FOSDEM roughly 6 years ago, before
> > > > DMA-buf was even merged upstream and way before amdgpu even existed. And
> > > > the response was yeah, maybe we need to look at this as well.
> > > >
> > > > Over the years I've mentioned now at least 5 times that this isn't going
> > > > to work in some situations and came up with different approaches how to
> > > > fix it.
> > > >
> > > > And you still have the nerves to tell me that this isn't a problem and
> > > > we should fix amdgpu instead? Sorry, but I'm really running out of ideas
> > > > how to explain why this isn't working for everybody.
> > >
> > > I'm trying really hard to not fuel a flame war here but I tend to lean
> > > Daniel's direction on this.  Stepping back from the individual needs
> > > of amdgpu and looking at things from the PoV of Linux as a whole, AMD
> > > being a special snowflake here is bad.  I think we have two problems:
> > > amdgpu doesn't play by the established rules, and the rules don't work
> > > well for amdgpu.  We need to solve BOTH problems.  Does that mean we
> > > need to smash something into amdgpu to force it into the dma-buf model
> > > today?  Maybe not; stuff's working well enough, I guess.  But we can't
> > > just rewrite all the rules and break everyone else either.
> > >
> > > > That amdgpu wants to be special is true, but it is a fundamental problem
> > > > that we have designed the implicit sync in DMA-buf only around the needs
> > > > of DRM drivers at that time instead of going a step back and saying hey
> > > > what would be an approach which works for everyone.
> > >
> > > How else was it supposed to be designed?  Based on the needs of
> > > non-existent future drivers?  That's just not fair.  We (Intel) are
> > > being burned by various aspects of dma-buf these days too.  It does no
> > > good to blame past developers or our past selves for not knowing the
> > > future.  It sucks but it's what we have.  And, to move forward, we
> > > need to fix it.  Let's do that.
> > >
> > > My concern with the flags approach as I'm beginning to digest it is
> > > that it's a bit too much of an attempt to rewrite history for my
> > > liking.  What do I mean by that?  I mean that any solution we come up
> > > with needs ensure that legacy drivers and modern drivers can play
> > > nicely together.  Either that or we need to modernize all the users of
> > > dma-buf implicit sync.  I really don't like the "as long as AMD+Intel
> > > works, we're good" approach.
> > >
> > > > You just need to apply my example from FOSDEM with ring buffers in a
> > > > single BO to the DMA-buf implicit sync model and immediately see how it
> > > > falls apart.
> > > >
> > > > > I think we need agreement on what the rules are, reasonable plan to
> > > > > get there, and then that should be enough to unblock work in the wider
> > > > > community. Holding the community at large hostage because one driver
> > > > > is different is really not great.
> > > >
> > > > Well forcing a drivers into a synchronization model not ideal for their
> > > > hardware isn't great either.
> > >
> > > As I said above, we're feeling the pain too.
> > >
> > > --Jason
>
>
>
> --
> Daniel Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] Add DMA_RESV_USAGE flags
  2021-05-20 14:18                               ` Daniel Vetter
  2021-05-20 14:30                                 ` Michel Dänzer
  2021-05-20 17:08                                 ` Jason Ekstrand
@ 2021-05-31 12:49                                 ` Michel Dänzer
  2 siblings, 0 replies; 50+ messages in thread
From: Michel Dänzer @ 2021-05-31 12:49 UTC (permalink / raw)
  To: Daniel Vetter
  Cc: moderated list:DMA BUFFER SHARING FRAMEWORK,
	Christian König, Jason Ekstrand, dri-devel

On 2021-05-20 4:18 p.m., Daniel Vetter wrote:
> On Thu, May 20, 2021 at 10:13:38AM +0200, Michel Dänzer wrote:
>> On 2021-05-20 9:55 a.m., Daniel Vetter wrote:
>>> On Wed, May 19, 2021 at 5:48 PM Michel Dänzer <michel@daenzer.net> wrote:
>>>>
>>>> On 2021-05-19 5:21 p.m., Jason Ekstrand wrote:
>>>>> On Wed, May 19, 2021 at 5:52 AM Michel Dänzer <michel@daenzer.net> wrote:
>>>>>>
>>>>>> On 2021-05-19 12:06 a.m., Jason Ekstrand wrote:
>>>>>>> On Tue, May 18, 2021 at 4:17 PM Daniel Vetter <daniel@ffwll.ch> wrote:
>>>>>>>>
>>>>>>>> On Tue, May 18, 2021 at 7:40 PM Christian König
>>>>>>>> <ckoenig.leichtzumerken@gmail.com> wrote:
>>>>>>>>>
>>>>>>>>> Am 18.05.21 um 18:48 schrieb Daniel Vetter:
>>>>>>>>>> On Tue, May 18, 2021 at 2:49 PM Christian König
>>>>>>>>>> <ckoenig.leichtzumerken@gmail.com> wrote:
>>>>>>>>>>
>>>>>>>>>>> And as long as we are all inside amdgpu we also don't have any oversync,
>>>>>>>>>>> the issue only happens when we share dma-bufs with i915 (radeon and
>>>>>>>>>>> AFAIK nouveau does the right thing as well).
>>>>>>>>>> Yeah because then you can't use the amdgpu dma_resv model anymore and
>>>>>>>>>> have to use the one atomic helpers use. Which is also the one that
>>>>>>>>>> e.g. Jason is threathening to bake in as uapi with his dma_buf ioctl,
>>>>>>>>>> so as soon as that lands and someone starts using it, something has to
>>>>>>>>>> adapt _anytime_ you have a dma-buf hanging around. Not just when it's
>>>>>>>>>> shared with another device.
>>>>>>>>>
>>>>>>>>> Yeah, and that is exactly the reason why I will NAK this uAPI change.
>>>>>>>>>
>>>>>>>>> This doesn't works for amdgpu at all for the reasons outlined above.
>>>>>>>>
>>>>>>>> Uh that's really not how uapi works. "my driver is right, everyone
>>>>>>>> else is wrong" is not how cross driver contracts are defined. If that
>>>>>>>> means a perf impact until you've fixed your rules, that's on you.
>>>>>>>>
>>>>>>>> Also you're a few years too late with nacking this, it's already uapi
>>>>>>>> in the form of the dma-buf poll() support.
>>>>>>>
>>>>>>> ^^  My fancy new ioctl doesn't expose anything that isn't already
>>>>>>> there.  It just lets you take a snap-shot of a wait instead of doing
>>>>>>> an active wait which might end up with more fences added depending on
>>>>>>> interrupts and retries.  The dma-buf poll waits on all fences for
>>>>>>> POLLOUT and only the exclusive fence for POLLIN.  It's already uAPI.
>>>>>>
>>>>>> Note that the dma-buf poll support could be useful to Wayland compositors for the same purpose as Jason's new ioctl (only using client buffers which have finished drawing for an output frame, to avoid missing a refresh cycle due to client drawing), *if* it didn't work differently with amdgpu.
>>>>>>
>>>>>> Am I understanding correctly that Jason's new ioctl would also work differently with amdgpu as things stand currently? If so, that would be a real bummer and might hinder adoption of the ioctl by Wayland compositors.
>>>>>
>>>>> My new ioctl has identical semantics to poll().  It just lets you take
>>>>> a snapshot in time to wait on later instead of waiting on whatever
>>>>> happens to be set right now.  IMO, having identical semantics to
>>>>> poll() isn't something we want to change.
>>>>
>>>> Agreed.
>>>>
>>>> I'd argue then that making amdgpu poll semantics match those of other drivers is a pre-requisite for the new ioctl, otherwise it seems unlikely that the ioctl will be widely adopted.
>>>
>>> This seems backwards, because that means useful improvements in all
>>> other drivers are stalled until amdgpu is fixed.
>>>
>>> I think we need agreement on what the rules are, reasonable plan to
>>> get there, and then that should be enough to unblock work in the wider
>>> community. Holding the community at large hostage because one driver
>>> is different is really not great.
>>
>> I think we're in violent agreement. :) The point I was trying to make is
>> that amdgpu really needs to be fixed to be consistent with other drivers
>> ASAP.
> 
> It's not that easy at all. I think best case we're looking at about a one
> year plan to get this into shape, taking into account usual release/distro
> update latencies.
> 
> Best case.
> 
> But also it's not a really big issue, since this shouldn't stop
> compositors from using poll on dma-buf fd or the sync_file stuff from
> Jason: The use-case for this in compositors is to avoid a single client
> stalling the entire desktop. If a driver lies by not setting the exclusive
> fence when expected, you simply don't get this stall avoidance benefit of
> misbehaving clients.

That's a good point; I was coming to the same realization.


> But also this needs a gpu scheduler and higher priority for the
> compositor (or a lot of hw planes so you can composite
> with them alone), so it's all fairly academic issue.

I went ahead and implemented this for mutter: https://gitlab.gnome.org/GNOME/mutter/-/merge_requests/1880

Works as intended on my work laptop with Intel GPU, so it's not just academic. :)

I hope this can serve as motivation for providing the same poll semantics (and a higher priority GFX queue exposed via EGL_IMG_context_priority) in amdgpu as well.


-- 
Earthling Michel Dänzer               |               https://redhat.com
Libre software enthusiast             |             Mesa and X developer

^ permalink raw reply	[flat|nested] 50+ messages in thread

end of thread, other threads:[~2021-05-31 12:49 UTC | newest]

Thread overview: 50+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-05-17 14:11 [RFC] Add DMA_RESV_USAGE flags Christian König
2021-05-17 14:11 ` [PATCH 01/11] dma-buf: fix invalid debug print Christian König
2021-05-17 14:11 ` [PATCH 02/11] dma-buf: add SPDX header and fix style in dma-resv.c Christian König
2021-05-17 14:11 ` [PATCH 03/11] dma-buf: cleanup dma-resv shared fence debugging a bit Christian König
2021-05-17 14:11 ` [PATCH 04/11] dma-buf: rename and cleanup dma_resv_get_excl Christian König
2021-05-17 14:11 ` [PATCH 05/11] dma-buf: rename and cleanup dma_resv_get_list Christian König
2021-05-17 14:11 ` [PATCH 06/11] dma-buf: add dma_resv_list_fence helper Christian König
2021-05-17 14:11 ` [PATCH 07/11] dma-buf: add dma_resv_replace_shared Christian König
2021-05-17 14:11 ` [PATCH 08/11] dma-buf: improve shared fence abstraction Christian König
2021-05-17 14:11 ` [PATCH 09/11] dma-buf: add shared fence usage flags Christian König
2021-05-17 20:36   ` Daniel Vetter
2021-05-18 12:54     ` Christian König
2021-05-17 14:11 ` [PATCH 10/11] drm/i915: also wait for shared dmabuf fences before flip Christian König
2021-05-17 14:11 ` [PATCH 11/11] drm/amdgpu: fix shared access to exported DMA-bufs Christian König
2021-05-17 15:04 ` [RFC] Add DMA_RESV_USAGE flags Daniel Vetter
2021-05-17 19:38   ` Christian König
2021-05-17 20:15     ` Jason Ekstrand
2021-05-17 20:15     ` Daniel Vetter
2021-05-17 22:49       ` Jason Ekstrand
2021-05-18  5:59         ` Daniel Vetter
2021-05-18 10:29           ` Daniel Vetter
2021-05-18 12:49           ` Christian König
2021-05-18 13:26             ` Daniel Stone
2021-05-18 13:51               ` Christian König
2021-05-18 16:48             ` Daniel Vetter
2021-05-18 17:40               ` Christian König
2021-05-18 21:17                 ` Daniel Vetter
2021-05-18 22:06                   ` Jason Ekstrand
2021-05-19 10:52                     ` Michel Dänzer
2021-05-19 15:21                       ` Jason Ekstrand
2021-05-19 15:48                         ` Michel Dänzer
2021-05-20  7:55                           ` Daniel Vetter
2021-05-20  8:13                             ` Michel Dänzer
2021-05-20 10:00                               ` Christian König
2021-05-20 14:18                               ` Daniel Vetter
2021-05-20 14:30                                 ` Michel Dänzer
2021-05-20 17:08                                 ` Jason Ekstrand
2021-05-31 12:49                                 ` Michel Dänzer
2021-05-20 10:50                             ` Christian König
2021-05-20 17:23                               ` Jason Ekstrand
2021-05-20 19:04                                 ` Jason Ekstrand
2021-05-20 19:14                                   ` Daniel Vetter
2021-05-21  7:27                                     ` Christian König
2021-05-21  9:36                                     ` Bas Nieuwenhuizen
2021-05-21  7:24                                 ` Christian König
2021-05-19 11:43                     ` Christian König
2021-05-19 15:35                       ` Jason Ekstrand
2021-05-19 11:24                   ` Christian König
2021-05-20  7:58                     ` Daniel Vetter
2021-05-18 21:31                 ` Dave Airlie

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.