All of lore.kernel.org
 help / color / mirror / Atom feed
* RFC: Add write flag to reservation object fences
@ 2018-08-09 11:37 Christian König
       [not found] ` <20180809113713.48024-1-christian.koenig-5C7GfCeVMHo@public.gmane.org>
                   ` (4 more replies)
  0 siblings, 5 replies; 25+ messages in thread
From: Christian König @ 2018-08-09 11:37 UTC (permalink / raw)
  To: intel-gfx, amd-gfx, dri-devel

Hi everyone,

This set of patches tries to improve read after write hazard handling for reservation objects.

It allows us to specify for each shared fence if it represents a write operation.

Based on this the i915 driver is modified to always wait for all writes before pageflip and the previously used workaround is removed from amdgpu.

Please comment,
Christian.

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH 1/6] dma-buf: remove shared fence staging in reservation object
       [not found] ` <20180809113713.48024-1-christian.koenig-5C7GfCeVMHo@public.gmane.org>
@ 2018-08-09 11:37   ` Christian König
  2018-08-09 12:08     ` Chris Wilson
  2018-08-09 11:37   ` [PATCH 2/6] dma-buf: add reservation object shared fence accessor Christian König
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 25+ messages in thread
From: Christian König @ 2018-08-09 11:37 UTC (permalink / raw)
  To: intel-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

No need for that any more. Just replace the list when there isn't enough
room any more for at least one additional fence.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/dma-buf/reservation.c | 180 ++++++++++++++----------------------------
 include/linux/reservation.h   |   4 -
 2 files changed, 60 insertions(+), 124 deletions(-)

diff --git a/drivers/dma-buf/reservation.c b/drivers/dma-buf/reservation.c
index 314eb1071cce..1f0c61b540ba 100644
--- a/drivers/dma-buf/reservation.c
+++ b/drivers/dma-buf/reservation.c
@@ -68,104 +68,23 @@ EXPORT_SYMBOL(reservation_seqcount_string);
  */
 int reservation_object_reserve_shared(struct reservation_object *obj)
 {
-	struct reservation_object_list *fobj, *old;
-	u32 max;
+	struct reservation_object_list *old, *new;
+	unsigned int i, j, k, max;
 
 	old = reservation_object_get_list(obj);
 
 	if (old && old->shared_max) {
-		if (old->shared_count < old->shared_max) {
-			/* perform an in-place update */
-			kfree(obj->staged);
-			obj->staged = NULL;
+		if (old->shared_count < old->shared_max)
 			return 0;
-		} else
+		else
 			max = old->shared_max * 2;
-	} else
-		max = 4;
-
-	/*
-	 * resize obj->staged or allocate if it doesn't exist,
-	 * noop if already correct size
-	 */
-	fobj = krealloc(obj->staged, offsetof(typeof(*fobj), shared[max]),
-			GFP_KERNEL);
-	if (!fobj)
-		return -ENOMEM;
-
-	obj->staged = fobj;
-	fobj->shared_max = max;
-	return 0;
-}
-EXPORT_SYMBOL(reservation_object_reserve_shared);
-
-static void
-reservation_object_add_shared_inplace(struct reservation_object *obj,
-				      struct reservation_object_list *fobj,
-				      struct dma_fence *fence)
-{
-	struct dma_fence *signaled = NULL;
-	u32 i, signaled_idx;
-
-	dma_fence_get(fence);
-
-	preempt_disable();
-	write_seqcount_begin(&obj->seq);
-
-	for (i = 0; i < fobj->shared_count; ++i) {
-		struct dma_fence *old_fence;
-
-		old_fence = rcu_dereference_protected(fobj->shared[i],
-						reservation_object_held(obj));
-
-		if (old_fence->context == fence->context) {
-			/* memory barrier is added by write_seqcount_begin */
-			RCU_INIT_POINTER(fobj->shared[i], fence);
-			write_seqcount_end(&obj->seq);
-			preempt_enable();
-
-			dma_fence_put(old_fence);
-			return;
-		}
-
-		if (!signaled && dma_fence_is_signaled(old_fence)) {
-			signaled = old_fence;
-			signaled_idx = i;
-		}
-	}
-
-	/*
-	 * memory barrier is added by write_seqcount_begin,
-	 * fobj->shared_count is protected by this lock too
-	 */
-	if (signaled) {
-		RCU_INIT_POINTER(fobj->shared[signaled_idx], fence);
 	} else {
-		RCU_INIT_POINTER(fobj->shared[fobj->shared_count], fence);
-		fobj->shared_count++;
+		max = 4;
 	}
 
-	write_seqcount_end(&obj->seq);
-	preempt_enable();
-
-	dma_fence_put(signaled);
-}
-
-static void
-reservation_object_add_shared_replace(struct reservation_object *obj,
-				      struct reservation_object_list *old,
-				      struct reservation_object_list *fobj,
-				      struct dma_fence *fence)
-{
-	unsigned i, j, k;
-
-	dma_fence_get(fence);
-
-	if (!old) {
-		RCU_INIT_POINTER(fobj->shared[0], fence);
-		fobj->shared_count = 1;
-		goto done;
-	}
+	new = kmalloc(offsetof(typeof(*new), shared[max]), GFP_KERNEL);
+	if (!new)
+		return -ENOMEM;
 
 	/*
 	 * no need to bump fence refcounts, rcu_read access
@@ -173,46 +92,45 @@ reservation_object_add_shared_replace(struct reservation_object *obj,
 	 * references from the old struct are carried over to
 	 * the new.
 	 */
-	for (i = 0, j = 0, k = fobj->shared_max; i < old->shared_count; ++i) {
-		struct dma_fence *check;
-
-		check = rcu_dereference_protected(old->shared[i],
-						reservation_object_held(obj));
+	for (i = 0, j = 0, k = max; i < (old ? old->shared_count : 0); ++i) {
+		struct dma_fence *fence;
 
-		if (check->context == fence->context ||
-		    dma_fence_is_signaled(check))
-			RCU_INIT_POINTER(fobj->shared[--k], check);
+		fence = rcu_dereference_protected(old->shared[i],
+						  reservation_object_held(obj));
+		if (dma_fence_is_signaled(fence))
+			RCU_INIT_POINTER(new->shared[--k], fence);
 		else
-			RCU_INIT_POINTER(fobj->shared[j++], check);
+			RCU_INIT_POINTER(new->shared[j++], fence);
 	}
-	fobj->shared_count = j;
-	RCU_INIT_POINTER(fobj->shared[fobj->shared_count], fence);
-	fobj->shared_count++;
+	new->shared_count = j;
+	new->shared_max = max;
 
-done:
 	preempt_disable();
 	write_seqcount_begin(&obj->seq);
 	/*
 	 * RCU_INIT_POINTER can be used here,
 	 * seqcount provides the necessary barriers
 	 */
-	RCU_INIT_POINTER(obj->fence, fobj);
+	RCU_INIT_POINTER(obj->fence, new);
 	write_seqcount_end(&obj->seq);
 	preempt_enable();
 
 	if (!old)
-		return;
+		return 0;
 
 	/* Drop the references to the signaled fences */
-	for (i = k; i < fobj->shared_max; ++i) {
-		struct dma_fence *f;
+	for (i = k; i < new->shared_max; ++i) {
+		struct dma_fence *fence;
 
-		f = rcu_dereference_protected(fobj->shared[i],
-					      reservation_object_held(obj));
-		dma_fence_put(f);
+		fence = rcu_dereference_protected(new->shared[i],
+						  reservation_object_held(obj));
+		dma_fence_put(fence);
 	}
 	kfree_rcu(old, rcu);
+
+	return 0;
 }
+EXPORT_SYMBOL(reservation_object_reserve_shared);
 
 /**
  * reservation_object_add_shared_fence - Add a fence to a shared slot
@@ -225,16 +143,41 @@ reservation_object_add_shared_replace(struct reservation_object *obj,
 void reservation_object_add_shared_fence(struct reservation_object *obj,
 					 struct dma_fence *fence)
 {
-	struct reservation_object_list *old, *fobj = obj->staged;
+	struct reservation_object_list *fobj;
+	unsigned int i;
 
-	old = reservation_object_get_list(obj);
-	obj->staged = NULL;
+	dma_fence_get(fence);
+
+	fobj = reservation_object_get_list(obj);
 
-	if (!fobj) {
-		BUG_ON(old->shared_count >= old->shared_max);
-		reservation_object_add_shared_inplace(obj, old, fence);
-	} else
-		reservation_object_add_shared_replace(obj, old, fobj, fence);
+	preempt_disable();
+	write_seqcount_begin(&obj->seq);
+
+	for (i = 0; i < fobj->shared_count; ++i) {
+		struct dma_fence *old_fence;
+
+		old_fence = rcu_dereference_protected(fobj->shared[i],
+						      reservation_object_held(obj));
+		if (old_fence->context == fence->context ||
+		    dma_fence_is_signaled(old_fence)) {
+			/* memory barrier is added by write_seqcount_begin */
+			RCU_INIT_POINTER(fobj->shared[i], fence);
+			write_seqcount_end(&obj->seq);
+			preempt_enable();
+			dma_fence_put(old_fence);
+			return;
+		}
+	}
+
+	/*
+	 * memory barrier is added by write_seqcount_begin,
+	 * fobj->shared_count is protected by this lock too
+	 */
+	RCU_INIT_POINTER(fobj->shared[fobj->shared_count], fence);
+	fobj->shared_count++;
+
+	write_seqcount_end(&obj->seq);
+	preempt_enable();
 }
 EXPORT_SYMBOL(reservation_object_add_shared_fence);
 
@@ -343,9 +286,6 @@ int reservation_object_copy_fences(struct reservation_object *dst,
 	new = dma_fence_get_rcu_safe(&src->fence_excl);
 	rcu_read_unlock();
 
-	kfree(dst->staged);
-	dst->staged = NULL;
-
 	src_list = reservation_object_get_list(dst);
 	old = reservation_object_get_excl(dst);
 
diff --git a/include/linux/reservation.h b/include/linux/reservation.h
index 02166e815afb..54cf6773a14c 100644
--- a/include/linux/reservation.h
+++ b/include/linux/reservation.h
@@ -68,7 +68,6 @@ struct reservation_object_list {
  * @seq: sequence count for managing RCU read-side synchronization
  * @fence_excl: the exclusive fence, if there is one currently
  * @fence: list of current shared fences
- * @staged: staged copy of shared fences for RCU updates
  */
 struct reservation_object {
 	struct ww_mutex lock;
@@ -76,7 +75,6 @@ struct reservation_object {
 
 	struct dma_fence __rcu *fence_excl;
 	struct reservation_object_list __rcu *fence;
-	struct reservation_object_list *staged;
 };
 
 #define reservation_object_held(obj) lockdep_is_held(&(obj)->lock.base)
@@ -95,7 +93,6 @@ reservation_object_init(struct reservation_object *obj)
 	__seqcount_init(&obj->seq, reservation_seqcount_string, &reservation_seqcount_class);
 	RCU_INIT_POINTER(obj->fence, NULL);
 	RCU_INIT_POINTER(obj->fence_excl, NULL);
-	obj->staged = NULL;
 }
 
 /**
@@ -124,7 +121,6 @@ reservation_object_fini(struct reservation_object *obj)
 
 		kfree(fobj);
 	}
-	kfree(obj->staged);
 
 	ww_mutex_destroy(&obj->lock);
 }
-- 
2.14.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH 2/6] dma-buf: add reservation object shared fence accessor
       [not found] ` <20180809113713.48024-1-christian.koenig-5C7GfCeVMHo@public.gmane.org>
  2018-08-09 11:37   ` [PATCH 1/6] dma-buf: remove shared fence staging in reservation object Christian König
@ 2018-08-09 11:37   ` Christian König
       [not found]     ` <20180809113713.48024-3-christian.koenig-5C7GfCeVMHo@public.gmane.org>
  2018-08-09 11:37   ` [PATCH 5/6] drm/i915: wait for write fences before pflip Christian König
  2018-08-09 11:37   ` [PATCH 6/6] drm/amdgpu: remove exclusive fence workaround Christian König
  3 siblings, 1 reply; 25+ messages in thread
From: Christian König @ 2018-08-09 11:37 UTC (permalink / raw)
  To: intel-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Add a helper to access the shared fences in an reservation object.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c |  7 ++-----
 drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c         |  3 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c          |  4 ++--
 drivers/gpu/drm/msm/msm_gem.c                    |  4 ++--
 drivers/gpu/drm/nouveau/nouveau_fence.c          |  3 +--
 drivers/gpu/drm/radeon/radeon_sync.c             |  3 +--
 drivers/gpu/drm/ttm/ttm_bo.c                     |  4 +---
 include/linux/reservation.h                      | 19 +++++++++++++++++++
 8 files changed, 29 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index fa38a960ce00..989932234160 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -238,9 +238,7 @@ static int amdgpu_amdkfd_remove_eviction_fence(struct amdgpu_bo *bo,
 	for (i = 0; i < shared_count; ++i) {
 		struct dma_fence *f;
 
-		f = rcu_dereference_protected(fobj->shared[i],
-					      reservation_object_held(resv));
-
+		f = reservation_object_get_shared_fence(resv, fobj, i);
 		if (ef) {
 			if (f->context == ef->base.context) {
 				dma_fence_put(f);
@@ -273,8 +271,7 @@ static int amdgpu_amdkfd_remove_eviction_fence(struct amdgpu_bo *bo,
 		struct dma_fence *f;
 		struct amdgpu_amdkfd_fence *efence;
 
-		f = rcu_dereference_protected(fobj->shared[i],
-			reservation_object_held(resv));
+		f = reservation_object_get_shared_fence(resv, fobj, i);
 
 		efence = to_amdgpu_amdkfd_fence(f);
 		if (efence) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
index 2d6f5ec77a68..dbfd62ab67e4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
@@ -212,8 +212,7 @@ int amdgpu_sync_resv(struct amdgpu_device *adev,
 		return r;
 
 	for (i = 0; i < flist->shared_count; ++i) {
-		f = rcu_dereference_protected(flist->shared[i],
-					      reservation_object_held(resv));
+		f = reservation_object_get_shared_fence(resv, flist, i);
 		/* We only want to trigger KFD eviction fences on
 		 * evict or move jobs. Skip KFD fences otherwise.
 		 */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index c6611cff64c8..22896a398eab 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1482,8 +1482,8 @@ static bool amdgpu_ttm_bo_eviction_valuable(struct ttm_buffer_object *bo,
 	flist = reservation_object_get_list(bo->resv);
 	if (flist) {
 		for (i = 0; i < flist->shared_count; ++i) {
-			f = rcu_dereference_protected(flist->shared[i],
-				reservation_object_held(bo->resv));
+			f = reservation_object_get_shared_fence(bo->resv,
+								flist, i);
 			if (amdkfd_fence_check_mm(f, current->mm))
 				return false;
 		}
diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c
index f583bb4222f9..95d25dbfde2b 100644
--- a/drivers/gpu/drm/msm/msm_gem.c
+++ b/drivers/gpu/drm/msm/msm_gem.c
@@ -651,8 +651,8 @@ int msm_gem_sync_object(struct drm_gem_object *obj,
 		return 0;
 
 	for (i = 0; i < fobj->shared_count; i++) {
-		fence = rcu_dereference_protected(fobj->shared[i],
-						reservation_object_held(msm_obj->resv));
+		fence = reservation_object_get_shared_fence(msm_obj->resv,
+							    fobj, i);
 		if (fence->context != fctx->context) {
 			ret = dma_fence_wait(fence, true);
 			if (ret)
diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c b/drivers/gpu/drm/nouveau/nouveau_fence.c
index 412d49bc6e56..3ce921c276c1 100644
--- a/drivers/gpu/drm/nouveau/nouveau_fence.c
+++ b/drivers/gpu/drm/nouveau/nouveau_fence.c
@@ -376,8 +376,7 @@ nouveau_fence_sync(struct nouveau_bo *nvbo, struct nouveau_channel *chan, bool e
 		struct nouveau_channel *prev = NULL;
 		bool must_wait = true;
 
-		fence = rcu_dereference_protected(fobj->shared[i],
-						reservation_object_held(resv));
+		fence = reservation_object_get_shared_fence(resv, fobj, i);
 
 		f = nouveau_local_fence(fence, chan->drm);
 		if (f) {
diff --git a/drivers/gpu/drm/radeon/radeon_sync.c b/drivers/gpu/drm/radeon/radeon_sync.c
index be5d7a38d3aa..bf7f9a648838 100644
--- a/drivers/gpu/drm/radeon/radeon_sync.c
+++ b/drivers/gpu/drm/radeon/radeon_sync.c
@@ -110,8 +110,7 @@ int radeon_sync_resv(struct radeon_device *rdev,
 		return r;
 
 	for (i = 0; i < flist->shared_count; ++i) {
-		f = rcu_dereference_protected(flist->shared[i],
-					      reservation_object_held(resv));
+		f = reservation_object_get_shared_fence(resv, flist, i);
 		fence = to_radeon_fence(f);
 		if (fence && fence->rdev == rdev)
 			radeon_sync_fence(sync, fence);
diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index 7c484729f9b2..820d97d3e8b9 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -370,9 +370,7 @@ static void ttm_bo_flush_all_fences(struct ttm_buffer_object *bo)
 		dma_fence_enable_sw_signaling(fence);
 
 	for (i = 0; fobj && i < fobj->shared_count; ++i) {
-		fence = rcu_dereference_protected(fobj->shared[i],
-					reservation_object_held(bo->resv));
-
+		fence = reservation_object_get_shared_fence(bo->resv, fobj, i);
 		if (!fence->ops->signaled)
 			dma_fence_enable_sw_signaling(fence);
 	}
diff --git a/include/linux/reservation.h b/include/linux/reservation.h
index 54cf6773a14c..8a3298574bf5 100644
--- a/include/linux/reservation.h
+++ b/include/linux/reservation.h
@@ -140,6 +140,25 @@ reservation_object_get_list(struct reservation_object *obj)
 					 reservation_object_held(obj));
 }
 
+/**
+ * reservation_object_get_shared_fence - get a fence from a reservation object's
+ * shared fence list.
+ * @obj: the reservation object
+ * @list: the list to get the fence from
+ * @idx: the index in the list to get
+ *
+ * Returns the fence from the shared fence list.  Does NOT take references to
+ * the fence.  Needs to be in RCU context or the obj->lock must be held.
+ */
+static inline struct dma_fence *
+reservation_object_get_shared_fence(struct reservation_object *obj,
+				    struct reservation_object_list *list,
+				    unsigned int idx)
+{
+	return rcu_dereference_protected(list->shared[idx],
+					 reservation_object_held(obj));
+}
+
 /**
  * reservation_object_lock - lock the reservation object
  * @obj: the reservation object
-- 
2.14.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH 3/6] dma-buf: add is_write to reservation_object_add_shared_fence
  2018-08-09 11:37 RFC: Add write flag to reservation object fences Christian König
       [not found] ` <20180809113713.48024-1-christian.koenig-5C7GfCeVMHo@public.gmane.org>
@ 2018-08-09 11:37 ` Christian König
  2018-08-09 11:37 ` [PATCH 4/6] dma-buf: add writes_only flag to reservation_object_get_fences_rcu Christian König
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 25+ messages in thread
From: Christian König @ 2018-08-09 11:37 UTC (permalink / raw)
  To: intel-gfx, amd-gfx, dri-devel

Note if the added fence is a write by using the lsb in the fenc pointer.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/dma-buf/dma-buf.c                    |  8 +++-
 drivers/dma-buf/reservation.c                | 59 +++++++++++++++++-----------
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c   |  2 +-
 drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c |  3 +-
 drivers/gpu/drm/i915/i915_gem.c              |  6 ++-
 drivers/gpu/drm/i915/i915_gem_execbuffer.c   |  2 +-
 drivers/gpu/drm/msm/msm_gem.c                |  3 +-
 drivers/gpu/drm/nouveau/nouveau_bo.c         |  2 +-
 drivers/gpu/drm/qxl/qxl_release.c            |  3 +-
 drivers/gpu/drm/radeon/radeon_object.c       |  2 +-
 drivers/gpu/drm/ttm/ttm_bo.c                 |  2 +-
 drivers/gpu/drm/ttm/ttm_execbuf_util.c       |  3 +-
 drivers/gpu/drm/vc4/vc4_gem.c                |  3 +-
 drivers/gpu/drm/vgem/vgem_fence.c            |  2 +-
 include/linux/reservation.h                  | 20 +++++++---
 15 files changed, 76 insertions(+), 44 deletions(-)

diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
index 13884474d158..6b816cd505d6 100644
--- a/drivers/dma-buf/dma-buf.c
+++ b/drivers/dma-buf/dma-buf.c
@@ -244,7 +244,10 @@ static __poll_t dma_buf_poll(struct file *file, poll_table *poll)
 			goto out;
 
 		for (i = 0; i < shared_count; ++i) {
-			struct dma_fence *fence = rcu_dereference(fobj->shared[i]);
+			struct dma_fence *fence;
+
+			fence = reservation_object_shared_fence(
+				rcu_dereference(fobj->shared[i]));
 
 			if (!dma_fence_get_rcu(fence)) {
 				/*
@@ -1062,7 +1065,8 @@ static int dma_buf_debug_show(struct seq_file *s, void *unused)
 				   fence->ops->get_timeline_name(fence),
 				   dma_fence_is_signaled(fence) ? "" : "un");
 		for (i = 0; i < shared_count; i++) {
-			fence = rcu_dereference(fobj->shared[i]);
+			fence = reservation_object_shared_fence(
+				rcu_dereference(fobj->shared[i]));
 			if (!dma_fence_get_rcu(fence))
 				continue;
 			seq_printf(s, "\tShared fence: %s %s %ssignalled\n",
diff --git a/drivers/dma-buf/reservation.c b/drivers/dma-buf/reservation.c
index 1f0c61b540ba..0f98384b86d4 100644
--- a/drivers/dma-buf/reservation.c
+++ b/drivers/dma-buf/reservation.c
@@ -93,14 +93,14 @@ int reservation_object_reserve_shared(struct reservation_object *obj)
 	 * the new.
 	 */
 	for (i = 0, j = 0, k = max; i < (old ? old->shared_count : 0); ++i) {
-		struct dma_fence *fence;
+		void *e;
 
-		fence = rcu_dereference_protected(old->shared[i],
-						  reservation_object_held(obj));
-		if (dma_fence_is_signaled(fence))
-			RCU_INIT_POINTER(new->shared[--k], fence);
+		e = rcu_dereference_protected(old->shared[i],
+					      reservation_object_held(obj));
+		if (dma_fence_is_signaled(reservation_object_shared_fence(e)))
+			RCU_INIT_POINTER(new->shared[--k], e);
 		else
-			RCU_INIT_POINTER(new->shared[j++], fence);
+			RCU_INIT_POINTER(new->shared[j++], e);
 	}
 	new->shared_count = j;
 	new->shared_max = max;
@@ -120,11 +120,11 @@ int reservation_object_reserve_shared(struct reservation_object *obj)
 
 	/* Drop the references to the signaled fences */
 	for (i = k; i < new->shared_max; ++i) {
-		struct dma_fence *fence;
+		void *e;
 
-		fence = rcu_dereference_protected(new->shared[i],
-						  reservation_object_held(obj));
-		dma_fence_put(fence);
+		e = rcu_dereference_protected(new->shared[i],
+					      reservation_object_held(obj));
+		dma_fence_put(reservation_object_shared_fence(e));
 	}
 	kfree_rcu(old, rcu);
 
@@ -141,7 +141,8 @@ EXPORT_SYMBOL(reservation_object_reserve_shared);
  * reservation_object_reserve_shared() has been called.
  */
 void reservation_object_add_shared_fence(struct reservation_object *obj,
-					 struct dma_fence *fence)
+					 struct dma_fence *fence,
+					 bool is_write)
 {
 	struct reservation_object_list *fobj;
 	unsigned int i;
@@ -155,13 +156,17 @@ void reservation_object_add_shared_fence(struct reservation_object *obj,
 
 	for (i = 0; i < fobj->shared_count; ++i) {
 		struct dma_fence *old_fence;
+		void *e;
 
-		old_fence = rcu_dereference_protected(fobj->shared[i],
-						      reservation_object_held(obj));
-		if (old_fence->context == fence->context ||
+		e = rcu_dereference_protected(fobj->shared[i],
+					      reservation_object_held(obj));
+		old_fence = reservation_object_shared_fence(e);
+		if ((old_fence->context == fence->context &&
+		    reservation_object_shared_is_write(e) == is_write) ||
 		    dma_fence_is_signaled(old_fence)) {
 			/* memory barrier is added by write_seqcount_begin */
-			RCU_INIT_POINTER(fobj->shared[i], fence);
+			RCU_INIT_POINTER(fobj->shared[i],
+					 (void *)(is_write | (long)fence));
 			write_seqcount_end(&obj->seq);
 			preempt_enable();
 			dma_fence_put(old_fence);
@@ -173,7 +178,8 @@ void reservation_object_add_shared_fence(struct reservation_object *obj,
 	 * memory barrier is added by write_seqcount_begin,
 	 * fobj->shared_count is protected by this lock too
 	 */
-	RCU_INIT_POINTER(fobj->shared[fobj->shared_count], fence);
+	RCU_INIT_POINTER(fobj->shared[fobj->shared_count],
+			 (void *)(is_write | (long)fence));
 	fobj->shared_count++;
 
 	write_seqcount_end(&obj->seq);
@@ -213,8 +219,7 @@ void reservation_object_add_excl_fence(struct reservation_object *obj,
 
 	/* inplace update, no shared fences */
 	while (i--)
-		dma_fence_put(rcu_dereference_protected(old->shared[i],
-						reservation_object_held(obj)));
+		dma_fence_put(reservation_object_get_shared_fence(obj, old, i));
 
 	dma_fence_put(old_fence);
 }
@@ -260,8 +265,10 @@ int reservation_object_copy_fences(struct reservation_object *dst,
 		dst_list->shared_max = shared_count;
 		for (i = 0; i < src_list->shared_count; ++i) {
 			struct dma_fence *fence;
+			void *e;
 
-			fence = rcu_dereference(src_list->shared[i]);
+			e = rcu_dereference(src_list->shared[i]);
+			fence = reservation_object_shared_fence(e);
 			if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
 				     &fence->flags))
 				continue;
@@ -277,7 +284,7 @@ int reservation_object_copy_fences(struct reservation_object *dst,
 				continue;
 			}
 
-			rcu_assign_pointer(dst_list->shared[dst_list->shared_count++], fence);
+			rcu_assign_pointer(dst_list->shared[dst_list->shared_count++], e);
 		}
 	} else {
 		dst_list = NULL;
@@ -368,7 +375,9 @@ int reservation_object_get_fences_rcu(struct reservation_object *obj,
 			shared = nshared;
 			shared_count = fobj ? fobj->shared_count : 0;
 			for (i = 0; i < shared_count; ++i) {
-				shared[i] = rcu_dereference(fobj->shared[i]);
+				void *e = rcu_dereference(fobj->shared[i]);
+
+				shared[i] = reservation_object_shared_fence(e);
 				if (!dma_fence_get_rcu(shared[i]))
 					break;
 			}
@@ -456,8 +465,10 @@ long reservation_object_wait_timeout_rcu(struct reservation_object *obj,
 			shared_count = fobj->shared_count;
 
 		for (i = 0; !fence && i < shared_count; ++i) {
-			struct dma_fence *lfence = rcu_dereference(fobj->shared[i]);
+			void *e = rcu_dereference(fobj->shared[i]);
+			struct dma_fence *lfence;
 
+			lfence = reservation_object_shared_fence(e);
 			if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
 				     &lfence->flags))
 				continue;
@@ -545,8 +556,10 @@ bool reservation_object_test_signaled_rcu(struct reservation_object *obj,
 			shared_count = fobj->shared_count;
 
 		for (i = 0; i < shared_count; ++i) {
-			struct dma_fence *fence = rcu_dereference(fobj->shared[i]);
+			void *e = rcu_dereference(fobj->shared[i]);
+			struct dma_fence *fence;
 
+			fence = reservation_object_shared_fence(e);
 			ret = reservation_object_test_signaled_single(fence);
 			if (ret < 0)
 				goto retry;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index b0e14a3d54ef..303143b89275 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -1342,7 +1342,7 @@ void amdgpu_bo_fence(struct amdgpu_bo *bo, struct dma_fence *fence,
 	struct reservation_object *resv = bo->tbo.resv;
 
 	if (shared)
-		reservation_object_add_shared_fence(resv, fence);
+		reservation_object_add_shared_fence(resv, fence, true);
 	else
 		reservation_object_add_excl_fence(resv, fence);
 }
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c b/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c
index 46ecd3e66ac9..5f4a872a88dd 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c
@@ -214,7 +214,8 @@ static void submit_attach_object_fences(struct etnaviv_gem_submit *submit)
 							  submit->out_fence);
 		else
 			reservation_object_add_shared_fence(etnaviv_obj->resv,
-							    submit->out_fence);
+							    submit->out_fence,
+							    false);
 
 		submit_unlock_object(submit, i);
 	}
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 86f1f9aaa119..0415420e1cfd 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -4552,8 +4552,10 @@ i915_gem_busy_ioctl(struct drm_device *dev, void *data,
 		unsigned int shared_count = list->shared_count, i;
 
 		for (i = 0; i < shared_count; ++i) {
-			struct dma_fence *fence =
-				rcu_dereference(list->shared[i]);
+			struct dma_fence *fence;
+
+			fence = reservation_object_shared_fence(
+				rcu_dereference(list->shared[i]));
 
 			args->busy |= busy_check_reader(fence);
 		}
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index eefd449502e2..85e3f92e87f8 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1771,7 +1771,7 @@ static void eb_export_fence(struct i915_vma *vma,
 	if (flags & EXEC_OBJECT_WRITE)
 		reservation_object_add_excl_fence(resv, &rq->fence);
 	else if (reservation_object_reserve_shared(resv) == 0)
-		reservation_object_add_shared_fence(resv, &rq->fence);
+		reservation_object_add_shared_fence(resv, &rq->fence, false);
 	reservation_object_unlock(resv);
 }
 
diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c
index 95d25dbfde2b..fdee77627a0f 100644
--- a/drivers/gpu/drm/msm/msm_gem.c
+++ b/drivers/gpu/drm/msm/msm_gem.c
@@ -672,7 +672,8 @@ void msm_gem_move_to_active(struct drm_gem_object *obj,
 	if (exclusive)
 		reservation_object_add_excl_fence(msm_obj->resv, fence);
 	else
-		reservation_object_add_shared_fence(msm_obj->resv, fence);
+		reservation_object_add_shared_fence(msm_obj->resv, fence,
+						    false);
 	list_del_init(&msm_obj->mm_list);
 	list_add_tail(&msm_obj->mm_list, &gpu->active_list);
 }
diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c
index 7214022dfb91..741faead4c7f 100644
--- a/drivers/gpu/drm/nouveau/nouveau_bo.c
+++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
@@ -1657,7 +1657,7 @@ nouveau_bo_fence(struct nouveau_bo *nvbo, struct nouveau_fence *fence, bool excl
 	if (exclusive)
 		reservation_object_add_excl_fence(resv, &fence->base);
 	else if (fence)
-		reservation_object_add_shared_fence(resv, &fence->base);
+		reservation_object_add_shared_fence(resv, &fence->base, false);
 }
 
 struct ttm_bo_driver nouveau_bo_driver = {
diff --git a/drivers/gpu/drm/qxl/qxl_release.c b/drivers/gpu/drm/qxl/qxl_release.c
index 7cb214577275..e68ab1efd809 100644
--- a/drivers/gpu/drm/qxl/qxl_release.c
+++ b/drivers/gpu/drm/qxl/qxl_release.c
@@ -466,7 +466,8 @@ void qxl_release_fence_buffer_objects(struct qxl_release *release)
 		bo = entry->bo;
 		qbo = to_qxl_bo(bo);
 
-		reservation_object_add_shared_fence(bo->resv, &release->base);
+		reservation_object_add_shared_fence(bo->resv, &release->base,
+						    false);
 		ttm_bo_add_to_lru(bo);
 		reservation_object_unlock(bo->resv);
 	}
diff --git a/drivers/gpu/drm/radeon/radeon_object.c b/drivers/gpu/drm/radeon/radeon_object.c
index ba2fd295697f..11114ffb7495 100644
--- a/drivers/gpu/drm/radeon/radeon_object.c
+++ b/drivers/gpu/drm/radeon/radeon_object.c
@@ -870,7 +870,7 @@ void radeon_bo_fence(struct radeon_bo *bo, struct radeon_fence *fence,
 	struct reservation_object *resv = bo->tbo.resv;
 
 	if (shared)
-		reservation_object_add_shared_fence(resv, &fence->base);
+		reservation_object_add_shared_fence(resv, &fence->base, false);
 	else
 		reservation_object_add_excl_fence(resv, &fence->base);
 }
diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index 820d97d3e8b9..9e98d8977cd0 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -794,7 +794,7 @@ static int ttm_bo_add_move_fence(struct ttm_buffer_object *bo,
 	spin_unlock(&man->move_lock);
 
 	if (fence) {
-		reservation_object_add_shared_fence(bo->resv, fence);
+		reservation_object_add_shared_fence(bo->resv, fence, true);
 
 		ret = reservation_object_reserve_shared(bo->resv);
 		if (unlikely(ret))
diff --git a/drivers/gpu/drm/ttm/ttm_execbuf_util.c b/drivers/gpu/drm/ttm/ttm_execbuf_util.c
index e73ae0d22897..f82a460d106b 100644
--- a/drivers/gpu/drm/ttm/ttm_execbuf_util.c
+++ b/drivers/gpu/drm/ttm/ttm_execbuf_util.c
@@ -202,7 +202,8 @@ void ttm_eu_fence_buffer_objects(struct ww_acquire_ctx *ticket,
 	list_for_each_entry(entry, list, head) {
 		bo = entry->bo;
 		if (entry->shared)
-			reservation_object_add_shared_fence(bo->resv, fence);
+			reservation_object_add_shared_fence(bo->resv, fence,
+							    true);
 		else
 			reservation_object_add_excl_fence(bo->resv, fence);
 		ttm_bo_add_to_lru(bo);
diff --git a/drivers/gpu/drm/vc4/vc4_gem.c b/drivers/gpu/drm/vc4/vc4_gem.c
index 7910b9acedd6..9db15a76c25f 100644
--- a/drivers/gpu/drm/vc4/vc4_gem.c
+++ b/drivers/gpu/drm/vc4/vc4_gem.c
@@ -536,7 +536,8 @@ vc4_update_bo_seqnos(struct vc4_exec_info *exec, uint64_t seqno)
 		bo = to_vc4_bo(&exec->bo[i]->base);
 		bo->seqno = seqno;
 
-		reservation_object_add_shared_fence(bo->resv, exec->fence);
+		reservation_object_add_shared_fence(bo->resv, exec->fence,
+						    false);
 	}
 
 	list_for_each_entry(bo, &exec->unref_list, unref_head) {
diff --git a/drivers/gpu/drm/vgem/vgem_fence.c b/drivers/gpu/drm/vgem/vgem_fence.c
index b28876c222b4..5d8b47056be6 100644
--- a/drivers/gpu/drm/vgem/vgem_fence.c
+++ b/drivers/gpu/drm/vgem/vgem_fence.c
@@ -194,7 +194,7 @@ int vgem_fence_attach_ioctl(struct drm_device *dev,
 	if (arg->flags & VGEM_FENCE_WRITE)
 		reservation_object_add_excl_fence(resv, fence);
 	else if ((ret = reservation_object_reserve_shared(resv)) == 0)
-		reservation_object_add_shared_fence(resv, fence);
+		reservation_object_add_shared_fence(resv, fence, false);
 	reservation_object_unlock(resv);
 
 	/* Record the fence in our idr for later signaling */
diff --git a/include/linux/reservation.h b/include/linux/reservation.h
index 8a3298574bf5..d73bf025df4b 100644
--- a/include/linux/reservation.h
+++ b/include/linux/reservation.h
@@ -59,7 +59,7 @@ extern const char reservation_seqcount_string[];
 struct reservation_object_list {
 	struct rcu_head rcu;
 	u32 shared_count, shared_max;
-	struct dma_fence __rcu *shared[];
+	void * __rcu *shared[];
 };
 
 /**
@@ -80,6 +80,8 @@ struct reservation_object {
 #define reservation_object_held(obj) lockdep_is_held(&(obj)->lock.base)
 #define reservation_object_assert_held(obj) \
 	lockdep_assert_held(&(obj)->lock.base)
+#define reservation_object_shared_fence(e) ((struct dma_fence *)((long)e & ~1ul))
+#define reservation_object_shared_is_write(e) ((long)e & 1)
 
 /**
  * reservation_object_init - initialize a reservation object
@@ -116,8 +118,11 @@ reservation_object_fini(struct reservation_object *obj)
 
 	fobj = rcu_dereference_protected(obj->fence, 1);
 	if (fobj) {
-		for (i = 0; i < fobj->shared_count; ++i)
-			dma_fence_put(rcu_dereference_protected(fobj->shared[i], 1));
+		for (i = 0; i < fobj->shared_count; ++i) {
+			void *e = rcu_dereference_protected(fobj->shared[i], 1);
+
+			dma_fence_put(reservation_object_shared_fence(e));
+		}
 
 		kfree(fobj);
 	}
@@ -155,8 +160,10 @@ reservation_object_get_shared_fence(struct reservation_object *obj,
 				    struct reservation_object_list *list,
 				    unsigned int idx)
 {
-	return rcu_dereference_protected(list->shared[idx],
-					 reservation_object_held(obj));
+	void *e = rcu_dereference_protected(list->shared[idx],
+					    reservation_object_held(obj));
+
+	return reservation_object_shared_fence(e);
 }
 
 /**
@@ -282,7 +289,8 @@ reservation_object_get_excl_rcu(struct reservation_object *obj)
 
 int reservation_object_reserve_shared(struct reservation_object *obj);
 void reservation_object_add_shared_fence(struct reservation_object *obj,
-					 struct dma_fence *fence);
+					 struct dma_fence *fence,
+					 bool as_write);
 
 void reservation_object_add_excl_fence(struct reservation_object *obj,
 				       struct dma_fence *fence);
-- 
2.14.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH 4/6] dma-buf: add writes_only flag to reservation_object_get_fences_rcu
  2018-08-09 11:37 RFC: Add write flag to reservation object fences Christian König
       [not found] ` <20180809113713.48024-1-christian.koenig-5C7GfCeVMHo@public.gmane.org>
  2018-08-09 11:37 ` [PATCH 3/6] dma-buf: add is_write to reservation_object_add_shared_fence Christian König
@ 2018-08-09 11:37 ` Christian König
  2018-08-09 11:53 ` ✗ Fi.CI.BAT: failure for series starting with [1/6] dma-buf: remove shared fence staging in reservation object Patchwork
  2018-08-09 13:38 ` RFC: Add write flag to reservation object fences Daniel Vetter
  4 siblings, 0 replies; 25+ messages in thread
From: Christian König @ 2018-08-09 11:37 UTC (permalink / raw)
  To: intel-gfx, amd-gfx, dri-devel

That allows us to only retreive fences of write operations.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/dma-buf/reservation.c                | 19 +++++++++++++++----
 drivers/gpu/drm/amd/amdgpu/amdgpu_display.c  |  3 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c      |  3 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c       |  2 +-
 drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c |  3 ++-
 drivers/gpu/drm/i915/i915_gem.c              |  4 ++--
 drivers/gpu/drm/i915/i915_request.c          |  2 +-
 drivers/gpu/drm/i915/i915_sw_fence.c         |  2 +-
 include/linux/reservation.h                  |  1 +
 9 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/drivers/dma-buf/reservation.c b/drivers/dma-buf/reservation.c
index 0f98384b86d4..f5dc17aa5efb 100644
--- a/drivers/dma-buf/reservation.c
+++ b/drivers/dma-buf/reservation.c
@@ -316,6 +316,7 @@ EXPORT_SYMBOL(reservation_object_copy_fences);
  * reservation_object_get_fences_rcu - Get an object's shared and exclusive
  * fences without update side lock held
  * @obj: the reservation object
+ * @writes_only: if true then only write operations are returned
  * @pfence_excl: the returned exclusive fence (or NULL)
  * @pshared_count: the number of shared fences returned
  * @pshared: the array of shared fence ptrs returned (array is krealloc'd to
@@ -326,6 +327,7 @@ EXPORT_SYMBOL(reservation_object_copy_fences);
  * shared fences as well. Returns either zero or -ENOMEM.
  */
 int reservation_object_get_fences_rcu(struct reservation_object *obj,
+				      bool writes_only,
 				      struct dma_fence **pfence_excl,
 				      unsigned *pshared_count,
 				      struct dma_fence ***pshared)
@@ -358,6 +360,7 @@ int reservation_object_get_fences_rcu(struct reservation_object *obj,
 
 		if (sz) {
 			struct dma_fence **nshared;
+			unsigned int j;
 
 			nshared = krealloc(shared, sz,
 					   GFP_NOWAIT | __GFP_NOWARN);
@@ -374,13 +377,20 @@ int reservation_object_get_fences_rcu(struct reservation_object *obj,
 			}
 			shared = nshared;
 			shared_count = fobj ? fobj->shared_count : 0;
-			for (i = 0; i < shared_count; ++i) {
-				void *e = rcu_dereference(fobj->shared[i]);
+			for (i = 0, j = 0; j < shared_count; ++j) {
+				void *e = rcu_dereference(fobj->shared[j]);
+
+				if (writes_only &&
+				    !reservation_object_shared_is_write(e))
+					continue;
 
 				shared[i] = reservation_object_shared_fence(e);
 				if (!dma_fence_get_rcu(shared[i]))
-					break;
+					goto drop_references;
+
+				i++;
 			}
+			shared_count = i;
 
 			if (!pfence_excl && fence_excl) {
 				shared[i] = fence_excl;
@@ -390,7 +400,8 @@ int reservation_object_get_fences_rcu(struct reservation_object *obj,
 			}
 		}
 
-		if (i != shared_count || read_seqcount_retry(&obj->seq, seq)) {
+		if (read_seqcount_retry(&obj->seq, seq)) {
+drop_references:
 			while (i--)
 				dma_fence_put(shared[i]);
 			dma_fence_put(fence_excl);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c
index 7d6a36bca9dd..a6d2ba4b4d2d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c
@@ -200,7 +200,8 @@ int amdgpu_display_crtc_page_flip_target(struct drm_crtc *crtc,
 		goto unpin;
 	}
 
-	r = reservation_object_get_fences_rcu(new_abo->tbo.resv, &work->excl,
+	r = reservation_object_get_fences_rcu(new_abo->tbo.resv, false,
+					      &work->excl,
 					      &work->shared_count,
 					      &work->shared);
 	if (unlikely(r != 0)) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
index 3a072a7a39f0..82de4eb38dff 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
@@ -112,7 +112,8 @@ void amdgpu_pasid_free_delayed(struct reservation_object *resv,
 	unsigned count;
 	int r;
 
-	r = reservation_object_get_fences_rcu(resv, NULL, &count, &fences);
+	r = reservation_object_get_fences_rcu(resv, false, NULL,
+					      &count, &fences);
 	if (r)
 		goto fallback;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 015613b4f98b..e1bb6f13de41 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -1841,7 +1841,7 @@ static void amdgpu_vm_prt_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
 	unsigned i, shared_count;
 	int r;
 
-	r = reservation_object_get_fences_rcu(resv, &excl,
+	r = reservation_object_get_fences_rcu(resv, false, &excl,
 					      &shared_count, &shared);
 	if (r) {
 		/* Not enough memory to grab the fence list, as last resort
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c b/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c
index 5f4a872a88dd..fda8be5be574 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c
@@ -188,7 +188,8 @@ static int submit_fence_sync(struct etnaviv_gem_submit *submit)
 			continue;
 
 		if (bo->flags & ETNA_SUBMIT_BO_WRITE) {
-			ret = reservation_object_get_fences_rcu(robj, &bo->excl,
+			ret = reservation_object_get_fences_rcu(robj, false,
+								&bo->excl,
 								&bo->nr_shared,
 								&bo->shared);
 			if (ret)
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 0415420e1cfd..85013933e827 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -517,7 +517,7 @@ i915_gem_object_wait_reservation(struct reservation_object *resv,
 		unsigned int count, i;
 		int ret;
 
-		ret = reservation_object_get_fences_rcu(resv,
+		ret = reservation_object_get_fences_rcu(resv, false,
 							&excl, &count, &shared);
 		if (ret)
 			return ret;
@@ -619,7 +619,7 @@ i915_gem_object_wait_priority(struct drm_i915_gem_object *obj,
 		unsigned int count, i;
 		int ret;
 
-		ret = reservation_object_get_fences_rcu(obj->resv,
+		ret = reservation_object_get_fences_rcu(obj->resv, false,
 							&excl, &count, &shared);
 		if (ret)
 			return ret;
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index f187250e60c6..8b5d87353f54 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -981,7 +981,7 @@ i915_request_await_object(struct i915_request *to,
 		struct dma_fence **shared;
 		unsigned int count, i;
 
-		ret = reservation_object_get_fences_rcu(obj->resv,
+		ret = reservation_object_get_fences_rcu(obj->resv, false,
 							&excl, &count, &shared);
 		if (ret)
 			return ret;
diff --git a/drivers/gpu/drm/i915/i915_sw_fence.c b/drivers/gpu/drm/i915/i915_sw_fence.c
index 1de5173e53a2..1da2c5a99e47 100644
--- a/drivers/gpu/drm/i915/i915_sw_fence.c
+++ b/drivers/gpu/drm/i915/i915_sw_fence.c
@@ -499,7 +499,7 @@ int i915_sw_fence_await_reservation(struct i915_sw_fence *fence,
 		struct dma_fence **shared;
 		unsigned int count, i;
 
-		ret = reservation_object_get_fences_rcu(resv,
+		ret = reservation_object_get_fences_rcu(resv, false,
 							&excl, &count, &shared);
 		if (ret)
 			return ret;
diff --git a/include/linux/reservation.h b/include/linux/reservation.h
index d73bf025df4b..c6defd955aa3 100644
--- a/include/linux/reservation.h
+++ b/include/linux/reservation.h
@@ -296,6 +296,7 @@ void reservation_object_add_excl_fence(struct reservation_object *obj,
 				       struct dma_fence *fence);
 
 int reservation_object_get_fences_rcu(struct reservation_object *obj,
+				      bool writes_only,
 				      struct dma_fence **pfence_excl,
 				      unsigned *pshared_count,
 				      struct dma_fence ***pshared);
-- 
2.14.1

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH 5/6] drm/i915: wait for write fences before pflip
       [not found] ` <20180809113713.48024-1-christian.koenig-5C7GfCeVMHo@public.gmane.org>
  2018-08-09 11:37   ` [PATCH 1/6] dma-buf: remove shared fence staging in reservation object Christian König
  2018-08-09 11:37   ` [PATCH 2/6] dma-buf: add reservation object shared fence accessor Christian König
@ 2018-08-09 11:37   ` Christian König
  2018-08-09 11:37   ` [PATCH 6/6] drm/amdgpu: remove exclusive fence workaround Christian König
  3 siblings, 0 replies; 25+ messages in thread
From: Christian König @ 2018-08-09 11:37 UTC (permalink / raw)
  To: intel-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Wait for all write operations before page flipping.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/i915/intel_display.c | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 2c16c3a3cdea..154dc86062a3 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -48,6 +48,7 @@
 #include <drm/drm_rect.h>
 #include <linux/dma_remapping.h>
 #include <linux/reservation.h>
+#include <linux/dma-fence-array.h>
 
 /* Primary plane formats for gen <= 3 */
 static const uint32_t i8xx_primary_formats[] = {
@@ -13022,7 +13023,8 @@ intel_prepare_plane_fb(struct drm_plane *plane,
 	intel_fb_obj_flush(obj, ORIGIN_DIRTYFB);
 
 	if (!new_state->fence) { /* implicit fencing */
-		struct dma_fence *fence;
+		struct dma_fence *fence, **fences;
+		unsigned count;
 
 		ret = i915_sw_fence_await_reservation(&intel_state->commit_ready,
 						      obj->resv, NULL,
@@ -13031,7 +13033,29 @@ intel_prepare_plane_fb(struct drm_plane *plane,
 		if (ret < 0)
 			return ret;
 
-		fence = reservation_object_get_excl_rcu(obj->resv);
+		ret = reservation_object_get_fences_rcu(obj->resv, true, NULL,
+							&count, &fences);
+		if (ret)
+			return ret;
+
+		if (count == 0) {
+			fence = NULL;
+		} else if (count == 1) {
+			fence = fences[0];
+			kfree(fences);
+		} else {
+			uint64_t context = dma_fence_context_alloc(1);
+			struct dma_fence_array *array;
+
+			array = dma_fence_array_create(count, fences, context,
+						       1, false);
+			if (!array) {
+				kfree(fences);
+				return -ENOMEM;
+			}
+			fence = &array->base;
+		}
+
 		if (fence) {
 			add_rps_boost_after_vblank(new_state->crtc, fence);
 			dma_fence_put(fence);
-- 
2.14.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH 6/6] drm/amdgpu: remove exclusive fence workaround
       [not found] ` <20180809113713.48024-1-christian.koenig-5C7GfCeVMHo@public.gmane.org>
                     ` (2 preceding siblings ...)
  2018-08-09 11:37   ` [PATCH 5/6] drm/i915: wait for write fences before pflip Christian König
@ 2018-08-09 11:37   ` Christian König
  3 siblings, 0 replies; 25+ messages in thread
From: Christian König @ 2018-08-09 11:37 UTC (permalink / raw)
  To: intel-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

We now note that all fences are potential writers.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c     |  3 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c  |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.h  |  1 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c   | 24 ------------------------
 5 files changed, 4 insertions(+), 28 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
index d472a2c8399f..eee17ea7cf8b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
@@ -115,7 +115,7 @@ int amdgpu_bo_list_create(struct amdgpu_device *adev, struct drm_file *filp,
 		entry->priority = min(info[i].bo_priority,
 				      AMDGPU_BO_LIST_MAX_PRIORITY);
 		entry->tv.bo = &entry->robj->tbo;
-		entry->tv.shared = !entry->robj->prime_shared_count;
+		entry->tv.shared = true;
 
 		if (entry->robj->preferred_domains == AMDGPU_GEM_DOMAIN_GDS)
 			list->gds_obj = entry->robj;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index 71792d820ae0..b52626754598 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -716,7 +716,8 @@ int amdgpu_gem_op_ioctl(struct drm_device *dev, void *data,
 		break;
 	}
 	case AMDGPU_GEM_OP_SET_PLACEMENT:
-		if (robj->prime_shared_count && (args->value & AMDGPU_GEM_DOMAIN_VRAM)) {
+		if (robj->gem_base.import_attach &&
+		    args->value & AMDGPU_GEM_DOMAIN_VRAM) {
 			r = -EINVAL;
 			amdgpu_bo_unreserve(robj);
 			break;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index 303143b89275..8f2346da111f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -883,7 +883,7 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain,
 		return -EINVAL;
 
 	/* A shared bo cannot be migrated to VRAM */
-	if (bo->prime_shared_count) {
+	if (bo->gem_base.import_attach) {
 		if (domain & AMDGPU_GEM_DOMAIN_GTT)
 			domain = AMDGPU_GEM_DOMAIN_GTT;
 		else
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
index 18945dd6982d..0a3635c0def3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
@@ -88,7 +88,6 @@ struct amdgpu_bo {
 	u64				metadata_flags;
 	void				*metadata;
 	u32				metadata_size;
-	unsigned			prime_shared_count;
 	/* list of all virtual address to which this bo is associated to */
 	struct list_head		va;
 	/* Constant after initialization */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c
index 2686297e34e0..bb8de0566bfa 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c
@@ -178,8 +178,6 @@ amdgpu_gem_prime_import_sg_table(struct drm_device *dev,
 	bo->tbo.ttm->sg = sg;
 	bo->allowed_domains = AMDGPU_GEM_DOMAIN_GTT;
 	bo->preferred_domains = AMDGPU_GEM_DOMAIN_GTT;
-	if (attach->dmabuf->ops != &amdgpu_dmabuf_ops)
-		bo->prime_shared_count = 1;
 
 	ww_mutex_unlock(&resv->lock);
 	return &bo->gem_base;
@@ -206,7 +204,6 @@ static int amdgpu_gem_map_attach(struct dma_buf *dma_buf,
 {
 	struct drm_gem_object *obj = dma_buf->priv;
 	struct amdgpu_bo *bo = gem_to_amdgpu_bo(obj);
-	struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
 	long r;
 
 	r = drm_gem_map_attach(dma_buf, attach);
@@ -217,29 +214,11 @@ static int amdgpu_gem_map_attach(struct dma_buf *dma_buf,
 	if (unlikely(r != 0))
 		goto error_detach;
 
-
-	if (attach->dev->driver != adev->dev->driver) {
-		/*
-		 * Wait for all shared fences to complete before we switch to future
-		 * use of exclusive fence on this prime shared bo.
-		 */
-		r = reservation_object_wait_timeout_rcu(bo->tbo.resv,
-							true, false,
-							MAX_SCHEDULE_TIMEOUT);
-		if (unlikely(r < 0)) {
-			DRM_DEBUG_PRIME("Fence wait failed: %li\n", r);
-			goto error_unreserve;
-		}
-	}
-
 	/* pin buffer into GTT */
 	r = amdgpu_bo_pin(bo, AMDGPU_GEM_DOMAIN_GTT);
 	if (r)
 		goto error_unreserve;
 
-	if (attach->dev->driver != adev->dev->driver)
-		bo->prime_shared_count++;
-
 error_unreserve:
 	amdgpu_bo_unreserve(bo);
 
@@ -262,7 +241,6 @@ static void amdgpu_gem_map_detach(struct dma_buf *dma_buf,
 {
 	struct drm_gem_object *obj = dma_buf->priv;
 	struct amdgpu_bo *bo = gem_to_amdgpu_bo(obj);
-	struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
 	int ret = 0;
 
 	ret = amdgpu_bo_reserve(bo, true);
@@ -270,8 +248,6 @@ static void amdgpu_gem_map_detach(struct dma_buf *dma_buf,
 		goto error;
 
 	amdgpu_bo_unpin(bo);
-	if (attach->dev->driver != adev->dev->driver && bo->prime_shared_count)
-		bo->prime_shared_count--;
 	amdgpu_bo_unreserve(bo);
 
 error:
-- 
2.14.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* ✗ Fi.CI.BAT: failure for series starting with [1/6] dma-buf: remove shared fence staging in reservation object
  2018-08-09 11:37 RFC: Add write flag to reservation object fences Christian König
                   ` (2 preceding siblings ...)
  2018-08-09 11:37 ` [PATCH 4/6] dma-buf: add writes_only flag to reservation_object_get_fences_rcu Christian König
@ 2018-08-09 11:53 ` Patchwork
  2018-08-09 13:38 ` RFC: Add write flag to reservation object fences Daniel Vetter
  4 siblings, 0 replies; 25+ messages in thread
From: Patchwork @ 2018-08-09 11:53 UTC (permalink / raw)
  To: Christian König; +Cc: intel-gfx

== Series Details ==

Series: series starting with [1/6] dma-buf: remove shared fence staging in reservation object
URL   : https://patchwork.freedesktop.org/series/47948/
State : failure

== Summary ==

Applying: dma-buf: remove shared fence staging in reservation object
Using index info to reconstruct a base tree...
M	drivers/dma-buf/reservation.c
Falling back to patching base and 3-way merge...
Auto-merging drivers/dma-buf/reservation.c
CONFLICT (content): Merge conflict in drivers/dma-buf/reservation.c
error: Failed to merge in the changes.
Patch failed at 0001 dma-buf: remove shared fence staging in reservation object
Use 'git am --show-current-patch' to see the failed patch
When you have resolved this problem, run "git am --continue".
If you prefer to skip this patch, run "git am --skip" instead.
To restore the original branch and stop patching, run "git am --abort".

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 1/6] dma-buf: remove shared fence staging in reservation object
  2018-08-09 11:37   ` [PATCH 1/6] dma-buf: remove shared fence staging in reservation object Christian König
@ 2018-08-09 12:08     ` Chris Wilson
  2018-08-09 12:22       ` Christian König
  0 siblings, 1 reply; 25+ messages in thread
From: Chris Wilson @ 2018-08-09 12:08 UTC (permalink / raw)
  To: Christian König, amd-gfx, dri-devel, intel-gfx

Quoting Christian König (2018-08-09 12:37:08)
>  void reservation_object_add_shared_fence(struct reservation_object *obj,
>                                          struct dma_fence *fence)
>  {
> -       struct reservation_object_list *old, *fobj = obj->staged;
> +       struct reservation_object_list *fobj;
> +       unsigned int i;
>  
> -       old = reservation_object_get_list(obj);
> -       obj->staged = NULL;
> +       dma_fence_get(fence);
> +
> +       fobj = reservation_object_get_list(obj);
>  
> -       if (!fobj) {
> -               BUG_ON(old->shared_count >= old->shared_max);
> -               reservation_object_add_shared_inplace(obj, old, fence);
> -       } else
> -               reservation_object_add_shared_replace(obj, old, fobj, fence);
> +       preempt_disable();
> +       write_seqcount_begin(&obj->seq);
> +
> +       for (i = 0; i < fobj->shared_count; ++i) {
> +               struct dma_fence *old_fence;
> +
> +               old_fence = rcu_dereference_protected(fobj->shared[i],
> +                                                     reservation_object_held(obj));
> +               if (old_fence->context == fence->context ||
> +                   dma_fence_is_signaled(old_fence)) {

Are you happy with the possibility that the shared[] may contain two
fences belonging to the same context? That was a sticking point earlier.

> +                       /* memory barrier is added by write_seqcount_begin */
> +                       RCU_INIT_POINTER(fobj->shared[i], fence);
> +                       write_seqcount_end(&obj->seq);
> +                       preempt_enable();
> +                       dma_fence_put(old_fence);

You can rearrange this to have a single exit.

       for (i = 0; i < fobj->shared_count; ++i) {
               struct dma_fence *old_fence;

               old_fence = rcu_dereference_protected(fobj->shared[i],
                                                     reservation_object_held(obj));
               if (old_fence->context == fence->context ||
                   dma_fence_is_signaled(old_fence)) {
		   	dma_fence_put(old_fence);
			goto replace;
		}
	}
	fobj->shared_count++;
replace:
       /*
        * memory barrier is added by write_seqcount_begin,
        * fobj->shared_count is protected by this lock too
        */
	RCU_INIT_POINTER(fobj->shared[i], fence);
	write_seqcount_end(&obj->seq);
	preempt_enable();
}
-Chris
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 1/6] dma-buf: remove shared fence staging in reservation object
  2018-08-09 12:08     ` Chris Wilson
@ 2018-08-09 12:22       ` Christian König
  0 siblings, 0 replies; 25+ messages in thread
From: Christian König @ 2018-08-09 12:22 UTC (permalink / raw)
  To: Chris Wilson, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	intel-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Am 09.08.2018 um 14:08 schrieb Chris Wilson:
> Quoting Christian König (2018-08-09 12:37:08)
>>   void reservation_object_add_shared_fence(struct reservation_object *obj,
>>                                           struct dma_fence *fence)
>>   {
>> -       struct reservation_object_list *old, *fobj = obj->staged;
>> +       struct reservation_object_list *fobj;
>> +       unsigned int i;
>>   
>> -       old = reservation_object_get_list(obj);
>> -       obj->staged = NULL;
>> +       dma_fence_get(fence);
>> +
>> +       fobj = reservation_object_get_list(obj);
>>   
>> -       if (!fobj) {
>> -               BUG_ON(old->shared_count >= old->shared_max);
>> -               reservation_object_add_shared_inplace(obj, old, fence);
>> -       } else
>> -               reservation_object_add_shared_replace(obj, old, fobj, fence);
>> +       preempt_disable();
>> +       write_seqcount_begin(&obj->seq);
>> +
>> +       for (i = 0; i < fobj->shared_count; ++i) {
>> +               struct dma_fence *old_fence;
>> +
>> +               old_fence = rcu_dereference_protected(fobj->shared[i],
>> +                                                     reservation_object_held(obj));
>> +               if (old_fence->context == fence->context ||
>> +                   dma_fence_is_signaled(old_fence)) {
> Are you happy with the possibility that the shared[] may contain two
> fences belonging to the same context? That was a sticking point earlier.

Yeah, that is fixed by now. I've removed the dependency on this in our 
VM handling code quite a while ago.

>
>> +                       /* memory barrier is added by write_seqcount_begin */
>> +                       RCU_INIT_POINTER(fobj->shared[i], fence);
>> +                       write_seqcount_end(&obj->seq);
>> +                       preempt_enable();
>> +                       dma_fence_put(old_fence);
> You can rearrange this to have a single exit.

Good point, going to rearrange the code.

Christian.

>
>         for (i = 0; i < fobj->shared_count; ++i) {
>                 struct dma_fence *old_fence;
>
>                 old_fence = rcu_dereference_protected(fobj->shared[i],
>                                                       reservation_object_held(obj));
>                 if (old_fence->context == fence->context ||
>                     dma_fence_is_signaled(old_fence)) {
> 		   	dma_fence_put(old_fence);
> 			goto replace;
> 		}
> 	}
> 	fobj->shared_count++;
> replace:
>         /*
>          * memory barrier is added by write_seqcount_begin,
>          * fobj->shared_count is protected by this lock too
>          */
> 	RCU_INIT_POINTER(fobj->shared[i], fence);
> 	write_seqcount_end(&obj->seq);
> 	preempt_enable();
> }
> -Chris

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: RFC: Add write flag to reservation object fences
  2018-08-09 11:37 RFC: Add write flag to reservation object fences Christian König
                   ` (3 preceding siblings ...)
  2018-08-09 11:53 ` ✗ Fi.CI.BAT: failure for series starting with [1/6] dma-buf: remove shared fence staging in reservation object Patchwork
@ 2018-08-09 13:38 ` Daniel Vetter
  2018-08-09 13:58   ` Christian König
  4 siblings, 1 reply; 25+ messages in thread
From: Daniel Vetter @ 2018-08-09 13:38 UTC (permalink / raw)
  To: Christian König; +Cc: intel-gfx, dri-devel, amd-gfx

On Thu, Aug 09, 2018 at 01:37:07PM +0200, Christian König wrote:
> Hi everyone,
> 
> This set of patches tries to improve read after write hazard handling
> for reservation objects.
> 
> It allows us to specify for each shared fence if it represents a write
> operation.
> 
> Based on this the i915 driver is modified to always wait for all writes
> before pageflip and the previously used workaround is removed from
> amdgpu.

Hm, I thought after the entire discussions we agreed again that it's _not_
the write hazard we want to track, but whether there's an exclusive fence
that must be observed for implicit buffer sync. That's why it's called the
exclusive fence, not the write fence!

If you want multiple of those, I guess we could add those, but that
doesn't really make sense - how exactly did you end up with multiple
exclusive fences in the first place?

i915 (and fwiw, any other driver) does _not_ want to observe all write
fences attached to a dma-buf. We want to _only_ observe the single
exclusive fence used for implicit buffer sync, which might or might not
exist. Otherwise the entire point of having explicit sync and explicit
fences in the atomic ioctl is out of the window and the use case of 2
draw/flip loops using a single buffer is defeated.

Again: How exactly you construct that exclusive fences, and how exactly
the kernel and userspace cooperate to figure out when to set the exclusive
fences, is 100% up to amdgpu. If you do explicit sync by default, and only
switch to implicit sync (and setting the exclusive fence) as needed,
that's perfectly fine.  No need at all to leak that into core code and
confuse everyone that there's multiple exclusive fences they need to
somehow observe.

Cheers, Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: RFC: Add write flag to reservation object fences
  2018-08-09 13:38 ` RFC: Add write flag to reservation object fences Daniel Vetter
@ 2018-08-09 13:58   ` Christian König
       [not found]     ` <154cc05d-f2e1-695a-5bd3-a2fd5d40a548-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
  0 siblings, 1 reply; 25+ messages in thread
From: Christian König @ 2018-08-09 13:58 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: intel-gfx, dri-devel, amd-gfx

Am 09.08.2018 um 15:38 schrieb Daniel Vetter:
> On Thu, Aug 09, 2018 at 01:37:07PM +0200, Christian König wrote:
>> Hi everyone,
>>
>> This set of patches tries to improve read after write hazard handling
>> for reservation objects.
>>
>> It allows us to specify for each shared fence if it represents a write
>> operation.
>>
>> Based on this the i915 driver is modified to always wait for all writes
>> before pageflip and the previously used workaround is removed from
>> amdgpu.
> Hm, I thought after the entire discussions we agreed again that it's _not_
> the write hazard we want to track, but whether there's an exclusive fence
> that must be observed for implicit buffer sync. That's why it's called the
> exclusive fence, not the write fence!
>
> If you want multiple of those, I guess we could add those, but that
> doesn't really make sense - how exactly did you end up with multiple
> exclusive fences in the first place?

Maybe you misunderstood me, we don't have multiple exclusive fences.

What we have are multiple writers which write to the BO. In other words 
multiple engines which compose the content of the BO at the same time.

For page flipping we need to wait for all of them to completed.

> i915 (and fwiw, any other driver) does _not_ want to observe all write
> fences attached to a dma-buf. We want to _only_ observe the single
> exclusive fence used for implicit buffer sync, which might or might not
> exist. Otherwise the entire point of having explicit sync and explicit
> fences in the atomic ioctl is out of the window and the use case of 2
> draw/flip loops using a single buffer is defeated.

What do you mean with that?

Even for the atomic IOCTL with implicit fencing I strongly suspect that 
we can wait for multiple fences before doing the flip. Otherwise it 
would not really be useful to us.

> Again: How exactly you construct that exclusive fences, and how exactly
> the kernel and userspace cooperate to figure out when to set the exclusive
> fences, is 100% up to amdgpu. If you do explicit sync by default, and only
> switch to implicit sync (and setting the exclusive fence) as needed,
> that's perfectly fine.  No need at all to leak that into core code and
> confuse everyone that there's multiple exclusive fences they need to
> somehow observe.

I simply never have a single exclusive fence provided by userspace.

I always have multiple command submissions accessing the buffer at the 
same time.

See to me the explicit fence in the reservation object is not even 
remotely related to implicit or explicit synchronization.

Regards,
Christian.

>
> Cheers, Daniel

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: RFC: Add write flag to reservation object fences
       [not found]     ` <154cc05d-f2e1-695a-5bd3-a2fd5d40a548-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
@ 2018-08-09 14:22       ` Daniel Vetter
       [not found]         ` <CAKMK7uHgwFr1P3s=TcZ_NMuexCwoO1FkEn9q3CaYaGyBKviB-g-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 25+ messages in thread
From: Daniel Vetter @ 2018-08-09 14:22 UTC (permalink / raw)
  To: Christian König; +Cc: intel-gfx, dri-devel, amd-gfx list

On Thu, Aug 9, 2018 at 3:58 PM, Christian König
<ckoenig.leichtzumerken@gmail.com> wrote:
> Am 09.08.2018 um 15:38 schrieb Daniel Vetter:
>>
>> On Thu, Aug 09, 2018 at 01:37:07PM +0200, Christian König wrote:
>>>
>>> Hi everyone,
>>>
>>> This set of patches tries to improve read after write hazard handling
>>> for reservation objects.
>>>
>>> It allows us to specify for each shared fence if it represents a write
>>> operation.
>>>
>>> Based on this the i915 driver is modified to always wait for all writes
>>> before pageflip and the previously used workaround is removed from
>>> amdgpu.
>>
>> Hm, I thought after the entire discussions we agreed again that it's _not_
>> the write hazard we want to track, but whether there's an exclusive fence
>> that must be observed for implicit buffer sync. That's why it's called the
>> exclusive fence, not the write fence!
>>
>> If you want multiple of those, I guess we could add those, but that
>> doesn't really make sense - how exactly did you end up with multiple
>> exclusive fences in the first place?
>
>
> Maybe you misunderstood me, we don't have multiple exclusive fences.
>
> What we have are multiple writers which write to the BO. In other words
> multiple engines which compose the content of the BO at the same time.
>
> For page flipping we need to wait for all of them to completed.
>
>> i915 (and fwiw, any other driver) does _not_ want to observe all write
>> fences attached to a dma-buf. We want to _only_ observe the single
>> exclusive fence used for implicit buffer sync, which might or might not
>> exist. Otherwise the entire point of having explicit sync and explicit
>> fences in the atomic ioctl is out of the window and the use case of 2
>> draw/flip loops using a single buffer is defeated.
>
>
> What do you mean with that?
>
> Even for the atomic IOCTL with implicit fencing I strongly suspect that we
> can wait for multiple fences before doing the flip. Otherwise it would not
> really be useful to us.
>
>> Again: How exactly you construct that exclusive fences, and how exactly
>> the kernel and userspace cooperate to figure out when to set the exclusive
>> fences, is 100% up to amdgpu. If you do explicit sync by default, and only
>> switch to implicit sync (and setting the exclusive fence) as needed,
>> that's perfectly fine.  No need at all to leak that into core code and
>> confuse everyone that there's multiple exclusive fences they need to
>> somehow observe.
>
>
> I simply never have a single exclusive fence provided by userspace.
>
> I always have multiple command submissions accessing the buffer at the same
> time.
>
> See to me the explicit fence in the reservation object is not even remotely
> related to implicit or explicit synchronization.

Hm, I guess that's the confusion then. The only reason we have the
exclusive fence is to implement cross-driver implicit syncing. What
else you do internally in your driver doesn't matter, as long as you
keep up that contract.

And it's intentionally not called write_fence or anything like that,
because that's not what it tracks.

Of course any buffer moves the kernel does also must be tracked in the
exclusive fence, because userspace cannot know about these. So you
might have an exclusive fence set and also an explicit fence passed in
through the atomic ioctl. Aside: Right now all drivers only observe
one or the other, not both, so will break as soon as we start moving
shared buffers around. At least on Android or anything else using
explicit fencing.

So here's my summary, as I understanding things right now:
- for non-shared buffers at least, amdgpu uses explicit fencing, and
hence all fences caused by userspace end up as shared fences, whether
that's writes or reads. This means you end up with possibly multiple
write fences, but never any exclusive fences.
- for non-shared buffers the only exclusive fences amdgpu sets are for
buffer moves done by the kernel.
- amgpu (kernel + userspace combo here) does not seem to have a
concept/tracking for when a buffer is used with implicit or explicit
fencing. It does however track all writes.
- as a consequence, amdgpu needs to pessimistically assume that all
writes to shared buffer need to obey implicit fencing rules.
- for shared buffers (across process or drivers) implicit fencing does
_not_ allow concurrent writers. That limitation is why people want to
do explicit fencing, and it's the reason why there's only 1 slot for
an exclusive. Note I really mean concurrent here, a queue of in-flight
writes by different batches is perfectly fine. But it's a fully
ordered queue of writes.
- but as a consequence of amdgpu's lack of implicit fencing and hence
need to pessimistically assume there's multiple write fences amdgpu
needs to put multiple fences behind the single exclusive slot. This is
a limitation imposed by by the amdgpu stack, not something inherit to
how implicit fencing works.
- Chris Wilson's patch implements all this (and afaics with a bit more
coffee, correctly).

If you want to be less pessimistic in amdgpu for shared buffers, you
need to start tracking which shared buffer access need implicit and
which explicit sync. What you can't do is suddenly create more than 1
exclusive fence, that's not how implicit fencing works. Another thing
you cannot do is force everyone else (in non-amdgpu or core code) to
sync against _all_ writes, because that forces implicit syncing. Which
people very much don't want.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: RFC: Add write flag to reservation object fences
       [not found]         ` <CAKMK7uHgwFr1P3s=TcZ_NMuexCwoO1FkEn9q3CaYaGyBKviB-g-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2018-08-09 14:54           ` Christian König
       [not found]             ` <9731d8b0-238b-d64d-9f80-4f549eb230b9-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 25+ messages in thread
From: Christian König @ 2018-08-09 14:54 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: intel-gfx, dri-devel, amd-gfx list

Am 09.08.2018 um 16:22 schrieb Daniel Vetter:
> On Thu, Aug 9, 2018 at 3:58 PM, Christian König
> <ckoenig.leichtzumerken@gmail.com> wrote:
>> Am 09.08.2018 um 15:38 schrieb Daniel Vetter:
>>> On Thu, Aug 09, 2018 at 01:37:07PM +0200, Christian König wrote:
>>> [SNIP]
>> See to me the explicit fence in the reservation object is not even remotely
>> related to implicit or explicit synchronization.
> Hm, I guess that's the confusion then. The only reason we have the
> exclusive fence is to implement cross-driver implicit syncing. What
> else you do internally in your driver doesn't matter, as long as you
> keep up that contract.
>
> And it's intentionally not called write_fence or anything like that,
> because that's not what it tracks.
>
> Of course any buffer moves the kernel does also must be tracked in the
> exclusive fence, because userspace cannot know about these. So you
> might have an exclusive fence set and also an explicit fence passed in
> through the atomic ioctl. Aside: Right now all drivers only observe
> one or the other, not both, so will break as soon as we start moving
> shared buffers around. At least on Android or anything else using
> explicit fencing.

Actually both radeon and nouveau use the approach that shared fences 
need to wait on as well when they don't come from the current driver.

>
> So here's my summary, as I understanding things right now:
> - for non-shared buffers at least, amdgpu uses explicit fencing, and
> hence all fences caused by userspace end up as shared fences, whether
> that's writes or reads. This means you end up with possibly multiple
> write fences, but never any exclusive fences.
> - for non-shared buffers the only exclusive fences amdgpu sets are for
> buffer moves done by the kernel.
> - amgpu (kernel + userspace combo here) does not seem to have a
> concept/tracking for when a buffer is used with implicit or explicit
> fencing. It does however track all writes.

No, that is incorrect. It tracks all accesses to a buffer object in the 
form of shared fences, we don't care if it is a write or not.

What we track as well is which client uses a BO last and as long as the 
same client uses the BO we don't add any implicit synchronization.

Only when a BO is used by another client we have implicit 
synchronization for all command submissions. This behavior can be 
disable with a flag during BO creation.

> - as a consequence, amdgpu needs to pessimistically assume that all
> writes to shared buffer need to obey implicit fencing rules.
> - for shared buffers (across process or drivers) implicit fencing does
> _not_ allow concurrent writers. That limitation is why people want to
> do explicit fencing, and it's the reason why there's only 1 slot for
> an exclusive. Note I really mean concurrent here, a queue of in-flight
> writes by different batches is perfectly fine. But it's a fully
> ordered queue of writes.
> - but as a consequence of amdgpu's lack of implicit fencing and hence
> need to pessimistically assume there's multiple write fences amdgpu
> needs to put multiple fences behind the single exclusive slot. This is
> a limitation imposed by by the amdgpu stack, not something inherit to
> how implicit fencing works.
> - Chris Wilson's patch implements all this (and afaics with a bit more
> coffee, correctly).
>
> If you want to be less pessimistic in amdgpu for shared buffers, you
> need to start tracking which shared buffer access need implicit and
> which explicit sync. What you can't do is suddenly create more than 1
> exclusive fence, that's not how implicit fencing works. Another thing
> you cannot do is force everyone else (in non-amdgpu or core code) to
> sync against _all_ writes, because that forces implicit syncing. Which
> people very much don't want.

I also do see the problem that most other hardware doesn't need that 
functionality, because it is driven by a single engine. That's why I 
tried to keep the overhead as low as possible.

But at least for amdgpu (and I strongly suspect for nouveau as well) it 
is absolutely vital in a number of cases to allow concurrent accesses 
from the same client even when the BO is then later used with implicit 
synchronization.

This is also the reason why the current workaround is so problematic for 
us. Cause as soon as the BO is shared with another (non-amdgpu) device 
all command submissions to it will be serialized even when they come 
from the same client.

Would it be an option extend the concept of the "owner" of the BO amdpgu 
uses to other drivers as well?

When you already have explicit synchronization insider your client, but 
not between clients (e.g. still uses DRI2 or DRI3), this could also be 
rather beneficial for others as well.

Regards,
Christian.

> -Daniel

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 2/6] dma-buf: add reservation object shared fence accessor
       [not found]     ` <20180809113713.48024-3-christian.koenig-5C7GfCeVMHo@public.gmane.org>
@ 2018-08-10  7:41       ` Huang Rui
  0 siblings, 0 replies; 25+ messages in thread
From: Huang Rui @ 2018-08-10  7:41 UTC (permalink / raw)
  To: Christian König
  Cc: intel-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

On Thu, Aug 09, 2018 at 01:37:09PM +0200, Christian König wrote:
> Add a helper to access the shared fences in an reservation object.
> 
> Signed-off-by: Christian König <christian.koenig@amd.com>

Reviewed-by: Huang Rui <ray.huang@amd.com>

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c |  7 ++-----
>  drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c         |  3 +--
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c          |  4 ++--
>  drivers/gpu/drm/msm/msm_gem.c                    |  4 ++--
>  drivers/gpu/drm/nouveau/nouveau_fence.c          |  3 +--
>  drivers/gpu/drm/radeon/radeon_sync.c             |  3 +--
>  drivers/gpu/drm/ttm/ttm_bo.c                     |  4 +---
>  include/linux/reservation.h                      | 19 +++++++++++++++++++
>  8 files changed, 29 insertions(+), 18 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index fa38a960ce00..989932234160 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -238,9 +238,7 @@ static int amdgpu_amdkfd_remove_eviction_fence(struct amdgpu_bo *bo,
>  	for (i = 0; i < shared_count; ++i) {
>  		struct dma_fence *f;
>  
> -		f = rcu_dereference_protected(fobj->shared[i],
> -					      reservation_object_held(resv));
> -
> +		f = reservation_object_get_shared_fence(resv, fobj, i);
>  		if (ef) {
>  			if (f->context == ef->base.context) {
>  				dma_fence_put(f);
> @@ -273,8 +271,7 @@ static int amdgpu_amdkfd_remove_eviction_fence(struct amdgpu_bo *bo,
>  		struct dma_fence *f;
>  		struct amdgpu_amdkfd_fence *efence;
>  
> -		f = rcu_dereference_protected(fobj->shared[i],
> -			reservation_object_held(resv));
> +		f = reservation_object_get_shared_fence(resv, fobj, i);
>  
>  		efence = to_amdgpu_amdkfd_fence(f);
>  		if (efence) {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
> index 2d6f5ec77a68..dbfd62ab67e4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
> @@ -212,8 +212,7 @@ int amdgpu_sync_resv(struct amdgpu_device *adev,
>  		return r;
>  
>  	for (i = 0; i < flist->shared_count; ++i) {
> -		f = rcu_dereference_protected(flist->shared[i],
> -					      reservation_object_held(resv));
> +		f = reservation_object_get_shared_fence(resv, flist, i);
>  		/* We only want to trigger KFD eviction fences on
>  		 * evict or move jobs. Skip KFD fences otherwise.
>  		 */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> index c6611cff64c8..22896a398eab 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> @@ -1482,8 +1482,8 @@ static bool amdgpu_ttm_bo_eviction_valuable(struct ttm_buffer_object *bo,
>  	flist = reservation_object_get_list(bo->resv);
>  	if (flist) {
>  		for (i = 0; i < flist->shared_count; ++i) {
> -			f = rcu_dereference_protected(flist->shared[i],
> -				reservation_object_held(bo->resv));
> +			f = reservation_object_get_shared_fence(bo->resv,
> +								flist, i);
>  			if (amdkfd_fence_check_mm(f, current->mm))
>  				return false;
>  		}
> diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c
> index f583bb4222f9..95d25dbfde2b 100644
> --- a/drivers/gpu/drm/msm/msm_gem.c
> +++ b/drivers/gpu/drm/msm/msm_gem.c
> @@ -651,8 +651,8 @@ int msm_gem_sync_object(struct drm_gem_object *obj,
>  		return 0;
>  
>  	for (i = 0; i < fobj->shared_count; i++) {
> -		fence = rcu_dereference_protected(fobj->shared[i],
> -						reservation_object_held(msm_obj->resv));
> +		fence = reservation_object_get_shared_fence(msm_obj->resv,
> +							    fobj, i);
>  		if (fence->context != fctx->context) {
>  			ret = dma_fence_wait(fence, true);
>  			if (ret)
> diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c b/drivers/gpu/drm/nouveau/nouveau_fence.c
> index 412d49bc6e56..3ce921c276c1 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_fence.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_fence.c
> @@ -376,8 +376,7 @@ nouveau_fence_sync(struct nouveau_bo *nvbo, struct nouveau_channel *chan, bool e
>  		struct nouveau_channel *prev = NULL;
>  		bool must_wait = true;
>  
> -		fence = rcu_dereference_protected(fobj->shared[i],
> -						reservation_object_held(resv));
> +		fence = reservation_object_get_shared_fence(resv, fobj, i);
>  
>  		f = nouveau_local_fence(fence, chan->drm);
>  		if (f) {
> diff --git a/drivers/gpu/drm/radeon/radeon_sync.c b/drivers/gpu/drm/radeon/radeon_sync.c
> index be5d7a38d3aa..bf7f9a648838 100644
> --- a/drivers/gpu/drm/radeon/radeon_sync.c
> +++ b/drivers/gpu/drm/radeon/radeon_sync.c
> @@ -110,8 +110,7 @@ int radeon_sync_resv(struct radeon_device *rdev,
>  		return r;
>  
>  	for (i = 0; i < flist->shared_count; ++i) {
> -		f = rcu_dereference_protected(flist->shared[i],
> -					      reservation_object_held(resv));
> +		f = reservation_object_get_shared_fence(resv, flist, i);
>  		fence = to_radeon_fence(f);
>  		if (fence && fence->rdev == rdev)
>  			radeon_sync_fence(sync, fence);
> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
> index 7c484729f9b2..820d97d3e8b9 100644
> --- a/drivers/gpu/drm/ttm/ttm_bo.c
> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
> @@ -370,9 +370,7 @@ static void ttm_bo_flush_all_fences(struct ttm_buffer_object *bo)
>  		dma_fence_enable_sw_signaling(fence);
>  
>  	for (i = 0; fobj && i < fobj->shared_count; ++i) {
> -		fence = rcu_dereference_protected(fobj->shared[i],
> -					reservation_object_held(bo->resv));
> -
> +		fence = reservation_object_get_shared_fence(bo->resv, fobj, i);
>  		if (!fence->ops->signaled)
>  			dma_fence_enable_sw_signaling(fence);
>  	}
> diff --git a/include/linux/reservation.h b/include/linux/reservation.h
> index 54cf6773a14c..8a3298574bf5 100644
> --- a/include/linux/reservation.h
> +++ b/include/linux/reservation.h
> @@ -140,6 +140,25 @@ reservation_object_get_list(struct reservation_object *obj)
>  					 reservation_object_held(obj));
>  }
>  
> +/**
> + * reservation_object_get_shared_fence - get a fence from a reservation object's
> + * shared fence list.
> + * @obj: the reservation object
> + * @list: the list to get the fence from
> + * @idx: the index in the list to get
> + *
> + * Returns the fence from the shared fence list.  Does NOT take references to
> + * the fence.  Needs to be in RCU context or the obj->lock must be held.
> + */
> +static inline struct dma_fence *
> +reservation_object_get_shared_fence(struct reservation_object *obj,
> +				    struct reservation_object_list *list,
> +				    unsigned int idx)
> +{
> +	return rcu_dereference_protected(list->shared[idx],
> +					 reservation_object_held(obj));
> +}
> +
>  /**
>   * reservation_object_lock - lock the reservation object
>   * @obj: the reservation object
> -- 
> 2.14.1
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [Intel-gfx] RFC: Add write flag to reservation object fences
       [not found]             ` <9731d8b0-238b-d64d-9f80-4f549eb230b9-5C7GfCeVMHo@public.gmane.org>
@ 2018-08-10  7:51               ` Chris Wilson
  2018-08-10  8:24                 ` Christian König
  2018-08-10  8:29               ` Daniel Vetter
  1 sibling, 1 reply; 25+ messages in thread
From: Chris Wilson @ 2018-08-10  7:51 UTC (permalink / raw)
  To: Christian König, Daniel Vetter; +Cc: intel-gfx, amd-gfx list, dri-devel

Quoting Christian König (2018-08-09 15:54:31)
> Am 09.08.2018 um 16:22 schrieb Daniel Vetter:
> > On Thu, Aug 9, 2018 at 3:58 PM, Christian König
> > <ckoenig.leichtzumerken@gmail.com> wrote:
> >> Am 09.08.2018 um 15:38 schrieb Daniel Vetter:
> >>> On Thu, Aug 09, 2018 at 01:37:07PM +0200, Christian König wrote:
> >>> [SNIP]
> >> See to me the explicit fence in the reservation object is not even remotely
> >> related to implicit or explicit synchronization.
> > Hm, I guess that's the confusion then. The only reason we have the
> > exclusive fence is to implement cross-driver implicit syncing. What
> > else you do internally in your driver doesn't matter, as long as you
> > keep up that contract.
> >
> > And it's intentionally not called write_fence or anything like that,
> > because that's not what it tracks.
> >
> > Of course any buffer moves the kernel does also must be tracked in the
> > exclusive fence, because userspace cannot know about these. So you
> > might have an exclusive fence set and also an explicit fence passed in
> > through the atomic ioctl. Aside: Right now all drivers only observe
> > one or the other, not both, so will break as soon as we start moving
> > shared buffers around. At least on Android or anything else using
> > explicit fencing.
> 
> Actually both radeon and nouveau use the approach that shared fences 
> need to wait on as well when they don't come from the current driver.

nouveau has write hazard tracking in its api , and is using the
excl fence for it was well.

As far as I can tell, this is all about fixing the lack of
acknowledgement of the requirement for implicit fence tracking in
amdgpu's (and its radeon predecessor) ABI. Which is fine so long as you
get userspace to only use explicit fence passing to the compositor.
-Chris
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [Intel-gfx] RFC: Add write flag to reservation object fences
  2018-08-10  7:51               ` [Intel-gfx] " Chris Wilson
@ 2018-08-10  8:24                 ` Christian König
  2018-08-10  8:32                   ` Daniel Vetter
  0 siblings, 1 reply; 25+ messages in thread
From: Christian König @ 2018-08-10  8:24 UTC (permalink / raw)
  To: Chris Wilson, Daniel Vetter; +Cc: intel-gfx, amd-gfx list, dri-devel

Am 10.08.2018 um 09:51 schrieb Chris Wilson:
> Quoting Christian König (2018-08-09 15:54:31)
>> Am 09.08.2018 um 16:22 schrieb Daniel Vetter:
>>> On Thu, Aug 9, 2018 at 3:58 PM, Christian König
>>> <ckoenig.leichtzumerken@gmail.com> wrote:
>>>> Am 09.08.2018 um 15:38 schrieb Daniel Vetter:
>>>>> On Thu, Aug 09, 2018 at 01:37:07PM +0200, Christian König wrote:
>>>>> [SNIP]
>>>> See to me the explicit fence in the reservation object is not even remotely
>>>> related to implicit or explicit synchronization.
>>> Hm, I guess that's the confusion then. The only reason we have the
>>> exclusive fence is to implement cross-driver implicit syncing. What
>>> else you do internally in your driver doesn't matter, as long as you
>>> keep up that contract.
>>>
>>> And it's intentionally not called write_fence or anything like that,
>>> because that's not what it tracks.
>>>
>>> Of course any buffer moves the kernel does also must be tracked in the
>>> exclusive fence, because userspace cannot know about these. So you
>>> might have an exclusive fence set and also an explicit fence passed in
>>> through the atomic ioctl. Aside: Right now all drivers only observe
>>> one or the other, not both, so will break as soon as we start moving
>>> shared buffers around. At least on Android or anything else using
>>> explicit fencing.
>> Actually both radeon and nouveau use the approach that shared fences
>> need to wait on as well when they don't come from the current driver.
> nouveau has write hazard tracking in its api , and is using the
> excl fence for it was well.
>
> As far as I can tell, this is all about fixing the lack of
> acknowledgement of the requirement for implicit fence tracking in
> amdgpu's (and its radeon predecessor) ABI.

Well I agree that implicit fencing was a bad idea to begin with, but the 
solution you guys came up with is not going to work in all cases either.

> Which is fine so long as you
> get userspace to only use explicit fence passing to the compositor.

Well exactly that's the problem.

I can't pass exactly one explicit fence to the compositor because I have 
multiple command submissions which run asynchronously and need to finish 
before the compositor can start to use the surface.

So the whole concept of using a single exclusive fence doesn't work in 
the case of amdgpu. And to be honest I think all drivers with multiple 
engines could benefit of that as well.

Christian.

> -Chris

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: RFC: Add write flag to reservation object fences
       [not found]             ` <9731d8b0-238b-d64d-9f80-4f549eb230b9-5C7GfCeVMHo@public.gmane.org>
  2018-08-10  7:51               ` [Intel-gfx] " Chris Wilson
@ 2018-08-10  8:29               ` Daniel Vetter
       [not found]                 ` <CAKMK7uH2ZvkmTVcPJwKjouxdvrLKVXBVGSFKAD5rtQrXLoX_GQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  1 sibling, 1 reply; 25+ messages in thread
From: Daniel Vetter @ 2018-08-10  8:29 UTC (permalink / raw)
  To: Christian König; +Cc: intel-gfx, dri-devel, amd-gfx list

On Thu, Aug 9, 2018 at 4:54 PM, Christian König
<christian.koenig@amd.com> wrote:
> Am 09.08.2018 um 16:22 schrieb Daniel Vetter:
>>
>> On Thu, Aug 9, 2018 at 3:58 PM, Christian König
>> <ckoenig.leichtzumerken@gmail.com> wrote:
>>>
>>> Am 09.08.2018 um 15:38 schrieb Daniel Vetter:
>>>>
>>>> On Thu, Aug 09, 2018 at 01:37:07PM +0200, Christian König wrote:
>>>> [SNIP]
>>>
>>> See to me the explicit fence in the reservation object is not even
>>> remotely
>>> related to implicit or explicit synchronization.
>>
>> Hm, I guess that's the confusion then. The only reason we have the
>> exclusive fence is to implement cross-driver implicit syncing. What
>> else you do internally in your driver doesn't matter, as long as you
>> keep up that contract.
>>
>> And it's intentionally not called write_fence or anything like that,
>> because that's not what it tracks.
>>
>> Of course any buffer moves the kernel does also must be tracked in the
>> exclusive fence, because userspace cannot know about these. So you
>> might have an exclusive fence set and also an explicit fence passed in
>> through the atomic ioctl. Aside: Right now all drivers only observe
>> one or the other, not both, so will break as soon as we start moving
>> shared buffers around. At least on Android or anything else using
>> explicit fencing.
>
>
> Actually both radeon and nouveau use the approach that shared fences need to
> wait on as well when they don't come from the current driver.
>
>>
>> So here's my summary, as I understanding things right now:
>> - for non-shared buffers at least, amdgpu uses explicit fencing, and
>> hence all fences caused by userspace end up as shared fences, whether
>> that's writes or reads. This means you end up with possibly multiple
>> write fences, but never any exclusive fences.
>> - for non-shared buffers the only exclusive fences amdgpu sets are for
>> buffer moves done by the kernel.
>> - amgpu (kernel + userspace combo here) does not seem to have a
>> concept/tracking for when a buffer is used with implicit or explicit
>> fencing. It does however track all writes.
>
>
> No, that is incorrect. It tracks all accesses to a buffer object in the form
> of shared fences, we don't care if it is a write or not.
>
> What we track as well is which client uses a BO last and as long as the same
> client uses the BO we don't add any implicit synchronization.
>
> Only when a BO is used by another client we have implicit synchronization
> for all command submissions. This behavior can be disable with a flag during
> BO creation.

I'm only interested in the case of shared buffers. And for those you
_do_ pessimistically assume that all access must be implicitly synced.
Iirc amdgpu doesn't support EGL_ANDROID_native_fence_sync, so this
makes sense that you don't bother with it.

>> - as a consequence, amdgpu needs to pessimistically assume that all
>> writes to shared buffer need to obey implicit fencing rules.
>> - for shared buffers (across process or drivers) implicit fencing does
>> _not_ allow concurrent writers. That limitation is why people want to
>> do explicit fencing, and it's the reason why there's only 1 slot for
>> an exclusive. Note I really mean concurrent here, a queue of in-flight
>> writes by different batches is perfectly fine. But it's a fully
>> ordered queue of writes.
>> - but as a consequence of amdgpu's lack of implicit fencing and hence
>> need to pessimistically assume there's multiple write fences amdgpu
>> needs to put multiple fences behind the single exclusive slot. This is
>> a limitation imposed by by the amdgpu stack, not something inherit to
>> how implicit fencing works.
>> - Chris Wilson's patch implements all this (and afaics with a bit more
>> coffee, correctly).
>>
>> If you want to be less pessimistic in amdgpu for shared buffers, you
>> need to start tracking which shared buffer access need implicit and
>> which explicit sync. What you can't do is suddenly create more than 1
>> exclusive fence, that's not how implicit fencing works. Another thing
>> you cannot do is force everyone else (in non-amdgpu or core code) to
>> sync against _all_ writes, because that forces implicit syncing. Which
>> people very much don't want.
>
>
> I also do see the problem that most other hardware doesn't need that
> functionality, because it is driven by a single engine. That's why I tried
> to keep the overhead as low as possible.
>
> But at least for amdgpu (and I strongly suspect for nouveau as well) it is
> absolutely vital in a number of cases to allow concurrent accesses from the
> same client even when the BO is then later used with implicit
> synchronization.
>
> This is also the reason why the current workaround is so problematic for us.
> Cause as soon as the BO is shared with another (non-amdgpu) device all
> command submissions to it will be serialized even when they come from the
> same client.
>
> Would it be an option extend the concept of the "owner" of the BO amdpgu
> uses to other drivers as well?
>
> When you already have explicit synchronization insider your client, but not
> between clients (e.g. still uses DRI2 or DRI3), this could also be rather
> beneficial for others as well.

Again: How you synchronize your driver internal rendering is totally
up to you. If you see an exclusive fence by amdgpu, and submit new
rendering by amdgpu, you can totally ignore the exclusive fence. The
only api contracts for implicit fencing are between drivers for shared
buffers. If you submit rendering to a shared buffer in parallel, all
without attaching an exclusive fence that's perfectly fine, but
somewhen later on, depending upon protocol (glFlush or glxSwapBuffers
or whatever) you have to collect all those concurrent write hazards
and bake them into 1 single exclusive fence for implicit fencing.

Atm (and Chris seems to concur) the amdgpu uapi doesn't allow you to
do that, so for anything shared you have to be super pessimistic.
Adding a HAND_OFF_FOR_IMPLICIT_FENCING flag/ioctl would probably fix
that. Only when that flag is set would you take all shared write
hazards and bake them into one exclusive fence for hand-off to the
next driver. You'd also need the same when receiving an implicitly
fenced buffer, to make sure that your concurrent writes do synchronize
with reading (aka shared fences) done by other drivers. With a bunch
of trickery and hacks it might be possible to infer this from current
ioctls even, but you need to be really careful.

And you're right that amdgpu seems to be the only (or one of the only)
drivers which do truly concurrent rendering to the same buffer (not
just concurrent rendering to multiple buffers all suballocated from
the same bo). But we can't fix this in the kernel with the tricks you
propose, because without such an uapi extension (or inference) we
can't tell the implicit fencing from the explicit fencing case. And
for shared buffers with explicit fencing we _must_ _not_ sync against
all writes. owner won't help here, because it's still not tracking
whether something is explicit or implicit synced.

We've cheated a bit with most other drivers in this area, also because
we don't have to deal with truly concurrent rendering. So it's not
obvious that we're not tracking writes/reads, but implicit/explicit
fencing. But semantically we track the later for shared buffers.

Cheers, Daniel

PS: One idea I have for inference: Every time you see a shared buffer
in an amdgpu CS:
1. Grab reservation lock
2. Check all the fences' creators. If any of them are foreign (not by
amdgpu), then run the current pessimistic code.
3. If all fences are by amdgpu
- Look at the exclusive fence. If it's a ttm bo move keep it, if it's
marked as a special implicit syncing fence, ignore it.
- Run all CS concurrently by storing all their write fences in the shared slots.
- Create a fake exclusive fence which ties all the write hazards into
one fence. Mark them as special implicit syncing fences in your
amdgpu_fence struct. This will make sure other drivers sync properly,
but since you ignore these special it won't reduce amdgpu-internal
concurrency.
- Make sure you don't drop the ttm bo move fences accidentally, will
be some tricky accounting.
4. Submit CS and drop reservation lock.

I think this would work, but much cleaner if you make this an explicit
part of the amgpu uapi.
-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: RFC: Add write flag to reservation object fences
  2018-08-10  8:24                 ` Christian König
@ 2018-08-10  8:32                   ` Daniel Vetter
  2018-08-10  8:51                     ` Christian König
  0 siblings, 1 reply; 25+ messages in thread
From: Daniel Vetter @ 2018-08-10  8:32 UTC (permalink / raw)
  To: Christian König; +Cc: intel-gfx, amd-gfx list, dri-devel

On Fri, Aug 10, 2018 at 10:24 AM, Christian König
<christian.koenig@amd.com> wrote:
> Am 10.08.2018 um 09:51 schrieb Chris Wilson:
>>
>> Quoting Christian König (2018-08-09 15:54:31)
>>>
>>> Am 09.08.2018 um 16:22 schrieb Daniel Vetter:
>>>>
>>>> On Thu, Aug 9, 2018 at 3:58 PM, Christian König
>>>> <ckoenig.leichtzumerken@gmail.com> wrote:
>>>>>
>>>>> Am 09.08.2018 um 15:38 schrieb Daniel Vetter:
>>>>>>
>>>>>> On Thu, Aug 09, 2018 at 01:37:07PM +0200, Christian König wrote:
>>>>>> [SNIP]
>>>>>
>>>>> See to me the explicit fence in the reservation object is not even
>>>>> remotely
>>>>> related to implicit or explicit synchronization.
>>>>
>>>> Hm, I guess that's the confusion then. The only reason we have the
>>>> exclusive fence is to implement cross-driver implicit syncing. What
>>>> else you do internally in your driver doesn't matter, as long as you
>>>> keep up that contract.
>>>>
>>>> And it's intentionally not called write_fence or anything like that,
>>>> because that's not what it tracks.
>>>>
>>>> Of course any buffer moves the kernel does also must be tracked in the
>>>> exclusive fence, because userspace cannot know about these. So you
>>>> might have an exclusive fence set and also an explicit fence passed in
>>>> through the atomic ioctl. Aside: Right now all drivers only observe
>>>> one or the other, not both, so will break as soon as we start moving
>>>> shared buffers around. At least on Android or anything else using
>>>> explicit fencing.
>>>
>>> Actually both radeon and nouveau use the approach that shared fences
>>> need to wait on as well when they don't come from the current driver.
>>
>> nouveau has write hazard tracking in its api , and is using the
>> excl fence for it was well.
>>
>> As far as I can tell, this is all about fixing the lack of
>> acknowledgement of the requirement for implicit fence tracking in
>> amdgpu's (and its radeon predecessor) ABI.
>
>
> Well I agree that implicit fencing was a bad idea to begin with, but the
> solution you guys came up with is not going to work in all cases either.
>
>> Which is fine so long as you
>> get userspace to only use explicit fence passing to the compositor.
>
>
> Well exactly that's the problem.
>
> I can't pass exactly one explicit fence to the compositor because I have
> multiple command submissions which run asynchronously and need to finish
> before the compositor can start to use the surface.
>
> So the whole concept of using a single exclusive fence doesn't work in the
> case of amdgpu. And to be honest I think all drivers with multiple engines
> could benefit of that as well.

Fences can be merge. This works, really :-) In amdgpu's implementation
of EGL_ANDROID_native_fence_sync you will need to do that before
passing the result to the caller.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: RFC: Add write flag to reservation object fences
  2018-08-10  8:32                   ` Daniel Vetter
@ 2018-08-10  8:51                     ` Christian König
       [not found]                       ` <97e6154a-380c-521c-6287-6fbafe0384b4-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 25+ messages in thread
From: Christian König @ 2018-08-10  8:51 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: intel-gfx, amd-gfx list, dri-devel

Am 10.08.2018 um 10:32 schrieb Daniel Vetter:
> On Fri, Aug 10, 2018 at 10:24 AM, Christian König
> <christian.koenig@amd.com> wrote:
>> Am 10.08.2018 um 09:51 schrieb Chris Wilson:
>>> Quoting Christian König (2018-08-09 15:54:31)
>>>> Am 09.08.2018 um 16:22 schrieb Daniel Vetter:
>>>>> On Thu, Aug 9, 2018 at 3:58 PM, Christian König
>>>>> <ckoenig.leichtzumerken@gmail.com> wrote:
>>>>>> Am 09.08.2018 um 15:38 schrieb Daniel Vetter:
>>>>>>> On Thu, Aug 09, 2018 at 01:37:07PM +0200, Christian König wrote:
>>>>>>> [SNIP]
>>>>>> See to me the explicit fence in the reservation object is not even
>>>>>> remotely
>>>>>> related to implicit or explicit synchronization.
>>>>> Hm, I guess that's the confusion then. The only reason we have the
>>>>> exclusive fence is to implement cross-driver implicit syncing. What
>>>>> else you do internally in your driver doesn't matter, as long as you
>>>>> keep up that contract.
>>>>>
>>>>> And it's intentionally not called write_fence or anything like that,
>>>>> because that's not what it tracks.
>>>>>
>>>>> Of course any buffer moves the kernel does also must be tracked in the
>>>>> exclusive fence, because userspace cannot know about these. So you
>>>>> might have an exclusive fence set and also an explicit fence passed in
>>>>> through the atomic ioctl. Aside: Right now all drivers only observe
>>>>> one or the other, not both, so will break as soon as we start moving
>>>>> shared buffers around. At least on Android or anything else using
>>>>> explicit fencing.
>>>> Actually both radeon and nouveau use the approach that shared fences
>>>> need to wait on as well when they don't come from the current driver.
>>> nouveau has write hazard tracking in its api , and is using the
>>> excl fence for it was well.
>>>
>>> As far as I can tell, this is all about fixing the lack of
>>> acknowledgement of the requirement for implicit fence tracking in
>>> amdgpu's (and its radeon predecessor) ABI.
>>
>> Well I agree that implicit fencing was a bad idea to begin with, but the
>> solution you guys came up with is not going to work in all cases either.
>>
>>> Which is fine so long as you
>>> get userspace to only use explicit fence passing to the compositor.
>>
>> Well exactly that's the problem.
>>
>> I can't pass exactly one explicit fence to the compositor because I have
>> multiple command submissions which run asynchronously and need to finish
>> before the compositor can start to use the surface.
>>
>> So the whole concept of using a single exclusive fence doesn't work in the
>> case of amdgpu. And to be honest I think all drivers with multiple engines
>> could benefit of that as well.
> Fences can be merge. This works, really :-) In amdgpu's implementation
> of EGL_ANDROID_native_fence_sync you will need to do that before
> passing the result to the caller.

Yeah, but that is for the userspace API. Not for any internal handling 
in the driver.

Christian.

> -Daniel

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: RFC: Add write flag to reservation object fences
       [not found]                 ` <CAKMK7uH2ZvkmTVcPJwKjouxdvrLKVXBVGSFKAD5rtQrXLoX_GQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2018-08-10  9:14                   ` Christian König
  2018-08-10  9:21                     ` Daniel Vetter
  2018-08-10  9:28                     ` Daniel Vetter
  0 siblings, 2 replies; 25+ messages in thread
From: Christian König @ 2018-08-10  9:14 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: intel-gfx, dri-devel, amd-gfx list

Am 10.08.2018 um 10:29 schrieb Daniel Vetter:
> [SNIP]
> I'm only interested in the case of shared buffers. And for those you
> _do_ pessimistically assume that all access must be implicitly synced.
> Iirc amdgpu doesn't support EGL_ANDROID_native_fence_sync, so this
> makes sense that you don't bother with it.

See flag AMDGPU_GEM_CREATE_EXPLICIT_SYNC.

>
>>> - as a consequence, amdgpu needs to pessimistically assume that all
>>> writes to shared buffer need to obey implicit fencing rules.
>>> - for shared buffers (across process or drivers) implicit fencing does
>>> _not_ allow concurrent writers. That limitation is why people want to
>>> do explicit fencing, and it's the reason why there's only 1 slot for
>>> an exclusive. Note I really mean concurrent here, a queue of in-flight
>>> writes by different batches is perfectly fine. But it's a fully
>>> ordered queue of writes.
>>> - but as a consequence of amdgpu's lack of implicit fencing and hence
>>> need to pessimistically assume there's multiple write fences amdgpu
>>> needs to put multiple fences behind the single exclusive slot. This is
>>> a limitation imposed by by the amdgpu stack, not something inherit to
>>> how implicit fencing works.
>>> - Chris Wilson's patch implements all this (and afaics with a bit more
>>> coffee, correctly).
>>>
>>> If you want to be less pessimistic in amdgpu for shared buffers, you
>>> need to start tracking which shared buffer access need implicit and
>>> which explicit sync. What you can't do is suddenly create more than 1
>>> exclusive fence, that's not how implicit fencing works. Another thing
>>> you cannot do is force everyone else (in non-amdgpu or core code) to
>>> sync against _all_ writes, because that forces implicit syncing. Which
>>> people very much don't want.
>>
>> I also do see the problem that most other hardware doesn't need that
>> functionality, because it is driven by a single engine. That's why I tried
>> to keep the overhead as low as possible.
>>
>> But at least for amdgpu (and I strongly suspect for nouveau as well) it is
>> absolutely vital in a number of cases to allow concurrent accesses from the
>> same client even when the BO is then later used with implicit
>> synchronization.
>>
>> This is also the reason why the current workaround is so problematic for us.
>> Cause as soon as the BO is shared with another (non-amdgpu) device all
>> command submissions to it will be serialized even when they come from the
>> same client.
>>
>> Would it be an option extend the concept of the "owner" of the BO amdpgu
>> uses to other drivers as well?
>>
>> When you already have explicit synchronization insider your client, but not
>> between clients (e.g. still uses DRI2 or DRI3), this could also be rather
>> beneficial for others as well.
> Again: How you synchronize your driver internal rendering is totally
> up to you. If you see an exclusive fence by amdgpu, and submit new
> rendering by amdgpu, you can totally ignore the exclusive fence. The
> only api contracts for implicit fencing are between drivers for shared
> buffers. If you submit rendering to a shared buffer in parallel, all
> without attaching an exclusive fence that's perfectly fine, but
> somewhen later on, depending upon protocol (glFlush or glxSwapBuffers
> or whatever) you have to collect all those concurrent write hazards
> and bake them into 1 single exclusive fence for implicit fencing.
>
> Atm (and Chris seems to concur) the amdgpu uapi doesn't allow you to
> do that, so for anything shared you have to be super pessimistic.
> Adding a HAND_OFF_FOR_IMPLICIT_FENCING flag/ioctl would probably fix
> that. Only when that flag is set would you take all shared write
> hazards and bake them into one exclusive fence for hand-off to the
> next driver. You'd also need the same when receiving an implicitly
> fenced buffer, to make sure that your concurrent writes do synchronize
> with reading (aka shared fences) done by other drivers. With a bunch
> of trickery and hacks it might be possible to infer this from current
> ioctls even, but you need to be really careful.

A new uapi is out of question because we need to be backward compatible.

> And you're right that amdgpu seems to be the only (or one of the only)
> drivers which do truly concurrent rendering to the same buffer (not
> just concurrent rendering to multiple buffers all suballocated from
> the same bo). But we can't fix this in the kernel with the tricks you
> propose, because without such an uapi extension (or inference) we
> can't tell the implicit fencing from the explicit fencing case.

Sure we can. As I said for amdgpu that is absolutely no problem at all.

In your terminology all rendering from the same client to a BO is 
explicitly fenced, while all rendering from different clients are 
implicit fenced.

> And for shared buffers with explicit fencing we _must_ _not_ sync against
> all writes. owner won't help here, because it's still not tracking
> whether something is explicit or implicit synced.

Implicit syncing can be disable by giving the 
AMDGPU_GEM_CREATE_EXPLICIT_SYNC flag while creating the BO.

> We've cheated a bit with most other drivers in this area, also because
> we don't have to deal with truly concurrent rendering.

Yeah, absolutely nailed it. And this cheating is now completely breaking 
my neck because it doesn't work well at all with the requirements I have 
at hand here.

> So it's not
> obvious that we're not tracking writes/reads, but implicit/explicit
> fencing. But semantically we track the later for shared buffers.
>
> Cheers, Daniel
>
> PS: One idea I have for inference: Every time you see a shared buffer
> in an amdgpu CS:
> 1. Grab reservation lock
> 2. Check all the fences' creators. If any of them are foreign (not by
> amdgpu), then run the current pessimistic code.

That is exactly what we already do.

> 3. If all fences are by amdgpu
> - Look at the exclusive fence. If it's a ttm bo move keep it, if it's
> marked as a special implicit syncing fence, ignore it.
> - Run all CS concurrently by storing all their write fences in the shared slots.
> - Create a fake exclusive fence which ties all the write hazards into
> one fence. Mark them as special implicit syncing fences in your
> amdgpu_fence struct. This will make sure other drivers sync properly,
> but since you ignore these special it won't reduce amdgpu-internal
> concurrency.

That won't work, adding the exclusive fence removes all shared fences 
and I still need to have those for the sync check above.

I need something like a callback from other drivers that the reservation 
object is now used by them and no longer by amdgpu.

Regards,
Christian.

> - Make sure you don't drop the ttm bo move fences accidentally, will
> be some tricky accounting.
> 4. Submit CS and drop reservation lock.
>
> I think this would work, but much cleaner if you make this an explicit
> part of the amgpu uapi.

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: RFC: Add write flag to reservation object fences
  2018-08-10  9:14                   ` Christian König
@ 2018-08-10  9:21                     ` Daniel Vetter
  2018-08-10 14:24                       ` Christian König
  2018-08-10  9:28                     ` Daniel Vetter
  1 sibling, 1 reply; 25+ messages in thread
From: Daniel Vetter @ 2018-08-10  9:21 UTC (permalink / raw)
  To: Christian König; +Cc: intel-gfx, dri-devel, amd-gfx list

On Fri, Aug 10, 2018 at 11:14 AM, Christian König
<christian.koenig@amd.com> wrote:
> Am 10.08.2018 um 10:29 schrieb Daniel Vetter:
>>
>> [SNIP]
>> I'm only interested in the case of shared buffers. And for those you
>> _do_ pessimistically assume that all access must be implicitly synced.
>> Iirc amdgpu doesn't support EGL_ANDROID_native_fence_sync, so this
>> makes sense that you don't bother with it.
>
>
> See flag AMDGPU_GEM_CREATE_EXPLICIT_SYNC.
>
>
>>
>>>> - as a consequence, amdgpu needs to pessimistically assume that all
>>>> writes to shared buffer need to obey implicit fencing rules.
>>>> - for shared buffers (across process or drivers) implicit fencing does
>>>> _not_ allow concurrent writers. That limitation is why people want to
>>>> do explicit fencing, and it's the reason why there's only 1 slot for
>>>> an exclusive. Note I really mean concurrent here, a queue of in-flight
>>>> writes by different batches is perfectly fine. But it's a fully
>>>> ordered queue of writes.
>>>> - but as a consequence of amdgpu's lack of implicit fencing and hence
>>>> need to pessimistically assume there's multiple write fences amdgpu
>>>> needs to put multiple fences behind the single exclusive slot. This is
>>>> a limitation imposed by by the amdgpu stack, not something inherit to
>>>> how implicit fencing works.
>>>> - Chris Wilson's patch implements all this (and afaics with a bit more
>>>> coffee, correctly).
>>>>
>>>> If you want to be less pessimistic in amdgpu for shared buffers, you
>>>> need to start tracking which shared buffer access need implicit and
>>>> which explicit sync. What you can't do is suddenly create more than 1
>>>> exclusive fence, that's not how implicit fencing works. Another thing
>>>> you cannot do is force everyone else (in non-amdgpu or core code) to
>>>> sync against _all_ writes, because that forces implicit syncing. Which
>>>> people very much don't want.
>>>
>>>
>>> I also do see the problem that most other hardware doesn't need that
>>> functionality, because it is driven by a single engine. That's why I
>>> tried
>>> to keep the overhead as low as possible.
>>>
>>> But at least for amdgpu (and I strongly suspect for nouveau as well) it
>>> is
>>> absolutely vital in a number of cases to allow concurrent accesses from
>>> the
>>> same client even when the BO is then later used with implicit
>>> synchronization.
>>>
>>> This is also the reason why the current workaround is so problematic for
>>> us.
>>> Cause as soon as the BO is shared with another (non-amdgpu) device all
>>> command submissions to it will be serialized even when they come from the
>>> same client.
>>>
>>> Would it be an option extend the concept of the "owner" of the BO amdpgu
>>> uses to other drivers as well?
>>>
>>> When you already have explicit synchronization insider your client, but
>>> not
>>> between clients (e.g. still uses DRI2 or DRI3), this could also be rather
>>> beneficial for others as well.
>>
>> Again: How you synchronize your driver internal rendering is totally
>> up to you. If you see an exclusive fence by amdgpu, and submit new
>> rendering by amdgpu, you can totally ignore the exclusive fence. The
>> only api contracts for implicit fencing are between drivers for shared
>> buffers. If you submit rendering to a shared buffer in parallel, all
>> without attaching an exclusive fence that's perfectly fine, but
>> somewhen later on, depending upon protocol (glFlush or glxSwapBuffers
>> or whatever) you have to collect all those concurrent write hazards
>> and bake them into 1 single exclusive fence for implicit fencing.
>>
>> Atm (and Chris seems to concur) the amdgpu uapi doesn't allow you to
>> do that, so for anything shared you have to be super pessimistic.
>> Adding a HAND_OFF_FOR_IMPLICIT_FENCING flag/ioctl would probably fix
>> that. Only when that flag is set would you take all shared write
>> hazards and bake them into one exclusive fence for hand-off to the
>> next driver. You'd also need the same when receiving an implicitly
>> fenced buffer, to make sure that your concurrent writes do synchronize
>> with reading (aka shared fences) done by other drivers. With a bunch
>> of trickery and hacks it might be possible to infer this from current
>> ioctls even, but you need to be really careful.
>
>
> A new uapi is out of question because we need to be backward compatible.

Since when is new uapi out of the question for a performance improvement?

>> And you're right that amdgpu seems to be the only (or one of the only)
>> drivers which do truly concurrent rendering to the same buffer (not
>> just concurrent rendering to multiple buffers all suballocated from
>> the same bo). But we can't fix this in the kernel with the tricks you
>> propose, because without such an uapi extension (or inference) we
>> can't tell the implicit fencing from the explicit fencing case.
>
>
> Sure we can. As I said for amdgpu that is absolutely no problem at all.
>
> In your terminology all rendering from the same client to a BO is explicitly
> fenced, while all rendering from different clients are implicit fenced.
>
>> And for shared buffers with explicit fencing we _must_ _not_ sync against
>> all writes. owner won't help here, because it's still not tracking
>> whether something is explicit or implicit synced.
>
>
> Implicit syncing can be disable by giving the
> AMDGPU_GEM_CREATE_EXPLICIT_SYNC flag while creating the BO.
>
>> We've cheated a bit with most other drivers in this area, also because
>> we don't have to deal with truly concurrent rendering.
>
>
> Yeah, absolutely nailed it. And this cheating is now completely breaking my
> neck because it doesn't work well at all with the requirements I have at
> hand here.
>
>> So it's not
>> obvious that we're not tracking writes/reads, but implicit/explicit
>> fencing. But semantically we track the later for shared buffers.
>>
>> Cheers, Daniel
>>
>> PS: One idea I have for inference: Every time you see a shared buffer
>> in an amdgpu CS:
>> 1. Grab reservation lock
>> 2. Check all the fences' creators. If any of them are foreign (not by
>> amdgpu), then run the current pessimistic code.
>
>
> That is exactly what we already do.
>
>> 3. If all fences are by amdgpu
>> - Look at the exclusive fence. If it's a ttm bo move keep it, if it's
>> marked as a special implicit syncing fence, ignore it.
>> - Run all CS concurrently by storing all their write fences in the shared
>> slots.
>> - Create a fake exclusive fence which ties all the write hazards into
>> one fence. Mark them as special implicit syncing fences in your
>> amdgpu_fence struct. This will make sure other drivers sync properly,
>> but since you ignore these special it won't reduce amdgpu-internal
>> concurrency.
>
>
> That won't work, adding the exclusive fence removes all shared fences and I
> still need to have those for the sync check above.
>
> I need something like a callback from other drivers that the reservation
> object is now used by them and no longer by amdgpu.

Then don't track _any_ of the amdgpu internal fences in the reservation object:
- 1 reservation object that you hand to ttm, for use internally within amdgpu
- 1 reservation object that you attach to the dma-buf (or get from the
imported dma-buf), where you play all the tricks to fake fences.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [Intel-gfx] RFC: Add write flag to reservation object fences
       [not found]                       ` <97e6154a-380c-521c-6287-6fbafe0384b4-5C7GfCeVMHo@public.gmane.org>
@ 2018-08-10  9:25                         ` Daniel Vetter
  0 siblings, 0 replies; 25+ messages in thread
From: Daniel Vetter @ 2018-08-10  9:25 UTC (permalink / raw)
  To: Christian König; +Cc: intel-gfx, amd-gfx list, dri-devel, Chris Wilson

On Fri, Aug 10, 2018 at 10:51 AM, Christian König
<christian.koenig@amd.com> wrote:
> Am 10.08.2018 um 10:32 schrieb Daniel Vetter:
>>
>> On Fri, Aug 10, 2018 at 10:24 AM, Christian König
>> <christian.koenig@amd.com> wrote:
>>>
>>> Am 10.08.2018 um 09:51 schrieb Chris Wilson:
>>>>
>>>> Quoting Christian König (2018-08-09 15:54:31)
>>>>>
>>>>> Am 09.08.2018 um 16:22 schrieb Daniel Vetter:
>>>>>>
>>>>>> On Thu, Aug 9, 2018 at 3:58 PM, Christian König
>>>>>> <ckoenig.leichtzumerken@gmail.com> wrote:
>>>>>>>
>>>>>>> Am 09.08.2018 um 15:38 schrieb Daniel Vetter:
>>>>>>>>
>>>>>>>> On Thu, Aug 09, 2018 at 01:37:07PM +0200, Christian König wrote:
>>>>>>>> [SNIP]
>>>>>>>
>>>>>>> See to me the explicit fence in the reservation object is not even
>>>>>>> remotely
>>>>>>> related to implicit or explicit synchronization.
>>>>>>
>>>>>> Hm, I guess that's the confusion then. The only reason we have the
>>>>>> exclusive fence is to implement cross-driver implicit syncing. What
>>>>>> else you do internally in your driver doesn't matter, as long as you
>>>>>> keep up that contract.
>>>>>>
>>>>>> And it's intentionally not called write_fence or anything like that,
>>>>>> because that's not what it tracks.
>>>>>>
>>>>>> Of course any buffer moves the kernel does also must be tracked in the
>>>>>> exclusive fence, because userspace cannot know about these. So you
>>>>>> might have an exclusive fence set and also an explicit fence passed in
>>>>>> through the atomic ioctl. Aside: Right now all drivers only observe
>>>>>> one or the other, not both, so will break as soon as we start moving
>>>>>> shared buffers around. At least on Android or anything else using
>>>>>> explicit fencing.
>>>>>
>>>>> Actually both radeon and nouveau use the approach that shared fences
>>>>> need to wait on as well when they don't come from the current driver.
>>>>
>>>> nouveau has write hazard tracking in its api , and is using the
>>>> excl fence for it was well.
>>>>
>>>> As far as I can tell, this is all about fixing the lack of
>>>> acknowledgement of the requirement for implicit fence tracking in
>>>> amdgpu's (and its radeon predecessor) ABI.
>>>
>>>
>>> Well I agree that implicit fencing was a bad idea to begin with, but the
>>> solution you guys came up with is not going to work in all cases either.
>>>
>>>> Which is fine so long as you
>>>> get userspace to only use explicit fence passing to the compositor.
>>>
>>>
>>> Well exactly that's the problem.
>>>
>>> I can't pass exactly one explicit fence to the compositor because I have
>>> multiple command submissions which run asynchronously and need to finish
>>> before the compositor can start to use the surface.
>>>
>>> So the whole concept of using a single exclusive fence doesn't work in
>>> the
>>> case of amdgpu. And to be honest I think all drivers with multiple
>>> engines
>>> could benefit of that as well.
>>
>> Fences can be merge. This works, really :-) In amdgpu's implementation
>> of EGL_ANDROID_native_fence_sync you will need to do that before
>> passing the result to the caller.
>
> Yeah, but that is for the userspace API. Not for any internal handling in
> the driver.

dma_fence_array? Or how do you think this fence merging for userspace
is implemented?

The only trouble is that amdgpu doesn't have an explicit hand-over
point in the uapi where an explicitly synced buffer (all of them
really) gets its fences for implicit syncing attached.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: RFC: Add write flag to reservation object fences
  2018-08-10  9:14                   ` Christian König
  2018-08-10  9:21                     ` Daniel Vetter
@ 2018-08-10  9:28                     ` Daniel Vetter
  1 sibling, 0 replies; 25+ messages in thread
From: Daniel Vetter @ 2018-08-10  9:28 UTC (permalink / raw)
  To: Christian König; +Cc: intel-gfx, dri-devel, amd-gfx list

On Fri, Aug 10, 2018 at 11:14 AM, Christian König
<christian.koenig@amd.com> wrote:
> Am 10.08.2018 um 10:29 schrieb Daniel Vetter:
>>
>> [SNIP]
>> I'm only interested in the case of shared buffers. And for those you
>> _do_ pessimistically assume that all access must be implicitly synced.
>> Iirc amdgpu doesn't support EGL_ANDROID_native_fence_sync, so this
>> makes sense that you don't bother with it.
>
>
> See flag AMDGPU_GEM_CREATE_EXPLICIT_SYNC.

That's for radv. Won't be enough for EGL_ANDROID_native_fence_sync,
because you cannot know at buffer allocation time how the fencing will
be done in all cases.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: RFC: Add write flag to reservation object fences
  2018-08-10  9:21                     ` Daniel Vetter
@ 2018-08-10 14:24                       ` Christian König
  0 siblings, 0 replies; 25+ messages in thread
From: Christian König @ 2018-08-10 14:24 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: intel-gfx, dri-devel, amd-gfx list

Am 10.08.2018 um 11:21 schrieb Daniel Vetter:
> [SNIP]
> Then don't track _any_ of the amdgpu internal fences in the reservation object:
> - 1 reservation object that you hand to ttm, for use internally within amdgpu
> - 1 reservation object that you attach to the dma-buf (or get from the
> imported dma-buf), where you play all the tricks to fake fences.

Well that is an interesting idea. Going to try that one out.

Regards,
Christian.

> -Daniel

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2018-08-10 14:24 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-08-09 11:37 RFC: Add write flag to reservation object fences Christian König
     [not found] ` <20180809113713.48024-1-christian.koenig-5C7GfCeVMHo@public.gmane.org>
2018-08-09 11:37   ` [PATCH 1/6] dma-buf: remove shared fence staging in reservation object Christian König
2018-08-09 12:08     ` Chris Wilson
2018-08-09 12:22       ` Christian König
2018-08-09 11:37   ` [PATCH 2/6] dma-buf: add reservation object shared fence accessor Christian König
     [not found]     ` <20180809113713.48024-3-christian.koenig-5C7GfCeVMHo@public.gmane.org>
2018-08-10  7:41       ` Huang Rui
2018-08-09 11:37   ` [PATCH 5/6] drm/i915: wait for write fences before pflip Christian König
2018-08-09 11:37   ` [PATCH 6/6] drm/amdgpu: remove exclusive fence workaround Christian König
2018-08-09 11:37 ` [PATCH 3/6] dma-buf: add is_write to reservation_object_add_shared_fence Christian König
2018-08-09 11:37 ` [PATCH 4/6] dma-buf: add writes_only flag to reservation_object_get_fences_rcu Christian König
2018-08-09 11:53 ` ✗ Fi.CI.BAT: failure for series starting with [1/6] dma-buf: remove shared fence staging in reservation object Patchwork
2018-08-09 13:38 ` RFC: Add write flag to reservation object fences Daniel Vetter
2018-08-09 13:58   ` Christian König
     [not found]     ` <154cc05d-f2e1-695a-5bd3-a2fd5d40a548-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2018-08-09 14:22       ` Daniel Vetter
     [not found]         ` <CAKMK7uHgwFr1P3s=TcZ_NMuexCwoO1FkEn9q3CaYaGyBKviB-g-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2018-08-09 14:54           ` Christian König
     [not found]             ` <9731d8b0-238b-d64d-9f80-4f549eb230b9-5C7GfCeVMHo@public.gmane.org>
2018-08-10  7:51               ` [Intel-gfx] " Chris Wilson
2018-08-10  8:24                 ` Christian König
2018-08-10  8:32                   ` Daniel Vetter
2018-08-10  8:51                     ` Christian König
     [not found]                       ` <97e6154a-380c-521c-6287-6fbafe0384b4-5C7GfCeVMHo@public.gmane.org>
2018-08-10  9:25                         ` [Intel-gfx] " Daniel Vetter
2018-08-10  8:29               ` Daniel Vetter
     [not found]                 ` <CAKMK7uH2ZvkmTVcPJwKjouxdvrLKVXBVGSFKAD5rtQrXLoX_GQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2018-08-10  9:14                   ` Christian König
2018-08-10  9:21                     ` Daniel Vetter
2018-08-10 14:24                       ` Christian König
2018-08-10  9:28                     ` Daniel Vetter

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.