All of lore.kernel.org
 help / color / mirror / Atom feed
* [Intel-gfx] [PATCH 1/2] drm/mm: Add an iterator to optimally walk over holes for an allocation
@ 2022-02-02  1:11 Vivek Kasireddy
  2022-02-02  1:11 ` [Intel-gfx] [PATCH 2/2] drm/i915/gem: Don't try to map and fence large scanout buffers (v5) Vivek Kasireddy
                   ` (4 more replies)
  0 siblings, 5 replies; 19+ messages in thread
From: Vivek Kasireddy @ 2022-02-02  1:11 UTC (permalink / raw)
  To: intel-gfx

This iterator relies on drm_mm_first_hole() and drm_mm_next_hole()
functions to identify suitable holes for an allocation of a given
size by efficently traversing the rbtree associated with the given
allocator.

It replaces the for loop in drm_mm_insert_node_in_range() and can
also be used by drm drivers to quickly identify holes of a certain
size within a given range.

Suggested-by: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Signed-off-by: Vivek Kasireddy <vivek.kasireddy@intel.com>
---
 drivers/gpu/drm/drm_mm.c | 28 ++++++++++++----------------
 include/drm/drm_mm.h     | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/drm_mm.c b/drivers/gpu/drm/drm_mm.c
index 8257f9d4f619..416c849c10e5 100644
--- a/drivers/gpu/drm/drm_mm.c
+++ b/drivers/gpu/drm/drm_mm.c
@@ -352,10 +352,10 @@ static struct drm_mm_node *find_hole_addr(struct drm_mm *mm, u64 addr, u64 size)
 	return node;
 }
 
-static struct drm_mm_node *
-first_hole(struct drm_mm *mm,
-	   u64 start, u64 end, u64 size,
-	   enum drm_mm_insert_mode mode)
+struct drm_mm_node *
+drm_mm_first_hole(struct drm_mm *mm,
+		  u64 start, u64 end, u64 size,
+		  enum drm_mm_insert_mode mode)
 {
 	switch (mode) {
 	default:
@@ -374,6 +374,7 @@ first_hole(struct drm_mm *mm,
 						hole_stack);
 	}
 }
+EXPORT_SYMBOL(drm_mm_first_hole);
 
 /**
  * DECLARE_NEXT_HOLE_ADDR - macro to declare next hole functions
@@ -410,11 +411,11 @@ static struct drm_mm_node *name(struct drm_mm_node *entry, u64 size)	\
 DECLARE_NEXT_HOLE_ADDR(next_hole_high_addr, rb_left, rb_right)
 DECLARE_NEXT_HOLE_ADDR(next_hole_low_addr, rb_right, rb_left)
 
-static struct drm_mm_node *
-next_hole(struct drm_mm *mm,
-	  struct drm_mm_node *node,
-	  u64 size,
-	  enum drm_mm_insert_mode mode)
+struct drm_mm_node *
+drm_mm_next_hole(struct drm_mm *mm,
+		 struct drm_mm_node *node,
+		 u64 size,
+		 enum drm_mm_insert_mode mode)
 {
 	switch (mode) {
 	default:
@@ -432,6 +433,7 @@ next_hole(struct drm_mm *mm,
 		return &node->hole_stack == &mm->hole_stack ? NULL : node;
 	}
 }
+EXPORT_SYMBOL(drm_mm_next_hole);
 
 /**
  * drm_mm_reserve_node - insert an pre-initialized node
@@ -520,7 +522,6 @@ int drm_mm_insert_node_in_range(struct drm_mm * const mm,
 {
 	struct drm_mm_node *hole;
 	u64 remainder_mask;
-	bool once;
 
 	DRM_MM_BUG_ON(range_start > range_end);
 
@@ -533,13 +534,8 @@ int drm_mm_insert_node_in_range(struct drm_mm * const mm,
 	if (alignment <= 1)
 		alignment = 0;
 
-	once = mode & DRM_MM_INSERT_ONCE;
-	mode &= ~DRM_MM_INSERT_ONCE;
-
 	remainder_mask = is_power_of_2(alignment) ? alignment - 1 : 0;
-	for (hole = first_hole(mm, range_start, range_end, size, mode);
-	     hole;
-	     hole = once ? NULL : next_hole(mm, hole, size, mode)) {
+	drm_mm_for_each_best_hole(hole, mm, range_start, range_end, size, mode) {
 		u64 hole_start = __drm_mm_hole_node_start(hole);
 		u64 hole_end = hole_start + hole->hole_size;
 		u64 adj_start, adj_end;
diff --git a/include/drm/drm_mm.h b/include/drm/drm_mm.h
index ac33ba1b18bc..5055447697fa 100644
--- a/include/drm/drm_mm.h
+++ b/include/drm/drm_mm.h
@@ -322,6 +322,17 @@ static inline u64 __drm_mm_hole_node_end(const struct drm_mm_node *hole_node)
 	return list_next_entry(hole_node, node_list)->start;
 }
 
+struct drm_mm_node *
+drm_mm_first_hole(struct drm_mm *mm,
+		  u64 start, u64 end, u64 size,
+		  enum drm_mm_insert_mode mode);
+
+struct drm_mm_node *
+drm_mm_next_hole(struct drm_mm *mm,
+		 struct drm_mm_node *node,
+		 u64 size,
+		 enum drm_mm_insert_mode mode);
+
 /**
  * drm_mm_hole_node_end - computes the end of the hole following @node
  * @hole_node: drm_mm_node which implicitly tracks the following hole
@@ -400,6 +411,27 @@ static inline u64 drm_mm_hole_node_end(const struct drm_mm_node *hole_node)
 	     1 : 0; \
 	     pos = list_next_entry(pos, hole_stack))
 
+/**
+ * drm_mm_for_each_best_hole - iterator to optimally walk over all holes >= @size
+ * @pos: &drm_mm_node used internally to track progress
+ * @mm: &drm_mm allocator to walk
+ * @range_start: start of the allowed range for the allocation
+ * @range_end: end of the allowed range for the allocation
+ * @size: size of the allocation
+ * @mode: fine-tune the allocation search
+ *
+ * This iterator walks over all holes suitable for the allocation of given
+ * @size in a very efficient manner. It is implemented by calling
+ * drm_mm_first_hole() and drm_mm_next_hole() which identify the
+ * appropriate holes within the given range by efficently traversing the
+ * rbtree associated with @mm.
+ */
+#define drm_mm_for_each_best_hole(pos, mm, range_start, range_end, size, mode) \
+	for (pos = drm_mm_first_hole(mm, range_start, range_end, size, mode); \
+	     pos; \
+	     pos = mode & DRM_MM_INSERT_ONCE ? \
+	     NULL : drm_mm_next_hole(mm, hole, size, mode))
+
 /*
  * Basic range manager support (drm_mm.c)
  */
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [Intel-gfx] [PATCH 2/2] drm/i915/gem: Don't try to map and fence large scanout buffers (v5)
  2022-02-02  1:11 [Intel-gfx] [PATCH 1/2] drm/mm: Add an iterator to optimally walk over holes for an allocation Vivek Kasireddy
@ 2022-02-02  1:11 ` Vivek Kasireddy
  2022-02-02 13:19   ` Tvrtko Ursulin
  2022-02-02  1:43 ` [Intel-gfx] ✗ Fi.CI.CHECKPATCH: warning for series starting with [1/2] drm/mm: Add an iterator to optimally walk over holes for an allocation Patchwork
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 19+ messages in thread
From: Vivek Kasireddy @ 2022-02-02  1:11 UTC (permalink / raw)
  To: intel-gfx

On platforms capable of allowing 8K (7680 x 4320) modes, pinning 2 or
more framebuffers/scanout buffers results in only one that is mappable/
fenceable. Therefore, pageflipping between these 2 FBs where only one
is mappable/fenceable creates latencies large enough to miss alternate
vblanks thereby producing less optimal framerate.

This mainly happens because when i915_gem_object_pin_to_display_plane()
is called to pin one of the FB objs, the associated vma is identified
as misplaced and therefore i915_vma_unbind() is called which unbinds and
evicts it. This misplaced vma gets subseqently pinned only when
i915_gem_object_ggtt_pin_ww() is called without PIN_MAPPABLE. This
results in a latency of ~10ms and happens every other vblank/repaint cycle.
Therefore, to fix this issue, we try to see if there is space to map
at-least two objects of a given size and return early if there isn't. This
would ensure that we do not try with PIN_MAPPABLE for any objects that
are too big to map thereby preventing unncessary unbind.

Testcase:
Running Weston and weston-simple-egl on an Alderlake_S (ADLS) platform
with a 8K@60 mode results in only ~40 FPS. Since upstream Weston submits
a frame ~7ms before the next vblank, the latencies seen between atomic
commit and flip event are 7, 24 (7 + 16.66), 7, 24..... suggesting that
it misses the vblank every other frame.

Here is the ftrace snippet that shows the source of the ~10ms latency:
              i915_gem_object_pin_to_display_plane() {
0.102 us   |    i915_gem_object_set_cache_level();
                i915_gem_object_ggtt_pin_ww() {
0.390 us   |      i915_vma_instance();
0.178 us   |      i915_vma_misplaced();
                  i915_vma_unbind() {
                  __i915_active_wait() {
0.082 us   |        i915_active_acquire_if_busy();
0.475 us   |      }
                  intel_runtime_pm_get() {
0.087 us   |        intel_runtime_pm_acquire();
0.259 us   |      }
                  __i915_active_wait() {
0.085 us   |        i915_active_acquire_if_busy();
0.240 us   |      }
                  __i915_vma_evict() {
                    ggtt_unbind_vma() {
                      gen8_ggtt_clear_range() {
10507.255 us |        }
10507.689 us |      }
10508.516 us |   }

v2: Instead of using bigjoiner checks, determine whether a scanout
    buffer is too big by checking to see if it is possible to map
    two of them into the ggtt.

v3 (Ville):
- Count how many fb objects can be fit into the available holes
  instead of checking for a hole twice the object size.
- Take alignment constraints into account.
- Limit this large scanout buffer check to >= Gen 11 platforms.

v4:
- Remove existing heuristic that checks just for size. (Ville)
- Return early if we find space to map at-least two objects. (Tvrtko)
- Slightly update the commit message.

v5: (Tvrtko)
- Rename the function to indicate that the object may be too big to
  map into the aperture.
- Account for guard pages while calculating the total size required
  for the object.
- Do not subject all objects to the heuristic check and instead
  consider objects only of a certain size.
- Do the hole walk using the rbtree.
- Preserve the existing PIN_NONBLOCK logic.
- Drop the PIN_MAPPABLE check while pinning the VMA.

Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Manasi Navare <manasi.d.navare@intel.com>
Signed-off-by: Vivek Kasireddy <vivek.kasireddy@intel.com>
---
 drivers/gpu/drm/i915/i915_gem.c | 117 ++++++++++++++++++++++++--------
 1 file changed, 88 insertions(+), 29 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index e3a2c2a0e156..752fec2b4c60 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -46,6 +46,7 @@
 #include "gem/i915_gem_mman.h"
 #include "gem/i915_gem_region.h"
 #include "gem/i915_gem_userptr.h"
+#include "gem/i915_gem_tiling.h"
 #include "gt/intel_engine_user.h"
 #include "gt/intel_gt.h"
 #include "gt/intel_gt_pm.h"
@@ -876,6 +877,92 @@ static void discard_ggtt_vma(struct i915_vma *vma)
 	spin_unlock(&obj->vma.lock);
 }
 
+static bool
+i915_gem_object_fits_in_aperture(struct drm_i915_gem_object *obj,
+				 u64 alignment, u64 flags)
+{
+	struct drm_i915_private *i915 = to_i915(obj->base.dev);
+	struct i915_ggtt *ggtt = to_gt(i915)->ggtt;
+	struct drm_mm_node *hole;
+	u64 hole_start, hole_end, start, end;
+	u64 fence_size, fence_alignment;
+	unsigned int count = 0;
+
+	/*
+	 * If the required space is larger than the available
+	 * aperture, we will not able to find a slot for the
+	 * object and unbinding the object now will be in
+	 * vain. Worse, doing so may cause us to ping-pong
+	 * the object in and out of the Global GTT and
+	 * waste a lot of cycles under the mutex.
+	 */
+	if (obj->base.size > ggtt->mappable_end)
+		return true;
+
+	/*
+	 * If NONBLOCK is set the caller is optimistically
+	 * trying to cache the full object within the mappable
+	 * aperture, and *must* have a fallback in place for
+	 * situations where we cannot bind the object. We
+	 * can be a little more lax here and use the fallback
+	 * more often to avoid costly migrations of ourselves
+	 * and other objects within the aperture.
+	 */
+	if (!(flags & PIN_NONBLOCK))
+		return false;
+
+	/*
+	 * We only consider objects whose size is at-least a quarter of
+	 * the aperture to be too big and subject them to the new
+	 * heuristic below.
+	 */
+	if (obj->base.size < ggtt->mappable_end / 4)
+		return false;
+
+	if (HAS_GMCH(i915) || DISPLAY_VER(i915) < 11 ||
+	    !i915_gem_object_is_framebuffer(obj))
+		return false;
+
+	fence_size = i915_gem_fence_size(i915, obj->base.size,
+					 i915_gem_object_get_tiling(obj),
+					 i915_gem_object_get_stride(obj));
+
+	if (i915_vm_has_cache_coloring(&ggtt->vm))
+		fence_size += 2 * I915_GTT_PAGE_SIZE;
+
+	fence_alignment = i915_gem_fence_alignment(i915, obj->base.size,
+						   i915_gem_object_get_tiling(obj),
+						   i915_gem_object_get_stride(obj));
+	alignment = max_t(u64, alignment, fence_alignment);
+
+	/*
+	 * Assuming this object is a large scanout buffer, we try to find
+	 * out if there is room to map at-least two of them. There could
+	 * be space available to map one but to be consistent, we try to
+	 * avoid mapping/fencing any of them.
+	 */
+	drm_mm_for_each_best_hole(hole, &ggtt->vm.mm, 0, ggtt->mappable_end,
+				  fence_size, DRM_MM_INSERT_LOW) {
+		hole_start = drm_mm_hole_node_start(hole);
+		hole_end = hole_start + hole->hole_size;
+
+		do {
+			start = round_up(hole_start, alignment);
+			end = min_t(u64, hole_end, ggtt->mappable_end);
+
+			if (range_overflows(start, fence_size, end))
+				break;
+
+			if (++count >= 2)
+				return false;
+
+			hole_start = start + fence_size;
+		} while (1);
+	}
+
+	return true;
+}
+
 struct i915_vma *
 i915_gem_object_ggtt_pin_ww(struct drm_i915_gem_object *obj,
 			    struct i915_gem_ww_ctx *ww,
@@ -891,36 +978,8 @@ i915_gem_object_ggtt_pin_ww(struct drm_i915_gem_object *obj,
 
 	if (flags & PIN_MAPPABLE &&
 	    (!view || view->type == I915_GGTT_VIEW_NORMAL)) {
-		/*
-		 * If the required space is larger than the available
-		 * aperture, we will not able to find a slot for the
-		 * object and unbinding the object now will be in
-		 * vain. Worse, doing so may cause us to ping-pong
-		 * the object in and out of the Global GTT and
-		 * waste a lot of cycles under the mutex.
-		 */
-		if (obj->base.size > ggtt->mappable_end)
+		if (i915_gem_object_fits_in_aperture(obj, alignment, flags))
 			return ERR_PTR(-E2BIG);
-
-		/*
-		 * If NONBLOCK is set the caller is optimistically
-		 * trying to cache the full object within the mappable
-		 * aperture, and *must* have a fallback in place for
-		 * situations where we cannot bind the object. We
-		 * can be a little more lax here and use the fallback
-		 * more often to avoid costly migrations of ourselves
-		 * and other objects within the aperture.
-		 *
-		 * Half-the-aperture is used as a simple heuristic.
-		 * More interesting would to do search for a free
-		 * block prior to making the commitment to unbind.
-		 * That caters for the self-harm case, and with a
-		 * little more heuristics (e.g. NOFAULT, NOEVICT)
-		 * we could try to minimise harm to others.
-		 */
-		if (flags & PIN_NONBLOCK &&
-		    obj->base.size > ggtt->mappable_end / 2)
-			return ERR_PTR(-ENOSPC);
 	}
 
 new_vma:
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [Intel-gfx] ✗ Fi.CI.CHECKPATCH: warning for series starting with [1/2] drm/mm: Add an iterator to optimally walk over holes for an allocation
  2022-02-02  1:11 [Intel-gfx] [PATCH 1/2] drm/mm: Add an iterator to optimally walk over holes for an allocation Vivek Kasireddy
  2022-02-02  1:11 ` [Intel-gfx] [PATCH 2/2] drm/i915/gem: Don't try to map and fence large scanout buffers (v5) Vivek Kasireddy
@ 2022-02-02  1:43 ` Patchwork
  2022-02-02  1:45 ` [Intel-gfx] ✗ Fi.CI.SPARSE: " Patchwork
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 19+ messages in thread
From: Patchwork @ 2022-02-02  1:43 UTC (permalink / raw)
  To: Vivek Kasireddy; +Cc: intel-gfx

== Series Details ==

Series: series starting with [1/2] drm/mm: Add an iterator to optimally walk over holes for an allocation
URL   : https://patchwork.freedesktop.org/series/99597/
State : warning

== Summary ==

$ dim checkpatch origin/drm-tip
737921ca12bb drm/mm: Add an iterator to optimally walk over holes for an allocation
-:9: WARNING:TYPO_SPELLING: 'efficently' may be misspelled - perhaps 'efficiently'?
#9: 
size by efficently traversing the rbtree associated with the given
        ^^^^^^^^^^

-:132: WARNING:TYPO_SPELLING: 'efficently' may be misspelled - perhaps 'efficiently'?
#132: FILE: include/drm/drm_mm.h:426:
+ * appropriate holes within the given range by efficently traversing the
                                                ^^^^^^^^^^

-:135: CHECK:MACRO_ARG_REUSE: Macro argument reuse 'pos' - possible side-effects?
#135: FILE: include/drm/drm_mm.h:429:
+#define drm_mm_for_each_best_hole(pos, mm, range_start, range_end, size, mode) \
+	for (pos = drm_mm_first_hole(mm, range_start, range_end, size, mode); \
+	     pos; \
+	     pos = mode & DRM_MM_INSERT_ONCE ? \
+	     NULL : drm_mm_next_hole(mm, hole, size, mode))

-:135: CHECK:MACRO_ARG_REUSE: Macro argument reuse 'mm' - possible side-effects?
#135: FILE: include/drm/drm_mm.h:429:
+#define drm_mm_for_each_best_hole(pos, mm, range_start, range_end, size, mode) \
+	for (pos = drm_mm_first_hole(mm, range_start, range_end, size, mode); \
+	     pos; \
+	     pos = mode & DRM_MM_INSERT_ONCE ? \
+	     NULL : drm_mm_next_hole(mm, hole, size, mode))

-:135: CHECK:MACRO_ARG_REUSE: Macro argument reuse 'size' - possible side-effects?
#135: FILE: include/drm/drm_mm.h:429:
+#define drm_mm_for_each_best_hole(pos, mm, range_start, range_end, size, mode) \
+	for (pos = drm_mm_first_hole(mm, range_start, range_end, size, mode); \
+	     pos; \
+	     pos = mode & DRM_MM_INSERT_ONCE ? \
+	     NULL : drm_mm_next_hole(mm, hole, size, mode))

-:135: CHECK:MACRO_ARG_REUSE: Macro argument reuse 'mode' - possible side-effects?
#135: FILE: include/drm/drm_mm.h:429:
+#define drm_mm_for_each_best_hole(pos, mm, range_start, range_end, size, mode) \
+	for (pos = drm_mm_first_hole(mm, range_start, range_end, size, mode); \
+	     pos; \
+	     pos = mode & DRM_MM_INSERT_ONCE ? \
+	     NULL : drm_mm_next_hole(mm, hole, size, mode))

total: 0 errors, 2 warnings, 4 checks, 109 lines checked
6b91082aa6e9 drm/i915/gem: Don't try to map and fence large scanout buffers (v5)



^ permalink raw reply	[flat|nested] 19+ messages in thread

* [Intel-gfx] ✗ Fi.CI.SPARSE: warning for series starting with [1/2] drm/mm: Add an iterator to optimally walk over holes for an allocation
  2022-02-02  1:11 [Intel-gfx] [PATCH 1/2] drm/mm: Add an iterator to optimally walk over holes for an allocation Vivek Kasireddy
  2022-02-02  1:11 ` [Intel-gfx] [PATCH 2/2] drm/i915/gem: Don't try to map and fence large scanout buffers (v5) Vivek Kasireddy
  2022-02-02  1:43 ` [Intel-gfx] ✗ Fi.CI.CHECKPATCH: warning for series starting with [1/2] drm/mm: Add an iterator to optimally walk over holes for an allocation Patchwork
@ 2022-02-02  1:45 ` Patchwork
  2022-02-02  2:09 ` [Intel-gfx] ✗ Fi.CI.BAT: failure " Patchwork
  2022-02-04  1:45 ` [Intel-gfx] ✗ Fi.CI.BUILD: failure for series starting with [1/2] drm/mm: Add an iterator to optimally walk over holes for an allocation (rev2) Patchwork
  4 siblings, 0 replies; 19+ messages in thread
From: Patchwork @ 2022-02-02  1:45 UTC (permalink / raw)
  To: Vivek Kasireddy; +Cc: intel-gfx

== Series Details ==

Series: series starting with [1/2] drm/mm: Add an iterator to optimally walk over holes for an allocation
URL   : https://patchwork.freedesktop.org/series/99597/
State : warning

== Summary ==

$ dim sparse --fast origin/drm-tip
Sparse version: v0.6.2
Fast mode used, each commit won't be checked separately.



^ permalink raw reply	[flat|nested] 19+ messages in thread

* [Intel-gfx] ✗ Fi.CI.BAT: failure for series starting with [1/2] drm/mm: Add an iterator to optimally walk over holes for an allocation
  2022-02-02  1:11 [Intel-gfx] [PATCH 1/2] drm/mm: Add an iterator to optimally walk over holes for an allocation Vivek Kasireddy
                   ` (2 preceding siblings ...)
  2022-02-02  1:45 ` [Intel-gfx] ✗ Fi.CI.SPARSE: " Patchwork
@ 2022-02-02  2:09 ` Patchwork
  2022-02-04  1:45 ` [Intel-gfx] ✗ Fi.CI.BUILD: failure for series starting with [1/2] drm/mm: Add an iterator to optimally walk over holes for an allocation (rev2) Patchwork
  4 siblings, 0 replies; 19+ messages in thread
From: Patchwork @ 2022-02-02  2:09 UTC (permalink / raw)
  To: Vivek Kasireddy; +Cc: intel-gfx

[-- Attachment #1: Type: text/plain, Size: 6654 bytes --]

== Series Details ==

Series: series starting with [1/2] drm/mm: Add an iterator to optimally walk over holes for an allocation
URL   : https://patchwork.freedesktop.org/series/99597/
State : failure

== Summary ==

CI Bug Log - changes from CI_DRM_11176 -> Patchwork_22151
====================================================

Summary
-------

  **FAILURE**

  Serious unknown changes coming with Patchwork_22151 absolutely need to be
  verified manually.
  
  If you think the reported changes have nothing to do with the changes
  introduced in Patchwork_22151, please notify your bug team to allow them
  to document this new failure mode, which will reduce false positives in CI.

  External URL: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_22151/index.html

Participating hosts (45 -> 41)
------------------------------

  Missing    (4): fi-kbl-soraka fi-bdw-samus fi-icl-u2 shard-tglu 

Possible new issues
-------------------

  Here are the unknown changes that may have been introduced in Patchwork_22151:

### IGT changes ###

#### Possible regressions ####

  * igt@i915_selftest@live@workarounds:
    - fi-snb-2520m:       [PASS][1] -> [DMESG-FAIL][2]
   [1]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11176/fi-snb-2520m/igt@i915_selftest@live@workarounds.html
   [2]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_22151/fi-snb-2520m/igt@i915_selftest@live@workarounds.html
    - fi-snb-2600:        [PASS][3] -> [DMESG-FAIL][4]
   [3]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11176/fi-snb-2600/igt@i915_selftest@live@workarounds.html
   [4]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_22151/fi-snb-2600/igt@i915_selftest@live@workarounds.html

  
Known issues
------------

  Here are the changes found in Patchwork_22151 that come from known issues:

### IGT changes ###

#### Issues hit ####

  * igt@amdgpu/amd_cs_nop@sync-fork-compute0:
    - fi-snb-2600:        NOTRUN -> [SKIP][5] ([fdo#109271]) +17 similar issues
   [5]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_22151/fi-snb-2600/igt@amdgpu/amd_cs_nop@sync-fork-compute0.html

  * igt@gem_flink_basic@bad-flink:
    - fi-skl-6600u:       [PASS][6] -> [INCOMPLETE][7] ([i915#4547])
   [6]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11176/fi-skl-6600u/igt@gem_flink_basic@bad-flink.html
   [7]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_22151/fi-skl-6600u/igt@gem_flink_basic@bad-flink.html

  * igt@i915_selftest@live@requests:
    - fi-blb-e6850:       [PASS][8] -> [DMESG-FAIL][9] ([i915#5026])
   [8]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11176/fi-blb-e6850/igt@i915_selftest@live@requests.html
   [9]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_22151/fi-blb-e6850/igt@i915_selftest@live@requests.html

  * igt@kms_frontbuffer_tracking@basic:
    - fi-cml-u2:          [PASS][10] -> [DMESG-WARN][11] ([i915#4269])
   [10]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11176/fi-cml-u2/igt@kms_frontbuffer_tracking@basic.html
   [11]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_22151/fi-cml-u2/igt@kms_frontbuffer_tracking@basic.html

  * igt@runner@aborted:
    - fi-blb-e6850:       NOTRUN -> [FAIL][12] ([fdo#109271] / [i915#2403] / [i915#2426] / [i915#4312])
   [12]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_22151/fi-blb-e6850/igt@runner@aborted.html

  
#### Possible fixes ####

  * igt@i915_selftest@live@gem_contexts:
    - bat-adlp-4:         [INCOMPLETE][13] -> [PASS][14]
   [13]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11176/bat-adlp-4/igt@i915_selftest@live@gem_contexts.html
   [14]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_22151/bat-adlp-4/igt@i915_selftest@live@gem_contexts.html

  * igt@i915_selftest@live@gt_pm:
    - {fi-jsl-1}:         [DMESG-FAIL][15] ([i915#1886]) -> [PASS][16]
   [15]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11176/fi-jsl-1/igt@i915_selftest@live@gt_pm.html
   [16]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_22151/fi-jsl-1/igt@i915_selftest@live@gt_pm.html

  * igt@i915_selftest@live@hangcheck:
    - fi-snb-2600:        [INCOMPLETE][17] ([i915#3921]) -> [PASS][18]
   [17]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11176/fi-snb-2600/igt@i915_selftest@live@hangcheck.html
   [18]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_22151/fi-snb-2600/igt@i915_selftest@live@hangcheck.html

  * igt@kms_frontbuffer_tracking@basic:
    - fi-cfl-8109u:       [DMESG-FAIL][19] ([i915#295]) -> [PASS][20]
   [19]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11176/fi-cfl-8109u/igt@kms_frontbuffer_tracking@basic.html
   [20]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_22151/fi-cfl-8109u/igt@kms_frontbuffer_tracking@basic.html

  * igt@kms_pipe_crc_basic@compare-crc-sanitycheck-pipe-b:
    - fi-cfl-8109u:       [DMESG-WARN][21] ([i915#295]) -> [PASS][22] +10 similar issues
   [21]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11176/fi-cfl-8109u/igt@kms_pipe_crc_basic@compare-crc-sanitycheck-pipe-b.html
   [22]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_22151/fi-cfl-8109u/igt@kms_pipe_crc_basic@compare-crc-sanitycheck-pipe-b.html

  
  {name}: This element is suppressed. This means it is ignored when computing
          the status of the difference (SUCCESS, WARNING, or FAILURE).

  [fdo#109271]: https://bugs.freedesktop.org/show_bug.cgi?id=109271
  [i915#1886]: https://gitlab.freedesktop.org/drm/intel/issues/1886
  [i915#2403]: https://gitlab.freedesktop.org/drm/intel/issues/2403
  [i915#2426]: https://gitlab.freedesktop.org/drm/intel/issues/2426
  [i915#295]: https://gitlab.freedesktop.org/drm/intel/issues/295
  [i915#3921]: https://gitlab.freedesktop.org/drm/intel/issues/3921
  [i915#4269]: https://gitlab.freedesktop.org/drm/intel/issues/4269
  [i915#4312]: https://gitlab.freedesktop.org/drm/intel/issues/4312
  [i915#4547]: https://gitlab.freedesktop.org/drm/intel/issues/4547
  [i915#5026]: https://gitlab.freedesktop.org/drm/intel/issues/5026


Build changes
-------------

  * Linux: CI_DRM_11176 -> Patchwork_22151

  CI-20190529: 20190529
  CI_DRM_11176: 196c30496e1aa8a456195d993619e2960b5b0b45 @ git://anongit.freedesktop.org/gfx-ci/linux
  IGT_6337: 7c9c034619ef9dbfbfe041fbf3973a1cf1ac7a22 @ https://gitlab.freedesktop.org/drm/igt-gpu-tools.git
  Patchwork_22151: 6b91082aa6e930ca18557551a33bcfae8fc38ec7 @ git://anongit.freedesktop.org/gfx-ci/linux


== Linux commits ==

6b91082aa6e9 drm/i915/gem: Don't try to map and fence large scanout buffers (v5)
737921ca12bb drm/mm: Add an iterator to optimally walk over holes for an allocation

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_22151/index.html

[-- Attachment #2: Type: text/html, Size: 7715 bytes --]

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [Intel-gfx] [PATCH 2/2] drm/i915/gem: Don't try to map and fence large scanout buffers (v5)
  2022-02-02  1:11 ` [Intel-gfx] [PATCH 2/2] drm/i915/gem: Don't try to map and fence large scanout buffers (v5) Vivek Kasireddy
@ 2022-02-02 13:19   ` Tvrtko Ursulin
  2022-02-04  1:22     ` [Intel-gfx] [PATCH 2/2] drm/i915/gem: Don't try to map and fence large scanout buffers (v6) Vivek Kasireddy
  0 siblings, 1 reply; 19+ messages in thread
From: Tvrtko Ursulin @ 2022-02-02 13:19 UTC (permalink / raw)
  To: Vivek Kasireddy, intel-gfx


On 02/02/2022 01:11, Vivek Kasireddy wrote:
> On platforms capable of allowing 8K (7680 x 4320) modes, pinning 2 or
> more framebuffers/scanout buffers results in only one that is mappable/
> fenceable. Therefore, pageflipping between these 2 FBs where only one
> is mappable/fenceable creates latencies large enough to miss alternate
> vblanks thereby producing less optimal framerate.
> 
> This mainly happens because when i915_gem_object_pin_to_display_plane()
> is called to pin one of the FB objs, the associated vma is identified
> as misplaced and therefore i915_vma_unbind() is called which unbinds and
> evicts it. This misplaced vma gets subseqently pinned only when
> i915_gem_object_ggtt_pin_ww() is called without PIN_MAPPABLE. This
> results in a latency of ~10ms and happens every other vblank/repaint cycle.
> Therefore, to fix this issue, we try to see if there is space to map
> at-least two objects of a given size and return early if there isn't. This
> would ensure that we do not try with PIN_MAPPABLE for any objects that
> are too big to map thereby preventing unncessary unbind.
> 
> Testcase:
> Running Weston and weston-simple-egl on an Alderlake_S (ADLS) platform
> with a 8K@60 mode results in only ~40 FPS. Since upstream Weston submits
> a frame ~7ms before the next vblank, the latencies seen between atomic
> commit and flip event are 7, 24 (7 + 16.66), 7, 24..... suggesting that
> it misses the vblank every other frame.
> 
> Here is the ftrace snippet that shows the source of the ~10ms latency:
>                i915_gem_object_pin_to_display_plane() {
> 0.102 us   |    i915_gem_object_set_cache_level();
>                  i915_gem_object_ggtt_pin_ww() {
> 0.390 us   |      i915_vma_instance();
> 0.178 us   |      i915_vma_misplaced();
>                    i915_vma_unbind() {
>                    __i915_active_wait() {
> 0.082 us   |        i915_active_acquire_if_busy();
> 0.475 us   |      }
>                    intel_runtime_pm_get() {
> 0.087 us   |        intel_runtime_pm_acquire();
> 0.259 us   |      }
>                    __i915_active_wait() {
> 0.085 us   |        i915_active_acquire_if_busy();
> 0.240 us   |      }
>                    __i915_vma_evict() {
>                      ggtt_unbind_vma() {
>                        gen8_ggtt_clear_range() {
> 10507.255 us |        }
> 10507.689 us |      }
> 10508.516 us |   }
> 
> v2: Instead of using bigjoiner checks, determine whether a scanout
>      buffer is too big by checking to see if it is possible to map
>      two of them into the ggtt.
> 
> v3 (Ville):
> - Count how many fb objects can be fit into the available holes
>    instead of checking for a hole twice the object size.
> - Take alignment constraints into account.
> - Limit this large scanout buffer check to >= Gen 11 platforms.
> 
> v4:
> - Remove existing heuristic that checks just for size. (Ville)
> - Return early if we find space to map at-least two objects. (Tvrtko)
> - Slightly update the commit message.
> 
> v5: (Tvrtko)
> - Rename the function to indicate that the object may be too big to
>    map into the aperture.
> - Account for guard pages while calculating the total size required
>    for the object.
> - Do not subject all objects to the heuristic check and instead
>    consider objects only of a certain size.
> - Do the hole walk using the rbtree.
> - Preserve the existing PIN_NONBLOCK logic.
> - Drop the PIN_MAPPABLE check while pinning the VMA.

Looks nice and clean. Just one question below. Sorry, it's a consequence 
of noticing more when it's nice and clean. :)

> 
> Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
> Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> Cc: Manasi Navare <manasi.d.navare@intel.com>
> Signed-off-by: Vivek Kasireddy <vivek.kasireddy@intel.com>
> ---
>   drivers/gpu/drm/i915/i915_gem.c | 117 ++++++++++++++++++++++++--------
>   1 file changed, 88 insertions(+), 29 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index e3a2c2a0e156..752fec2b4c60 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -46,6 +46,7 @@
>   #include "gem/i915_gem_mman.h"
>   #include "gem/i915_gem_region.h"
>   #include "gem/i915_gem_userptr.h"
> +#include "gem/i915_gem_tiling.h"
>   #include "gt/intel_engine_user.h"
>   #include "gt/intel_gt.h"
>   #include "gt/intel_gt_pm.h"
> @@ -876,6 +877,92 @@ static void discard_ggtt_vma(struct i915_vma *vma)
>   	spin_unlock(&obj->vma.lock);
>   }
>   
> +static bool
> +i915_gem_object_fits_in_aperture(struct drm_i915_gem_object *obj,
> +				 u64 alignment, u64 flags)
> +{
> +	struct drm_i915_private *i915 = to_i915(obj->base.dev);
> +	struct i915_ggtt *ggtt = to_gt(i915)->ggtt;
> +	struct drm_mm_node *hole;
> +	u64 hole_start, hole_end, start, end;
> +	u64 fence_size, fence_alignment;
> +	unsigned int count = 0;
> +
> +	/*
> +	 * If the required space is larger than the available
> +	 * aperture, we will not able to find a slot for the
> +	 * object and unbinding the object now will be in
> +	 * vain. Worse, doing so may cause us to ping-pong
> +	 * the object in and out of the Global GTT and
> +	 * waste a lot of cycles under the mutex.
> +	 */
> +	if (obj->base.size > ggtt->mappable_end)
> +		return true;

Noticed we used to report E2BIG for this case and ENOSPC for the one 
below. I don't know if it would be important to keep that distinction 
without digging into the code, but sounds plausible that it could be 
(for tests and such at least). To err on the side of safety it may be 
easier to make this helper return int 0 for success and -error? That way 
it's trivial to preserve the existing behaviour wrt error codes.

Regards,

Tvrtko

> +
> +	/*
> +	 * If NONBLOCK is set the caller is optimistically
> +	 * trying to cache the full object within the mappable
> +	 * aperture, and *must* have a fallback in place for
> +	 * situations where we cannot bind the object. We
> +	 * can be a little more lax here and use the fallback
> +	 * more often to avoid costly migrations of ourselves
> +	 * and other objects within the aperture.
> +	 */
> +	if (!(flags & PIN_NONBLOCK))
> +		return false;
> +
> +	/*
> +	 * We only consider objects whose size is at-least a quarter of
> +	 * the aperture to be too big and subject them to the new
> +	 * heuristic below.
> +	 */
> +	if (obj->base.size < ggtt->mappable_end / 4)
> +		return false;
> +
> +	if (HAS_GMCH(i915) || DISPLAY_VER(i915) < 11 ||
> +	    !i915_gem_object_is_framebuffer(obj))
> +		return false;
> +
> +	fence_size = i915_gem_fence_size(i915, obj->base.size,
> +					 i915_gem_object_get_tiling(obj),
> +					 i915_gem_object_get_stride(obj));
> +
> +	if (i915_vm_has_cache_coloring(&ggtt->vm))
> +		fence_size += 2 * I915_GTT_PAGE_SIZE;
> +
> +	fence_alignment = i915_gem_fence_alignment(i915, obj->base.size,
> +						   i915_gem_object_get_tiling(obj),
> +						   i915_gem_object_get_stride(obj));
> +	alignment = max_t(u64, alignment, fence_alignment);
> +
> +	/*
> +	 * Assuming this object is a large scanout buffer, we try to find
> +	 * out if there is room to map at-least two of them. There could
> +	 * be space available to map one but to be consistent, we try to
> +	 * avoid mapping/fencing any of them.
> +	 */
> +	drm_mm_for_each_best_hole(hole, &ggtt->vm.mm, 0, ggtt->mappable_end,
> +				  fence_size, DRM_MM_INSERT_LOW) {
> +		hole_start = drm_mm_hole_node_start(hole);
> +		hole_end = hole_start + hole->hole_size;
> +
> +		do {
> +			start = round_up(hole_start, alignment);
> +			end = min_t(u64, hole_end, ggtt->mappable_end);
> +
> +			if (range_overflows(start, fence_size, end))
> +				break;
> +
> +			if (++count >= 2)
> +				return false;
> +
> +			hole_start = start + fence_size;
> +		} while (1);
> +	}
> +
> +	return true;
> +}
> +
>   struct i915_vma *
>   i915_gem_object_ggtt_pin_ww(struct drm_i915_gem_object *obj,
>   			    struct i915_gem_ww_ctx *ww,
> @@ -891,36 +978,8 @@ i915_gem_object_ggtt_pin_ww(struct drm_i915_gem_object *obj,
>   
>   	if (flags & PIN_MAPPABLE &&
>   	    (!view || view->type == I915_GGTT_VIEW_NORMAL)) {
> -		/*
> -		 * If the required space is larger than the available
> -		 * aperture, we will not able to find a slot for the
> -		 * object and unbinding the object now will be in
> -		 * vain. Worse, doing so may cause us to ping-pong
> -		 * the object in and out of the Global GTT and
> -		 * waste a lot of cycles under the mutex.
> -		 */
> -		if (obj->base.size > ggtt->mappable_end)
> +		if (i915_gem_object_fits_in_aperture(obj, alignment, flags))
>   			return ERR_PTR(-E2BIG);
> -
> -		/*
> -		 * If NONBLOCK is set the caller is optimistically
> -		 * trying to cache the full object within the mappable
> -		 * aperture, and *must* have a fallback in place for
> -		 * situations where we cannot bind the object. We
> -		 * can be a little more lax here and use the fallback
> -		 * more often to avoid costly migrations of ourselves
> -		 * and other objects within the aperture.
> -		 *
> -		 * Half-the-aperture is used as a simple heuristic.
> -		 * More interesting would to do search for a free
> -		 * block prior to making the commitment to unbind.
> -		 * That caters for the self-harm case, and with a
> -		 * little more heuristics (e.g. NOFAULT, NOEVICT)
> -		 * we could try to minimise harm to others.
> -		 */
> -		if (flags & PIN_NONBLOCK &&
> -		    obj->base.size > ggtt->mappable_end / 2)
> -			return ERR_PTR(-ENOSPC);
>   	}
>   
>   new_vma:

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [Intel-gfx] [PATCH 2/2] drm/i915/gem: Don't try to map and fence large scanout buffers (v6)
  2022-02-02 13:19   ` Tvrtko Ursulin
@ 2022-02-04  1:22     ` Vivek Kasireddy
  2022-02-07 10:49       ` Tvrtko Ursulin
  2022-02-07 10:58       ` Ville Syrjälä
  0 siblings, 2 replies; 19+ messages in thread
From: Vivek Kasireddy @ 2022-02-04  1:22 UTC (permalink / raw)
  To: intel-gfx

On platforms capable of allowing 8K (7680 x 4320) modes, pinning 2 or
more framebuffers/scanout buffers results in only one that is mappable/
fenceable. Therefore, pageflipping between these 2 FBs where only one
is mappable/fenceable creates latencies large enough to miss alternate
vblanks thereby producing less optimal framerate.

This mainly happens because when i915_gem_object_pin_to_display_plane()
is called to pin one of the FB objs, the associated vma is identified
as misplaced and therefore i915_vma_unbind() is called which unbinds and
evicts it. This misplaced vma gets subseqently pinned only when
i915_gem_object_ggtt_pin_ww() is called without PIN_MAPPABLE. This
results in a latency of ~10ms and happens every other vblank/repaint cycle.
Therefore, to fix this issue, we try to see if there is space to map
at-least two objects of a given size and return early if there isn't. This
would ensure that we do not try with PIN_MAPPABLE for any objects that
are too big to map thereby preventing unncessary unbind.

Testcase:
Running Weston and weston-simple-egl on an Alderlake_S (ADLS) platform
with a 8K@60 mode results in only ~40 FPS. Since upstream Weston submits
a frame ~7ms before the next vblank, the latencies seen between atomic
commit and flip event are 7, 24 (7 + 16.66), 7, 24..... suggesting that
it misses the vblank every other frame.

Here is the ftrace snippet that shows the source of the ~10ms latency:
              i915_gem_object_pin_to_display_plane() {
0.102 us   |    i915_gem_object_set_cache_level();
                i915_gem_object_ggtt_pin_ww() {
0.390 us   |      i915_vma_instance();
0.178 us   |      i915_vma_misplaced();
                  i915_vma_unbind() {
                  __i915_active_wait() {
0.082 us   |        i915_active_acquire_if_busy();
0.475 us   |      }
                  intel_runtime_pm_get() {
0.087 us   |        intel_runtime_pm_acquire();
0.259 us   |      }
                  __i915_active_wait() {
0.085 us   |        i915_active_acquire_if_busy();
0.240 us   |      }
                  __i915_vma_evict() {
                    ggtt_unbind_vma() {
                      gen8_ggtt_clear_range() {
10507.255 us |        }
10507.689 us |      }
10508.516 us |   }

v2: Instead of using bigjoiner checks, determine whether a scanout
    buffer is too big by checking to see if it is possible to map
    two of them into the ggtt.

v3 (Ville):
- Count how many fb objects can be fit into the available holes
  instead of checking for a hole twice the object size.
- Take alignment constraints into account.
- Limit this large scanout buffer check to >= Gen 11 platforms.

v4:
- Remove existing heuristic that checks just for size. (Ville)
- Return early if we find space to map at-least two objects. (Tvrtko)
- Slightly update the commit message.

v5: (Tvrtko)
- Rename the function to indicate that the object may be too big to
  map into the aperture.
- Account for guard pages while calculating the total size required
  for the object.
- Do not subject all objects to the heuristic check and instead
  consider objects only of a certain size.
- Do the hole walk using the rbtree.
- Preserve the existing PIN_NONBLOCK logic.
- Drop the PIN_MAPPABLE check while pinning the VMA.

v6: (Tvrtko)
- Return 0 on success and the specific error code on failure to
  preserve the existing behavior.

Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Manasi Navare <manasi.d.navare@intel.com>
Signed-off-by: Vivek Kasireddy <vivek.kasireddy@intel.com>
---
 drivers/gpu/drm/i915/i915_gem.c | 120 ++++++++++++++++++++++++--------
 1 file changed, 90 insertions(+), 30 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index e3a2c2a0e156..39f0d17550c3 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -46,6 +46,7 @@
 #include "gem/i915_gem_mman.h"
 #include "gem/i915_gem_region.h"
 #include "gem/i915_gem_userptr.h"
+#include "gem/i915_gem_tiling.h"
 #include "gt/intel_engine_user.h"
 #include "gt/intel_gt.h"
 #include "gt/intel_gt_pm.h"
@@ -876,6 +877,92 @@ static void discard_ggtt_vma(struct i915_vma *vma)
 	spin_unlock(&obj->vma.lock);
 }
 
+static int
+i915_gem_object_fits_in_aperture(struct drm_i915_gem_object *obj,
+				 u64 alignment, u64 flags)
+{
+	struct drm_i915_private *i915 = to_i915(obj->base.dev);
+	struct i915_ggtt *ggtt = to_gt(i915)->ggtt;
+	struct drm_mm_node *hole;
+	u64 hole_start, hole_end, start, end;
+	u64 fence_size, fence_alignment;
+	unsigned int count = 0;
+
+	/*
+	 * If the required space is larger than the available
+	 * aperture, we will not able to find a slot for the
+	 * object and unbinding the object now will be in
+	 * vain. Worse, doing so may cause us to ping-pong
+	 * the object in and out of the Global GTT and
+	 * waste a lot of cycles under the mutex.
+	 */
+	if (obj->base.size > ggtt->mappable_end)
+		return -E2BIG;
+
+	/*
+	 * If NONBLOCK is set the caller is optimistically
+	 * trying to cache the full object within the mappable
+	 * aperture, and *must* have a fallback in place for
+	 * situations where we cannot bind the object. We
+	 * can be a little more lax here and use the fallback
+	 * more often to avoid costly migrations of ourselves
+	 * and other objects within the aperture.
+	 */
+	if (!(flags & PIN_NONBLOCK))
+		return 0;
+
+	/*
+	 * We only consider objects whose size is at-least a quarter of
+	 * the aperture to be too big and subject them to the new
+	 * heuristic below.
+	 */
+	if (obj->base.size < ggtt->mappable_end / 4)
+		return 0;
+
+	if (HAS_GMCH(i915) || DISPLAY_VER(i915) < 11 ||
+	    !i915_gem_object_is_framebuffer(obj))
+		return 0;
+
+	fence_size = i915_gem_fence_size(i915, obj->base.size,
+					 i915_gem_object_get_tiling(obj),
+					 i915_gem_object_get_stride(obj));
+
+	if (i915_vm_has_cache_coloring(&ggtt->vm))
+		fence_size += 2 * I915_GTT_PAGE_SIZE;
+
+	fence_alignment = i915_gem_fence_alignment(i915, obj->base.size,
+						   i915_gem_object_get_tiling(obj),
+						   i915_gem_object_get_stride(obj));
+	alignment = max_t(u64, alignment, fence_alignment);
+
+	/*
+	 * Assuming this object is a large scanout buffer, we try to find
+	 * out if there is room to map at-least two of them. There could
+	 * be space available to map one but to be consistent, we try to
+	 * avoid mapping/fencing any of them.
+	 */
+	drm_mm_for_each_suitable_hole(hole, &ggtt->vm.mm, 0, ggtt->mappable_end,
+				      fence_size, DRM_MM_INSERT_LOW) {
+		hole_start = drm_mm_hole_node_start(hole);
+		hole_end = hole_start + hole->hole_size;
+
+		do {
+			start = round_up(hole_start, alignment);
+			end = min_t(u64, hole_end, ggtt->mappable_end);
+
+			if (range_overflows(start, fence_size, end))
+				break;
+
+			if (++count >= 2)
+				return 0;
+
+			hole_start = start + fence_size;
+		} while (1);
+	}
+
+	return -ENOSPC;
+}
+
 struct i915_vma *
 i915_gem_object_ggtt_pin_ww(struct drm_i915_gem_object *obj,
 			    struct i915_gem_ww_ctx *ww,
@@ -891,36 +978,9 @@ i915_gem_object_ggtt_pin_ww(struct drm_i915_gem_object *obj,
 
 	if (flags & PIN_MAPPABLE &&
 	    (!view || view->type == I915_GGTT_VIEW_NORMAL)) {
-		/*
-		 * If the required space is larger than the available
-		 * aperture, we will not able to find a slot for the
-		 * object and unbinding the object now will be in
-		 * vain. Worse, doing so may cause us to ping-pong
-		 * the object in and out of the Global GTT and
-		 * waste a lot of cycles under the mutex.
-		 */
-		if (obj->base.size > ggtt->mappable_end)
-			return ERR_PTR(-E2BIG);
-
-		/*
-		 * If NONBLOCK is set the caller is optimistically
-		 * trying to cache the full object within the mappable
-		 * aperture, and *must* have a fallback in place for
-		 * situations where we cannot bind the object. We
-		 * can be a little more lax here and use the fallback
-		 * more often to avoid costly migrations of ourselves
-		 * and other objects within the aperture.
-		 *
-		 * Half-the-aperture is used as a simple heuristic.
-		 * More interesting would to do search for a free
-		 * block prior to making the commitment to unbind.
-		 * That caters for the self-harm case, and with a
-		 * little more heuristics (e.g. NOFAULT, NOEVICT)
-		 * we could try to minimise harm to others.
-		 */
-		if (flags & PIN_NONBLOCK &&
-		    obj->base.size > ggtt->mappable_end / 2)
-			return ERR_PTR(-ENOSPC);
+		ret = i915_gem_object_fits_in_aperture(obj, alignment, flags);
+		if (ret)
+			return ERR_PTR(ret);
 	}
 
 new_vma:
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [Intel-gfx] ✗ Fi.CI.BUILD: failure for series starting with [1/2] drm/mm: Add an iterator to optimally walk over holes for an allocation (rev2)
  2022-02-02  1:11 [Intel-gfx] [PATCH 1/2] drm/mm: Add an iterator to optimally walk over holes for an allocation Vivek Kasireddy
                   ` (3 preceding siblings ...)
  2022-02-02  2:09 ` [Intel-gfx] ✗ Fi.CI.BAT: failure " Patchwork
@ 2022-02-04  1:45 ` Patchwork
  2022-02-07 10:51   ` Tvrtko Ursulin
  4 siblings, 1 reply; 19+ messages in thread
From: Patchwork @ 2022-02-04  1:45 UTC (permalink / raw)
  To: Vivek Kasireddy; +Cc: intel-gfx

== Series Details ==

Series: series starting with [1/2] drm/mm: Add an iterator to optimally walk over holes for an allocation (rev2)
URL   : https://patchwork.freedesktop.org/series/99597/
State : failure

== Summary ==

CALL    scripts/checksyscalls.sh
  CALL    scripts/atomic/check-atomics.sh
  DESCEND objtool
  CHK     include/generated/compile.h
  CC [M]  drivers/gpu/drm/i915/i915_gem.o
drivers/gpu/drm/i915/i915_gem.c: In function ‘i915_gem_object_fits_in_aperture’:
drivers/gpu/drm/i915/i915_gem.c:944:2: error: implicit declaration of function ‘drm_mm_for_each_suitable_hole’; did you mean ‘drm_mm_for_each_best_hole’? [-Werror=implicit-function-declaration]
  drm_mm_for_each_suitable_hole(hole, &ggtt->vm.mm, 0, ggtt->mappable_end,
  ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  drm_mm_for_each_best_hole
drivers/gpu/drm/i915/i915_gem.c:945:41: error: expected ‘;’ before ‘{’ token
           fence_size, DRM_MM_INSERT_LOW) {
                                         ^~
                                         ;
drivers/gpu/drm/i915/i915_gem.c:889:15: error: unused variable ‘count’ [-Werror=unused-variable]
  unsigned int count = 0;
               ^~~~~
drivers/gpu/drm/i915/i915_gem.c:887:35: error: unused variable ‘end’ [-Werror=unused-variable]
  u64 hole_start, hole_end, start, end;
                                   ^~~
drivers/gpu/drm/i915/i915_gem.c:887:28: error: unused variable ‘start’ [-Werror=unused-variable]
  u64 hole_start, hole_end, start, end;
                            ^~~~~
drivers/gpu/drm/i915/i915_gem.c:887:18: error: unused variable ‘hole_end’ [-Werror=unused-variable]
  u64 hole_start, hole_end, start, end;
                  ^~~~~~~~
drivers/gpu/drm/i915/i915_gem.c:887:6: error: unused variable ‘hole_start’ [-Werror=unused-variable]
  u64 hole_start, hole_end, start, end;
      ^~~~~~~~~~
drivers/gpu/drm/i915/i915_gem.c:964:1: error: control reaches end of non-void function [-Werror=return-type]
 }
 ^
cc1: all warnings being treated as errors
scripts/Makefile.build:288: recipe for target 'drivers/gpu/drm/i915/i915_gem.o' failed
make[4]: *** [drivers/gpu/drm/i915/i915_gem.o] Error 1
scripts/Makefile.build:550: recipe for target 'drivers/gpu/drm/i915' failed
make[3]: *** [drivers/gpu/drm/i915] Error 2
scripts/Makefile.build:550: recipe for target 'drivers/gpu/drm' failed
make[2]: *** [drivers/gpu/drm] Error 2
scripts/Makefile.build:550: recipe for target 'drivers/gpu' failed
make[1]: *** [drivers/gpu] Error 2
Makefile:1831: recipe for target 'drivers' failed
make: *** [drivers] Error 2



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [Intel-gfx] [PATCH 2/2] drm/i915/gem: Don't try to map and fence large scanout buffers (v6)
  2022-02-04  1:22     ` [Intel-gfx] [PATCH 2/2] drm/i915/gem: Don't try to map and fence large scanout buffers (v6) Vivek Kasireddy
@ 2022-02-07 10:49       ` Tvrtko Ursulin
  2022-02-07 10:58       ` Ville Syrjälä
  1 sibling, 0 replies; 19+ messages in thread
From: Tvrtko Ursulin @ 2022-02-07 10:49 UTC (permalink / raw)
  To: Vivek Kasireddy, intel-gfx


On 04/02/2022 01:22, Vivek Kasireddy wrote:
> On platforms capable of allowing 8K (7680 x 4320) modes, pinning 2 or
> more framebuffers/scanout buffers results in only one that is mappable/
> fenceable. Therefore, pageflipping between these 2 FBs where only one
> is mappable/fenceable creates latencies large enough to miss alternate
> vblanks thereby producing less optimal framerate.
> 
> This mainly happens because when i915_gem_object_pin_to_display_plane()
> is called to pin one of the FB objs, the associated vma is identified
> as misplaced and therefore i915_vma_unbind() is called which unbinds and
> evicts it. This misplaced vma gets subseqently pinned only when
> i915_gem_object_ggtt_pin_ww() is called without PIN_MAPPABLE. This
> results in a latency of ~10ms and happens every other vblank/repaint cycle.
> Therefore, to fix this issue, we try to see if there is space to map
> at-least two objects of a given size and return early if there isn't. This
> would ensure that we do not try with PIN_MAPPABLE for any objects that
> are too big to map thereby preventing unncessary unbind.
> 
> Testcase:
> Running Weston and weston-simple-egl on an Alderlake_S (ADLS) platform
> with a 8K@60 mode results in only ~40 FPS. Since upstream Weston submits
> a frame ~7ms before the next vblank, the latencies seen between atomic
> commit and flip event are 7, 24 (7 + 16.66), 7, 24..... suggesting that
> it misses the vblank every other frame.
> 
> Here is the ftrace snippet that shows the source of the ~10ms latency:
>                i915_gem_object_pin_to_display_plane() {
> 0.102 us   |    i915_gem_object_set_cache_level();
>                  i915_gem_object_ggtt_pin_ww() {
> 0.390 us   |      i915_vma_instance();
> 0.178 us   |      i915_vma_misplaced();
>                    i915_vma_unbind() {
>                    __i915_active_wait() {
> 0.082 us   |        i915_active_acquire_if_busy();
> 0.475 us   |      }
>                    intel_runtime_pm_get() {
> 0.087 us   |        intel_runtime_pm_acquire();
> 0.259 us   |      }
>                    __i915_active_wait() {
> 0.085 us   |        i915_active_acquire_if_busy();
> 0.240 us   |      }
>                    __i915_vma_evict() {
>                      ggtt_unbind_vma() {
>                        gen8_ggtt_clear_range() {
> 10507.255 us |        }
> 10507.689 us |      }
> 10508.516 us |   }
> 
> v2: Instead of using bigjoiner checks, determine whether a scanout
>      buffer is too big by checking to see if it is possible to map
>      two of them into the ggtt.
> 
> v3 (Ville):
> - Count how many fb objects can be fit into the available holes
>    instead of checking for a hole twice the object size.
> - Take alignment constraints into account.
> - Limit this large scanout buffer check to >= Gen 11 platforms.
> 
> v4:
> - Remove existing heuristic that checks just for size. (Ville)
> - Return early if we find space to map at-least two objects. (Tvrtko)
> - Slightly update the commit message.
> 
> v5: (Tvrtko)
> - Rename the function to indicate that the object may be too big to
>    map into the aperture.
> - Account for guard pages while calculating the total size required
>    for the object.
> - Do not subject all objects to the heuristic check and instead
>    consider objects only of a certain size.
> - Do the hole walk using the rbtree.
> - Preserve the existing PIN_NONBLOCK logic.
> - Drop the PIN_MAPPABLE check while pinning the VMA.
> 
> v6: (Tvrtko)
> - Return 0 on success and the specific error code on failure to
>    preserve the existing behavior.
> 
> Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
> Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> Cc: Manasi Navare <manasi.d.navare@intel.com>
> Signed-off-by: Vivek Kasireddy <vivek.kasireddy@intel.com>
> ---
>   drivers/gpu/drm/i915/i915_gem.c | 120 ++++++++++++++++++++++++--------
>   1 file changed, 90 insertions(+), 30 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index e3a2c2a0e156..39f0d17550c3 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -46,6 +46,7 @@
>   #include "gem/i915_gem_mman.h"
>   #include "gem/i915_gem_region.h"
>   #include "gem/i915_gem_userptr.h"
> +#include "gem/i915_gem_tiling.h"
>   #include "gt/intel_engine_user.h"
>   #include "gt/intel_gt.h"
>   #include "gt/intel_gt_pm.h"
> @@ -876,6 +877,92 @@ static void discard_ggtt_vma(struct i915_vma *vma)
>   	spin_unlock(&obj->vma.lock);
>   }
>   
> +static int
> +i915_gem_object_fits_in_aperture(struct drm_i915_gem_object *obj,
> +				 u64 alignment, u64 flags)
> +{
> +	struct drm_i915_private *i915 = to_i915(obj->base.dev);
> +	struct i915_ggtt *ggtt = to_gt(i915)->ggtt;
> +	struct drm_mm_node *hole;
> +	u64 hole_start, hole_end, start, end;
> +	u64 fence_size, fence_alignment;
> +	unsigned int count = 0;
> +
> +	/*
> +	 * If the required space is larger than the available
> +	 * aperture, we will not able to find a slot for the
> +	 * object and unbinding the object now will be in
> +	 * vain. Worse, doing so may cause us to ping-pong
> +	 * the object in and out of the Global GTT and
> +	 * waste a lot of cycles under the mutex.
> +	 */
> +	if (obj->base.size > ggtt->mappable_end)
> +		return -E2BIG;
> +
> +	/*
> +	 * If NONBLOCK is set the caller is optimistically
> +	 * trying to cache the full object within the mappable
> +	 * aperture, and *must* have a fallback in place for
> +	 * situations where we cannot bind the object. We
> +	 * can be a little more lax here and use the fallback
> +	 * more often to avoid costly migrations of ourselves
> +	 * and other objects within the aperture.
> +	 */
> +	if (!(flags & PIN_NONBLOCK))
> +		return 0;
> +
> +	/*
> +	 * We only consider objects whose size is at-least a quarter of
> +	 * the aperture to be too big and subject them to the new
> +	 * heuristic below.
> +	 */
> +	if (obj->base.size < ggtt->mappable_end / 4)
> +		return 0;
> +
> +	if (HAS_GMCH(i915) || DISPLAY_VER(i915) < 11 ||
> +	    !i915_gem_object_is_framebuffer(obj))
> +		return 0;
> +
> +	fence_size = i915_gem_fence_size(i915, obj->base.size,
> +					 i915_gem_object_get_tiling(obj),
> +					 i915_gem_object_get_stride(obj));
> +
> +	if (i915_vm_has_cache_coloring(&ggtt->vm))
> +		fence_size += 2 * I915_GTT_PAGE_SIZE;
> +
> +	fence_alignment = i915_gem_fence_alignment(i915, obj->base.size,
> +						   i915_gem_object_get_tiling(obj),
> +						   i915_gem_object_get_stride(obj));
> +	alignment = max_t(u64, alignment, fence_alignment);
> +
> +	/*
> +	 * Assuming this object is a large scanout buffer, we try to find
> +	 * out if there is room to map at-least two of them. There could
> +	 * be space available to map one but to be consistent, we try to
> +	 * avoid mapping/fencing any of them.
> +	 */
> +	drm_mm_for_each_suitable_hole(hole, &ggtt->vm.mm, 0, ggtt->mappable_end,
> +				      fence_size, DRM_MM_INSERT_LOW) {
> +		hole_start = drm_mm_hole_node_start(hole);
> +		hole_end = hole_start + hole->hole_size;
> +
> +		do {
> +			start = round_up(hole_start, alignment);
> +			end = min_t(u64, hole_end, ggtt->mappable_end);
> +
> +			if (range_overflows(start, fence_size, end))
> +				break;
> +
> +			if (++count >= 2)
> +				return 0;
> +
> +			hole_start = start + fence_size;
> +		} while (1);
> +	}
> +
> +	return -ENOSPC;
> +}
> +
>   struct i915_vma *
>   i915_gem_object_ggtt_pin_ww(struct drm_i915_gem_object *obj,
>   			    struct i915_gem_ww_ctx *ww,
> @@ -891,36 +978,9 @@ i915_gem_object_ggtt_pin_ww(struct drm_i915_gem_object *obj,
>   
>   	if (flags & PIN_MAPPABLE &&
>   	    (!view || view->type == I915_GGTT_VIEW_NORMAL)) {
> -		/*
> -		 * If the required space is larger than the available
> -		 * aperture, we will not able to find a slot for the
> -		 * object and unbinding the object now will be in
> -		 * vain. Worse, doing so may cause us to ping-pong
> -		 * the object in and out of the Global GTT and
> -		 * waste a lot of cycles under the mutex.
> -		 */
> -		if (obj->base.size > ggtt->mappable_end)
> -			return ERR_PTR(-E2BIG);
> -
> -		/*
> -		 * If NONBLOCK is set the caller is optimistically
> -		 * trying to cache the full object within the mappable
> -		 * aperture, and *must* have a fallback in place for
> -		 * situations where we cannot bind the object. We
> -		 * can be a little more lax here and use the fallback
> -		 * more often to avoid costly migrations of ourselves
> -		 * and other objects within the aperture.
> -		 *
> -		 * Half-the-aperture is used as a simple heuristic.
> -		 * More interesting would to do search for a free
> -		 * block prior to making the commitment to unbind.
> -		 * That caters for the self-harm case, and with a
> -		 * little more heuristics (e.g. NOFAULT, NOEVICT)
> -		 * we could try to minimise harm to others.
> -		 */
> -		if (flags & PIN_NONBLOCK &&
> -		    obj->base.size > ggtt->mappable_end / 2)
> -			return ERR_PTR(-ENOSPC);
> +		ret = i915_gem_object_fits_in_aperture(obj, alignment, flags);
> +		if (ret)
> +			return ERR_PTR(ret);
>   	}
>   
>   new_vma:

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [Intel-gfx]  ✗ Fi.CI.BUILD: failure for series starting with [1/2] drm/mm: Add an iterator to optimally walk over holes for an allocation (rev2)
  2022-02-04  1:45 ` [Intel-gfx] ✗ Fi.CI.BUILD: failure for series starting with [1/2] drm/mm: Add an iterator to optimally walk over holes for an allocation (rev2) Patchwork
@ 2022-02-07 10:51   ` Tvrtko Ursulin
  0 siblings, 0 replies; 19+ messages in thread
From: Tvrtko Ursulin @ 2022-02-07 10:51 UTC (permalink / raw)
  To: intel-gfx, Patchwork, Vivek Kasireddy


On 04/02/2022 01:45, Patchwork wrote:
> == Series Details ==
> 
> Series: series starting with [1/2] drm/mm: Add an iterator to optimally walk over holes for an allocation (rev2)
> URL   : https://patchwork.freedesktop.org/series/99597/
> State : failure
> 
> == Summary ==
> 
> CALL    scripts/checksyscalls.sh
>    CALL    scripts/atomic/check-atomics.sh
>    DESCEND objtool
>    CHK     include/generated/compile.h
>    CC [M]  drivers/gpu/drm/i915/i915_gem.o
> drivers/gpu/drm/i915/i915_gem.c: In function ‘i915_gem_object_fits_in_aperture’:
> drivers/gpu/drm/i915/i915_gem.c:944:2: error: implicit declaration of function ‘drm_mm_for_each_suitable_hole’; did you mean ‘drm_mm_for_each_best_hole’? [-Werror=implicit-function-declaration]
>    drm_mm_for_each_suitable_hole(hole, &ggtt->vm.mm, 0, ggtt->mappable_end,
>    ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>    drm_mm_for_each_best_hole
> drivers/gpu/drm/i915/i915_gem.c:945:41: error: expected ‘;’ before ‘{’ token
>             fence_size, DRM_MM_INSERT_LOW) {
>                                           ^~
>                                           ;
> drivers/gpu/drm/i915/i915_gem.c:889:15: error: unused variable ‘count’ [-Werror=unused-variable]
>    unsigned int count = 0;
>                 ^~~~~
> drivers/gpu/drm/i915/i915_gem.c:887:35: error: unused variable ‘end’ [-Werror=unused-variable]
>    u64 hole_start, hole_end, start, end;
>                                     ^~~
> drivers/gpu/drm/i915/i915_gem.c:887:28: error: unused variable ‘start’ [-Werror=unused-variable]
>    u64 hole_start, hole_end, start, end;
>                              ^~~~~
> drivers/gpu/drm/i915/i915_gem.c:887:18: error: unused variable ‘hole_end’ [-Werror=unused-variable]
>    u64 hole_start, hole_end, start, end;
>                    ^~~~~~~~
> drivers/gpu/drm/i915/i915_gem.c:887:6: error: unused variable ‘hole_start’ [-Werror=unused-variable]
>    u64 hole_start, hole_end, start, end;
>        ^~~~~~~~~~
> drivers/gpu/drm/i915/i915_gem.c:964:1: error: control reaches end of non-void function [-Werror=return-type]
>   }
>   ^
> cc1: all warnings being treated as errors
> scripts/Makefile.build:288: recipe for target 'drivers/gpu/drm/i915/i915_gem.o' failed
> make[4]: *** [drivers/gpu/drm/i915/i915_gem.o] Error 1
> scripts/Makefile.build:550: recipe for target 'drivers/gpu/drm/i915' failed
> make[3]: *** [drivers/gpu/drm/i915] Error 2
> scripts/Makefile.build:550: recipe for target 'drivers/gpu/drm' failed
> make[2]: *** [drivers/gpu/drm] Error 2
> scripts/Makefile.build:550: recipe for target 'drivers/gpu' failed
> make[1]: *** [drivers/gpu] Error 2
> Makefile:1831: recipe for target 'drivers' failed
> make: *** [drivers] Error 2

Just post both patches to both intel-gfx and dri-devel as a series so 
patchwork picks it up correctly.

It is needed to give context to dri-devel folks anyway.

Regards,

Tvrtko



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [Intel-gfx] [PATCH 2/2] drm/i915/gem: Don't try to map and fence large scanout buffers (v6)
  2022-02-04  1:22     ` [Intel-gfx] [PATCH 2/2] drm/i915/gem: Don't try to map and fence large scanout buffers (v6) Vivek Kasireddy
  2022-02-07 10:49       ` Tvrtko Ursulin
@ 2022-02-07 10:58       ` Ville Syrjälä
  2022-02-07 11:47         ` Tvrtko Ursulin
  1 sibling, 1 reply; 19+ messages in thread
From: Ville Syrjälä @ 2022-02-07 10:58 UTC (permalink / raw)
  To: Vivek Kasireddy; +Cc: intel-gfx

On Thu, Feb 03, 2022 at 05:22:10PM -0800, Vivek Kasireddy wrote:
> On platforms capable of allowing 8K (7680 x 4320) modes, pinning 2 or
> more framebuffers/scanout buffers results in only one that is mappable/
> fenceable. Therefore, pageflipping between these 2 FBs where only one
> is mappable/fenceable creates latencies large enough to miss alternate
> vblanks thereby producing less optimal framerate.
> 
> This mainly happens because when i915_gem_object_pin_to_display_plane()
> is called to pin one of the FB objs, the associated vma is identified
> as misplaced and therefore i915_vma_unbind() is called which unbinds and
> evicts it. This misplaced vma gets subseqently pinned only when
> i915_gem_object_ggtt_pin_ww() is called without PIN_MAPPABLE. This
> results in a latency of ~10ms and happens every other vblank/repaint cycle.
> Therefore, to fix this issue, we try to see if there is space to map
> at-least two objects of a given size and return early if there isn't. This
> would ensure that we do not try with PIN_MAPPABLE for any objects that
> are too big to map thereby preventing unncessary unbind.
> 
> Testcase:
> Running Weston and weston-simple-egl on an Alderlake_S (ADLS) platform
> with a 8K@60 mode results in only ~40 FPS. Since upstream Weston submits
> a frame ~7ms before the next vblank, the latencies seen between atomic
> commit and flip event are 7, 24 (7 + 16.66), 7, 24..... suggesting that
> it misses the vblank every other frame.
> 
> Here is the ftrace snippet that shows the source of the ~10ms latency:
>               i915_gem_object_pin_to_display_plane() {
> 0.102 us   |    i915_gem_object_set_cache_level();
>                 i915_gem_object_ggtt_pin_ww() {
> 0.390 us   |      i915_vma_instance();
> 0.178 us   |      i915_vma_misplaced();
>                   i915_vma_unbind() {
>                   __i915_active_wait() {
> 0.082 us   |        i915_active_acquire_if_busy();
> 0.475 us   |      }
>                   intel_runtime_pm_get() {
> 0.087 us   |        intel_runtime_pm_acquire();
> 0.259 us   |      }
>                   __i915_active_wait() {
> 0.085 us   |        i915_active_acquire_if_busy();
> 0.240 us   |      }
>                   __i915_vma_evict() {
>                     ggtt_unbind_vma() {
>                       gen8_ggtt_clear_range() {
> 10507.255 us |        }
> 10507.689 us |      }
> 10508.516 us |   }
> 
> v2: Instead of using bigjoiner checks, determine whether a scanout
>     buffer is too big by checking to see if it is possible to map
>     two of them into the ggtt.
> 
> v3 (Ville):
> - Count how many fb objects can be fit into the available holes
>   instead of checking for a hole twice the object size.
> - Take alignment constraints into account.
> - Limit this large scanout buffer check to >= Gen 11 platforms.
> 
> v4:
> - Remove existing heuristic that checks just for size. (Ville)
> - Return early if we find space to map at-least two objects. (Tvrtko)
> - Slightly update the commit message.
> 
> v5: (Tvrtko)
> - Rename the function to indicate that the object may be too big to
>   map into the aperture.
> - Account for guard pages while calculating the total size required
>   for the object.
> - Do not subject all objects to the heuristic check and instead
>   consider objects only of a certain size.
> - Do the hole walk using the rbtree.
> - Preserve the existing PIN_NONBLOCK logic.
> - Drop the PIN_MAPPABLE check while pinning the VMA.
> 
> v6: (Tvrtko)
> - Return 0 on success and the specific error code on failure to
>   preserve the existing behavior.
> 
> Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
> Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> Cc: Manasi Navare <manasi.d.navare@intel.com>
> Signed-off-by: Vivek Kasireddy <vivek.kasireddy@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_gem.c | 120 ++++++++++++++++++++++++--------
>  1 file changed, 90 insertions(+), 30 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index e3a2c2a0e156..39f0d17550c3 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -46,6 +46,7 @@
>  #include "gem/i915_gem_mman.h"
>  #include "gem/i915_gem_region.h"
>  #include "gem/i915_gem_userptr.h"
> +#include "gem/i915_gem_tiling.h"
>  #include "gt/intel_engine_user.h"
>  #include "gt/intel_gt.h"
>  #include "gt/intel_gt_pm.h"
> @@ -876,6 +877,92 @@ static void discard_ggtt_vma(struct i915_vma *vma)
>  	spin_unlock(&obj->vma.lock);
>  }
>  
> +static int
> +i915_gem_object_fits_in_aperture(struct drm_i915_gem_object *obj,
> +				 u64 alignment, u64 flags)
> +{
> +	struct drm_i915_private *i915 = to_i915(obj->base.dev);
> +	struct i915_ggtt *ggtt = to_gt(i915)->ggtt;
> +	struct drm_mm_node *hole;
> +	u64 hole_start, hole_end, start, end;
> +	u64 fence_size, fence_alignment;
> +	unsigned int count = 0;
> +
> +	/*
> +	 * If the required space is larger than the available
> +	 * aperture, we will not able to find a slot for the
> +	 * object and unbinding the object now will be in
> +	 * vain. Worse, doing so may cause us to ping-pong
> +	 * the object in and out of the Global GTT and
> +	 * waste a lot of cycles under the mutex.
> +	 */
> +	if (obj->base.size > ggtt->mappable_end)
> +		return -E2BIG;
> +
> +	/*
> +	 * If NONBLOCK is set the caller is optimistically
> +	 * trying to cache the full object within the mappable
> +	 * aperture, and *must* have a fallback in place for
> +	 * situations where we cannot bind the object. We
> +	 * can be a little more lax here and use the fallback
> +	 * more often to avoid costly migrations of ourselves
> +	 * and other objects within the aperture.
> +	 */
> +	if (!(flags & PIN_NONBLOCK))
> +		return 0;
> +
> +	/*
> +	 * We only consider objects whose size is at-least a quarter of
> +	 * the aperture to be too big and subject them to the new
> +	 * heuristic below.
> +	 */
> +	if (obj->base.size < ggtt->mappable_end / 4)
> +		return 0;

That seems a fairly arbitrary thing to put here. Maybe something the
caller should check/specify?

> +
> +	if (HAS_GMCH(i915) || DISPLAY_VER(i915) < 11 ||
> +	    !i915_gem_object_is_framebuffer(obj))
> +		return 0;

None of that seems appropriate for a generic gem function
with this name.

> +
> +	fence_size = i915_gem_fence_size(i915, obj->base.size,
> +					 i915_gem_object_get_tiling(obj),
> +					 i915_gem_object_get_stride(obj));
> +
> +	if (i915_vm_has_cache_coloring(&ggtt->vm))
> +		fence_size += 2 * I915_GTT_PAGE_SIZE;
> +
> +	fence_alignment = i915_gem_fence_alignment(i915, obj->base.size,
> +						   i915_gem_object_get_tiling(obj),
> +						   i915_gem_object_get_stride(obj));
> +	alignment = max_t(u64, alignment, fence_alignment);
> +
> +	/*
> +	 * Assuming this object is a large scanout buffer, we try to find
> +	 * out if there is room to map at-least two of them. There could
> +	 * be space available to map one but to be consistent, we try to
> +	 * avoid mapping/fencing any of them.
> +	 */
> +	drm_mm_for_each_suitable_hole(hole, &ggtt->vm.mm, 0, ggtt->mappable_end,
> +				      fence_size, DRM_MM_INSERT_LOW) {
> +		hole_start = drm_mm_hole_node_start(hole);
> +		hole_end = hole_start + hole->hole_size;
> +
> +		do {
> +			start = round_up(hole_start, alignment);
> +			end = min_t(u64, hole_end, ggtt->mappable_end);
> +
> +			if (range_overflows(start, fence_size, end))
> +				break;
> +
> +			if (++count >= 2)
> +				return 0;
> +
> +			hole_start = start + fence_size;
> +		} while (1);
> +	}
> +
> +	return -ENOSPC;
> +}
> +
>  struct i915_vma *
>  i915_gem_object_ggtt_pin_ww(struct drm_i915_gem_object *obj,
>  			    struct i915_gem_ww_ctx *ww,
> @@ -891,36 +978,9 @@ i915_gem_object_ggtt_pin_ww(struct drm_i915_gem_object *obj,
>  
>  	if (flags & PIN_MAPPABLE &&
>  	    (!view || view->type == I915_GGTT_VIEW_NORMAL)) {
> -		/*
> -		 * If the required space is larger than the available
> -		 * aperture, we will not able to find a slot for the
> -		 * object and unbinding the object now will be in
> -		 * vain. Worse, doing so may cause us to ping-pong
> -		 * the object in and out of the Global GTT and
> -		 * waste a lot of cycles under the mutex.
> -		 */
> -		if (obj->base.size > ggtt->mappable_end)
> -			return ERR_PTR(-E2BIG);
> -
> -		/*
> -		 * If NONBLOCK is set the caller is optimistically
> -		 * trying to cache the full object within the mappable
> -		 * aperture, and *must* have a fallback in place for
> -		 * situations where we cannot bind the object. We
> -		 * can be a little more lax here and use the fallback
> -		 * more often to avoid costly migrations of ourselves
> -		 * and other objects within the aperture.
> -		 *
> -		 * Half-the-aperture is used as a simple heuristic.
> -		 * More interesting would to do search for a free
> -		 * block prior to making the commitment to unbind.
> -		 * That caters for the self-harm case, and with a
> -		 * little more heuristics (e.g. NOFAULT, NOEVICT)
> -		 * we could try to minimise harm to others.
> -		 */
> -		if (flags & PIN_NONBLOCK &&
> -		    obj->base.size > ggtt->mappable_end / 2)
> -			return ERR_PTR(-ENOSPC);
> +		ret = i915_gem_object_fits_in_aperture(obj, alignment, flags);
> +		if (ret)
> +			return ERR_PTR(ret);
>  	}
>  
>  new_vma:
> -- 
> 2.34.1

-- 
Ville Syrjälä
Intel

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [Intel-gfx] [PATCH 2/2] drm/i915/gem: Don't try to map and fence large scanout buffers (v6)
  2022-02-07 10:58       ` Ville Syrjälä
@ 2022-02-07 11:47         ` Tvrtko Ursulin
  2022-02-07 13:24           ` Ville Syrjälä
  0 siblings, 1 reply; 19+ messages in thread
From: Tvrtko Ursulin @ 2022-02-07 11:47 UTC (permalink / raw)
  To: Ville Syrjälä, Vivek Kasireddy; +Cc: intel-gfx


On 07/02/2022 10:58, Ville Syrjälä wrote:
> On Thu, Feb 03, 2022 at 05:22:10PM -0800, Vivek Kasireddy wrote:
>> On platforms capable of allowing 8K (7680 x 4320) modes, pinning 2 or
>> more framebuffers/scanout buffers results in only one that is mappable/
>> fenceable. Therefore, pageflipping between these 2 FBs where only one
>> is mappable/fenceable creates latencies large enough to miss alternate
>> vblanks thereby producing less optimal framerate.
>>
>> This mainly happens because when i915_gem_object_pin_to_display_plane()
>> is called to pin one of the FB objs, the associated vma is identified
>> as misplaced and therefore i915_vma_unbind() is called which unbinds and
>> evicts it. This misplaced vma gets subseqently pinned only when
>> i915_gem_object_ggtt_pin_ww() is called without PIN_MAPPABLE. This
>> results in a latency of ~10ms and happens every other vblank/repaint cycle.
>> Therefore, to fix this issue, we try to see if there is space to map
>> at-least two objects of a given size and return early if there isn't. This
>> would ensure that we do not try with PIN_MAPPABLE for any objects that
>> are too big to map thereby preventing unncessary unbind.
>>
>> Testcase:
>> Running Weston and weston-simple-egl on an Alderlake_S (ADLS) platform
>> with a 8K@60 mode results in only ~40 FPS. Since upstream Weston submits
>> a frame ~7ms before the next vblank, the latencies seen between atomic
>> commit and flip event are 7, 24 (7 + 16.66), 7, 24..... suggesting that
>> it misses the vblank every other frame.
>>
>> Here is the ftrace snippet that shows the source of the ~10ms latency:
>>                i915_gem_object_pin_to_display_plane() {
>> 0.102 us   |    i915_gem_object_set_cache_level();
>>                  i915_gem_object_ggtt_pin_ww() {
>> 0.390 us   |      i915_vma_instance();
>> 0.178 us   |      i915_vma_misplaced();
>>                    i915_vma_unbind() {
>>                    __i915_active_wait() {
>> 0.082 us   |        i915_active_acquire_if_busy();
>> 0.475 us   |      }
>>                    intel_runtime_pm_get() {
>> 0.087 us   |        intel_runtime_pm_acquire();
>> 0.259 us   |      }
>>                    __i915_active_wait() {
>> 0.085 us   |        i915_active_acquire_if_busy();
>> 0.240 us   |      }
>>                    __i915_vma_evict() {
>>                      ggtt_unbind_vma() {
>>                        gen8_ggtt_clear_range() {
>> 10507.255 us |        }
>> 10507.689 us |      }
>> 10508.516 us |   }
>>
>> v2: Instead of using bigjoiner checks, determine whether a scanout
>>      buffer is too big by checking to see if it is possible to map
>>      two of them into the ggtt.
>>
>> v3 (Ville):
>> - Count how many fb objects can be fit into the available holes
>>    instead of checking for a hole twice the object size.
>> - Take alignment constraints into account.
>> - Limit this large scanout buffer check to >= Gen 11 platforms.
>>
>> v4:
>> - Remove existing heuristic that checks just for size. (Ville)
>> - Return early if we find space to map at-least two objects. (Tvrtko)
>> - Slightly update the commit message.
>>
>> v5: (Tvrtko)
>> - Rename the function to indicate that the object may be too big to
>>    map into the aperture.
>> - Account for guard pages while calculating the total size required
>>    for the object.
>> - Do not subject all objects to the heuristic check and instead
>>    consider objects only of a certain size.
>> - Do the hole walk using the rbtree.
>> - Preserve the existing PIN_NONBLOCK logic.
>> - Drop the PIN_MAPPABLE check while pinning the VMA.
>>
>> v6: (Tvrtko)
>> - Return 0 on success and the specific error code on failure to
>>    preserve the existing behavior.
>>
>> Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
>> Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
>> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
>> Cc: Manasi Navare <manasi.d.navare@intel.com>
>> Signed-off-by: Vivek Kasireddy <vivek.kasireddy@intel.com>
>> ---
>>   drivers/gpu/drm/i915/i915_gem.c | 120 ++++++++++++++++++++++++--------
>>   1 file changed, 90 insertions(+), 30 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
>> index e3a2c2a0e156..39f0d17550c3 100644
>> --- a/drivers/gpu/drm/i915/i915_gem.c
>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>> @@ -46,6 +46,7 @@
>>   #include "gem/i915_gem_mman.h"
>>   #include "gem/i915_gem_region.h"
>>   #include "gem/i915_gem_userptr.h"
>> +#include "gem/i915_gem_tiling.h"
>>   #include "gt/intel_engine_user.h"
>>   #include "gt/intel_gt.h"
>>   #include "gt/intel_gt_pm.h"
>> @@ -876,6 +877,92 @@ static void discard_ggtt_vma(struct i915_vma *vma)
>>   	spin_unlock(&obj->vma.lock);
>>   }
>>   
>> +static int
>> +i915_gem_object_fits_in_aperture(struct drm_i915_gem_object *obj,
>> +				 u64 alignment, u64 flags)
>> +{
>> +	struct drm_i915_private *i915 = to_i915(obj->base.dev);
>> +	struct i915_ggtt *ggtt = to_gt(i915)->ggtt;
>> +	struct drm_mm_node *hole;
>> +	u64 hole_start, hole_end, start, end;
>> +	u64 fence_size, fence_alignment;
>> +	unsigned int count = 0;
>> +
>> +	/*
>> +	 * If the required space is larger than the available
>> +	 * aperture, we will not able to find a slot for the
>> +	 * object and unbinding the object now will be in
>> +	 * vain. Worse, doing so may cause us to ping-pong
>> +	 * the object in and out of the Global GTT and
>> +	 * waste a lot of cycles under the mutex.
>> +	 */
>> +	if (obj->base.size > ggtt->mappable_end)
>> +		return -E2BIG;
>> +
>> +	/*
>> +	 * If NONBLOCK is set the caller is optimistically
>> +	 * trying to cache the full object within the mappable
>> +	 * aperture, and *must* have a fallback in place for
>> +	 * situations where we cannot bind the object. We
>> +	 * can be a little more lax here and use the fallback
>> +	 * more often to avoid costly migrations of ourselves
>> +	 * and other objects within the aperture.
>> +	 */
>> +	if (!(flags & PIN_NONBLOCK))
>> +		return 0;
>> +
>> +	/*
>> +	 * We only consider objects whose size is at-least a quarter of
>> +	 * the aperture to be too big and subject them to the new
>> +	 * heuristic below.
>> +	 */
>> +	if (obj->base.size < ggtt->mappable_end / 4)
>> +		return 0;
> 
> That seems a fairly arbitrary thing to put here. Maybe something the
> caller should check/specify?

I have no strong opinion on this one. In my mind I categorised it under 
"is it a large framebuffer" heuristics. Previously it was less than one 
half of aperture always okay, now one quarter, plus 2x hole check if 
larger. Both are heuristics. I even mentioned earlier if 2x should be an 
input parameter as well, but again, given it's not an exported function 
couldn't really justify it.

> 
>> +
>> +	if (HAS_GMCH(i915) || DISPLAY_VER(i915) < 11 ||
>> +	    !i915_gem_object_is_framebuffer(obj))
>> +		return 0;
> 
> None of that seems appropriate for a generic gem function
> with this name.

It's not exported though, maybe remove i915_gem prefix to avoid any 
ideas of it being generic?

Regards,

Tvrtko

> 
>> +
>> +	fence_size = i915_gem_fence_size(i915, obj->base.size,
>> +					 i915_gem_object_get_tiling(obj),
>> +					 i915_gem_object_get_stride(obj));
>> +
>> +	if (i915_vm_has_cache_coloring(&ggtt->vm))
>> +		fence_size += 2 * I915_GTT_PAGE_SIZE;
>> +
>> +	fence_alignment = i915_gem_fence_alignment(i915, obj->base.size,
>> +						   i915_gem_object_get_tiling(obj),
>> +						   i915_gem_object_get_stride(obj));
>> +	alignment = max_t(u64, alignment, fence_alignment);
>> +
>> +	/*
>> +	 * Assuming this object is a large scanout buffer, we try to find
>> +	 * out if there is room to map at-least two of them. There could
>> +	 * be space available to map one but to be consistent, we try to
>> +	 * avoid mapping/fencing any of them.
>> +	 */
>> +	drm_mm_for_each_suitable_hole(hole, &ggtt->vm.mm, 0, ggtt->mappable_end,
>> +				      fence_size, DRM_MM_INSERT_LOW) {
>> +		hole_start = drm_mm_hole_node_start(hole);
>> +		hole_end = hole_start + hole->hole_size;
>> +
>> +		do {
>> +			start = round_up(hole_start, alignment);
>> +			end = min_t(u64, hole_end, ggtt->mappable_end);
>> +
>> +			if (range_overflows(start, fence_size, end))
>> +				break;
>> +
>> +			if (++count >= 2)
>> +				return 0;
>> +
>> +			hole_start = start + fence_size;
>> +		} while (1);
>> +	}
>> +
>> +	return -ENOSPC;
>> +}
>> +
>>   struct i915_vma *
>>   i915_gem_object_ggtt_pin_ww(struct drm_i915_gem_object *obj,
>>   			    struct i915_gem_ww_ctx *ww,
>> @@ -891,36 +978,9 @@ i915_gem_object_ggtt_pin_ww(struct drm_i915_gem_object *obj,
>>   
>>   	if (flags & PIN_MAPPABLE &&
>>   	    (!view || view->type == I915_GGTT_VIEW_NORMAL)) {
>> -		/*
>> -		 * If the required space is larger than the available
>> -		 * aperture, we will not able to find a slot for the
>> -		 * object and unbinding the object now will be in
>> -		 * vain. Worse, doing so may cause us to ping-pong
>> -		 * the object in and out of the Global GTT and
>> -		 * waste a lot of cycles under the mutex.
>> -		 */
>> -		if (obj->base.size > ggtt->mappable_end)
>> -			return ERR_PTR(-E2BIG);
>> -
>> -		/*
>> -		 * If NONBLOCK is set the caller is optimistically
>> -		 * trying to cache the full object within the mappable
>> -		 * aperture, and *must* have a fallback in place for
>> -		 * situations where we cannot bind the object. We
>> -		 * can be a little more lax here and use the fallback
>> -		 * more often to avoid costly migrations of ourselves
>> -		 * and other objects within the aperture.
>> -		 *
>> -		 * Half-the-aperture is used as a simple heuristic.
>> -		 * More interesting would to do search for a free
>> -		 * block prior to making the commitment to unbind.
>> -		 * That caters for the self-harm case, and with a
>> -		 * little more heuristics (e.g. NOFAULT, NOEVICT)
>> -		 * we could try to minimise harm to others.
>> -		 */
>> -		if (flags & PIN_NONBLOCK &&
>> -		    obj->base.size > ggtt->mappable_end / 2)
>> -			return ERR_PTR(-ENOSPC);
>> +		ret = i915_gem_object_fits_in_aperture(obj, alignment, flags);
>> +		if (ret)
>> +			return ERR_PTR(ret);
>>   	}
>>   
>>   new_vma:
>> -- 
>> 2.34.1
> 

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [Intel-gfx] [PATCH 2/2] drm/i915/gem: Don't try to map and fence large scanout buffers (v6)
  2022-02-07 11:47         ` Tvrtko Ursulin
@ 2022-02-07 13:24           ` Ville Syrjälä
  2022-02-07 13:36             ` Tvrtko Ursulin
  0 siblings, 1 reply; 19+ messages in thread
From: Ville Syrjälä @ 2022-02-07 13:24 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: intel-gfx

On Mon, Feb 07, 2022 at 11:47:16AM +0000, Tvrtko Ursulin wrote:
> 
> On 07/02/2022 10:58, Ville Syrjälä wrote:
> > On Thu, Feb 03, 2022 at 05:22:10PM -0800, Vivek Kasireddy wrote:
> >> On platforms capable of allowing 8K (7680 x 4320) modes, pinning 2 or
> >> more framebuffers/scanout buffers results in only one that is mappable/
> >> fenceable. Therefore, pageflipping between these 2 FBs where only one
> >> is mappable/fenceable creates latencies large enough to miss alternate
> >> vblanks thereby producing less optimal framerate.
> >>
> >> This mainly happens because when i915_gem_object_pin_to_display_plane()
> >> is called to pin one of the FB objs, the associated vma is identified
> >> as misplaced and therefore i915_vma_unbind() is called which unbinds and
> >> evicts it. This misplaced vma gets subseqently pinned only when
> >> i915_gem_object_ggtt_pin_ww() is called without PIN_MAPPABLE. This
> >> results in a latency of ~10ms and happens every other vblank/repaint cycle.
> >> Therefore, to fix this issue, we try to see if there is space to map
> >> at-least two objects of a given size and return early if there isn't. This
> >> would ensure that we do not try with PIN_MAPPABLE for any objects that
> >> are too big to map thereby preventing unncessary unbind.
> >>
> >> Testcase:
> >> Running Weston and weston-simple-egl on an Alderlake_S (ADLS) platform
> >> with a 8K@60 mode results in only ~40 FPS. Since upstream Weston submits
> >> a frame ~7ms before the next vblank, the latencies seen between atomic
> >> commit and flip event are 7, 24 (7 + 16.66), 7, 24..... suggesting that
> >> it misses the vblank every other frame.
> >>
> >> Here is the ftrace snippet that shows the source of the ~10ms latency:
> >>                i915_gem_object_pin_to_display_plane() {
> >> 0.102 us   |    i915_gem_object_set_cache_level();
> >>                  i915_gem_object_ggtt_pin_ww() {
> >> 0.390 us   |      i915_vma_instance();
> >> 0.178 us   |      i915_vma_misplaced();
> >>                    i915_vma_unbind() {
> >>                    __i915_active_wait() {
> >> 0.082 us   |        i915_active_acquire_if_busy();
> >> 0.475 us   |      }
> >>                    intel_runtime_pm_get() {
> >> 0.087 us   |        intel_runtime_pm_acquire();
> >> 0.259 us   |      }
> >>                    __i915_active_wait() {
> >> 0.085 us   |        i915_active_acquire_if_busy();
> >> 0.240 us   |      }
> >>                    __i915_vma_evict() {
> >>                      ggtt_unbind_vma() {
> >>                        gen8_ggtt_clear_range() {
> >> 10507.255 us |        }
> >> 10507.689 us |      }
> >> 10508.516 us |   }
> >>
> >> v2: Instead of using bigjoiner checks, determine whether a scanout
> >>      buffer is too big by checking to see if it is possible to map
> >>      two of them into the ggtt.
> >>
> >> v3 (Ville):
> >> - Count how many fb objects can be fit into the available holes
> >>    instead of checking for a hole twice the object size.
> >> - Take alignment constraints into account.
> >> - Limit this large scanout buffer check to >= Gen 11 platforms.
> >>
> >> v4:
> >> - Remove existing heuristic that checks just for size. (Ville)
> >> - Return early if we find space to map at-least two objects. (Tvrtko)
> >> - Slightly update the commit message.
> >>
> >> v5: (Tvrtko)
> >> - Rename the function to indicate that the object may be too big to
> >>    map into the aperture.
> >> - Account for guard pages while calculating the total size required
> >>    for the object.
> >> - Do not subject all objects to the heuristic check and instead
> >>    consider objects only of a certain size.
> >> - Do the hole walk using the rbtree.
> >> - Preserve the existing PIN_NONBLOCK logic.
> >> - Drop the PIN_MAPPABLE check while pinning the VMA.
> >>
> >> v6: (Tvrtko)
> >> - Return 0 on success and the specific error code on failure to
> >>    preserve the existing behavior.
> >>
> >> Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
> >> Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
> >> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> >> Cc: Manasi Navare <manasi.d.navare@intel.com>
> >> Signed-off-by: Vivek Kasireddy <vivek.kasireddy@intel.com>
> >> ---
> >>   drivers/gpu/drm/i915/i915_gem.c | 120 ++++++++++++++++++++++++--------
> >>   1 file changed, 90 insertions(+), 30 deletions(-)
> >>
> >> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> >> index e3a2c2a0e156..39f0d17550c3 100644
> >> --- a/drivers/gpu/drm/i915/i915_gem.c
> >> +++ b/drivers/gpu/drm/i915/i915_gem.c
> >> @@ -46,6 +46,7 @@
> >>   #include "gem/i915_gem_mman.h"
> >>   #include "gem/i915_gem_region.h"
> >>   #include "gem/i915_gem_userptr.h"
> >> +#include "gem/i915_gem_tiling.h"
> >>   #include "gt/intel_engine_user.h"
> >>   #include "gt/intel_gt.h"
> >>   #include "gt/intel_gt_pm.h"
> >> @@ -876,6 +877,92 @@ static void discard_ggtt_vma(struct i915_vma *vma)
> >>   	spin_unlock(&obj->vma.lock);
> >>   }
> >>   
> >> +static int
> >> +i915_gem_object_fits_in_aperture(struct drm_i915_gem_object *obj,
> >> +				 u64 alignment, u64 flags)
> >> +{
> >> +	struct drm_i915_private *i915 = to_i915(obj->base.dev);
> >> +	struct i915_ggtt *ggtt = to_gt(i915)->ggtt;
> >> +	struct drm_mm_node *hole;
> >> +	u64 hole_start, hole_end, start, end;
> >> +	u64 fence_size, fence_alignment;
> >> +	unsigned int count = 0;
> >> +
> >> +	/*
> >> +	 * If the required space is larger than the available
> >> +	 * aperture, we will not able to find a slot for the
> >> +	 * object and unbinding the object now will be in
> >> +	 * vain. Worse, doing so may cause us to ping-pong
> >> +	 * the object in and out of the Global GTT and
> >> +	 * waste a lot of cycles under the mutex.
> >> +	 */
> >> +	if (obj->base.size > ggtt->mappable_end)
> >> +		return -E2BIG;
> >> +
> >> +	/*
> >> +	 * If NONBLOCK is set the caller is optimistically
> >> +	 * trying to cache the full object within the mappable
> >> +	 * aperture, and *must* have a fallback in place for
> >> +	 * situations where we cannot bind the object. We
> >> +	 * can be a little more lax here and use the fallback
> >> +	 * more often to avoid costly migrations of ourselves
> >> +	 * and other objects within the aperture.
> >> +	 */
> >> +	if (!(flags & PIN_NONBLOCK))
> >> +		return 0;
> >> +
> >> +	/*
> >> +	 * We only consider objects whose size is at-least a quarter of
> >> +	 * the aperture to be too big and subject them to the new
> >> +	 * heuristic below.
> >> +	 */
> >> +	if (obj->base.size < ggtt->mappable_end / 4)
> >> +		return 0;
> > 
> > That seems a fairly arbitrary thing to put here. Maybe something the
> > caller should check/specify?
> 
> I have no strong opinion on this one. In my mind I categorised it under 
> "is it a large framebuffer" heuristics. Previously it was less than one 
> half of aperture always okay, now one quarter, plus 2x hole check if 
> larger. Both are heuristics. I even mentioned earlier if 2x should be an 
> input parameter as well, but again, given it's not an exported function 
> couldn't really justify it.

Is there any point in even having this extra check? If we 
don't think checking this is worth the hassle then why call
the function at all?

> 
> > 
> >> +
> >> +	if (HAS_GMCH(i915) || DISPLAY_VER(i915) < 11 ||
> >> +	    !i915_gem_object_is_framebuffer(obj))
> >> +		return 0;
> > 
> > None of that seems appropriate for a generic gem function
> > with this name.
> 
> It's not exported though, maybe remove i915_gem prefix to avoid any 
> ideas of it being generic?

These checks don't even seem to doing anything useful. HAS_GMCH should
already be covered by always setting PIN_MAPPABLE and hence O_NONBLOCK
is never even tried, the pre-icl vs. icl+ check should not exist at all
IMO, and if this is only called for framebuffers then why does the code
pretend that is not the case?

So I would suggest just ditching all these checks, and then the function
even does what it says on the tin.

-- 
Ville Syrjälä
Intel

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [Intel-gfx] [PATCH 2/2] drm/i915/gem: Don't try to map and fence large scanout buffers (v6)
  2022-02-07 13:24           ` Ville Syrjälä
@ 2022-02-07 13:36             ` Tvrtko Ursulin
  2022-02-08  5:10               ` Kasireddy, Vivek
  0 siblings, 1 reply; 19+ messages in thread
From: Tvrtko Ursulin @ 2022-02-07 13:36 UTC (permalink / raw)
  To: Ville Syrjälä; +Cc: intel-gfx


On 07/02/2022 13:24, Ville Syrjälä wrote:
> On Mon, Feb 07, 2022 at 11:47:16AM +0000, Tvrtko Ursulin wrote:
>>
>> On 07/02/2022 10:58, Ville Syrjälä wrote:
>>> On Thu, Feb 03, 2022 at 05:22:10PM -0800, Vivek Kasireddy wrote:
>>>> On platforms capable of allowing 8K (7680 x 4320) modes, pinning 2 or
>>>> more framebuffers/scanout buffers results in only one that is mappable/
>>>> fenceable. Therefore, pageflipping between these 2 FBs where only one
>>>> is mappable/fenceable creates latencies large enough to miss alternate
>>>> vblanks thereby producing less optimal framerate.
>>>>
>>>> This mainly happens because when i915_gem_object_pin_to_display_plane()
>>>> is called to pin one of the FB objs, the associated vma is identified
>>>> as misplaced and therefore i915_vma_unbind() is called which unbinds and
>>>> evicts it. This misplaced vma gets subseqently pinned only when
>>>> i915_gem_object_ggtt_pin_ww() is called without PIN_MAPPABLE. This
>>>> results in a latency of ~10ms and happens every other vblank/repaint cycle.
>>>> Therefore, to fix this issue, we try to see if there is space to map
>>>> at-least two objects of a given size and return early if there isn't. This
>>>> would ensure that we do not try with PIN_MAPPABLE for any objects that
>>>> are too big to map thereby preventing unncessary unbind.
>>>>
>>>> Testcase:
>>>> Running Weston and weston-simple-egl on an Alderlake_S (ADLS) platform
>>>> with a 8K@60 mode results in only ~40 FPS. Since upstream Weston submits
>>>> a frame ~7ms before the next vblank, the latencies seen between atomic
>>>> commit and flip event are 7, 24 (7 + 16.66), 7, 24..... suggesting that
>>>> it misses the vblank every other frame.
>>>>
>>>> Here is the ftrace snippet that shows the source of the ~10ms latency:
>>>>                 i915_gem_object_pin_to_display_plane() {
>>>> 0.102 us   |    i915_gem_object_set_cache_level();
>>>>                   i915_gem_object_ggtt_pin_ww() {
>>>> 0.390 us   |      i915_vma_instance();
>>>> 0.178 us   |      i915_vma_misplaced();
>>>>                     i915_vma_unbind() {
>>>>                     __i915_active_wait() {
>>>> 0.082 us   |        i915_active_acquire_if_busy();
>>>> 0.475 us   |      }
>>>>                     intel_runtime_pm_get() {
>>>> 0.087 us   |        intel_runtime_pm_acquire();
>>>> 0.259 us   |      }
>>>>                     __i915_active_wait() {
>>>> 0.085 us   |        i915_active_acquire_if_busy();
>>>> 0.240 us   |      }
>>>>                     __i915_vma_evict() {
>>>>                       ggtt_unbind_vma() {
>>>>                         gen8_ggtt_clear_range() {
>>>> 10507.255 us |        }
>>>> 10507.689 us |      }
>>>> 10508.516 us |   }
>>>>
>>>> v2: Instead of using bigjoiner checks, determine whether a scanout
>>>>       buffer is too big by checking to see if it is possible to map
>>>>       two of them into the ggtt.
>>>>
>>>> v3 (Ville):
>>>> - Count how many fb objects can be fit into the available holes
>>>>     instead of checking for a hole twice the object size.
>>>> - Take alignment constraints into account.
>>>> - Limit this large scanout buffer check to >= Gen 11 platforms.
>>>>
>>>> v4:
>>>> - Remove existing heuristic that checks just for size. (Ville)
>>>> - Return early if we find space to map at-least two objects. (Tvrtko)
>>>> - Slightly update the commit message.
>>>>
>>>> v5: (Tvrtko)
>>>> - Rename the function to indicate that the object may be too big to
>>>>     map into the aperture.
>>>> - Account for guard pages while calculating the total size required
>>>>     for the object.
>>>> - Do not subject all objects to the heuristic check and instead
>>>>     consider objects only of a certain size.
>>>> - Do the hole walk using the rbtree.
>>>> - Preserve the existing PIN_NONBLOCK logic.
>>>> - Drop the PIN_MAPPABLE check while pinning the VMA.
>>>>
>>>> v6: (Tvrtko)
>>>> - Return 0 on success and the specific error code on failure to
>>>>     preserve the existing behavior.
>>>>
>>>> Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
>>>> Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
>>>> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
>>>> Cc: Manasi Navare <manasi.d.navare@intel.com>
>>>> Signed-off-by: Vivek Kasireddy <vivek.kasireddy@intel.com>
>>>> ---
>>>>    drivers/gpu/drm/i915/i915_gem.c | 120 ++++++++++++++++++++++++--------
>>>>    1 file changed, 90 insertions(+), 30 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
>>>> index e3a2c2a0e156..39f0d17550c3 100644
>>>> --- a/drivers/gpu/drm/i915/i915_gem.c
>>>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>>>> @@ -46,6 +46,7 @@
>>>>    #include "gem/i915_gem_mman.h"
>>>>    #include "gem/i915_gem_region.h"
>>>>    #include "gem/i915_gem_userptr.h"
>>>> +#include "gem/i915_gem_tiling.h"
>>>>    #include "gt/intel_engine_user.h"
>>>>    #include "gt/intel_gt.h"
>>>>    #include "gt/intel_gt_pm.h"
>>>> @@ -876,6 +877,92 @@ static void discard_ggtt_vma(struct i915_vma *vma)
>>>>    	spin_unlock(&obj->vma.lock);
>>>>    }
>>>>    
>>>> +static int
>>>> +i915_gem_object_fits_in_aperture(struct drm_i915_gem_object *obj,
>>>> +				 u64 alignment, u64 flags)
>>>> +{
>>>> +	struct drm_i915_private *i915 = to_i915(obj->base.dev);
>>>> +	struct i915_ggtt *ggtt = to_gt(i915)->ggtt;
>>>> +	struct drm_mm_node *hole;
>>>> +	u64 hole_start, hole_end, start, end;
>>>> +	u64 fence_size, fence_alignment;
>>>> +	unsigned int count = 0;
>>>> +
>>>> +	/*
>>>> +	 * If the required space is larger than the available
>>>> +	 * aperture, we will not able to find a slot for the
>>>> +	 * object and unbinding the object now will be in
>>>> +	 * vain. Worse, doing so may cause us to ping-pong
>>>> +	 * the object in and out of the Global GTT and
>>>> +	 * waste a lot of cycles under the mutex.
>>>> +	 */
>>>> +	if (obj->base.size > ggtt->mappable_end)
>>>> +		return -E2BIG;
>>>> +
>>>> +	/*
>>>> +	 * If NONBLOCK is set the caller is optimistically
>>>> +	 * trying to cache the full object within the mappable
>>>> +	 * aperture, and *must* have a fallback in place for
>>>> +	 * situations where we cannot bind the object. We
>>>> +	 * can be a little more lax here and use the fallback
>>>> +	 * more often to avoid costly migrations of ourselves
>>>> +	 * and other objects within the aperture.
>>>> +	 */
>>>> +	if (!(flags & PIN_NONBLOCK))
>>>> +		return 0;
>>>> +
>>>> +	/*
>>>> +	 * We only consider objects whose size is at-least a quarter of
>>>> +	 * the aperture to be too big and subject them to the new
>>>> +	 * heuristic below.
>>>> +	 */
>>>> +	if (obj->base.size < ggtt->mappable_end / 4)
>>>> +		return 0;
>>>
>>> That seems a fairly arbitrary thing to put here. Maybe something the
>>> caller should check/specify?
>>
>> I have no strong opinion on this one. In my mind I categorised it under
>> "is it a large framebuffer" heuristics. Previously it was less than one
>> half of aperture always okay, now one quarter, plus 2x hole check if
>> larger. Both are heuristics. I even mentioned earlier if 2x should be an
>> input parameter as well, but again, given it's not an exported function
>> couldn't really justify it.
> 
> Is there any point in even having this extra check? If we
> don't think checking this is worth the hassle then why call
> the function at all?

The "/4" one? It was my suggestion to avoid the hole search if we can 
know based on size it cannot be a frame buffer that would be affected by 
the ping-ping problem. Granted that was before the rbtree hole search, 
when it was traversing the un-ordered linked list of holes. What is the 
correct size threshold I don't know.

>>>> +
>>>> +	if (HAS_GMCH(i915) || DISPLAY_VER(i915) < 11 ||
>>>> +	    !i915_gem_object_is_framebuffer(obj))
>>>> +		return 0;
>>>
>>> None of that seems appropriate for a generic gem function
>>> with this name.
>>
>> It's not exported though, maybe remove i915_gem prefix to avoid any
>> ideas of it being generic?
> 
> These checks don't even seem to doing anything useful. HAS_GMCH should
> already be covered by always setting PIN_MAPPABLE and hence O_NONBLOCK
> is never even tried, the pre-icl vs. icl+ check should not exist at all
> IMO, and if this is only called for framebuffers then why does the code
> pretend that is not the case?
> 
> So I would suggest just ditching all these checks, and then the function
> even does what it says on the tin.

Change log for v3 made me think at least some of this was your 
suggestion so I did not think about it further. :) No strong opinion either.

Regards,

Tvrtko

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [Intel-gfx] [PATCH 2/2] drm/i915/gem: Don't try to map and fence large scanout buffers (v6)
  2022-02-07 13:36             ` Tvrtko Ursulin
@ 2022-02-08  5:10               ` Kasireddy, Vivek
  2022-02-08  5:43                 ` Ville Syrjälä
  0 siblings, 1 reply; 19+ messages in thread
From: Kasireddy, Vivek @ 2022-02-08  5:10 UTC (permalink / raw)
  To: Tvrtko Ursulin, Ville Syrjälä; +Cc: intel-gfx

Hi Tvrtko, Ville,

> On 07/02/2022 13:24, Ville Syrjälä wrote:
> > On Mon, Feb 07, 2022 at 11:47:16AM +0000, Tvrtko Ursulin wrote:
> >>
> >> On 07/02/2022 10:58, Ville Syrjälä wrote:
> >>> On Thu, Feb 03, 2022 at 05:22:10PM -0800, Vivek Kasireddy wrote:
> >>>> On platforms capable of allowing 8K (7680 x 4320) modes, pinning 2
> >>>> or more framebuffers/scanout buffers results in only one that is
> >>>> mappable/ fenceable. Therefore, pageflipping between these 2 FBs
> >>>> where only one is mappable/fenceable creates latencies large enough
> >>>> to miss alternate vblanks thereby producing less optimal framerate.
> >>>>
> >>>> This mainly happens because when
> >>>> i915_gem_object_pin_to_display_plane()
> >>>> is called to pin one of the FB objs, the associated vma is
> >>>> identified as misplaced and therefore i915_vma_unbind() is called
> >>>> which unbinds and evicts it. This misplaced vma gets subseqently
> >>>> pinned only when
> >>>> i915_gem_object_ggtt_pin_ww() is called without PIN_MAPPABLE. This
> >>>> results in a latency of ~10ms and happens every other vblank/repaint cycle.
> >>>> Therefore, to fix this issue, we try to see if there is space to
> >>>> map at-least two objects of a given size and return early if there
> >>>> isn't. This would ensure that we do not try with PIN_MAPPABLE for
> >>>> any objects that are too big to map thereby preventing unncessary unbind.
> >>>>
> >>>> Testcase:
> >>>> Running Weston and weston-simple-egl on an Alderlake_S (ADLS)
> >>>> platform with a 8K@60 mode results in only ~40 FPS. Since upstream
> >>>> Weston submits a frame ~7ms before the next vblank, the latencies
> >>>> seen between atomic commit and flip event are 7, 24 (7 + 16.66), 7,
> >>>> 24..... suggesting that it misses the vblank every other frame.
> >>>>
> >>>> Here is the ftrace snippet that shows the source of the ~10ms latency:
> >>>>                 i915_gem_object_pin_to_display_plane() {
> >>>> 0.102 us   |    i915_gem_object_set_cache_level();
> >>>>                   i915_gem_object_ggtt_pin_ww() {
> >>>> 0.390 us   |      i915_vma_instance();
> >>>> 0.178 us   |      i915_vma_misplaced();
> >>>>                     i915_vma_unbind() {
> >>>>                     __i915_active_wait() {
> >>>> 0.082 us   |        i915_active_acquire_if_busy();
> >>>> 0.475 us   |      }
> >>>>                     intel_runtime_pm_get() {
> >>>> 0.087 us   |        intel_runtime_pm_acquire();
> >>>> 0.259 us   |      }
> >>>>                     __i915_active_wait() {
> >>>> 0.085 us   |        i915_active_acquire_if_busy();
> >>>> 0.240 us   |      }
> >>>>                     __i915_vma_evict() {
> >>>>                       ggtt_unbind_vma() {
> >>>>                         gen8_ggtt_clear_range() {
> >>>> 10507.255 us |        }
> >>>> 10507.689 us |      }
> >>>> 10508.516 us |   }
> >>>>
> >>>> v2: Instead of using bigjoiner checks, determine whether a scanout
> >>>>       buffer is too big by checking to see if it is possible to map
> >>>>       two of them into the ggtt.
> >>>>
> >>>> v3 (Ville):
> >>>> - Count how many fb objects can be fit into the available holes
> >>>>     instead of checking for a hole twice the object size.
> >>>> - Take alignment constraints into account.
> >>>> - Limit this large scanout buffer check to >= Gen 11 platforms.
> >>>>
> >>>> v4:
> >>>> - Remove existing heuristic that checks just for size. (Ville)
> >>>> - Return early if we find space to map at-least two objects.
> >>>> (Tvrtko)
> >>>> - Slightly update the commit message.
> >>>>
> >>>> v5: (Tvrtko)
> >>>> - Rename the function to indicate that the object may be too big to
> >>>>     map into the aperture.
> >>>> - Account for guard pages while calculating the total size required
> >>>>     for the object.
> >>>> - Do not subject all objects to the heuristic check and instead
> >>>>     consider objects only of a certain size.
> >>>> - Do the hole walk using the rbtree.
> >>>> - Preserve the existing PIN_NONBLOCK logic.
> >>>> - Drop the PIN_MAPPABLE check while pinning the VMA.
> >>>>
> >>>> v6: (Tvrtko)
> >>>> - Return 0 on success and the specific error code on failure to
> >>>>     preserve the existing behavior.
> >>>>
> >>>> Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
> >>>> Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
> >>>> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> >>>> Cc: Manasi Navare <manasi.d.navare@intel.com>
> >>>> Signed-off-by: Vivek Kasireddy <vivek.kasireddy@intel.com>
> >>>> ---
> >>>>    drivers/gpu/drm/i915/i915_gem.c | 120 ++++++++++++++++++++++++--------
> >>>>    1 file changed, 90 insertions(+), 30 deletions(-)
> >>>>
> >>>> diff --git a/drivers/gpu/drm/i915/i915_gem.c
> >>>> b/drivers/gpu/drm/i915/i915_gem.c index e3a2c2a0e156..39f0d17550c3
> >>>> 100644
> >>>> --- a/drivers/gpu/drm/i915/i915_gem.c
> >>>> +++ b/drivers/gpu/drm/i915/i915_gem.c
> >>>> @@ -46,6 +46,7 @@
> >>>>    #include "gem/i915_gem_mman.h"
> >>>>    #include "gem/i915_gem_region.h"
> >>>>    #include "gem/i915_gem_userptr.h"
> >>>> +#include "gem/i915_gem_tiling.h"
> >>>>    #include "gt/intel_engine_user.h"
> >>>>    #include "gt/intel_gt.h"
> >>>>    #include "gt/intel_gt_pm.h"
> >>>> @@ -876,6 +877,92 @@ static void discard_ggtt_vma(struct i915_vma *vma)
> >>>>    	spin_unlock(&obj->vma.lock);
> >>>>    }
> >>>>
> >>>> +static int
> >>>> +i915_gem_object_fits_in_aperture(struct drm_i915_gem_object *obj,
> >>>> +				 u64 alignment, u64 flags)
> >>>> +{
> >>>> +	struct drm_i915_private *i915 = to_i915(obj->base.dev);
> >>>> +	struct i915_ggtt *ggtt = to_gt(i915)->ggtt;
> >>>> +	struct drm_mm_node *hole;
> >>>> +	u64 hole_start, hole_end, start, end;
> >>>> +	u64 fence_size, fence_alignment;
> >>>> +	unsigned int count = 0;
> >>>> +
> >>>> +	/*
> >>>> +	 * If the required space is larger than the available
> >>>> +	 * aperture, we will not able to find a slot for the
> >>>> +	 * object and unbinding the object now will be in
> >>>> +	 * vain. Worse, doing so may cause us to ping-pong
> >>>> +	 * the object in and out of the Global GTT and
> >>>> +	 * waste a lot of cycles under the mutex.
> >>>> +	 */
> >>>> +	if (obj->base.size > ggtt->mappable_end)
> >>>> +		return -E2BIG;
> >>>> +
> >>>> +	/*
> >>>> +	 * If NONBLOCK is set the caller is optimistically
> >>>> +	 * trying to cache the full object within the mappable
> >>>> +	 * aperture, and *must* have a fallback in place for
> >>>> +	 * situations where we cannot bind the object. We
> >>>> +	 * can be a little more lax here and use the fallback
> >>>> +	 * more often to avoid costly migrations of ourselves
> >>>> +	 * and other objects within the aperture.
> >>>> +	 */
> >>>> +	if (!(flags & PIN_NONBLOCK))
> >>>> +		return 0;
> >>>> +
> >>>> +	/*
> >>>> +	 * We only consider objects whose size is at-least a quarter of
> >>>> +	 * the aperture to be too big and subject them to the new
> >>>> +	 * heuristic below.
> >>>> +	 */
> >>>> +	if (obj->base.size < ggtt->mappable_end / 4)
> >>>> +		return 0;
> >>>
> >>> That seems a fairly arbitrary thing to put here. Maybe something the
> >>> caller should check/specify?
> >>
> >> I have no strong opinion on this one. In my mind I categorised it
> >> under "is it a large framebuffer" heuristics. Previously it was less
> >> than one half of aperture always okay, now one quarter, plus 2x hole
> >> check if larger. Both are heuristics. I even mentioned earlier if 2x
> >> should be an input parameter as well, but again, given it's not an
> >> exported function couldn't really justify it.
> >
> > Is there any point in even having this extra check? If we don't think
> > checking this is worth the hassle then why call the function at all?
> 
> The "/4" one? It was my suggestion to avoid the hole search if we can know based on size
> it cannot be a frame buffer that would be affected by the ping-ping problem. Granted that
> was before the rbtree hole search, when it was traversing the un-ordered linked list of
> holes. What is the correct size threshold I don't know.
> 
> >>>> +
> >>>> +	if (HAS_GMCH(i915) || DISPLAY_VER(i915) < 11 ||
> >>>> +	    !i915_gem_object_is_framebuffer(obj))
> >>>> +		return 0;
> >>>
> >>> None of that seems appropriate for a generic gem function with this
> >>> name.
> >>
> >> It's not exported though, maybe remove i915_gem prefix to avoid any
> >> ideas of it being generic?
> >
> > These checks don't even seem to doing anything useful. HAS_GMCH should
> > already be covered by always setting PIN_MAPPABLE and hence O_NONBLOCK
[Kasireddy, Vivek] I can drop the HAS_GMCH(i915) check given that it is redundant.

> > is never even tried, the pre-icl vs. icl+ check should not exist at
[Kasireddy, Vivek] My aim was to narrow down the list of situations in which the
ping-pong problem becomes more pronounced and may lead to performance 
issues. Therefore, I added the DISPLAY_VER(i915) check since 8K/bigjoiner is
feasible only on those newer platforms. 

> > all IMO, and if this is only called for framebuffers then why does the
> > code pretend that is not the case?
[Kasireddy, Vivek] Oh, I added the i915_gem_object_is_framebuffer() check after I 
found that there are other callers (for example, reloc_iomap() in i915_gem_execbuffer.c)
of i915_gem_object_ggtt_pin_ww() that may not be working on an fb.

Also, I figured size < ggtt->mappable_end / 4 or a similar check is needed as we do
not want to subject all FBs through this performance critical path. 

Thanks,
Vivek

> >
> > So I would suggest just ditching all these checks, and then the
> > function even does what it says on the tin.
> 
> Change log for v3 made me think at least some of this was your suggestion so I did not
> think about it further. :) No strong opinion either.
> 
> Regards,
> 
> Tvrtko

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [Intel-gfx] [PATCH 2/2] drm/i915/gem: Don't try to map and fence large scanout buffers (v6)
  2022-02-08  5:10               ` Kasireddy, Vivek
@ 2022-02-08  5:43                 ` Ville Syrjälä
  2022-02-09  1:47                   ` Kasireddy, Vivek
  0 siblings, 1 reply; 19+ messages in thread
From: Ville Syrjälä @ 2022-02-08  5:43 UTC (permalink / raw)
  To: Kasireddy, Vivek; +Cc: intel-gfx

On Tue, Feb 08, 2022 at 05:10:42AM +0000, Kasireddy, Vivek wrote:
> Hi Tvrtko, Ville,
> 
> > On 07/02/2022 13:24, Ville Syrjälä wrote:
> > > On Mon, Feb 07, 2022 at 11:47:16AM +0000, Tvrtko Ursulin wrote:
> > >>
> > >> On 07/02/2022 10:58, Ville Syrjälä wrote:
> > >>> On Thu, Feb 03, 2022 at 05:22:10PM -0800, Vivek Kasireddy wrote:
> > >>>> On platforms capable of allowing 8K (7680 x 4320) modes, pinning 2
> > >>>> or more framebuffers/scanout buffers results in only one that is
> > >>>> mappable/ fenceable. Therefore, pageflipping between these 2 FBs
> > >>>> where only one is mappable/fenceable creates latencies large enough
> > >>>> to miss alternate vblanks thereby producing less optimal framerate.
> > >>>>
> > >>>> This mainly happens because when
> > >>>> i915_gem_object_pin_to_display_plane()
> > >>>> is called to pin one of the FB objs, the associated vma is
> > >>>> identified as misplaced and therefore i915_vma_unbind() is called
> > >>>> which unbinds and evicts it. This misplaced vma gets subseqently
> > >>>> pinned only when
> > >>>> i915_gem_object_ggtt_pin_ww() is called without PIN_MAPPABLE. This
> > >>>> results in a latency of ~10ms and happens every other vblank/repaint cycle.
> > >>>> Therefore, to fix this issue, we try to see if there is space to
> > >>>> map at-least two objects of a given size and return early if there
> > >>>> isn't. This would ensure that we do not try with PIN_MAPPABLE for
> > >>>> any objects that are too big to map thereby preventing unncessary unbind.
> > >>>>
> > >>>> Testcase:
> > >>>> Running Weston and weston-simple-egl on an Alderlake_S (ADLS)
> > >>>> platform with a 8K@60 mode results in only ~40 FPS. Since upstream
> > >>>> Weston submits a frame ~7ms before the next vblank, the latencies
> > >>>> seen between atomic commit and flip event are 7, 24 (7 + 16.66), 7,
> > >>>> 24..... suggesting that it misses the vblank every other frame.
> > >>>>
> > >>>> Here is the ftrace snippet that shows the source of the ~10ms latency:
> > >>>>                 i915_gem_object_pin_to_display_plane() {
> > >>>> 0.102 us   |    i915_gem_object_set_cache_level();
> > >>>>                   i915_gem_object_ggtt_pin_ww() {
> > >>>> 0.390 us   |      i915_vma_instance();
> > >>>> 0.178 us   |      i915_vma_misplaced();
> > >>>>                     i915_vma_unbind() {
> > >>>>                     __i915_active_wait() {
> > >>>> 0.082 us   |        i915_active_acquire_if_busy();
> > >>>> 0.475 us   |      }
> > >>>>                     intel_runtime_pm_get() {
> > >>>> 0.087 us   |        intel_runtime_pm_acquire();
> > >>>> 0.259 us   |      }
> > >>>>                     __i915_active_wait() {
> > >>>> 0.085 us   |        i915_active_acquire_if_busy();
> > >>>> 0.240 us   |      }
> > >>>>                     __i915_vma_evict() {
> > >>>>                       ggtt_unbind_vma() {
> > >>>>                         gen8_ggtt_clear_range() {
> > >>>> 10507.255 us |        }
> > >>>> 10507.689 us |      }
> > >>>> 10508.516 us |   }
> > >>>>
> > >>>> v2: Instead of using bigjoiner checks, determine whether a scanout
> > >>>>       buffer is too big by checking to see if it is possible to map
> > >>>>       two of them into the ggtt.
> > >>>>
> > >>>> v3 (Ville):
> > >>>> - Count how many fb objects can be fit into the available holes
> > >>>>     instead of checking for a hole twice the object size.
> > >>>> - Take alignment constraints into account.
> > >>>> - Limit this large scanout buffer check to >= Gen 11 platforms.
> > >>>>
> > >>>> v4:
> > >>>> - Remove existing heuristic that checks just for size. (Ville)
> > >>>> - Return early if we find space to map at-least two objects.
> > >>>> (Tvrtko)
> > >>>> - Slightly update the commit message.
> > >>>>
> > >>>> v5: (Tvrtko)
> > >>>> - Rename the function to indicate that the object may be too big to
> > >>>>     map into the aperture.
> > >>>> - Account for guard pages while calculating the total size required
> > >>>>     for the object.
> > >>>> - Do not subject all objects to the heuristic check and instead
> > >>>>     consider objects only of a certain size.
> > >>>> - Do the hole walk using the rbtree.
> > >>>> - Preserve the existing PIN_NONBLOCK logic.
> > >>>> - Drop the PIN_MAPPABLE check while pinning the VMA.
> > >>>>
> > >>>> v6: (Tvrtko)
> > >>>> - Return 0 on success and the specific error code on failure to
> > >>>>     preserve the existing behavior.
> > >>>>
> > >>>> Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
> > >>>> Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
> > >>>> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> > >>>> Cc: Manasi Navare <manasi.d.navare@intel.com>
> > >>>> Signed-off-by: Vivek Kasireddy <vivek.kasireddy@intel.com>
> > >>>> ---
> > >>>>    drivers/gpu/drm/i915/i915_gem.c | 120 ++++++++++++++++++++++++--------
> > >>>>    1 file changed, 90 insertions(+), 30 deletions(-)
> > >>>>
> > >>>> diff --git a/drivers/gpu/drm/i915/i915_gem.c
> > >>>> b/drivers/gpu/drm/i915/i915_gem.c index e3a2c2a0e156..39f0d17550c3
> > >>>> 100644
> > >>>> --- a/drivers/gpu/drm/i915/i915_gem.c
> > >>>> +++ b/drivers/gpu/drm/i915/i915_gem.c
> > >>>> @@ -46,6 +46,7 @@
> > >>>>    #include "gem/i915_gem_mman.h"
> > >>>>    #include "gem/i915_gem_region.h"
> > >>>>    #include "gem/i915_gem_userptr.h"
> > >>>> +#include "gem/i915_gem_tiling.h"
> > >>>>    #include "gt/intel_engine_user.h"
> > >>>>    #include "gt/intel_gt.h"
> > >>>>    #include "gt/intel_gt_pm.h"
> > >>>> @@ -876,6 +877,92 @@ static void discard_ggtt_vma(struct i915_vma *vma)
> > >>>>    	spin_unlock(&obj->vma.lock);
> > >>>>    }
> > >>>>
> > >>>> +static int
> > >>>> +i915_gem_object_fits_in_aperture(struct drm_i915_gem_object *obj,
> > >>>> +				 u64 alignment, u64 flags)
> > >>>> +{
> > >>>> +	struct drm_i915_private *i915 = to_i915(obj->base.dev);
> > >>>> +	struct i915_ggtt *ggtt = to_gt(i915)->ggtt;
> > >>>> +	struct drm_mm_node *hole;
> > >>>> +	u64 hole_start, hole_end, start, end;
> > >>>> +	u64 fence_size, fence_alignment;
> > >>>> +	unsigned int count = 0;
> > >>>> +
> > >>>> +	/*
> > >>>> +	 * If the required space is larger than the available
> > >>>> +	 * aperture, we will not able to find a slot for the
> > >>>> +	 * object and unbinding the object now will be in
> > >>>> +	 * vain. Worse, doing so may cause us to ping-pong
> > >>>> +	 * the object in and out of the Global GTT and
> > >>>> +	 * waste a lot of cycles under the mutex.
> > >>>> +	 */
> > >>>> +	if (obj->base.size > ggtt->mappable_end)
> > >>>> +		return -E2BIG;
> > >>>> +
> > >>>> +	/*
> > >>>> +	 * If NONBLOCK is set the caller is optimistically
> > >>>> +	 * trying to cache the full object within the mappable
> > >>>> +	 * aperture, and *must* have a fallback in place for
> > >>>> +	 * situations where we cannot bind the object. We
> > >>>> +	 * can be a little more lax here and use the fallback
> > >>>> +	 * more often to avoid costly migrations of ourselves
> > >>>> +	 * and other objects within the aperture.
> > >>>> +	 */
> > >>>> +	if (!(flags & PIN_NONBLOCK))
> > >>>> +		return 0;
> > >>>> +
> > >>>> +	/*
> > >>>> +	 * We only consider objects whose size is at-least a quarter of
> > >>>> +	 * the aperture to be too big and subject them to the new
> > >>>> +	 * heuristic below.
> > >>>> +	 */
> > >>>> +	if (obj->base.size < ggtt->mappable_end / 4)
> > >>>> +		return 0;
> > >>>
> > >>> That seems a fairly arbitrary thing to put here. Maybe something the
> > >>> caller should check/specify?
> > >>
> > >> I have no strong opinion on this one. In my mind I categorised it
> > >> under "is it a large framebuffer" heuristics. Previously it was less
> > >> than one half of aperture always okay, now one quarter, plus 2x hole
> > >> check if larger. Both are heuristics. I even mentioned earlier if 2x
> > >> should be an input parameter as well, but again, given it's not an
> > >> exported function couldn't really justify it.
> > >
> > > Is there any point in even having this extra check? If we don't think
> > > checking this is worth the hassle then why call the function at all?
> > 
> > The "/4" one? It was my suggestion to avoid the hole search if we can know based on size
> > it cannot be a frame buffer that would be affected by the ping-ping problem. Granted that
> > was before the rbtree hole search, when it was traversing the un-ordered linked list of
> > holes. What is the correct size threshold I don't know.
> > 
> > >>>> +
> > >>>> +	if (HAS_GMCH(i915) || DISPLAY_VER(i915) < 11 ||
> > >>>> +	    !i915_gem_object_is_framebuffer(obj))
> > >>>> +		return 0;
> > >>>
> > >>> None of that seems appropriate for a generic gem function with this
> > >>> name.
> > >>
> > >> It's not exported though, maybe remove i915_gem prefix to avoid any
> > >> ideas of it being generic?
> > >
> > > These checks don't even seem to doing anything useful. HAS_GMCH should
> > > already be covered by always setting PIN_MAPPABLE and hence O_NONBLOCK
> [Kasireddy, Vivek] I can drop the HAS_GMCH(i915) check given that it is redundant.
> 
> > > is never even tried, the pre-icl vs. icl+ check should not exist at
> [Kasireddy, Vivek] My aim was to narrow down the list of situations in which the
> ping-pong problem becomes more pronounced and may lead to performance 
> issues. Therefore, I added the DISPLAY_VER(i915) check since 8K/bigjoiner is
> feasible only on those newer platforms. 

Like I said before bigjoiner is irrelevant. The only thing that
matters is the size of the mapping vs. mappable aperture size.

> 
> > > all IMO, and if this is only called for framebuffers then why does the
> > > code pretend that is not the case?
> [Kasireddy, Vivek] Oh, I added the i915_gem_object_is_framebuffer() check after I 
> found that there are other callers (for example, reloc_iomap() in i915_gem_execbuffer.c)
> of i915_gem_object_ggtt_pin_ww() that may not be working on an fb.
> 
> Also, I figured size < ggtt->mappable_end / 4 or a similar check is needed as we do
> not want to subject all FBs through this performance critical path. 

Why not?

-- 
Ville Syrjälä
Intel

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [Intel-gfx] [PATCH 2/2] drm/i915/gem: Don't try to map and fence large scanout buffers (v6)
  2022-02-08  5:43                 ` Ville Syrjälä
@ 2022-02-09  1:47                   ` Kasireddy, Vivek
  2022-02-09  7:42                     ` Ville Syrjälä
  0 siblings, 1 reply; 19+ messages in thread
From: Kasireddy, Vivek @ 2022-02-09  1:47 UTC (permalink / raw)
  To: Ville Syrjälä; +Cc: intel-gfx

Hi Ville,

> 
> On Tue, Feb 08, 2022 at 05:10:42AM +0000, Kasireddy, Vivek wrote:
> > Hi Tvrtko, Ville,
> >
> > > On 07/02/2022 13:24, Ville Syrjälä wrote:
> > > > On Mon, Feb 07, 2022 at 11:47:16AM +0000, Tvrtko Ursulin wrote:
> > > >>
> > > >> On 07/02/2022 10:58, Ville Syrjälä wrote:
> > > >>> On Thu, Feb 03, 2022 at 05:22:10PM -0800, Vivek Kasireddy wrote:
> > > >>>> On platforms capable of allowing 8K (7680 x 4320) modes, pinning 2
> > > >>>> or more framebuffers/scanout buffers results in only one that is
> > > >>>> mappable/ fenceable. Therefore, pageflipping between these 2 FBs
> > > >>>> where only one is mappable/fenceable creates latencies large enough
> > > >>>> to miss alternate vblanks thereby producing less optimal framerate.
> > > >>>>
> > > >>>> This mainly happens because when
> > > >>>> i915_gem_object_pin_to_display_plane()
> > > >>>> is called to pin one of the FB objs, the associated vma is
> > > >>>> identified as misplaced and therefore i915_vma_unbind() is called
> > > >>>> which unbinds and evicts it. This misplaced vma gets subseqently
> > > >>>> pinned only when
> > > >>>> i915_gem_object_ggtt_pin_ww() is called without PIN_MAPPABLE. This
> > > >>>> results in a latency of ~10ms and happens every other vblank/repaint cycle.
> > > >>>> Therefore, to fix this issue, we try to see if there is space to
> > > >>>> map at-least two objects of a given size and return early if there
> > > >>>> isn't. This would ensure that we do not try with PIN_MAPPABLE for
> > > >>>> any objects that are too big to map thereby preventing unncessary unbind.
> > > >>>>
> > > >>>> Testcase:
> > > >>>> Running Weston and weston-simple-egl on an Alderlake_S (ADLS)
> > > >>>> platform with a 8K@60 mode results in only ~40 FPS. Since upstream
> > > >>>> Weston submits a frame ~7ms before the next vblank, the latencies
> > > >>>> seen between atomic commit and flip event are 7, 24 (7 + 16.66), 7,
> > > >>>> 24..... suggesting that it misses the vblank every other frame.
> > > >>>>
> > > >>>> Here is the ftrace snippet that shows the source of the ~10ms latency:
> > > >>>>                 i915_gem_object_pin_to_display_plane() {
> > > >>>> 0.102 us   |    i915_gem_object_set_cache_level();
> > > >>>>                   i915_gem_object_ggtt_pin_ww() {
> > > >>>> 0.390 us   |      i915_vma_instance();
> > > >>>> 0.178 us   |      i915_vma_misplaced();
> > > >>>>                     i915_vma_unbind() {
> > > >>>>                     __i915_active_wait() {
> > > >>>> 0.082 us   |        i915_active_acquire_if_busy();
> > > >>>> 0.475 us   |      }
> > > >>>>                     intel_runtime_pm_get() {
> > > >>>> 0.087 us   |        intel_runtime_pm_acquire();
> > > >>>> 0.259 us   |      }
> > > >>>>                     __i915_active_wait() {
> > > >>>> 0.085 us   |        i915_active_acquire_if_busy();
> > > >>>> 0.240 us   |      }
> > > >>>>                     __i915_vma_evict() {
> > > >>>>                       ggtt_unbind_vma() {
> > > >>>>                         gen8_ggtt_clear_range() {
> > > >>>> 10507.255 us |        }
> > > >>>> 10507.689 us |      }
> > > >>>> 10508.516 us |   }
> > > >>>>
> > > >>>> v2: Instead of using bigjoiner checks, determine whether a scanout
> > > >>>>       buffer is too big by checking to see if it is possible to map
> > > >>>>       two of them into the ggtt.
> > > >>>>
> > > >>>> v3 (Ville):
> > > >>>> - Count how many fb objects can be fit into the available holes
> > > >>>>     instead of checking for a hole twice the object size.
> > > >>>> - Take alignment constraints into account.
> > > >>>> - Limit this large scanout buffer check to >= Gen 11 platforms.
> > > >>>>
> > > >>>> v4:
> > > >>>> - Remove existing heuristic that checks just for size. (Ville)
> > > >>>> - Return early if we find space to map at-least two objects.
> > > >>>> (Tvrtko)
> > > >>>> - Slightly update the commit message.
> > > >>>>
> > > >>>> v5: (Tvrtko)
> > > >>>> - Rename the function to indicate that the object may be too big to
> > > >>>>     map into the aperture.
> > > >>>> - Account for guard pages while calculating the total size required
> > > >>>>     for the object.
> > > >>>> - Do not subject all objects to the heuristic check and instead
> > > >>>>     consider objects only of a certain size.
> > > >>>> - Do the hole walk using the rbtree.
> > > >>>> - Preserve the existing PIN_NONBLOCK logic.
> > > >>>> - Drop the PIN_MAPPABLE check while pinning the VMA.
> > > >>>>
> > > >>>> v6: (Tvrtko)
> > > >>>> - Return 0 on success and the specific error code on failure to
> > > >>>>     preserve the existing behavior.
> > > >>>>
> > > >>>> Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
> > > >>>> Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
> > > >>>> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> > > >>>> Cc: Manasi Navare <manasi.d.navare@intel.com>
> > > >>>> Signed-off-by: Vivek Kasireddy <vivek.kasireddy@intel.com>
> > > >>>> ---
> > > >>>>    drivers/gpu/drm/i915/i915_gem.c | 120 ++++++++++++++++++++++++-----
> ---
> > > >>>>    1 file changed, 90 insertions(+), 30 deletions(-)
> > > >>>>
> > > >>>> diff --git a/drivers/gpu/drm/i915/i915_gem.c
> > > >>>> b/drivers/gpu/drm/i915/i915_gem.c index e3a2c2a0e156..39f0d17550c3
> > > >>>> 100644
> > > >>>> --- a/drivers/gpu/drm/i915/i915_gem.c
> > > >>>> +++ b/drivers/gpu/drm/i915/i915_gem.c
> > > >>>> @@ -46,6 +46,7 @@
> > > >>>>    #include "gem/i915_gem_mman.h"
> > > >>>>    #include "gem/i915_gem_region.h"
> > > >>>>    #include "gem/i915_gem_userptr.h"
> > > >>>> +#include "gem/i915_gem_tiling.h"
> > > >>>>    #include "gt/intel_engine_user.h"
> > > >>>>    #include "gt/intel_gt.h"
> > > >>>>    #include "gt/intel_gt_pm.h"
> > > >>>> @@ -876,6 +877,92 @@ static void discard_ggtt_vma(struct i915_vma *vma)
> > > >>>>    	spin_unlock(&obj->vma.lock);
> > > >>>>    }
> > > >>>>
> > > >>>> +static int
> > > >>>> +i915_gem_object_fits_in_aperture(struct drm_i915_gem_object *obj,
> > > >>>> +				 u64 alignment, u64 flags)
> > > >>>> +{
> > > >>>> +	struct drm_i915_private *i915 = to_i915(obj->base.dev);
> > > >>>> +	struct i915_ggtt *ggtt = to_gt(i915)->ggtt;
> > > >>>> +	struct drm_mm_node *hole;
> > > >>>> +	u64 hole_start, hole_end, start, end;
> > > >>>> +	u64 fence_size, fence_alignment;
> > > >>>> +	unsigned int count = 0;
> > > >>>> +
> > > >>>> +	/*
> > > >>>> +	 * If the required space is larger than the available
> > > >>>> +	 * aperture, we will not able to find a slot for the
> > > >>>> +	 * object and unbinding the object now will be in
> > > >>>> +	 * vain. Worse, doing so may cause us to ping-pong
> > > >>>> +	 * the object in and out of the Global GTT and
> > > >>>> +	 * waste a lot of cycles under the mutex.
> > > >>>> +	 */
> > > >>>> +	if (obj->base.size > ggtt->mappable_end)
> > > >>>> +		return -E2BIG;
> > > >>>> +
> > > >>>> +	/*
> > > >>>> +	 * If NONBLOCK is set the caller is optimistically
> > > >>>> +	 * trying to cache the full object within the mappable
> > > >>>> +	 * aperture, and *must* have a fallback in place for
> > > >>>> +	 * situations where we cannot bind the object. We
> > > >>>> +	 * can be a little more lax here and use the fallback
> > > >>>> +	 * more often to avoid costly migrations of ourselves
> > > >>>> +	 * and other objects within the aperture.
> > > >>>> +	 */
> > > >>>> +	if (!(flags & PIN_NONBLOCK))
> > > >>>> +		return 0;
> > > >>>> +
> > > >>>> +	/*
> > > >>>> +	 * We only consider objects whose size is at-least a quarter of
> > > >>>> +	 * the aperture to be too big and subject them to the new
> > > >>>> +	 * heuristic below.
> > > >>>> +	 */
> > > >>>> +	if (obj->base.size < ggtt->mappable_end / 4)
> > > >>>> +		return 0;
> > > >>>
> > > >>> That seems a fairly arbitrary thing to put here. Maybe something the
> > > >>> caller should check/specify?
> > > >>
> > > >> I have no strong opinion on this one. In my mind I categorised it
> > > >> under "is it a large framebuffer" heuristics. Previously it was less
> > > >> than one half of aperture always okay, now one quarter, plus 2x hole
> > > >> check if larger. Both are heuristics. I even mentioned earlier if 2x
> > > >> should be an input parameter as well, but again, given it's not an
> > > >> exported function couldn't really justify it.
> > > >
> > > > Is there any point in even having this extra check? If we don't think
> > > > checking this is worth the hassle then why call the function at all?
> > >
> > > The "/4" one? It was my suggestion to avoid the hole search if we can know based on
> size
> > > it cannot be a frame buffer that would be affected by the ping-ping problem. Granted
> that
> > > was before the rbtree hole search, when it was traversing the un-ordered linked list of
> > > holes. What is the correct size threshold I don't know.
> > >
> > > >>>> +
> > > >>>> +	if (HAS_GMCH(i915) || DISPLAY_VER(i915) < 11 ||
> > > >>>> +	    !i915_gem_object_is_framebuffer(obj))
> > > >>>> +		return 0;
> > > >>>
> > > >>> None of that seems appropriate for a generic gem function with this
> > > >>> name.
> > > >>
> > > >> It's not exported though, maybe remove i915_gem prefix to avoid any
> > > >> ideas of it being generic?
> > > >
> > > > These checks don't even seem to doing anything useful. HAS_GMCH should
> > > > already be covered by always setting PIN_MAPPABLE and hence O_NONBLOCK
> > [Kasireddy, Vivek] I can drop the HAS_GMCH(i915) check given that it is redundant.
> >
> > > > is never even tried, the pre-icl vs. icl+ check should not exist at
> > [Kasireddy, Vivek] My aim was to narrow down the list of situations in which the
> > ping-pong problem becomes more pronounced and may lead to performance
> > issues. Therefore, I added the DISPLAY_VER(i915) check since 8K/bigjoiner is
> > feasible only on those newer platforms.
> 
> Like I said before bigjoiner is irrelevant. The only thing that
> matters is the size of the mapping vs. mappable aperture size.
[Kasireddy, Vivek] Ok, got it.

> 
> >
> > > > all IMO, and if this is only called for framebuffers then why does the
> > > > code pretend that is not the case?
> > [Kasireddy, Vivek] Oh, I added the i915_gem_object_is_framebuffer() check after I
> > found that there are other callers (for example, reloc_iomap() in i915_gem_execbuffer.c)
> > of i915_gem_object_ggtt_pin_ww() that may not be working on an fb.
> >
> > Also, I figured size < ggtt->mappable_end / 4 or a similar check is needed as we do
> > not want to subject all FBs through this performance critical path.
> 
> Why not?
[Kasireddy, Vivek] Oh, I just thought that it makes sense to avoid the expensive hole search
for smaller FBs that are unlikely to exhaust the mappable aperture space. And, I also wanted
to preserve the behavior prior to this patch. However, I guess I could drop this check as well
given that the hole search via rbtree traversal may not be that bad in terms of time.

What about the i915_gem_object_is_framebuffer() check? Should I keep it given the smaller
size and transient nature of batchbuffers that go through this path?

Thanks,
Vivek

> 
> --
> Ville Syrjälä
> Intel

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [Intel-gfx] [PATCH 2/2] drm/i915/gem: Don't try to map and fence large scanout buffers (v6)
  2022-02-09  1:47                   ` Kasireddy, Vivek
@ 2022-02-09  7:42                     ` Ville Syrjälä
  2022-02-10  1:41                       ` Kasireddy, Vivek
  0 siblings, 1 reply; 19+ messages in thread
From: Ville Syrjälä @ 2022-02-09  7:42 UTC (permalink / raw)
  To: Kasireddy, Vivek; +Cc: intel-gfx

On Wed, Feb 09, 2022 at 01:47:01AM +0000, Kasireddy, Vivek wrote:
> Hi Ville,
> 
> > 
> > On Tue, Feb 08, 2022 at 05:10:42AM +0000, Kasireddy, Vivek wrote:
> > > Hi Tvrtko, Ville,
> > >
> > > > On 07/02/2022 13:24, Ville Syrjälä wrote:
> > > > > On Mon, Feb 07, 2022 at 11:47:16AM +0000, Tvrtko Ursulin wrote:
> > > > >>
> > > > >> On 07/02/2022 10:58, Ville Syrjälä wrote:
> > > > >>> On Thu, Feb 03, 2022 at 05:22:10PM -0800, Vivek Kasireddy wrote:
> > > > >>>> On platforms capable of allowing 8K (7680 x 4320) modes, pinning 2
> > > > >>>> or more framebuffers/scanout buffers results in only one that is
> > > > >>>> mappable/ fenceable. Therefore, pageflipping between these 2 FBs
> > > > >>>> where only one is mappable/fenceable creates latencies large enough
> > > > >>>> to miss alternate vblanks thereby producing less optimal framerate.
> > > > >>>>
> > > > >>>> This mainly happens because when
> > > > >>>> i915_gem_object_pin_to_display_plane()
> > > > >>>> is called to pin one of the FB objs, the associated vma is
> > > > >>>> identified as misplaced and therefore i915_vma_unbind() is called
> > > > >>>> which unbinds and evicts it. This misplaced vma gets subseqently
> > > > >>>> pinned only when
> > > > >>>> i915_gem_object_ggtt_pin_ww() is called without PIN_MAPPABLE. This
> > > > >>>> results in a latency of ~10ms and happens every other vblank/repaint cycle.
> > > > >>>> Therefore, to fix this issue, we try to see if there is space to
> > > > >>>> map at-least two objects of a given size and return early if there
> > > > >>>> isn't. This would ensure that we do not try with PIN_MAPPABLE for
> > > > >>>> any objects that are too big to map thereby preventing unncessary unbind.
> > > > >>>>
> > > > >>>> Testcase:
> > > > >>>> Running Weston and weston-simple-egl on an Alderlake_S (ADLS)
> > > > >>>> platform with a 8K@60 mode results in only ~40 FPS. Since upstream
> > > > >>>> Weston submits a frame ~7ms before the next vblank, the latencies
> > > > >>>> seen between atomic commit and flip event are 7, 24 (7 + 16.66), 7,
> > > > >>>> 24..... suggesting that it misses the vblank every other frame.
> > > > >>>>
> > > > >>>> Here is the ftrace snippet that shows the source of the ~10ms latency:
> > > > >>>>                 i915_gem_object_pin_to_display_plane() {
> > > > >>>> 0.102 us   |    i915_gem_object_set_cache_level();
> > > > >>>>                   i915_gem_object_ggtt_pin_ww() {
> > > > >>>> 0.390 us   |      i915_vma_instance();
> > > > >>>> 0.178 us   |      i915_vma_misplaced();
> > > > >>>>                     i915_vma_unbind() {
> > > > >>>>                     __i915_active_wait() {
> > > > >>>> 0.082 us   |        i915_active_acquire_if_busy();
> > > > >>>> 0.475 us   |      }
> > > > >>>>                     intel_runtime_pm_get() {
> > > > >>>> 0.087 us   |        intel_runtime_pm_acquire();
> > > > >>>> 0.259 us   |      }
> > > > >>>>                     __i915_active_wait() {
> > > > >>>> 0.085 us   |        i915_active_acquire_if_busy();
> > > > >>>> 0.240 us   |      }
> > > > >>>>                     __i915_vma_evict() {
> > > > >>>>                       ggtt_unbind_vma() {
> > > > >>>>                         gen8_ggtt_clear_range() {
> > > > >>>> 10507.255 us |        }
> > > > >>>> 10507.689 us |      }
> > > > >>>> 10508.516 us |   }
> > > > >>>>
> > > > >>>> v2: Instead of using bigjoiner checks, determine whether a scanout
> > > > >>>>       buffer is too big by checking to see if it is possible to map
> > > > >>>>       two of them into the ggtt.
> > > > >>>>
> > > > >>>> v3 (Ville):
> > > > >>>> - Count how many fb objects can be fit into the available holes
> > > > >>>>     instead of checking for a hole twice the object size.
> > > > >>>> - Take alignment constraints into account.
> > > > >>>> - Limit this large scanout buffer check to >= Gen 11 platforms.
> > > > >>>>
> > > > >>>> v4:
> > > > >>>> - Remove existing heuristic that checks just for size. (Ville)
> > > > >>>> - Return early if we find space to map at-least two objects.
> > > > >>>> (Tvrtko)
> > > > >>>> - Slightly update the commit message.
> > > > >>>>
> > > > >>>> v5: (Tvrtko)
> > > > >>>> - Rename the function to indicate that the object may be too big to
> > > > >>>>     map into the aperture.
> > > > >>>> - Account for guard pages while calculating the total size required
> > > > >>>>     for the object.
> > > > >>>> - Do not subject all objects to the heuristic check and instead
> > > > >>>>     consider objects only of a certain size.
> > > > >>>> - Do the hole walk using the rbtree.
> > > > >>>> - Preserve the existing PIN_NONBLOCK logic.
> > > > >>>> - Drop the PIN_MAPPABLE check while pinning the VMA.
> > > > >>>>
> > > > >>>> v6: (Tvrtko)
> > > > >>>> - Return 0 on success and the specific error code on failure to
> > > > >>>>     preserve the existing behavior.
> > > > >>>>
> > > > >>>> Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
> > > > >>>> Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
> > > > >>>> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> > > > >>>> Cc: Manasi Navare <manasi.d.navare@intel.com>
> > > > >>>> Signed-off-by: Vivek Kasireddy <vivek.kasireddy@intel.com>
> > > > >>>> ---
> > > > >>>>    drivers/gpu/drm/i915/i915_gem.c | 120 ++++++++++++++++++++++++-----
> > ---
> > > > >>>>    1 file changed, 90 insertions(+), 30 deletions(-)
> > > > >>>>
> > > > >>>> diff --git a/drivers/gpu/drm/i915/i915_gem.c
> > > > >>>> b/drivers/gpu/drm/i915/i915_gem.c index e3a2c2a0e156..39f0d17550c3
> > > > >>>> 100644
> > > > >>>> --- a/drivers/gpu/drm/i915/i915_gem.c
> > > > >>>> +++ b/drivers/gpu/drm/i915/i915_gem.c
> > > > >>>> @@ -46,6 +46,7 @@
> > > > >>>>    #include "gem/i915_gem_mman.h"
> > > > >>>>    #include "gem/i915_gem_region.h"
> > > > >>>>    #include "gem/i915_gem_userptr.h"
> > > > >>>> +#include "gem/i915_gem_tiling.h"
> > > > >>>>    #include "gt/intel_engine_user.h"
> > > > >>>>    #include "gt/intel_gt.h"
> > > > >>>>    #include "gt/intel_gt_pm.h"
> > > > >>>> @@ -876,6 +877,92 @@ static void discard_ggtt_vma(struct i915_vma *vma)
> > > > >>>>    	spin_unlock(&obj->vma.lock);
> > > > >>>>    }
> > > > >>>>
> > > > >>>> +static int
> > > > >>>> +i915_gem_object_fits_in_aperture(struct drm_i915_gem_object *obj,
> > > > >>>> +				 u64 alignment, u64 flags)
> > > > >>>> +{
> > > > >>>> +	struct drm_i915_private *i915 = to_i915(obj->base.dev);
> > > > >>>> +	struct i915_ggtt *ggtt = to_gt(i915)->ggtt;
> > > > >>>> +	struct drm_mm_node *hole;
> > > > >>>> +	u64 hole_start, hole_end, start, end;
> > > > >>>> +	u64 fence_size, fence_alignment;
> > > > >>>> +	unsigned int count = 0;
> > > > >>>> +
> > > > >>>> +	/*
> > > > >>>> +	 * If the required space is larger than the available
> > > > >>>> +	 * aperture, we will not able to find a slot for the
> > > > >>>> +	 * object and unbinding the object now will be in
> > > > >>>> +	 * vain. Worse, doing so may cause us to ping-pong
> > > > >>>> +	 * the object in and out of the Global GTT and
> > > > >>>> +	 * waste a lot of cycles under the mutex.
> > > > >>>> +	 */
> > > > >>>> +	if (obj->base.size > ggtt->mappable_end)
> > > > >>>> +		return -E2BIG;
> > > > >>>> +
> > > > >>>> +	/*
> > > > >>>> +	 * If NONBLOCK is set the caller is optimistically
> > > > >>>> +	 * trying to cache the full object within the mappable
> > > > >>>> +	 * aperture, and *must* have a fallback in place for
> > > > >>>> +	 * situations where we cannot bind the object. We
> > > > >>>> +	 * can be a little more lax here and use the fallback
> > > > >>>> +	 * more often to avoid costly migrations of ourselves
> > > > >>>> +	 * and other objects within the aperture.
> > > > >>>> +	 */
> > > > >>>> +	if (!(flags & PIN_NONBLOCK))
> > > > >>>> +		return 0;
> > > > >>>> +
> > > > >>>> +	/*
> > > > >>>> +	 * We only consider objects whose size is at-least a quarter of
> > > > >>>> +	 * the aperture to be too big and subject them to the new
> > > > >>>> +	 * heuristic below.
> > > > >>>> +	 */
> > > > >>>> +	if (obj->base.size < ggtt->mappable_end / 4)
> > > > >>>> +		return 0;
> > > > >>>
> > > > >>> That seems a fairly arbitrary thing to put here. Maybe something the
> > > > >>> caller should check/specify?
> > > > >>
> > > > >> I have no strong opinion on this one. In my mind I categorised it
> > > > >> under "is it a large framebuffer" heuristics. Previously it was less
> > > > >> than one half of aperture always okay, now one quarter, plus 2x hole
> > > > >> check if larger. Both are heuristics. I even mentioned earlier if 2x
> > > > >> should be an input parameter as well, but again, given it's not an
> > > > >> exported function couldn't really justify it.
> > > > >
> > > > > Is there any point in even having this extra check? If we don't think
> > > > > checking this is worth the hassle then why call the function at all?
> > > >
> > > > The "/4" one? It was my suggestion to avoid the hole search if we can know based on
> > size
> > > > it cannot be a frame buffer that would be affected by the ping-ping problem. Granted
> > that
> > > > was before the rbtree hole search, when it was traversing the un-ordered linked list of
> > > > holes. What is the correct size threshold I don't know.
> > > >
> > > > >>>> +
> > > > >>>> +	if (HAS_GMCH(i915) || DISPLAY_VER(i915) < 11 ||
> > > > >>>> +	    !i915_gem_object_is_framebuffer(obj))
> > > > >>>> +		return 0;
> > > > >>>
> > > > >>> None of that seems appropriate for a generic gem function with this
> > > > >>> name.
> > > > >>
> > > > >> It's not exported though, maybe remove i915_gem prefix to avoid any
> > > > >> ideas of it being generic?
> > > > >
> > > > > These checks don't even seem to doing anything useful. HAS_GMCH should
> > > > > already be covered by always setting PIN_MAPPABLE and hence O_NONBLOCK
> > > [Kasireddy, Vivek] I can drop the HAS_GMCH(i915) check given that it is redundant.
> > >
> > > > > is never even tried, the pre-icl vs. icl+ check should not exist at
> > > [Kasireddy, Vivek] My aim was to narrow down the list of situations in which the
> > > ping-pong problem becomes more pronounced and may lead to performance
> > > issues. Therefore, I added the DISPLAY_VER(i915) check since 8K/bigjoiner is
> > > feasible only on those newer platforms.
> > 
> > Like I said before bigjoiner is irrelevant. The only thing that
> > matters is the size of the mapping vs. mappable aperture size.
> [Kasireddy, Vivek] Ok, got it.
> 
> > 
> > >
> > > > > all IMO, and if this is only called for framebuffers then why does the
> > > > > code pretend that is not the case?
> > > [Kasireddy, Vivek] Oh, I added the i915_gem_object_is_framebuffer() check after I
> > > found that there are other callers (for example, reloc_iomap() in i915_gem_execbuffer.c)
> > > of i915_gem_object_ggtt_pin_ww() that may not be working on an fb.
> > >
> > > Also, I figured size < ggtt->mappable_end / 4 or a similar check is needed as we do
> > > not want to subject all FBs through this performance critical path.
> > 
> > Why not?
> [Kasireddy, Vivek] Oh, I just thought that it makes sense to avoid the expensive hole search
> for smaller FBs that are unlikely to exhaust the mappable aperture space. And, I also wanted
> to preserve the behavior prior to this patch. However, I guess I could drop this check as well
> given that the hole search via rbtree traversal may not be that bad in terms of time.
> 
> What about the i915_gem_object_is_framebuffer() check? Should I keep it given the smaller
> size and transient nature of batchbuffers that go through this path?

Hmm. I guess if we don't have any real idea on the cost of this then
maybe we should not do it there, for the moment at least. I'm just
pondering if this whole thing should just be a seaprate function
each current caller of i915_gem_object_ggtt_pin_ww() should call
explicitly... But maybe we want framebuffers in partcular to undergo
the same restrictions always whether it's execbuffer or display
activity that's trying to pin them.

However, now that I think about this I'm wondering if we should do
all this stuff a bit later. I mean why are we even bothering with
these checks if the vma is already bound and not misplaced? Seems
the vma_is_misplaced() case already has the same mappable_size/2
check so that should probably be replaced with this thing, and then
additionally we could do something like this:

  if (vma_is_miplaced()) {
  	...
  	if (flags & PIN_NONBLOCK)
  		already have a check here;
+ } else if (!vma_is_bound()) {
+ 	if (flags & PIN_NONBLOCK)
+ 		check here too;
+ }

-- 
Ville Syrjälä
Intel

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [Intel-gfx] [PATCH 2/2] drm/i915/gem: Don't try to map and fence large scanout buffers (v6)
  2022-02-09  7:42                     ` Ville Syrjälä
@ 2022-02-10  1:41                       ` Kasireddy, Vivek
  0 siblings, 0 replies; 19+ messages in thread
From: Kasireddy, Vivek @ 2022-02-10  1:41 UTC (permalink / raw)
  To: Ville Syrjälä; +Cc: intel-gfx

Hi Ville,

> On Wed, Feb 09, 2022 at 01:47:01AM +0000, Kasireddy, Vivek wrote:
> > Hi Ville,
> >
> > >
> > > On Tue, Feb 08, 2022 at 05:10:42AM +0000, Kasireddy, Vivek wrote:
> > > > Hi Tvrtko, Ville,
> > > >
> > > > > On 07/02/2022 13:24, Ville Syrjälä wrote:
> > > > > > On Mon, Feb 07, 2022 at 11:47:16AM +0000, Tvrtko Ursulin wrote:
> > > > > >>
> > > > > >> On 07/02/2022 10:58, Ville Syrjälä wrote:
> > > > > >>> On Thu, Feb 03, 2022 at 05:22:10PM -0800, Vivek Kasireddy wrote:
> > > > > >>>> On platforms capable of allowing 8K (7680 x 4320) modes, pinning 2
> > > > > >>>> or more framebuffers/scanout buffers results in only one that is
> > > > > >>>> mappable/ fenceable. Therefore, pageflipping between these 2 FBs
> > > > > >>>> where only one is mappable/fenceable creates latencies large enough
> > > > > >>>> to miss alternate vblanks thereby producing less optimal framerate.
> > > > > >>>>
> > > > > >>>> This mainly happens because when
> > > > > >>>> i915_gem_object_pin_to_display_plane()
> > > > > >>>> is called to pin one of the FB objs, the associated vma is
> > > > > >>>> identified as misplaced and therefore i915_vma_unbind() is called
> > > > > >>>> which unbinds and evicts it. This misplaced vma gets subseqently
> > > > > >>>> pinned only when
> > > > > >>>> i915_gem_object_ggtt_pin_ww() is called without PIN_MAPPABLE. This
> > > > > >>>> results in a latency of ~10ms and happens every other vblank/repaint cycle.
> > > > > >>>> Therefore, to fix this issue, we try to see if there is space to
> > > > > >>>> map at-least two objects of a given size and return early if there
> > > > > >>>> isn't. This would ensure that we do not try with PIN_MAPPABLE for
> > > > > >>>> any objects that are too big to map thereby preventing unncessary unbind.
> > > > > >>>>
> > > > > >>>> Testcase:
> > > > > >>>> Running Weston and weston-simple-egl on an Alderlake_S (ADLS)
> > > > > >>>> platform with a 8K@60 mode results in only ~40 FPS. Since upstream
> > > > > >>>> Weston submits a frame ~7ms before the next vblank, the latencies
> > > > > >>>> seen between atomic commit and flip event are 7, 24 (7 + 16.66), 7,
> > > > > >>>> 24..... suggesting that it misses the vblank every other frame.
> > > > > >>>>
> > > > > >>>> Here is the ftrace snippet that shows the source of the ~10ms latency:
> > > > > >>>>                 i915_gem_object_pin_to_display_plane() {
> > > > > >>>> 0.102 us   |    i915_gem_object_set_cache_level();
> > > > > >>>>                   i915_gem_object_ggtt_pin_ww() {
> > > > > >>>> 0.390 us   |      i915_vma_instance();
> > > > > >>>> 0.178 us   |      i915_vma_misplaced();
> > > > > >>>>                     i915_vma_unbind() {
> > > > > >>>>                     __i915_active_wait() {
> > > > > >>>> 0.082 us   |        i915_active_acquire_if_busy();
> > > > > >>>> 0.475 us   |      }
> > > > > >>>>                     intel_runtime_pm_get() {
> > > > > >>>> 0.087 us   |        intel_runtime_pm_acquire();
> > > > > >>>> 0.259 us   |      }
> > > > > >>>>                     __i915_active_wait() {
> > > > > >>>> 0.085 us   |        i915_active_acquire_if_busy();
> > > > > >>>> 0.240 us   |      }
> > > > > >>>>                     __i915_vma_evict() {
> > > > > >>>>                       ggtt_unbind_vma() {
> > > > > >>>>                         gen8_ggtt_clear_range() {
> > > > > >>>> 10507.255 us |        }
> > > > > >>>> 10507.689 us |      }
> > > > > >>>> 10508.516 us |   }
> > > > > >>>>
> > > > > >>>> v2: Instead of using bigjoiner checks, determine whether a scanout
> > > > > >>>>       buffer is too big by checking to see if it is possible to map
> > > > > >>>>       two of them into the ggtt.
> > > > > >>>>
> > > > > >>>> v3 (Ville):
> > > > > >>>> - Count how many fb objects can be fit into the available holes
> > > > > >>>>     instead of checking for a hole twice the object size.
> > > > > >>>> - Take alignment constraints into account.
> > > > > >>>> - Limit this large scanout buffer check to >= Gen 11 platforms.
> > > > > >>>>
> > > > > >>>> v4:
> > > > > >>>> - Remove existing heuristic that checks just for size. (Ville)
> > > > > >>>> - Return early if we find space to map at-least two objects.
> > > > > >>>> (Tvrtko)
> > > > > >>>> - Slightly update the commit message.
> > > > > >>>>
> > > > > >>>> v5: (Tvrtko)
> > > > > >>>> - Rename the function to indicate that the object may be too big to
> > > > > >>>>     map into the aperture.
> > > > > >>>> - Account for guard pages while calculating the total size required
> > > > > >>>>     for the object.
> > > > > >>>> - Do not subject all objects to the heuristic check and instead
> > > > > >>>>     consider objects only of a certain size.
> > > > > >>>> - Do the hole walk using the rbtree.
> > > > > >>>> - Preserve the existing PIN_NONBLOCK logic.
> > > > > >>>> - Drop the PIN_MAPPABLE check while pinning the VMA.
> > > > > >>>>
> > > > > >>>> v6: (Tvrtko)
> > > > > >>>> - Return 0 on success and the specific error code on failure to
> > > > > >>>>     preserve the existing behavior.
> > > > > >>>>
> > > > > >>>> Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
> > > > > >>>> Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
> > > > > >>>> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> > > > > >>>> Cc: Manasi Navare <manasi.d.navare@intel.com>
> > > > > >>>> Signed-off-by: Vivek Kasireddy <vivek.kasireddy@intel.com>
> > > > > >>>> ---
> > > > > >>>>    drivers/gpu/drm/i915/i915_gem.c | 120
> ++++++++++++++++++++++++-----
> > > ---
> > > > > >>>>    1 file changed, 90 insertions(+), 30 deletions(-)
> > > > > >>>>
> > > > > >>>> diff --git a/drivers/gpu/drm/i915/i915_gem.c
> > > > > >>>> b/drivers/gpu/drm/i915/i915_gem.c index e3a2c2a0e156..39f0d17550c3
> > > > > >>>> 100644
> > > > > >>>> --- a/drivers/gpu/drm/i915/i915_gem.c
> > > > > >>>> +++ b/drivers/gpu/drm/i915/i915_gem.c
> > > > > >>>> @@ -46,6 +46,7 @@
> > > > > >>>>    #include "gem/i915_gem_mman.h"
> > > > > >>>>    #include "gem/i915_gem_region.h"
> > > > > >>>>    #include "gem/i915_gem_userptr.h"
> > > > > >>>> +#include "gem/i915_gem_tiling.h"
> > > > > >>>>    #include "gt/intel_engine_user.h"
> > > > > >>>>    #include "gt/intel_gt.h"
> > > > > >>>>    #include "gt/intel_gt_pm.h"
> > > > > >>>> @@ -876,6 +877,92 @@ static void discard_ggtt_vma(struct i915_vma
> *vma)
> > > > > >>>>    	spin_unlock(&obj->vma.lock);
> > > > > >>>>    }
> > > > > >>>>
> > > > > >>>> +static int
> > > > > >>>> +i915_gem_object_fits_in_aperture(struct drm_i915_gem_object *obj,
> > > > > >>>> +				 u64 alignment, u64 flags)
> > > > > >>>> +{
> > > > > >>>> +	struct drm_i915_private *i915 = to_i915(obj->base.dev);
> > > > > >>>> +	struct i915_ggtt *ggtt = to_gt(i915)->ggtt;
> > > > > >>>> +	struct drm_mm_node *hole;
> > > > > >>>> +	u64 hole_start, hole_end, start, end;
> > > > > >>>> +	u64 fence_size, fence_alignment;
> > > > > >>>> +	unsigned int count = 0;
> > > > > >>>> +
> > > > > >>>> +	/*
> > > > > >>>> +	 * If the required space is larger than the available
> > > > > >>>> +	 * aperture, we will not able to find a slot for the
> > > > > >>>> +	 * object and unbinding the object now will be in
> > > > > >>>> +	 * vain. Worse, doing so may cause us to ping-pong
> > > > > >>>> +	 * the object in and out of the Global GTT and
> > > > > >>>> +	 * waste a lot of cycles under the mutex.
> > > > > >>>> +	 */
> > > > > >>>> +	if (obj->base.size > ggtt->mappable_end)
> > > > > >>>> +		return -E2BIG;
> > > > > >>>> +
> > > > > >>>> +	/*
> > > > > >>>> +	 * If NONBLOCK is set the caller is optimistically
> > > > > >>>> +	 * trying to cache the full object within the mappable
> > > > > >>>> +	 * aperture, and *must* have a fallback in place for
> > > > > >>>> +	 * situations where we cannot bind the object. We
> > > > > >>>> +	 * can be a little more lax here and use the fallback
> > > > > >>>> +	 * more often to avoid costly migrations of ourselves
> > > > > >>>> +	 * and other objects within the aperture.
> > > > > >>>> +	 */
> > > > > >>>> +	if (!(flags & PIN_NONBLOCK))
> > > > > >>>> +		return 0;
> > > > > >>>> +
> > > > > >>>> +	/*
> > > > > >>>> +	 * We only consider objects whose size is at-least a quarter of
> > > > > >>>> +	 * the aperture to be too big and subject them to the new
> > > > > >>>> +	 * heuristic below.
> > > > > >>>> +	 */
> > > > > >>>> +	if (obj->base.size < ggtt->mappable_end / 4)
> > > > > >>>> +		return 0;
> > > > > >>>
> > > > > >>> That seems a fairly arbitrary thing to put here. Maybe something the
> > > > > >>> caller should check/specify?
> > > > > >>
> > > > > >> I have no strong opinion on this one. In my mind I categorised it
> > > > > >> under "is it a large framebuffer" heuristics. Previously it was less
> > > > > >> than one half of aperture always okay, now one quarter, plus 2x hole
> > > > > >> check if larger. Both are heuristics. I even mentioned earlier if 2x
> > > > > >> should be an input parameter as well, but again, given it's not an
> > > > > >> exported function couldn't really justify it.
> > > > > >
> > > > > > Is there any point in even having this extra check? If we don't think
> > > > > > checking this is worth the hassle then why call the function at all?
> > > > >
> > > > > The "/4" one? It was my suggestion to avoid the hole search if we can know based
> on
> > > size
> > > > > it cannot be a frame buffer that would be affected by the ping-ping problem.
> Granted
> > > that
> > > > > was before the rbtree hole search, when it was traversing the un-ordered linked list
> of
> > > > > holes. What is the correct size threshold I don't know.
> > > > >
> > > > > >>>> +
> > > > > >>>> +	if (HAS_GMCH(i915) || DISPLAY_VER(i915) < 11 ||
> > > > > >>>> +	    !i915_gem_object_is_framebuffer(obj))
> > > > > >>>> +		return 0;
> > > > > >>>
> > > > > >>> None of that seems appropriate for a generic gem function with this
> > > > > >>> name.
> > > > > >>
> > > > > >> It's not exported though, maybe remove i915_gem prefix to avoid any
> > > > > >> ideas of it being generic?
> > > > > >
> > > > > > These checks don't even seem to doing anything useful. HAS_GMCH should
> > > > > > already be covered by always setting PIN_MAPPABLE and hence
> O_NONBLOCK
> > > > [Kasireddy, Vivek] I can drop the HAS_GMCH(i915) check given that it is
> redundant.
> > > >
> > > > > > is never even tried, the pre-icl vs. icl+ check should not exist at
> > > > [Kasireddy, Vivek] My aim was to narrow down the list of situations in which the
> > > > ping-pong problem becomes more pronounced and may lead to performance
> > > > issues. Therefore, I added the DISPLAY_VER(i915) check since 8K/bigjoiner is
> > > > feasible only on those newer platforms.
> > >
> > > Like I said before bigjoiner is irrelevant. The only thing that
> > > matters is the size of the mapping vs. mappable aperture size.
> > [Kasireddy, Vivek] Ok, got it.
> >
> > >
> > > >
> > > > > > all IMO, and if this is only called for framebuffers then why does the
> > > > > > code pretend that is not the case?
> > > > [Kasireddy, Vivek] Oh, I added the i915_gem_object_is_framebuffer() check after I
> > > > found that there are other callers (for example, reloc_iomap() in
> i915_gem_execbuffer.c)
> > > > of i915_gem_object_ggtt_pin_ww() that may not be working on an fb.
> > > >
> > > > Also, I figured size < ggtt->mappable_end / 4 or a similar check is needed as we do
> > > > not want to subject all FBs through this performance critical path.
> > >
> > > Why not?
> > [Kasireddy, Vivek] Oh, I just thought that it makes sense to avoid the expensive hole
> search
> > for smaller FBs that are unlikely to exhaust the mappable aperture space. And, I also
> wanted
> > to preserve the behavior prior to this patch. However, I guess I could drop this check as
> well
> > given that the hole search via rbtree traversal may not be that bad in terms of time.
> >
> > What about the i915_gem_object_is_framebuffer() check? Should I keep it given the
> smaller
> > size and transient nature of batchbuffers that go through this path?
> 
> Hmm. I guess if we don't have any real idea on the cost of this then
> maybe we should not do it there, for the moment at least. I'm just
> pondering if this whole thing should just be a seaprate function
> each current caller of i915_gem_object_ggtt_pin_ww() should call
> explicitly... But maybe we want framebuffers in partcular to undergo
> the same restrictions always whether it's execbuffer or display
> activity that's trying to pin them.
> 
> However, now that I think about this I'm wondering if we should do
> all this stuff a bit later. I mean why are we even bothering with
> these checks if the vma is already bound and not misplaced?
[Kasireddy, Vivek] As I explain in the commit message, Weston flips
between two 8K FBs: FB1 and FB2 where
FB1 is mappable/fenceable and therefore not misplaced.
FB2 is NOT mappable and hence identified as misplaced.

And, when i915_gem_object_pin_to_display_plane() is called to pin
these FBs -- as part of pageflips -- this is what it does:
vma = ERR_PTR(-ENOSPC);
        if ((flags & PIN_MAPPABLE) == 0 &&
            (!view || view->type == I915_GGTT_VIEW_NORMAL))
                vma = i915_gem_object_ggtt_pin_ww(obj, ww, view, 0, alignment,
                                                  flags | PIN_MAPPABLE |
                                                  PIN_NONBLOCK);
        if (IS_ERR(vma) && vma != ERR_PTR(-EDEADLK))
                vma = i915_gem_object_ggtt_pin_ww(obj, ww, view, 0,
                                                  alignment, flags);

FB1 gets pinned in the first call to i915_gem_object_ggtt_pin_ww() (which
has PIN_MAPPABLE) and FB2 gets pinned in the second -- which does 
not have PIN_MAPPABLE as the first call fails because there is no 
space left in the aperture. However, during the next pageflips, FB1 
does not cause any issues but FB2 gets identified as misplaced
(because it fails the check
(flags & PIN_MAPPABLE && !i915_vma_is_map_and_fenceable(vma))
in the first call resulting in a costly unbind/eviction. It subsequently gets
pinned outside of mappable aperture in the second call though.

My solution -- with this patch -- is to just find out if there is enough space
in the mappable aperture for both FB1 and FB2 when either of them are getting
pinned -- with PIN_MAPPABLE. Neither of them are pinned into the aperture
if there is no room for both. One could argue that since there is room for FB1
in the aperture, we should let it be but I am not sure if this inconsistency
(i.e no FB2 but only FB1 in aperture) is OK with userspace apps.

> the vma_is_misplaced() case already has the same mappable_size/2
> check so that should probably be replaced with this thing, and then
[Kasireddy, Vivek] Right, this check seems redundant and can be removed.

> additionally we could do something like this:
> 
>   if (vma_is_miplaced()) {
>   	...
>   	if (flags & PIN_NONBLOCK)
>   		already have a check here;
> + } else if (!vma_is_bound()) {
> + 	if (flags & PIN_NONBLOCK)
> + 		check here too;
[Kasireddy, Vivek] Since it is the same check, I think doing it once
before if (i915_vma_misplaced()) should be ok as well.

Thanks,
Vivek

> + }
> 
> --
> Ville Syrjälä
> Intel

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2022-02-10  1:42 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-02-02  1:11 [Intel-gfx] [PATCH 1/2] drm/mm: Add an iterator to optimally walk over holes for an allocation Vivek Kasireddy
2022-02-02  1:11 ` [Intel-gfx] [PATCH 2/2] drm/i915/gem: Don't try to map and fence large scanout buffers (v5) Vivek Kasireddy
2022-02-02 13:19   ` Tvrtko Ursulin
2022-02-04  1:22     ` [Intel-gfx] [PATCH 2/2] drm/i915/gem: Don't try to map and fence large scanout buffers (v6) Vivek Kasireddy
2022-02-07 10:49       ` Tvrtko Ursulin
2022-02-07 10:58       ` Ville Syrjälä
2022-02-07 11:47         ` Tvrtko Ursulin
2022-02-07 13:24           ` Ville Syrjälä
2022-02-07 13:36             ` Tvrtko Ursulin
2022-02-08  5:10               ` Kasireddy, Vivek
2022-02-08  5:43                 ` Ville Syrjälä
2022-02-09  1:47                   ` Kasireddy, Vivek
2022-02-09  7:42                     ` Ville Syrjälä
2022-02-10  1:41                       ` Kasireddy, Vivek
2022-02-02  1:43 ` [Intel-gfx] ✗ Fi.CI.CHECKPATCH: warning for series starting with [1/2] drm/mm: Add an iterator to optimally walk over holes for an allocation Patchwork
2022-02-02  1:45 ` [Intel-gfx] ✗ Fi.CI.SPARSE: " Patchwork
2022-02-02  2:09 ` [Intel-gfx] ✗ Fi.CI.BAT: failure " Patchwork
2022-02-04  1:45 ` [Intel-gfx] ✗ Fi.CI.BUILD: failure for series starting with [1/2] drm/mm: Add an iterator to optimally walk over holes for an allocation (rev2) Patchwork
2022-02-07 10:51   ` Tvrtko Ursulin

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.