All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/3] drm/i915/lrc: allocate separate page for HWSP
@ 2017-07-12 22:57 Chris Wilson
  2017-07-12 22:58 ` [PATCH 2/3] drm/i915/execlists: Read the context-status buffer from the HWSP Chris Wilson
                   ` (3 more replies)
  0 siblings, 4 replies; 10+ messages in thread
From: Chris Wilson @ 2017-07-12 22:57 UTC (permalink / raw)
  To: intel-gfx

From: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>

On gen8+ we're currently using the PPHWSP of the kernel ctx as the
global HWSP. However, when the kernel ctx gets submitted (e.g. from
__intel_autoenable_gt_powersave) the HW will use that page as both
HWSP and PPHWSP. This causes a conflict in the register arena of the
HWSP, i.e. offsets below 0x80. We don't current utilize this arena,
but in the following patches we will take advantage of the cached
register state for handling execlist's context status interrupt.

To avoid the conflict, instead of re-using the PPHWSP of the kernel
ctx we can allocate a separate page for the HWSP like what happens for
pre-execlists platform.

v2: Add a use-case for the register arena.

Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Cc: Michel Thierry <michel.thierry@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1499357440-34688-1-git-send-email-daniele.ceraolospurio@intel.com
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/intel_engine_cs.c  | 126 +++++++++++++++++++++++++++++++-
 drivers/gpu/drm/i915/intel_lrc.c        |  34 +--------
 drivers/gpu/drm/i915/intel_ringbuffer.c | 125 +------------------------------
 3 files changed, 127 insertions(+), 158 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index 24db316e0fd1..cba120f3d89d 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -445,6 +445,114 @@ static void intel_engine_cleanup_scratch(struct intel_engine_cs *engine)
 	i915_vma_unpin_and_release(&engine->scratch);
 }
 
+static void cleanup_phys_status_page(struct intel_engine_cs *engine)
+{
+	struct drm_i915_private *dev_priv = engine->i915;
+
+	if (!dev_priv->status_page_dmah)
+		return;
+
+	drm_pci_free(&dev_priv->drm, dev_priv->status_page_dmah);
+	engine->status_page.page_addr = NULL;
+}
+
+static void cleanup_status_page(struct intel_engine_cs *engine)
+{
+	struct i915_vma *vma;
+	struct drm_i915_gem_object *obj;
+
+	vma = fetch_and_zero(&engine->status_page.vma);
+	if (!vma)
+		return;
+
+	obj = vma->obj;
+
+	i915_vma_unpin(vma);
+	i915_vma_close(vma);
+
+	i915_gem_object_unpin_map(obj);
+	__i915_gem_object_release_unless_active(obj);
+}
+
+static int init_status_page(struct intel_engine_cs *engine)
+{
+	struct drm_i915_gem_object *obj;
+	struct i915_vma *vma;
+	unsigned int flags;
+	void *vaddr;
+	int ret;
+
+	obj = i915_gem_object_create_internal(engine->i915, PAGE_SIZE);
+	if (IS_ERR(obj)) {
+		DRM_ERROR("Failed to allocate status page\n");
+		return PTR_ERR(obj);
+	}
+
+	ret = i915_gem_object_set_cache_level(obj, I915_CACHE_LLC);
+	if (ret)
+		goto err;
+
+	vma = i915_vma_instance(obj, &engine->i915->ggtt.base, NULL);
+	if (IS_ERR(vma)) {
+		ret = PTR_ERR(vma);
+		goto err;
+	}
+
+	flags = PIN_GLOBAL;
+	if (!HAS_LLC(engine->i915))
+		/* On g33, we cannot place HWS above 256MiB, so
+		 * restrict its pinning to the low mappable arena.
+		 * Though this restriction is not documented for
+		 * gen4, gen5, or byt, they also behave similarly
+		 * and hang if the HWS is placed at the top of the
+		 * GTT. To generalise, it appears that all !llc
+		 * platforms have issues with us placing the HWS
+		 * above the mappable region (even though we never
+		 * actualy map it).
+		 */
+		flags |= PIN_MAPPABLE;
+	ret = i915_vma_pin(vma, 0, 4096, flags);
+	if (ret)
+		goto err;
+
+	vaddr = i915_gem_object_pin_map(obj, I915_MAP_WB);
+	if (IS_ERR(vaddr)) {
+		ret = PTR_ERR(vaddr);
+		goto err_unpin;
+	}
+
+	engine->status_page.vma = vma;
+	engine->status_page.ggtt_offset = i915_ggtt_offset(vma);
+	engine->status_page.page_addr = memset(vaddr, 0, PAGE_SIZE);
+
+	DRM_DEBUG_DRIVER("%s hws offset: 0x%08x\n",
+			 engine->name, i915_ggtt_offset(vma));
+	return 0;
+
+err_unpin:
+	i915_vma_unpin(vma);
+err:
+	i915_gem_object_put(obj);
+	return ret;
+}
+
+static int init_phys_status_page(struct intel_engine_cs *engine)
+{
+	struct drm_i915_private *dev_priv = engine->i915;
+
+	GEM_BUG_ON(engine->id != RCS);
+
+	dev_priv->status_page_dmah =
+		drm_pci_alloc(&dev_priv->drm, PAGE_SIZE, PAGE_SIZE);
+	if (!dev_priv->status_page_dmah)
+		return -ENOMEM;
+
+	engine->status_page.page_addr = dev_priv->status_page_dmah->vaddr;
+	memset(engine->status_page.page_addr, 0, PAGE_SIZE);
+
+	return 0;
+}
+
 /**
  * intel_engines_init_common - initialize cengine state which might require hw access
  * @engine: Engine to initialize.
@@ -480,10 +588,21 @@ int intel_engine_init_common(struct intel_engine_cs *engine)
 
 	ret = i915_gem_render_state_init(engine);
 	if (ret)
-		goto err_unpin;
+		goto err_breadcrumbs;
+
+	if (HWS_NEEDS_PHYSICAL(engine->i915))
+		ret = init_phys_status_page(engine);
+	else
+		ret = init_status_page(engine);
+	if (ret)
+		goto err_rs_fini;
 
 	return 0;
 
+err_rs_fini:
+	i915_gem_render_state_fini(engine);
+err_breadcrumbs:
+	intel_engine_fini_breadcrumbs(engine);
 err_unpin:
 	engine->context_unpin(engine, engine->i915->kernel_context);
 	return ret;
@@ -500,6 +619,11 @@ void intel_engine_cleanup_common(struct intel_engine_cs *engine)
 {
 	intel_engine_cleanup_scratch(engine);
 
+	if (HWS_NEEDS_PHYSICAL(engine->i915))
+		cleanup_phys_status_page(engine);
+	else
+		cleanup_status_page(engine);
+
 	i915_gem_render_state_fini(engine);
 	intel_engine_fini_breadcrumbs(engine);
 	intel_engine_cleanup_cmd_parser(engine);
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 699868d81de8..9d231d0e427d 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1641,11 +1641,6 @@ void intel_logical_ring_cleanup(struct intel_engine_cs *engine)
 	if (engine->cleanup)
 		engine->cleanup(engine);
 
-	if (engine->status_page.vma) {
-		i915_gem_object_unpin_map(engine->status_page.vma->obj);
-		engine->status_page.vma = NULL;
-	}
-
 	intel_engine_cleanup_common(engine);
 
 	lrc_destroy_wa_ctx(engine);
@@ -1692,24 +1687,6 @@ logical_ring_default_irqs(struct intel_engine_cs *engine)
 	engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
 }
 
-static int
-lrc_setup_hws(struct intel_engine_cs *engine, struct i915_vma *vma)
-{
-	const int hws_offset = LRC_PPHWSP_PN * PAGE_SIZE;
-	void *hws;
-
-	/* The HWSP is part of the default context object in LRC mode. */
-	hws = i915_gem_object_pin_map(vma->obj, I915_MAP_WB);
-	if (IS_ERR(hws))
-		return PTR_ERR(hws);
-
-	engine->status_page.page_addr = hws + hws_offset;
-	engine->status_page.ggtt_offset = i915_ggtt_offset(vma) + hws_offset;
-	engine->status_page.vma = vma;
-
-	return 0;
-}
-
 static void
 logical_ring_setup(struct intel_engine_cs *engine)
 {
@@ -1742,23 +1719,14 @@ logical_ring_setup(struct intel_engine_cs *engine)
 	logical_ring_default_irqs(engine);
 }
 
-static int
-logical_ring_init(struct intel_engine_cs *engine)
+static int logical_ring_init(struct intel_engine_cs *engine)
 {
-	struct i915_gem_context *dctx = engine->i915->kernel_context;
 	int ret;
 
 	ret = intel_engine_init_common(engine);
 	if (ret)
 		goto error;
 
-	/* And setup the hardware status page. */
-	ret = lrc_setup_hws(engine, dctx->engine[engine->id].state);
-	if (ret) {
-		DRM_ERROR("Failed to set up hws %s: %d\n", engine->name, ret);
-		goto error;
-	}
-
 	return 0;
 
 error:
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 5224b7abb8a3..b2580539051e 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -1174,113 +1174,7 @@ i915_emit_bb_start(struct drm_i915_gem_request *req,
 	return 0;
 }
 
-static void cleanup_phys_status_page(struct intel_engine_cs *engine)
-{
-	struct drm_i915_private *dev_priv = engine->i915;
-
-	if (!dev_priv->status_page_dmah)
-		return;
-
-	drm_pci_free(&dev_priv->drm, dev_priv->status_page_dmah);
-	engine->status_page.page_addr = NULL;
-}
-
-static void cleanup_status_page(struct intel_engine_cs *engine)
-{
-	struct i915_vma *vma;
-	struct drm_i915_gem_object *obj;
-
-	vma = fetch_and_zero(&engine->status_page.vma);
-	if (!vma)
-		return;
-
-	obj = vma->obj;
-
-	i915_vma_unpin(vma);
-	i915_vma_close(vma);
-
-	i915_gem_object_unpin_map(obj);
-	__i915_gem_object_release_unless_active(obj);
-}
 
-static int init_status_page(struct intel_engine_cs *engine)
-{
-	struct drm_i915_gem_object *obj;
-	struct i915_vma *vma;
-	unsigned int flags;
-	void *vaddr;
-	int ret;
-
-	obj = i915_gem_object_create_internal(engine->i915, PAGE_SIZE);
-	if (IS_ERR(obj)) {
-		DRM_ERROR("Failed to allocate status page\n");
-		return PTR_ERR(obj);
-	}
-
-	ret = i915_gem_object_set_cache_level(obj, I915_CACHE_LLC);
-	if (ret)
-		goto err;
-
-	vma = i915_vma_instance(obj, &engine->i915->ggtt.base, NULL);
-	if (IS_ERR(vma)) {
-		ret = PTR_ERR(vma);
-		goto err;
-	}
-
-	flags = PIN_GLOBAL;
-	if (!HAS_LLC(engine->i915))
-		/* On g33, we cannot place HWS above 256MiB, so
-		 * restrict its pinning to the low mappable arena.
-		 * Though this restriction is not documented for
-		 * gen4, gen5, or byt, they also behave similarly
-		 * and hang if the HWS is placed at the top of the
-		 * GTT. To generalise, it appears that all !llc
-		 * platforms have issues with us placing the HWS
-		 * above the mappable region (even though we never
-		 * actualy map it).
-		 */
-		flags |= PIN_MAPPABLE;
-	ret = i915_vma_pin(vma, 0, 4096, flags);
-	if (ret)
-		goto err;
-
-	vaddr = i915_gem_object_pin_map(obj, I915_MAP_WB);
-	if (IS_ERR(vaddr)) {
-		ret = PTR_ERR(vaddr);
-		goto err_unpin;
-	}
-
-	engine->status_page.vma = vma;
-	engine->status_page.ggtt_offset = i915_ggtt_offset(vma);
-	engine->status_page.page_addr = memset(vaddr, 0, PAGE_SIZE);
-
-	DRM_DEBUG_DRIVER("%s hws offset: 0x%08x\n",
-			 engine->name, i915_ggtt_offset(vma));
-	return 0;
-
-err_unpin:
-	i915_vma_unpin(vma);
-err:
-	i915_gem_object_put(obj);
-	return ret;
-}
-
-static int init_phys_status_page(struct intel_engine_cs *engine)
-{
-	struct drm_i915_private *dev_priv = engine->i915;
-
-	GEM_BUG_ON(engine->id != RCS);
-
-	dev_priv->status_page_dmah =
-		drm_pci_alloc(&dev_priv->drm, PAGE_SIZE, PAGE_SIZE);
-	if (!dev_priv->status_page_dmah)
-		return -ENOMEM;
-
-	engine->status_page.page_addr = dev_priv->status_page_dmah->vaddr;
-	memset(engine->status_page.page_addr, 0, PAGE_SIZE);
-
-	return 0;
-}
 
 int intel_ring_pin(struct intel_ring *ring,
 		   struct drm_i915_private *i915,
@@ -1567,17 +1461,10 @@ static int intel_init_ring_buffer(struct intel_engine_cs *engine)
 	if (err)
 		goto err;
 
-	if (HWS_NEEDS_PHYSICAL(engine->i915))
-		err = init_phys_status_page(engine);
-	else
-		err = init_status_page(engine);
-	if (err)
-		goto err;
-
 	ring = intel_engine_create_ring(engine, 32 * PAGE_SIZE);
 	if (IS_ERR(ring)) {
 		err = PTR_ERR(ring);
-		goto err_hws;
+		goto err;
 	}
 
 	/* Ring wraparound at offset 0 sometimes hangs. No idea why. */
@@ -1592,11 +1479,6 @@ static int intel_init_ring_buffer(struct intel_engine_cs *engine)
 
 err_ring:
 	intel_ring_free(ring);
-err_hws:
-	if (HWS_NEEDS_PHYSICAL(engine->i915))
-		cleanup_phys_status_page(engine);
-	else
-		cleanup_status_page(engine);
 err:
 	intel_engine_cleanup_common(engine);
 	return err;
@@ -1615,11 +1497,6 @@ void intel_engine_cleanup(struct intel_engine_cs *engine)
 	if (engine->cleanup)
 		engine->cleanup(engine);
 
-	if (HWS_NEEDS_PHYSICAL(dev_priv))
-		cleanup_phys_status_page(engine);
-	else
-		cleanup_status_page(engine);
-
 	intel_engine_cleanup_common(engine);
 
 	dev_priv->engine[engine->id] = NULL;
-- 
2.13.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 2/3] drm/i915/execlists: Read the context-status buffer from the HWSP
  2017-07-12 22:57 [PATCH 1/3] drm/i915/lrc: allocate separate page for HWSP Chris Wilson
@ 2017-07-12 22:58 ` Chris Wilson
  2017-07-13  0:40   ` Michel Thierry
  2017-07-12 22:58 ` [PATCH 3/3] drm/i915/execlists: Read the context-status HEAD " Chris Wilson
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 10+ messages in thread
From: Chris Wilson @ 2017-07-12 22:58 UTC (permalink / raw)
  To: intel-gfx; +Cc: Mika Kuoppala

The engine provides a mirror of the CSB in the HWSP. If we use the
cacheable reads from the HWSP, we can shave off a few mmio reads per
context-switch interrupt (which are quite frequent!). Just removing a
couple of mmio is not enough to actually reduce any latency, but a small
reduction in overall cpu usage.

Much appreciation for Ben dropping the bombshell that the CSB was in the
HWSP and for Michel in digging out the details.

Suggested-by: Ben Widawsky <benjamin.widawsky@intel.com>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Michel Thierry <michel.thierry@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/intel_lrc.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 9d231d0e427d..e413465a552b 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -547,8 +547,8 @@ static void intel_lrc_irq_handler(unsigned long data)
 	while (test_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted)) {
 		u32 __iomem *csb_mmio =
 			dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine));
-		u32 __iomem *buf =
-			dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_BUF_LO(engine, 0));
+		/* The HWSP contains a (cacheable) mirror of the CSB */
+		u32 *buf = &engine->status_page.page_addr[0x10];
 		unsigned int head, tail;
 
 		/* The write will be ordered by the uncached read (itself
@@ -590,13 +590,12 @@ static void intel_lrc_irq_handler(unsigned long data)
 			 * status notifier.
 			 */
 
-			status = readl(buf + 2 * head);
+			status = buf[2 * head];
 			if (!(status & GEN8_CTX_STATUS_COMPLETED_MASK))
 				continue;
 
 			/* Check the context/desc id for this event matches */
-			GEM_DEBUG_BUG_ON(readl(buf + 2 * head + 1) !=
-					 port->context_id);
+			GEM_DEBUG_BUG_ON(buf[2 * head + 1] != port->context_id);
 
 			rq = port_unpack(port, &count);
 			GEM_BUG_ON(count == 0);
-- 
2.13.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 3/3] drm/i915/execlists: Read the context-status HEAD from the HWSP
  2017-07-12 22:57 [PATCH 1/3] drm/i915/lrc: allocate separate page for HWSP Chris Wilson
  2017-07-12 22:58 ` [PATCH 2/3] drm/i915/execlists: Read the context-status buffer from the HWSP Chris Wilson
@ 2017-07-12 22:58 ` Chris Wilson
  2017-07-12 23:41   ` Daniele Ceraolo Spurio
  2017-07-12 23:18 ` ✓ Fi.CI.BAT: success for series starting with [1/3] drm/i915/lrc: allocate separate page for HWSP Patchwork
  2017-07-12 23:51 ` [PATCH 1/3] " Michel Thierry
  3 siblings, 1 reply; 10+ messages in thread
From: Chris Wilson @ 2017-07-12 22:58 UTC (permalink / raw)
  To: intel-gfx; +Cc: Mika Kuoppala

The engine provides also provides mirror of the CSB write pointer in the
HWSP, but not of our read pointer. To take advantage of this we need to
remember where we read up to on the last interrupt and continue off from
there. This poses a problem following a reset, as we don't know where
the hw will start writing from, and due to the use of power contexts we
cannot perform that query during the reset itself. So we continue the
current modus operandi of delaying the first read of the context-status
read/write pointers until after the first interrupt. With this we should
now have eliminated all uncached mmio reads in handling the
context-status interrupt, though we still have the uncached mmio writes
for submitting new work, and many uncached mmio reads in the global
interrupt handler itself. Still a step in the right direction towards
reducing our resubmit latency, although it appears lost in the noise!

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Michel Thierry <michel.thierry@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/intel_lrc.c        | 20 +++++++++++++++-----
 drivers/gpu/drm/i915/intel_ringbuffer.h |  1 +
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index e413465a552b..db750abb905e 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -562,9 +562,15 @@ static void intel_lrc_irq_handler(unsigned long data)
 		 * is set and we do a new loop.
 		 */
 		__clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
-		head = readl(csb_mmio);
-		tail = GEN8_CSB_WRITE_PTR(head);
-		head = GEN8_CSB_READ_PTR(head);
+		if (unlikely(engine->csb_head == -1)) { /* following a reset */
+			head = readl(csb_mmio);
+			tail = GEN8_CSB_WRITE_PTR(head);
+			head = GEN8_CSB_READ_PTR(head);
+			engine->csb_head = head;
+		} else {
+			head = engine->csb_head;
+			tail = buf[0xf];
+		}
 		while (head != tail) {
 			struct drm_i915_gem_request *rq;
 			unsigned int status;
@@ -618,8 +624,11 @@ static void intel_lrc_irq_handler(unsigned long data)
 				   !(status & GEN8_CTX_STATUS_ACTIVE_IDLE));
 		}
 
-		writel(_MASKED_FIELD(GEN8_CSB_READ_PTR_MASK, head << 8),
-		       csb_mmio);
+		if (head != engine->csb_head) {
+			engine->csb_head = head;
+			writel(_MASKED_FIELD(GEN8_CSB_READ_PTR_MASK, head << 8),
+			       csb_mmio);
+		}
 	}
 
 	if (execlists_elsp_ready(engine))
@@ -1246,6 +1255,7 @@ static int gen8_init_common_ring(struct intel_engine_cs *engine)
 
 	/* After a GPU reset, we may have requests to replay */
 	clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
+	engine->csb_head = -1;
 
 	submit = false;
 	for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++) {
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index d33c93444c0d..56751413e40c 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -391,6 +391,7 @@ struct intel_engine_cs {
 	struct rb_root execlist_queue;
 	struct rb_node *execlist_first;
 	unsigned int fw_domains;
+	unsigned int csb_head;
 
 	/* Contexts are pinned whilst they are active on the GPU. The last
 	 * context executed remains active whilst the GPU is idle - the
-- 
2.13.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* ✓ Fi.CI.BAT: success for series starting with [1/3] drm/i915/lrc: allocate separate page for HWSP
  2017-07-12 22:57 [PATCH 1/3] drm/i915/lrc: allocate separate page for HWSP Chris Wilson
  2017-07-12 22:58 ` [PATCH 2/3] drm/i915/execlists: Read the context-status buffer from the HWSP Chris Wilson
  2017-07-12 22:58 ` [PATCH 3/3] drm/i915/execlists: Read the context-status HEAD " Chris Wilson
@ 2017-07-12 23:18 ` Patchwork
  2017-07-12 23:51 ` [PATCH 1/3] " Michel Thierry
  3 siblings, 0 replies; 10+ messages in thread
From: Patchwork @ 2017-07-12 23:18 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [1/3] drm/i915/lrc: allocate separate page for HWSP
URL   : https://patchwork.freedesktop.org/series/27207/
State : success

== Summary ==

Series 27207v1 Series without cover letter
https://patchwork.freedesktop.org/api/1.0/series/27207/revisions/1/mbox/

Test gem_exec_flush:
        Subgroup basic-batch-kernel-default-uc:
                pass       -> FAIL       (fi-snb-2600) fdo#100007
Test gem_exec_suspend:
        Subgroup basic-s4-devices:
                pass       -> DMESG-WARN (fi-kbl-7560u) fdo#100125 +1
Test kms_pipe_crc_basic:
        Subgroup hang-read-crc-pipe-a:
                pass       -> DMESG-WARN (fi-pnv-d510) fdo#101597
        Subgroup suspend-read-crc-pipe-b:
                dmesg-warn -> PASS       (fi-byt-j1900) fdo#101705

fdo#100007 https://bugs.freedesktop.org/show_bug.cgi?id=100007
fdo#100125 https://bugs.freedesktop.org/show_bug.cgi?id=100125
fdo#101597 https://bugs.freedesktop.org/show_bug.cgi?id=101597
fdo#101705 https://bugs.freedesktop.org/show_bug.cgi?id=101705

fi-bdw-5557u     total:279  pass:268  dwarn:0   dfail:0   fail:0   skip:11  time:445s
fi-bdw-gvtdvm    total:279  pass:265  dwarn:0   dfail:0   fail:0   skip:14  time:430s
fi-blb-e6850     total:279  pass:224  dwarn:1   dfail:0   fail:0   skip:54  time:357s
fi-bsw-n3050     total:279  pass:243  dwarn:0   dfail:0   fail:0   skip:36  time:524s
fi-bxt-j4205     total:279  pass:260  dwarn:0   dfail:0   fail:0   skip:19  time:510s
fi-byt-j1900     total:279  pass:255  dwarn:0   dfail:0   fail:0   skip:24  time:489s
fi-byt-n2820     total:279  pass:250  dwarn:1   dfail:0   fail:0   skip:28  time:483s
fi-glk-2a        total:279  pass:260  dwarn:0   dfail:0   fail:0   skip:19  time:591s
fi-hsw-4770      total:279  pass:263  dwarn:0   dfail:0   fail:0   skip:16  time:433s
fi-hsw-4770r     total:279  pass:263  dwarn:0   dfail:0   fail:0   skip:16  time:412s
fi-ilk-650       total:279  pass:229  dwarn:0   dfail:0   fail:0   skip:50  time:422s
fi-ivb-3520m     total:279  pass:261  dwarn:0   dfail:0   fail:0   skip:18  time:510s
fi-ivb-3770      total:279  pass:261  dwarn:0   dfail:0   fail:0   skip:18  time:464s
fi-kbl-7500u     total:279  pass:261  dwarn:0   dfail:0   fail:0   skip:18  time:464s
fi-kbl-7560u     total:279  pass:268  dwarn:1   dfail:0   fail:0   skip:10  time:569s
fi-kbl-r         total:279  pass:261  dwarn:0   dfail:0   fail:0   skip:18  time:588s
fi-pnv-d510      total:279  pass:221  dwarn:3   dfail:0   fail:0   skip:55  time:562s
fi-skl-6260u     total:279  pass:269  dwarn:0   dfail:0   fail:0   skip:10  time:457s
fi-skl-6700hq    total:279  pass:262  dwarn:0   dfail:0   fail:0   skip:17  time:583s
fi-skl-6700k     total:279  pass:257  dwarn:4   dfail:0   fail:0   skip:18  time:465s
fi-skl-6770hq    total:279  pass:269  dwarn:0   dfail:0   fail:0   skip:10  time:486s
fi-skl-gvtdvm    total:279  pass:266  dwarn:0   dfail:0   fail:0   skip:13  time:432s
fi-skl-x1585l    total:279  pass:269  dwarn:0   dfail:0   fail:0   skip:10  time:486s
fi-snb-2520m     total:279  pass:251  dwarn:0   dfail:0   fail:0   skip:28  time:542s
fi-snb-2600      total:279  pass:249  dwarn:0   dfail:0   fail:1   skip:29  time:407s

8ad9e19aafea47c272163c2cbf554e06ff7f9857 drm-tip: 2017y-07m-11d-19h-08m-20s UTC integration manifest
bc682b0 drm/i915/execlists: Read the context-status HEAD from the HWSP
591aae0 drm/i915/execlists: Read the context-status buffer from the HWSP
7847371 drm/i915/lrc: allocate separate page for HWSP

== Logs ==

For more details see: https://intel-gfx-ci.01.org/CI/Patchwork_5177/
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 3/3] drm/i915/execlists: Read the context-status HEAD from the HWSP
  2017-07-12 22:58 ` [PATCH 3/3] drm/i915/execlists: Read the context-status HEAD " Chris Wilson
@ 2017-07-12 23:41   ` Daniele Ceraolo Spurio
  2017-07-13  0:38     ` Michel Thierry
  0 siblings, 1 reply; 10+ messages in thread
From: Daniele Ceraolo Spurio @ 2017-07-12 23:41 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx; +Cc: Mika Kuoppala



On 12/07/17 15:58, Chris Wilson wrote:
> The engine provides also provides mirror of the CSB write pointer in the
> HWSP, but not of our read pointer. To take advantage of this we need to
> remember where we read up to on the last interrupt and continue off from
> there. This poses a problem following a reset, as we don't know where
> the hw will start writing from, and due to the use of power contexts we
> cannot perform that query during the reset itself. So we continue the
> current modus operandi of delaying the first read of the context-status
> read/write pointers until after the first interrupt. With this we should
> now have eliminated all uncached mmio reads in handling the
> context-status interrupt, though we still have the uncached mmio writes
> for submitting new work, and many uncached mmio reads in the global
> interrupt handler itself. Still a step in the right direction towards
> reducing our resubmit latency, although it appears lost in the noise!
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Michel Thierry <michel.thierry@intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Cc: Mika Kuoppala <mika.kuoppala@intel.com>
> ---
>   drivers/gpu/drm/i915/intel_lrc.c        | 20 +++++++++++++++-----
>   drivers/gpu/drm/i915/intel_ringbuffer.h |  1 +
>   2 files changed, 16 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index e413465a552b..db750abb905e 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -562,9 +562,15 @@ static void intel_lrc_irq_handler(unsigned long data)
>   		 * is set and we do a new loop.
>   		 */
>   		__clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
> -		head = readl(csb_mmio);
> -		tail = GEN8_CSB_WRITE_PTR(head);
> -		head = GEN8_CSB_READ_PTR(head);
> +		if (unlikely(engine->csb_head == -1)) { /* following a reset */
> +			head = readl(csb_mmio);
> +			tail = GEN8_CSB_WRITE_PTR(head);
> +			head = GEN8_CSB_READ_PTR(head);
> +			engine->csb_head = head;
> +		} else {
> +			head = engine->csb_head;
> +			tail = buf[0xf];

In CNL the tail moves to offset 0x2f of the HWSP (i.e. buf[0x1f]), might 
  be worth considering it immediately since CNL support is being merged.

-Daniele

> +		}
>   		while (head != tail) {
>   			struct drm_i915_gem_request *rq;
>   			unsigned int status;
> @@ -618,8 +624,11 @@ static void intel_lrc_irq_handler(unsigned long data)
>   				   !(status & GEN8_CTX_STATUS_ACTIVE_IDLE));
>   		}
>   
> -		writel(_MASKED_FIELD(GEN8_CSB_READ_PTR_MASK, head << 8),
> -		       csb_mmio);
> +		if (head != engine->csb_head) {
> +			engine->csb_head = head;
> +			writel(_MASKED_FIELD(GEN8_CSB_READ_PTR_MASK, head << 8),
> +			       csb_mmio);
> +		}
>   	}
>   
>   	if (execlists_elsp_ready(engine))
> @@ -1246,6 +1255,7 @@ static int gen8_init_common_ring(struct intel_engine_cs *engine)
>   
>   	/* After a GPU reset, we may have requests to replay */
>   	clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
> +	engine->csb_head = -1;
>   
>   	submit = false;
>   	for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++) {
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index d33c93444c0d..56751413e40c 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -391,6 +391,7 @@ struct intel_engine_cs {
>   	struct rb_root execlist_queue;
>   	struct rb_node *execlist_first;
>   	unsigned int fw_domains;
> +	unsigned int csb_head;
>   
>   	/* Contexts are pinned whilst they are active on the GPU. The last
>   	 * context executed remains active whilst the GPU is idle - the
> 
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/3] drm/i915/lrc: allocate separate page for HWSP
  2017-07-12 22:57 [PATCH 1/3] drm/i915/lrc: allocate separate page for HWSP Chris Wilson
                   ` (2 preceding siblings ...)
  2017-07-12 23:18 ` ✓ Fi.CI.BAT: success for series starting with [1/3] drm/i915/lrc: allocate separate page for HWSP Patchwork
@ 2017-07-12 23:51 ` Michel Thierry
  2017-07-13  8:33   ` Chris Wilson
  3 siblings, 1 reply; 10+ messages in thread
From: Michel Thierry @ 2017-07-12 23:51 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

On 7/12/2017 3:57 PM, Chris Wilson wrote:
> From: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
> 
> On gen8+ we're currently using the PPHWSP of the kernel ctx as the
> global HWSP. However, when the kernel ctx gets submitted (e.g. from
> __intel_autoenable_gt_powersave) the HW will use that page as both
> HWSP and PPHWSP. This causes a conflict in the register arena of the
> HWSP, i.e. offsets below 0x80. We don't current utilize this arena,
> but in the following patches we will take advantage of the cached
> register state for handling execlist's context status interrupt.
> 
> To avoid the conflict, instead of re-using the PPHWSP of the kernel
> ctx we can allocate a separate page for the HWSP like what happens for
> pre-execlists platform.
> 

I'm sure you haven't forgotten, but anyway a reminder for others; this 
change depends on "drm/i915/guc: Don't make assumptions while getting 
the lrca offset" [https://patchwork.freedesktop.org/patch/166519/], or 
guc loading will break.

> v2: Add a use-case for the register arena.
> 

s/arena/area/ ?

I imagine this refers to the commit msg only, right? (I couldn't find 
this difference compared to the original patch... the only small changes 
I noticed were in intel_engine_init_common and logical_ring_init).

Apart from those 2 things, it looks good to me.

Reviewed-by: Michel Thierry <michel.thierry@intel.com>

> Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
> Cc: Michel Thierry <michel.thierry@intel.com>
> Link: http://patchwork.freedesktop.org/patch/msgid/1499357440-34688-1-git-send-email-daniele.ceraolospurio@intel.com
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/intel_engine_cs.c  | 126 +++++++++++++++++++++++++++++++-
>   drivers/gpu/drm/i915/intel_lrc.c        |  34 +--------
>   drivers/gpu/drm/i915/intel_ringbuffer.c | 125 +------------------------------
>   3 files changed, 127 insertions(+), 158 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
> index 24db316e0fd1..cba120f3d89d 100644
> --- a/drivers/gpu/drm/i915/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/intel_engine_cs.c
> @@ -445,6 +445,114 @@ static void intel_engine_cleanup_scratch(struct intel_engine_cs *engine)
>   	i915_vma_unpin_and_release(&engine->scratch);
>   }
>   
> +static void cleanup_phys_status_page(struct intel_engine_cs *engine)
> +{
> +	struct drm_i915_private *dev_priv = engine->i915;
> +
> +	if (!dev_priv->status_page_dmah)
> +		return;
> +
> +	drm_pci_free(&dev_priv->drm, dev_priv->status_page_dmah);
> +	engine->status_page.page_addr = NULL;
> +}
> +
> +static void cleanup_status_page(struct intel_engine_cs *engine)
> +{
> +	struct i915_vma *vma;
> +	struct drm_i915_gem_object *obj;
> +
> +	vma = fetch_and_zero(&engine->status_page.vma);
> +	if (!vma)
> +		return;
> +
> +	obj = vma->obj;
> +
> +	i915_vma_unpin(vma);
> +	i915_vma_close(vma);
> +
> +	i915_gem_object_unpin_map(obj);
> +	__i915_gem_object_release_unless_active(obj);
> +}
> +
> +static int init_status_page(struct intel_engine_cs *engine)
> +{
> +	struct drm_i915_gem_object *obj;
> +	struct i915_vma *vma;
> +	unsigned int flags;
> +	void *vaddr;
> +	int ret;
> +
> +	obj = i915_gem_object_create_internal(engine->i915, PAGE_SIZE);
> +	if (IS_ERR(obj)) {
> +		DRM_ERROR("Failed to allocate status page\n");
> +		return PTR_ERR(obj);
> +	}
> +
> +	ret = i915_gem_object_set_cache_level(obj, I915_CACHE_LLC);
> +	if (ret)
> +		goto err;
> +
> +	vma = i915_vma_instance(obj, &engine->i915->ggtt.base, NULL);
> +	if (IS_ERR(vma)) {
> +		ret = PTR_ERR(vma);
> +		goto err;
> +	}
> +
> +	flags = PIN_GLOBAL;
> +	if (!HAS_LLC(engine->i915))
> +		/* On g33, we cannot place HWS above 256MiB, so
> +		 * restrict its pinning to the low mappable arena.
> +		 * Though this restriction is not documented for
> +		 * gen4, gen5, or byt, they also behave similarly
> +		 * and hang if the HWS is placed at the top of the
> +		 * GTT. To generalise, it appears that all !llc
> +		 * platforms have issues with us placing the HWS
> +		 * above the mappable region (even though we never
> +		 * actualy map it).
> +		 */
> +		flags |= PIN_MAPPABLE;
> +	ret = i915_vma_pin(vma, 0, 4096, flags);
> +	if (ret)
> +		goto err;
> +
> +	vaddr = i915_gem_object_pin_map(obj, I915_MAP_WB);
> +	if (IS_ERR(vaddr)) {
> +		ret = PTR_ERR(vaddr);
> +		goto err_unpin;
> +	}
> +
> +	engine->status_page.vma = vma;
> +	engine->status_page.ggtt_offset = i915_ggtt_offset(vma);
> +	engine->status_page.page_addr = memset(vaddr, 0, PAGE_SIZE);
> +
> +	DRM_DEBUG_DRIVER("%s hws offset: 0x%08x\n",
> +			 engine->name, i915_ggtt_offset(vma));
> +	return 0;
> +
> +err_unpin:
> +	i915_vma_unpin(vma);
> +err:
> +	i915_gem_object_put(obj);
> +	return ret;
> +}
> +
> +static int init_phys_status_page(struct intel_engine_cs *engine)
> +{
> +	struct drm_i915_private *dev_priv = engine->i915;
> +
> +	GEM_BUG_ON(engine->id != RCS);
> +
> +	dev_priv->status_page_dmah =
> +		drm_pci_alloc(&dev_priv->drm, PAGE_SIZE, PAGE_SIZE);
> +	if (!dev_priv->status_page_dmah)
> +		return -ENOMEM;
> +
> +	engine->status_page.page_addr = dev_priv->status_page_dmah->vaddr;
> +	memset(engine->status_page.page_addr, 0, PAGE_SIZE);
> +
> +	return 0;
> +}
> +
>   /**
>    * intel_engines_init_common - initialize cengine state which might require hw access
>    * @engine: Engine to initialize.
> @@ -480,10 +588,21 @@ int intel_engine_init_common(struct intel_engine_cs *engine)
>   
>   	ret = i915_gem_render_state_init(engine);
>   	if (ret)
> -		goto err_unpin;
> +		goto err_breadcrumbs;
> +
> +	if (HWS_NEEDS_PHYSICAL(engine->i915))
> +		ret = init_phys_status_page(engine);
> +	else
> +		ret = init_status_page(engine);
> +	if (ret)
> +		goto err_rs_fini;
>   
>   	return 0;
>   
> +err_rs_fini:
> +	i915_gem_render_state_fini(engine);
> +err_breadcrumbs:
> +	intel_engine_fini_breadcrumbs(engine);

Good catch from the original RFC.

>   err_unpin:
>   	engine->context_unpin(engine, engine->i915->kernel_context);
>   	return ret;
> @@ -500,6 +619,11 @@ void intel_engine_cleanup_common(struct intel_engine_cs *engine)
>   {
>   	intel_engine_cleanup_scratch(engine);
>   
> +	if (HWS_NEEDS_PHYSICAL(engine->i915))
> +		cleanup_phys_status_page(engine);
> +	else
> +		cleanup_status_page(engine);
> +
>   	i915_gem_render_state_fini(engine);
>   	intel_engine_fini_breadcrumbs(engine);
>   	intel_engine_cleanup_cmd_parser(engine);
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 699868d81de8..9d231d0e427d 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -1641,11 +1641,6 @@ void intel_logical_ring_cleanup(struct intel_engine_cs *engine)
>   	if (engine->cleanup)
>   		engine->cleanup(engine);
>   
> -	if (engine->status_page.vma) {
> -		i915_gem_object_unpin_map(engine->status_page.vma->obj);
> -		engine->status_page.vma = NULL;
> -	}
> -
>   	intel_engine_cleanup_common(engine);
>   
>   	lrc_destroy_wa_ctx(engine);
> @@ -1692,24 +1687,6 @@ logical_ring_default_irqs(struct intel_engine_cs *engine)
>   	engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
>   }
>   
> -static int
> -lrc_setup_hws(struct intel_engine_cs *engine, struct i915_vma *vma)
> -{
> -	const int hws_offset = LRC_PPHWSP_PN * PAGE_SIZE;
> -	void *hws;
> -
> -	/* The HWSP is part of the default context object in LRC mode. */
> -	hws = i915_gem_object_pin_map(vma->obj, I915_MAP_WB);
> -	if (IS_ERR(hws))
> -		return PTR_ERR(hws);
> -
> -	engine->status_page.page_addr = hws + hws_offset;
> -	engine->status_page.ggtt_offset = i915_ggtt_offset(vma) + hws_offset;
> -	engine->status_page.vma = vma;
> -
> -	return 0;
> -}
> -
>   static void
>   logical_ring_setup(struct intel_engine_cs *engine)
>   {
> @@ -1742,23 +1719,14 @@ logical_ring_setup(struct intel_engine_cs *engine)
>   	logical_ring_default_irqs(engine);
>   }
>   
> -static int
> -logical_ring_init(struct intel_engine_cs *engine)
> +static int logical_ring_init(struct intel_engine_cs *engine)
>   {
> -	struct i915_gem_context *dctx = engine->i915->kernel_context;
>   	int ret;
>   
>   	ret = intel_engine_init_common(engine);
>   	if (ret)
>   		goto error;
>   
> -	/* And setup the hardware status page. */
> -	ret = lrc_setup_hws(engine, dctx->engine[engine->id].state);
> -	if (ret) {
> -		DRM_ERROR("Failed to set up hws %s: %d\n", engine->name, ret);
> -		goto error;
> -	}
> -
>   	return 0;
>   
>   error:
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index 5224b7abb8a3..b2580539051e 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -1174,113 +1174,7 @@ i915_emit_bb_start(struct drm_i915_gem_request *req,
>   	return 0;
>   }
>   
> -static void cleanup_phys_status_page(struct intel_engine_cs *engine)
> -{
> -	struct drm_i915_private *dev_priv = engine->i915;
> -
> -	if (!dev_priv->status_page_dmah)
> -		return;
> -
> -	drm_pci_free(&dev_priv->drm, dev_priv->status_page_dmah);
> -	engine->status_page.page_addr = NULL;
> -}
> -
> -static void cleanup_status_page(struct intel_engine_cs *engine)
> -{
> -	struct i915_vma *vma;
> -	struct drm_i915_gem_object *obj;
> -
> -	vma = fetch_and_zero(&engine->status_page.vma);
> -	if (!vma)
> -		return;
> -
> -	obj = vma->obj;
> -
> -	i915_vma_unpin(vma);
> -	i915_vma_close(vma);
> -
> -	i915_gem_object_unpin_map(obj);
> -	__i915_gem_object_release_unless_active(obj);
> -}
>   
> -static int init_status_page(struct intel_engine_cs *engine)
> -{
> -	struct drm_i915_gem_object *obj;
> -	struct i915_vma *vma;
> -	unsigned int flags;
> -	void *vaddr;
> -	int ret;
> -
> -	obj = i915_gem_object_create_internal(engine->i915, PAGE_SIZE);
> -	if (IS_ERR(obj)) {
> -		DRM_ERROR("Failed to allocate status page\n");
> -		return PTR_ERR(obj);
> -	}
> -
> -	ret = i915_gem_object_set_cache_level(obj, I915_CACHE_LLC);
> -	if (ret)
> -		goto err;
> -
> -	vma = i915_vma_instance(obj, &engine->i915->ggtt.base, NULL);
> -	if (IS_ERR(vma)) {
> -		ret = PTR_ERR(vma);
> -		goto err;
> -	}
> -
> -	flags = PIN_GLOBAL;
> -	if (!HAS_LLC(engine->i915))
> -		/* On g33, we cannot place HWS above 256MiB, so
> -		 * restrict its pinning to the low mappable arena.
> -		 * Though this restriction is not documented for
> -		 * gen4, gen5, or byt, they also behave similarly
> -		 * and hang if the HWS is placed at the top of the
> -		 * GTT. To generalise, it appears that all !llc
> -		 * platforms have issues with us placing the HWS
> -		 * above the mappable region (even though we never
> -		 * actualy map it).
> -		 */
> -		flags |= PIN_MAPPABLE;
> -	ret = i915_vma_pin(vma, 0, 4096, flags);
> -	if (ret)
> -		goto err;
> -
> -	vaddr = i915_gem_object_pin_map(obj, I915_MAP_WB);
> -	if (IS_ERR(vaddr)) {
> -		ret = PTR_ERR(vaddr);
> -		goto err_unpin;
> -	}
> -
> -	engine->status_page.vma = vma;
> -	engine->status_page.ggtt_offset = i915_ggtt_offset(vma);
> -	engine->status_page.page_addr = memset(vaddr, 0, PAGE_SIZE);
> -
> -	DRM_DEBUG_DRIVER("%s hws offset: 0x%08x\n",
> -			 engine->name, i915_ggtt_offset(vma));
> -	return 0;
> -
> -err_unpin:
> -	i915_vma_unpin(vma);
> -err:
> -	i915_gem_object_put(obj);
> -	return ret;
> -}
> -
> -static int init_phys_status_page(struct intel_engine_cs *engine)
> -{
> -	struct drm_i915_private *dev_priv = engine->i915;
> -
> -	GEM_BUG_ON(engine->id != RCS);
> -
> -	dev_priv->status_page_dmah =
> -		drm_pci_alloc(&dev_priv->drm, PAGE_SIZE, PAGE_SIZE);
> -	if (!dev_priv->status_page_dmah)
> -		return -ENOMEM;
> -
> -	engine->status_page.page_addr = dev_priv->status_page_dmah->vaddr;
> -	memset(engine->status_page.page_addr, 0, PAGE_SIZE);
> -
> -	return 0;
> -}
>   
>   int intel_ring_pin(struct intel_ring *ring,
>   		   struct drm_i915_private *i915,
> @@ -1567,17 +1461,10 @@ static int intel_init_ring_buffer(struct intel_engine_cs *engine)
>   	if (err)
>   		goto err;
>   
> -	if (HWS_NEEDS_PHYSICAL(engine->i915))
> -		err = init_phys_status_page(engine);
> -	else
> -		err = init_status_page(engine);
> -	if (err)
> -		goto err;
> -
>   	ring = intel_engine_create_ring(engine, 32 * PAGE_SIZE);
>   	if (IS_ERR(ring)) {
>   		err = PTR_ERR(ring);
> -		goto err_hws;
> +		goto err;
>   	}
>   
>   	/* Ring wraparound at offset 0 sometimes hangs. No idea why. */
> @@ -1592,11 +1479,6 @@ static int intel_init_ring_buffer(struct intel_engine_cs *engine)
>   
>   err_ring:
>   	intel_ring_free(ring);
> -err_hws:
> -	if (HWS_NEEDS_PHYSICAL(engine->i915))
> -		cleanup_phys_status_page(engine);
> -	else
> -		cleanup_status_page(engine);
>   err:
>   	intel_engine_cleanup_common(engine);
>   	return err;
> @@ -1615,11 +1497,6 @@ void intel_engine_cleanup(struct intel_engine_cs *engine)
>   	if (engine->cleanup)
>   		engine->cleanup(engine);
>   
> -	if (HWS_NEEDS_PHYSICAL(dev_priv))
> -		cleanup_phys_status_page(engine);
> -	else
> -		cleanup_status_page(engine);
> -
>   	intel_engine_cleanup_common(engine);
>   
>   	dev_priv->engine[engine->id] = NULL;
> 
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 3/3] drm/i915/execlists: Read the context-status HEAD from the HWSP
  2017-07-12 23:41   ` Daniele Ceraolo Spurio
@ 2017-07-13  0:38     ` Michel Thierry
  2017-07-13  8:00       ` Chris Wilson
  0 siblings, 1 reply; 10+ messages in thread
From: Michel Thierry @ 2017-07-13  0:38 UTC (permalink / raw)
  To: Daniele Ceraolo Spurio, Chris Wilson, intel-gfx; +Cc: Kuoppala, Mika

On 7/12/2017 4:41 PM, Daniele Ceraolo Spurio wrote:
> 
> 
> On 12/07/17 15:58, Chris Wilson wrote:
>> The engine provides also provides mirror of the CSB write pointer in the
>> HWSP, but not of our read pointer. To take advantage of this we need to
>> remember where we read up to on the last interrupt and continue off from
>> there. This poses a problem following a reset, as we don't know where
>> the hw will start writing from, and due to the use of power contexts we
>> cannot perform that query during the reset itself. So we continue the
>> current modus operandi of delaying the first read of the context-status
>> read/write pointers until after the first interrupt. With this we should
>> now have eliminated all uncached mmio reads in handling the
>> context-status interrupt, though we still have the uncached mmio writes
>> for submitting new work, and many uncached mmio reads in the global
>> interrupt handler itself. Still a step in the right direction towards
>> reducing our resubmit latency, although it appears lost in the noise!
>>
>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>> Cc: Michel Thierry <michel.thierry@intel.com>
>> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>> Cc: Mika Kuoppala <mika.kuoppala@intel.com>
>> ---
>>    drivers/gpu/drm/i915/intel_lrc.c        | 20 +++++++++++++++-----
>>    drivers/gpu/drm/i915/intel_ringbuffer.h |  1 +
>>    2 files changed, 16 insertions(+), 5 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
>> index e413465a552b..db750abb905e 100644
>> --- a/drivers/gpu/drm/i915/intel_lrc.c
>> +++ b/drivers/gpu/drm/i915/intel_lrc.c
>> @@ -562,9 +562,15 @@ static void intel_lrc_irq_handler(unsigned long data)
>>                 * is set and we do a new loop.
>>                 */
>>                __clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
>> -             head = readl(csb_mmio);
>> -             tail = GEN8_CSB_WRITE_PTR(head);
>> -             head = GEN8_CSB_READ_PTR(head);
>> +             if (unlikely(engine->csb_head == -1)) { /* following a reset */
>> +                     head = readl(csb_mmio);
>> +                     tail = GEN8_CSB_WRITE_PTR(head);
>> +                     head = GEN8_CSB_READ_PTR(head);
>> +                     engine->csb_head = head;
>> +             } else {
>> +                     head = engine->csb_head;
>> +                     tail = buf[0xf];
> 
> In CNL the tail moves to offset 0x2f of the HWSP (i.e. buf[0x1f]), might
>    be worth considering it immediately since CNL support is being merged.
> 
> -Daniele
> 

Also right now it is implicitly doing csb_head_dword - csb_start_dword 
(0x1f - 0x10). I would vote to have them as defines.

And my only other comment, wouldn't the same changes should apply to the 
i915_engine_info debugfs? For example: https://hastebin.com/tahurezeri

-Michel

>> +             }
>>                while (head != tail) {
>>                        struct drm_i915_gem_request *rq;
>>                        unsigned int status;
>> @@ -618,8 +624,11 @@ static void intel_lrc_irq_handler(unsigned long data)
>>                                   !(status & GEN8_CTX_STATUS_ACTIVE_IDLE));
>>                }
>>
>> -             writel(_MASKED_FIELD(GEN8_CSB_READ_PTR_MASK, head << 8),
>> -                    csb_mmio);
>> +             if (head != engine->csb_head) {
>> +                     engine->csb_head = head;
>> +                     writel(_MASKED_FIELD(GEN8_CSB_READ_PTR_MASK, head << 8),
>> +                            csb_mmio);
>> +             }
>>        }
>>
>>        if (execlists_elsp_ready(engine))
>> @@ -1246,6 +1255,7 @@ static int gen8_init_common_ring(struct intel_engine_cs *engine)
>>
>>        /* After a GPU reset, we may have requests to replay */
>>        clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
>> +     engine->csb_head = -1;
>>
>>        submit = false;
>>        for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++) {
>> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
>> index d33c93444c0d..56751413e40c 100644
>> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
>> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
>> @@ -391,6 +391,7 @@ struct intel_engine_cs {
>>        struct rb_root execlist_queue;
>>        struct rb_node *execlist_first;
>>        unsigned int fw_domains;
>> +     unsigned int csb_head;
>>
>>        /* Contexts are pinned whilst they are active on the GPU. The last
>>         * context executed remains active whilst the GPU is idle - the
>>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
> 
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/3] drm/i915/execlists: Read the context-status buffer from the HWSP
  2017-07-12 22:58 ` [PATCH 2/3] drm/i915/execlists: Read the context-status buffer from the HWSP Chris Wilson
@ 2017-07-13  0:40   ` Michel Thierry
  0 siblings, 0 replies; 10+ messages in thread
From: Michel Thierry @ 2017-07-13  0:40 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx; +Cc: Mika Kuoppala

On 7/12/2017 3:58 PM, Chris Wilson wrote:
> The engine provides a mirror of the CSB in the HWSP. If we use the
> cacheable reads from the HWSP, we can shave off a few mmio reads per
> context-switch interrupt (which are quite frequent!). Just removing a
> couple of mmio is not enough to actually reduce any latency, but a small
> reduction in overall cpu usage.
> 
> Much appreciation for Ben dropping the bombshell that the CSB was in the
> HWSP and for Michel in digging out the details.
> 
> Suggested-by: Ben Widawsky <benjamin.widawsky@intel.com>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Michel Thierry <michel.thierry@intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Cc: Mika Kuoppala <mika.kuoppala@intel.com>
> ---
>   drivers/gpu/drm/i915/intel_lrc.c | 9 ++++-----
>   1 file changed, 4 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 9d231d0e427d..e413465a552b 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -547,8 +547,8 @@ static void intel_lrc_irq_handler(unsigned long data)
>   	while (test_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted)) {
>   		u32 __iomem *csb_mmio =
>   			dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine));
> -		u32 __iomem *buf =
> -			dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_BUF_LO(engine, 0));
> +		/* The HWSP contains a (cacheable) mirror of the CSB */
> +		u32 *buf = &engine->status_page.page_addr[0x10];

I would add the dword offset as a define (and next to the other ones we 
already have for the HWSP). It'll also help the next patch, which reads 
the head, i.e.:

@@ -500,4 +500,5 @@ intel_write_status_page(struct intel_engine_cs 
*engine, int reg, u32 value)
   *
   * The area from dword 0x30 to 0x3ff is available for driver usage.
   */
+#define I915_GEM_HWS_CSB_START         0x10
  #define I915_GEM_HWS_INDEX             0x30

And surprisingly, there's already an old comment about these dwords' 
reserved meaning (so they have just been reused);

  * The following dwords have a reserved meaning:
  * 0x00: ISR copy, updated when an ISR bit not set in the HWSTAM
...
<< * 0x10-0x1b: Context status DWords (GM45)  >>
<< * 0x1f: Last written status offset. (GM45) >>
    * 0x20-0x2f: Reserved (Gen6+)

But yes, I can confirm it works in skl too.

Acked-by: Michel Thierry <michel.thierry@intel.com>

>   		unsigned int head, tail;
>   
>   		/* The write will be ordered by the uncached read (itself
> @@ -590,13 +590,12 @@ static void intel_lrc_irq_handler(unsigned long data)
>   			 * status notifier.
>   			 */
>   
> -			status = readl(buf + 2 * head);
> +			status = buf[2 * head];
>   			if (!(status & GEN8_CTX_STATUS_COMPLETED_MASK))
>   				continue;
>   
>   			/* Check the context/desc id for this event matches */
> -			GEM_DEBUG_BUG_ON(readl(buf + 2 * head + 1) !=
> -					 port->context_id);
> +			GEM_DEBUG_BUG_ON(buf[2 * head + 1] != port->context_id);
>   
>   			rq = port_unpack(port, &count);
>   			GEM_BUG_ON(count == 0);
> 
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 3/3] drm/i915/execlists: Read the context-status HEAD from the HWSP
  2017-07-13  0:38     ` Michel Thierry
@ 2017-07-13  8:00       ` Chris Wilson
  0 siblings, 0 replies; 10+ messages in thread
From: Chris Wilson @ 2017-07-13  8:00 UTC (permalink / raw)
  To: Michel Thierry, Daniele Ceraolo Spurio, intel-gfx; +Cc: Kuoppala, Mika

Quoting Michel Thierry (2017-07-13 01:38:18)
> On 7/12/2017 4:41 PM, Daniele Ceraolo Spurio wrote:
> > 
> > 
> > On 12/07/17 15:58, Chris Wilson wrote:
> >> The engine provides also provides mirror of the CSB write pointer in the
> >> HWSP, but not of our read pointer. To take advantage of this we need to
> >> remember where we read up to on the last interrupt and continue off from
> >> there. This poses a problem following a reset, as we don't know where
> >> the hw will start writing from, and due to the use of power contexts we
> >> cannot perform that query during the reset itself. So we continue the
> >> current modus operandi of delaying the first read of the context-status
> >> read/write pointers until after the first interrupt. With this we should
> >> now have eliminated all uncached mmio reads in handling the
> >> context-status interrupt, though we still have the uncached mmio writes
> >> for submitting new work, and many uncached mmio reads in the global
> >> interrupt handler itself. Still a step in the right direction towards
> >> reducing our resubmit latency, although it appears lost in the noise!
> >>
> >> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >> Cc: Michel Thierry <michel.thierry@intel.com>
> >> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> >> Cc: Mika Kuoppala <mika.kuoppala@intel.com>
> >> ---
> >>    drivers/gpu/drm/i915/intel_lrc.c        | 20 +++++++++++++++-----
> >>    drivers/gpu/drm/i915/intel_ringbuffer.h |  1 +
> >>    2 files changed, 16 insertions(+), 5 deletions(-)
> >>
> >> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> >> index e413465a552b..db750abb905e 100644
> >> --- a/drivers/gpu/drm/i915/intel_lrc.c
> >> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> >> @@ -562,9 +562,15 @@ static void intel_lrc_irq_handler(unsigned long data)
> >>                 * is set and we do a new loop.
> >>                 */
> >>                __clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
> >> -             head = readl(csb_mmio);
> >> -             tail = GEN8_CSB_WRITE_PTR(head);
> >> -             head = GEN8_CSB_READ_PTR(head);
> >> +             if (unlikely(engine->csb_head == -1)) { /* following a reset */
> >> +                     head = readl(csb_mmio);
> >> +                     tail = GEN8_CSB_WRITE_PTR(head);
> >> +                     head = GEN8_CSB_READ_PTR(head);
> >> +                     engine->csb_head = head;
> >> +             } else {
> >> +                     head = engine->csb_head;
> >> +                     tail = buf[0xf];
> > 
> > In CNL the tail moves to offset 0x2f of the HWSP (i.e. buf[0x1f]), might
> >    be worth considering it immediately since CNL support is being merged.
> > 
> > -Daniele
> > 
> 
> Also right now it is implicitly doing csb_head_dword - csb_start_dword 
> (0x1f - 0x10). I would vote to have them as defines.
> 
> And my only other comment, wouldn't the same changes should apply to the 
> i915_engine_info debugfs? For example: https://hastebin.com/tahurezeri

Both. You only start to look in debugfs when you suspect something is
going wrong, so you definitely want the confirmation of checking the
mmio against the hwsp.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/3] drm/i915/lrc: allocate separate page for HWSP
  2017-07-12 23:51 ` [PATCH 1/3] " Michel Thierry
@ 2017-07-13  8:33   ` Chris Wilson
  0 siblings, 0 replies; 10+ messages in thread
From: Chris Wilson @ 2017-07-13  8:33 UTC (permalink / raw)
  To: Michel Thierry, intel-gfx

Quoting Michel Thierry (2017-07-13 00:51:11)
> On 7/12/2017 3:57 PM, Chris Wilson wrote:
> > From: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
> > 
> > On gen8+ we're currently using the PPHWSP of the kernel ctx as the
> > global HWSP. However, when the kernel ctx gets submitted (e.g. from
> > __intel_autoenable_gt_powersave) the HW will use that page as both
> > HWSP and PPHWSP. This causes a conflict in the register arena of the
> > HWSP, i.e. offsets below 0x80. We don't current utilize this arena,
> > but in the following patches we will take advantage of the cached
> > register state for handling execlist's context status interrupt.
> > 
> > To avoid the conflict, instead of re-using the PPHWSP of the kernel
> > ctx we can allocate a separate page for the HWSP like what happens for
> > pre-execlists platform.
> > 
> 
> I'm sure you haven't forgotten, but anyway a reminder for others; this 
> change depends on "drm/i915/guc: Don't make assumptions while getting 
> the lrca offset" [https://patchwork.freedesktop.org/patch/166519/], or 
> guc loading will break.
> 
> > v2: Add a use-case for the register arena.
> > 
> 
> s/arena/area/ ?

I like arena when I want the impression of an enclosed space, and area
for an open space. Here I felt that the registers are a reserved block
inside the hwsp, which I felt acted like an enclosure.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2017-07-13  8:33 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-07-12 22:57 [PATCH 1/3] drm/i915/lrc: allocate separate page for HWSP Chris Wilson
2017-07-12 22:58 ` [PATCH 2/3] drm/i915/execlists: Read the context-status buffer from the HWSP Chris Wilson
2017-07-13  0:40   ` Michel Thierry
2017-07-12 22:58 ` [PATCH 3/3] drm/i915/execlists: Read the context-status HEAD " Chris Wilson
2017-07-12 23:41   ` Daniele Ceraolo Spurio
2017-07-13  0:38     ` Michel Thierry
2017-07-13  8:00       ` Chris Wilson
2017-07-12 23:18 ` ✓ Fi.CI.BAT: success for series starting with [1/3] drm/i915/lrc: allocate separate page for HWSP Patchwork
2017-07-12 23:51 ` [PATCH 1/3] " Michel Thierry
2017-07-13  8:33   ` Chris Wilson

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.